/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace dwarfs::writer::internal { using namespace dwarfs::internal; namespace { // TODO: move out of here class job_tracker { public: explicit job_tracker(folly::Function&& on_jobs_done) : on_jobs_done_{std::move(on_jobs_done)} {} void start_job() { std::lock_guard lock(mx_); ++active_; } void finish_job() { bool all_done = false; { std::lock_guard lock(mx_); assert(active_ > 0); --active_; all_done = active_ == 0; } if (all_done) { on_jobs_done_(); } } private: std::mutex mx_; size_t active_{0}; folly::Function on_jobs_done_; }; template int distance(std::array const& a, std::array const& b) { int d = 0; for (size_t i = 0; i < N; ++i) { d += folly::popcount(a[i] ^ b[i]); } return d; } #ifdef DWARFS_MULTIVERSIONING #ifdef __clang__ __attribute__((target_clones("avx512vpopcntdq", "popcnt", "default"))) #else __attribute__((target_clones("popcnt", "default"))) #endif #endif int distance(std::array const& a, std::array const& b) { return distance(a, b); } template void order_by_shortest_path(size_t count, GetI&& geti, GetK&& getk, Swap&& swapper) { for (size_t i = 0; i < count - 1; ++i) { auto bi = geti(i); int best_distance = std::numeric_limits::max(); size_t best_index = 0; for (size_t k = i + 1; k < count; ++k) { auto bk = getk(k); auto d = distance(*bi, *bk); if (d < best_distance) { best_distance = d; best_index = k; if (best_distance <= 1) { break; } } } if (best_index > 0 && i + 1 != best_index) { swapper(i + 1, best_index); } } } template class basic_centroid { public: static_assert(Bits % (8 * sizeof(BitsType)) == 0); static constexpr size_t const array_size = Bits / (8 * sizeof(BitsType)); using value_type = std::array; using bits_type = folly::Bits; basic_centroid() { std::fill(centroid_.begin(), centroid_.end(), 0); std::fill(bitcounts_.begin(), bitcounts_.end(), 0); } value_type const& value() const { return centroid_; }; void add(value_type const& vec) { ++veccount_; for (size_t bit = 0; bit < Bits; ++bit) { bitcounts_[bit] += bits_type::test(vec.data(), bit) ? 1 : 0; if (bitcounts_[bit] > veccount_ / 2) { bits_type::set(centroid_.data(), bit); } else { bits_type::clear(centroid_.data(), bit); } } } auto distance_to(value_type const& vec) const { return distance(centroid_, vec); } private: value_type centroid_; std::array bitcounts_; CountsType veccount_; }; template struct basic_cluster { using centroid_type = basic_centroid; using index_value_type = IndexValueType; using index_type = std::vector; basic_cluster() = default; explicit basic_cluster(index_type&& index) : index{std::move(index)} {} centroid_type centroid; index_type index; }; template struct basic_cluster_tree_node { using cluster_type = ClusterType; using index_type = typename cluster_type::index_type; using index_value_type = typename cluster_type::index_value_type; using cluster_pointer = std::unique_ptr; using children_vector = std::vector>; basic_cluster_tree_node() : v{std::make_unique()} {} explicit basic_cluster_tree_node(index_type&& index) : v{std::make_unique(std::move(index))} {} children_vector const& children() const { return std::get(v); } children_vector& children() { return std::get(v); } cluster_type const& cluster() const { return *std::get(v); } cluster_type& cluster() { return *std::get(v); } bool is_leaf() const { return std::holds_alternative(v); } std::string description() const { if (is_leaf()) { return fmt::format("{} items", cluster().index.size()); } else { return fmt::format("{} children", children().size()); } } index_value_type first_index() const { if (is_leaf()) { return cluster().index.front(); } return children().front().first_index(); } index_value_type last_index() const { if (is_leaf()) { return cluster().index.back(); } return children().back().last_index(); } std::variant v; }; } // namespace template class similarity_ordering_ final : public similarity_ordering::impl { public: using index_value_type = similarity_ordering::index_value_type; using index_type = std::vector; using duplicates_map = std::unordered_map; using nilsimsa_element_view = basic_array_similarity_element_view<256, uint64_t>; using nilsimsa_cluster = basic_cluster<256, uint64_t, uint32_t, index_value_type>; using nilsimsa_cluster_tree_node = basic_cluster_tree_node; similarity_ordering_(logger& lgr, progress& prog, worker_group& wg, similarity_ordering_options const& opts) : LOG_PROXY_INIT(lgr) , prog_{prog} , wg_{wg} , opts_{opts} {} void order_nilsimsa(nilsimsa_element_view const& ev, receiver rec, std::optional index) const override; private: index_type build_index(similarity_element_view const& ev) const; duplicates_map find_duplicates(similarity_element_view const& ev, index_type& index) const; template size_t total_distance(basic_array_similarity_element_view const& ev, index_type const& index) const; template void order_cluster(basic_array_similarity_element_view const& ev, index_type& index) const; template size_t order_tree_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev) const; template void cluster_by_distance( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, int max_distance) const; template void cluster_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, std::shared_ptr jt, int max_distance) const; template void cluster(basic_cluster_tree_node>& root, basic_array_similarity_element_view const& ev, std::shared_ptr jt) const; template void collect_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, duplicates_map& dup, index_type& ordered, std::string const& indent) const; template void order_impl( receiver&& rec, std::optional idx, basic_array_similarity_element_view const& ev) const; LOG_PROXY_DECL(LoggerPolicy); progress& prog_; worker_group& wg_; similarity_ordering_options const opts_; }; template auto similarity_ordering_::build_index( similarity_element_view const& ev) const -> index_type { index_type index; { auto tt = LOG_TIMED_TRACE; index.reserve(ev.size()); for (index_value_type i = 0; i < ev.size(); ++i) { if (ev.exists(i)) { index.push_back(i); } } index.shrink_to_fit(); tt << opts_.context << "build index: " << ev.size() << " -> " << index.size(); } return index; } template auto similarity_ordering_::find_duplicates( similarity_element_view const& ev, index_type& index) const -> duplicates_map { duplicates_map dm; { auto tt = LOG_TIMED_TRACE; std::sort(index.begin(), index.end(), [&ev](auto a, auto b) { return ev.bitvec_less(a, b); }); tt << opts_.context << "sort index of " << index.size() << " elements"; } { auto tt = LOG_TIMED_TRACE; if (!index.empty()) { auto src = index.begin(); auto dst = src; while (++src != index.end()) { if (ev.bits_equal(*dst, *src)) { dm[*dst].push_back(*src); } else if (++dst != src) { *dst = std::move(*src); } } index.erase(++dst, index.end()); } tt << opts_.context << "find duplicates: " << index.size() << " unique / " << dm.size() << " groups"; } return dm; } template template size_t similarity_ordering_::total_distance( basic_array_similarity_element_view const& ev, index_type const& index) const { size_t td = 0; if (!index.empty()) { auto* prev = &ev.get_bits(index[0]); for (size_t i = 1; i < index.size(); ++i) { auto& curr = ev.get_bits(index[i]); td += distance(*prev, curr); prev = &curr; } } return td; } template template void similarity_ordering_::order_cluster( basic_array_similarity_element_view const& ev, index_type& index) const { using bitvec_type = typename basic_array_similarity_element_view::bitvec_type; if (!index.empty()) { // TODO: try simulated annealing again? reproducibly? std::sort(index.begin(), index.end(), [&ev](auto a, auto b) { return ev.order_less(a, b); }); std::vector bits; bits.reserve(index.size()); for (auto i : index) { bits.push_back(&ev.get_bits(i)); } auto getter = [&bits](auto i) { return bits[i]; }; auto swapper = [&bits, &index](auto i, auto k) { std::swap(bits[i], bits[k]); std::swap(index[i], index[k]); }; order_by_shortest_path(index.size(), getter, getter, swapper); } } template template size_t similarity_ordering_::order_tree_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev) const { using node_type = std::decay_t; using bitvec_type = typename basic_array_similarity_element_view::bitvec_type; if (node.is_leaf()) { auto& cluster = node.cluster(); return std::accumulate( cluster.index.begin(), cluster.index.end(), size_t(0), [&ev](size_t acc, size_t i) { return acc + ev.weight(i); }); } auto& children = node.children(); std::vector< std::tuple> bits; bits.reserve(children.size()); size_t total_weight = 0; for (auto& cn : children) { auto weight = order_tree_rec(cn, ev); bits.emplace_back(&ev.get_bits(cn.first_index()), &ev.get_bits(cn.last_index()), &cn, weight); total_weight += weight; } // all children of this node are ordered now std::stable_sort(bits.begin(), bits.end(), [](auto const& a, auto const& b) { return std::get<3>(a) > std::get<3>(b); }); order_by_shortest_path( bits.size(), [&bits](auto i) { return std::get<1>(bits[i]); }, [&bits](auto k) { return std::get<0>(bits[k]); }, [&bits](auto i, auto k) { std::swap(bits[i], bits[k]); }); std::vector ordered_children; ordered_children.reserve(children.size()); for (auto& b : bits) { ordered_children.emplace_back(std::move(*std::get<2>(b))); } children.swap(ordered_children); return total_weight; } template template void similarity_ordering_::cluster_by_distance( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, int max_distance) const { using node_type = std::decay_t; using cluster_type = typename node_type::cluster_type; typename node_type::children_vector children; auto td = LOG_TIMED_DEBUG; for (auto i : node.cluster().index) { auto const& vec = ev.get_bits(i); cluster_type* match = nullptr; int best_distance = std::numeric_limits::max(); cluster_type* best_match = nullptr; for (auto& c : children) { auto& cluster = c.cluster(); auto d = cluster.centroid.distance_to(vec); if (d <= max_distance) { match = &cluster; break; } else if (d < best_distance) { best_distance = d; best_match = &cluster; } } if (!match) { if (children.size() < opts_.max_children) { auto& nn = children.emplace_back(); match = &nn.cluster(); } else { match = best_match; } } match->centroid.add(vec); match->index.push_back(i); } td << opts_.context << "cluster_by_distance: " << node.cluster().index.size() << " -> " << children.size() << ")"; node.v = std::move(children); } template template void similarity_ordering_::cluster_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, std::shared_ptr jt, int max_distance) const { cluster_by_distance(node, ev, max_distance); for (auto& cn : node.children()) { if (max_distance > 1 && cn.cluster().index.size() > opts_.max_cluster_size) { jt->start_job(); wg_.add_job([this, &cn, &ev, jt, md = max_distance / 2] { cluster_rec(cn, ev, jt, md); jt->finish_job(); }); } else if (cn.cluster().index.size() > 1) { jt->start_job(); wg_.add_job([this, &index = cn.cluster().index, &ev, jt] { order_cluster(ev, index); jt->finish_job(); }); } } } template template void similarity_ordering_::cluster( basic_cluster_tree_node< basic_cluster>& root, basic_array_similarity_element_view const& ev, std::shared_ptr jt) const { jt->start_job(); wg_.add_job([this, &root, &ev, jt] { cluster_rec(root, ev, jt, Bits / 2); jt->finish_job(); }); } template template void similarity_ordering_::collect_rec( basic_cluster_tree_node< basic_cluster>& node, basic_array_similarity_element_view const& ev, duplicates_map& dup, index_type& ordered, std::string const& indent) const { if (node.is_leaf()) { for (auto e : node.cluster().index) { LOG_TRACE << opts_.context << indent << " " << ev.description(e) << " -> " << node.cluster().centroid.distance_to(ev.get_bits(e)); ordered.push_back(e); if (auto it = dup.find(e); it != dup.end()) { auto& dupvec = it->second; std::sort(dupvec.begin(), dupvec.end(), [&ev](auto a, auto b) { return ev.order_less(a, b); }); for (auto i : dupvec) { LOG_TRACE << opts_.context << indent << " + " << ev.description(i) << " -> " << node.cluster().centroid.distance_to(ev.get_bits(i)); ordered.push_back(i); } } } } else { // TODO: order children, probably do this as a separate (parallel) // step before collecting for (auto const& [i, cn] : ranges::views::enumerate(node.children())) { LOG_TRACE << opts_.context << indent << "[" << i << "] " << cn.description(); collect_rec(cn, ev, dup, ordered, indent + " "); } } } template template void similarity_ordering_::order_impl( receiver&& rec, std::optional idx, basic_array_similarity_element_view const& ev) const { index_type index; if (idx) { index = *idx; } else { index = build_index(ev); } LOG_DEBUG << opts_.context << "total distance before ordering: " << total_distance(ev, index); size_t size_hint = index.size(); auto duplicates = find_duplicates(ev, index); auto root = std::make_shared(std::move(index)); auto jt = std::make_shared( [this, size_hint, &ev, rec = std::move(rec), root, dup = std::move(duplicates)]() mutable { { auto tv = LOG_TIMED_VERBOSE; order_tree_rec(*root, ev); tv << opts_.context << "nilsimsa recursive ordering finished"; } index_type rv; rv.reserve(size_hint); collect_rec(*root, ev, dup, rv, ""); LOG_DEBUG << opts_.context << "total distance after ordering: " << total_distance(ev, rv); rec.set_value(std::move(rv)); }); cluster(*root, ev, jt); } template void similarity_ordering_::order_nilsimsa( nilsimsa_element_view const& ev, receiver rec, std::optional index) const { wg_.add_job( [this, rec = std::move(rec), idx = std::move(index), &ev]() mutable { order_impl(std::move(rec), std::move(idx), ev); }); } similarity_ordering::similarity_ordering( logger& lgr, progress& prog, internal::worker_group& wg, similarity_ordering_options const& opts) : impl_(make_unique_logging_object(lgr, prog, wg, opts)) {} } // namespace dwarfs::writer::internal