/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . */ #include #include #include #include #include namespace dwarfs::writer::internal { /** * Simple locality sensitive hashing function * * There are probably much better LSH algorithms available, but this * one is fast and easy enough to understand. * * It builds histogram of `2**hist_bits` buckets. The buckets are * indexed by the `hist_bits` LSBs of a rolling 32-bit hash, so we * end up inserting a value for every 4-byte substring of the data. * * Assuming that the distribution of these substrings are going to * be similar across similar files, the return value composed of the * indices of the 4 most common buckets. */ class similarity::impl { static constexpr size_t hist_bits = 8; static constexpr uint32_t mask = (1 << hist_bits) - 1; public: impl() { for (size_t i = 0; i < vec_.size(); ++i) { vec_[i].first = 0; vec_[i].second = i; } } void update(uint8_t const* data, size_t size) { for (size_t off = 0; off < size; ++off) { val_ = (val_ << 8) | data[off]; if (size_ + off >= 3) { auto hv = folly::hash::jenkins_rev_mix32(val_); ++vec_[hv & mask].first; } } size_ += size; } uint32_t finalize() { std::partial_sort(vec_.begin(), vec_.begin() + 4, vec_.end(), [](const auto& a, const auto& b) { return a.first > b.first || (a.first == b.first && a.second < b.second); }); return (vec_[0].second << 24) | (vec_[1].second << 16) | (vec_[2].second << 8) | (vec_[3].second << 0); } private: std::array, 1 << hist_bits> vec_; uint32_t val_{0}; size_t size_{0}; }; similarity::similarity() : impl_{std::make_unique()} {} similarity::~similarity() = default; void similarity::update(uint8_t const* data, size_t size) { impl_->update(data, size); } uint32_t similarity::finalize() const { return impl_->finalize(); } } // namespace dwarfs::writer::internal