/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace dwarfs::writer::internal { using namespace dwarfs::internal; namespace { constexpr size_t const kLargeFileThreshold = 1024 * 1024; constexpr size_t const kLargeFileStartHashSize = 4096; } // namespace template class file_scanner_ final : public file_scanner::impl { public: file_scanner_(logger& lgr, worker_group& wg, os_access const& os, inode_manager& im, progress& prog, file_scanner::options const& opts); void scan(file* p) override; void finalize(uint32_t& inode_num) override; uint32_t num_unique() const override { return num_unique_; } void dump(std::ostream& os) const override; private: void scan_dedupe(file* p); void hash_file(file* p); void add_inode(file* p, int lineno); template void finalize_hardlinks(Lookup&& lookup); template void finalize_files(folly::F14FastMap& fmap, uint32_t& inode_num, uint32_t& obj_num); template void finalize_inodes(std::vector>& ent, uint32_t& inode_num, uint32_t& obj_num); template std::string format_key(T const& key) const { return fmt::format("{}", key); } template std::string format_key(T const* key) const { return fmt::format("{}", reinterpret_cast(key)); } std::string format_key(std::string_view key) const { return fmt::format("{}", folly::hexlify(key)); } void dump_value(std::ostream& os, std::integral auto val) const { os << fmt::format("{}", val); } void dump_value(std::ostream& os, std::shared_ptr const&) const { os << "null"; } void dump_value(std::ostream& os, file const* p) const; void dump_value(std::ostream& os, inode::files_vector const& vec) const; void dump_inodes(std::ostream& os) const; void dump_inode_create_info(std::ostream& os) const; template void dump_map(std::ostream& os, std::string_view name, T const& map) const; LOG_PROXY_DECL(LoggerPolicy); worker_group& wg_; os_access const& os_; inode_manager& im_; progress& prog_; file_scanner::options const opts_; uint32_t num_unique_{0}; folly::F14FastMap hardlinks_; std::mutex mutable mx_; // The pair stores the file size and optionally a hash of the first // 4 KiB of the file. If there's a collision, the worst that can // happen is that we unnecessary hash a file that is not a duplicate. folly::F14FastMap, inode::files_vector> unique_size_; // We need this lookup table to later find the unique_size_ entry // given just a file pointer. folly::F14FastMap file_start_hash_; folly::F14FastMap, std::shared_ptr> first_file_hashed_; folly::F14FastMap by_raw_inode_; folly::F14FastMap by_hash_; struct inode_create_info { inode const* i; file const* f; int line; }; std::vector debug_inode_create_; }; // The `unique_size_` table holds an entry for each file size we // discover, and optionally - for large files - an XXH3 hash of the // first 4 KiB of the file. // // - When we first discover a new file size (+hash), we know for // sure that this file is *not* a duplicate of a file we've seen // before. Thus, we can immediately create a new inode, and we can // immediately start similarity scanning for this inode. // // - When we discover the second file of particular size (+hash), we // must fully hash both files (using the user-provided algorithm) // to see if they're identical. We already have an inode for the // first file, so we must delay the creation of a new inode until // we know that the second file is not a duplicate. // // - Exactly the same applies for subsequent files. // // - We must ensure that the presence of a hash is checked in // `by_hash_` for subsequent files only if the first file's // hash has been computed and stored. Otherwise, if a subsequent // file's hash computation finishes before the first file, we // assume (potentially wrongly) that the subsequent file is not // a duplicate. // // - So subsequent files must wait for the first file unless we // know up front that the first file's hash has already been // stored. As long as the first file's hash has not been stored, // it is still present in `unique_size_`. It will be removed // from `unique_size_` after its hash has been stored. // // - The optional hash value of the first 4 KiB of a large file is // useful if there are a lot of large files with the same size. // One potential scenario is uncompressed images which are very // likely to have the same size, but very unlikely to have the // same contents. The choice of 4 KiB is arbitrary, as is the // threshold of 1 MiB for "large files". The 4 KiB hash is computed // synchronously, so this could be a potential bottleneck; however, // it should happen rarely enough to not be a problem. template file_scanner_::file_scanner_(logger& lgr, worker_group& wg, os_access const& os, inode_manager& im, progress& prog, file_scanner::options const& opts) : LOG_PROXY_INIT(lgr) , wg_(wg) , os_(os) , im_(im) , prog_(prog) , opts_{opts} {} template void file_scanner_::scan(file* p) { // This method is supposed to be called from a single thread only. if (p->num_hard_links() > 1) { auto& vec = hardlinks_[p->raw_inode_num()]; vec.push_back(p); if (vec.size() > 1) { p->hardlink(vec[0], prog_); ++prog_.files_scanned; return; } } p->create_data(); prog_.original_size += p->size(); if (opts_.hash_algo) { scan_dedupe(p); } else { prog_.current.store(p); p->scan(nullptr, prog_, opts_.hash_algo); // TODO by_raw_inode_[p->raw_inode_num()].push_back(p); add_inode(p, __LINE__); } } template void file_scanner_::finalize(uint32_t& inode_num) { uint32_t obj_num = 0; assert(first_file_hashed_.empty()); if (opts_.hash_algo) { finalize_hardlinks([this](file const* p) -> inode::files_vector& { if (auto it = by_hash_.find(p->hash()); it != by_hash_.end()) { return it->second; } auto const size = p->size(); uint64_t hash{0}; if (size >= kLargeFileThreshold) [[unlikely]] { auto it = file_start_hash_.find(p); DWARFS_CHECK(it != file_start_hash_.end(), "internal error: missing start hash for large file"); hash = it->second; } return unique_size_.at({size, hash}); }); finalize_files(unique_size_, inode_num, obj_num); finalize_files(by_raw_inode_, inode_num, obj_num); finalize_files(by_hash_, inode_num, obj_num); } else { finalize_hardlinks([this](file const* p) -> inode::files_vector& { return by_raw_inode_.at(p->raw_inode_num()); }); finalize_files(by_raw_inode_, inode_num, obj_num); } } template void file_scanner_::scan_dedupe(file* p) { // We need no lock yet, as `unique_size_` is only manipulated from // this thread. uint64_t size = p->size(); uint64_t start_hash{0}; LOG_TRACE << "scanning file " << p->path_as_string() << " [size=" << size << "]"; if (size >= kLargeFileThreshold) { if (!p->is_invalid()) { try { auto mm = os_.map_file(p->fs_path(), kLargeFileStartHashSize); checksum cs(checksum::algorithm::XXH3_64); cs.update(mm->addr(), kLargeFileStartHashSize); cs.finalize(&start_hash); } catch (...) { LOG_ERROR << "failed to map file " << p->path_as_string() << ": " << exception_str(std::current_exception()) << ", creating empty file"; ++prog_.errors; p->set_invalid(); } } file_start_hash_.emplace(p, start_hash); } auto const unique_key = std::make_pair(size, start_hash); auto [it, is_new] = unique_size_.emplace(unique_key, inode::files_vector()); if (is_new) { // A file (size, start_hash) that has never been seen before. We can safely // create a new inode and we'll keep track of the file. it->second.push_back(p); { std::lock_guard lock(mx_); add_inode(p, __LINE__); } } else { // This file (size, start_hash) has been seen before, so this is potentially // a duplicate. std::shared_ptr latch; if (it->second.empty()) { // This is any file of this (size, start_hash) after the second file std::lock_guard lock(mx_); if (auto ffi = first_file_hashed_.find(unique_key); ffi != first_file_hashed_.end()) { latch = ffi->second; } } else { // This is the second file of this (size, start_hash). We now need to // hash both the first and second file and ensure that the first file's // hash is stored to `by_hash_` first. We set up a latch to synchronize // insertion into `by_hash_`. latch = std::make_shared(1); { std::lock_guard lock(mx_); DWARFS_CHECK(first_file_hashed_.emplace(unique_key, latch).second, "internal error: first file hashed latch already exists"); } // Add a job for the first file wg_.add_job([this, p = it->second.front(), latch, unique_key] { hash_file(p); { std::lock_guard lock(mx_); assert(p->get_inode()); if (p->is_invalid()) [[unlikely]] { by_raw_inode_[p->raw_inode_num()].push_back(p); } else { auto& ref = by_hash_[p->hash()]; DWARFS_CHECK(ref.empty(), "internal error: unexpected existing hash"); ref.push_back(p); } latch->count_down(); DWARFS_CHECK(first_file_hashed_.erase(unique_key) > 0, "internal error: missing first file hashed latch"); } }); // Clear files vector, but don't delete the hash table entry, to indicate // that files of this (size, start_hash) *must* be hashed. it->second.clear(); } // Add a job for any subsequent files wg_.add_job([this, p, latch] { hash_file(p); if (latch) { // Wait until the first file of this (size, start_hash) has been added // to `by_hash_`. latch->wait(); } { std::unique_lock lock(mx_); if (p->is_invalid()) [[unlikely]] { add_inode(p, __LINE__); by_raw_inode_[p->raw_inode_num()].push_back(p); } else { auto& ref = by_hash_[p->hash()]; if (ref.empty()) { // This is *not* a duplicate. We must allocate a new inode. add_inode(p, __LINE__); } else { auto inode = ref.front()->get_inode(); assert(inode); p->set_inode(inode); ++prog_.files_scanned; ++prog_.duplicate_files; prog_.saved_by_deduplication += p->size(); } ref.push_back(p); } } }); } } template void file_scanner_::hash_file(file* p) { if (p->is_invalid()) { return; } auto const size = p->size(); std::unique_ptr mm; if (size > 0) { try { mm = os_.map_file(p->fs_path(), size); } catch (...) { LOG_ERROR << "failed to map file " << p->path_as_string() << ": " << exception_str(std::current_exception()) << ", creating empty file"; ++prog_.errors; p->set_invalid(); return; } } prog_.current.store(p); p->scan(mm.get(), prog_, opts_.hash_algo); } template void file_scanner_::add_inode(file* p, int lineno) { assert(!p->get_inode()); auto inode = im_.create_inode(); p->set_inode(inode); if (opts_.debug_inode_create) { debug_inode_create_.push_back({inode.get(), p, lineno}); } im_.scan_background(wg_, os_, std::move(inode), p); } template template void file_scanner_::finalize_hardlinks(Lookup&& lookup) { auto tv = LOG_TIMED_VERBOSE; for (auto& kv : hardlinks_) { auto& hlv = kv.second; if (hlv.size() > 1) { auto& fv = lookup(hlv.front()); for (auto p : ranges::views::drop(hlv, 1)) { p->set_inode(fv.front()->get_inode()); fv.push_back(p); } } } hardlinks_.clear(); tv << "finalized " << hardlinks_.size() << " hardlinks"; } template template void file_scanner_::finalize_files( folly::F14FastMap& fmap, uint32_t& inode_num, uint32_t& obj_num) { std::vector> ent; auto tv = LOG_TIMED_VERBOSE; ent.reserve(fmap.size()); fmap.eraseInto( fmap.begin(), fmap.end(), [&ent](KeyType&& k, inode::files_vector&& fv) { if (!fv.empty()) { if constexpr (UniqueOnly) { DWARFS_CHECK(fv.size() == fv.front()->refcount(), "internal error"); } ent.emplace_back(std::move(k), std::move(fv)); } }); std::sort(ent.begin(), ent.end(), [](auto& left, auto& right) { return left.first < right.first; }); DWARFS_CHECK(fmap.empty(), "expected file map to be empty"); finalize_inodes(ent, inode_num, obj_num); if constexpr (!UniqueOnly) { finalize_inodes(ent, inode_num, obj_num); } tv << "finalized " << ent.size() << (UniqueOnly ? " unique" : "") << " files"; } template template void file_scanner_::finalize_inodes( std::vector>& ent, uint32_t& inode_num, uint32_t& obj_num) { int const obj_num_before = obj_num; auto tv = LOG_TIMED_VERBOSE; for (auto& p : ent) { auto& files = p.second; if constexpr (Unique) { DWARFS_CHECK(!files.empty(), fmt::format("internal error in finalize_inodes: empty files " "vector for key {}", p.first)); // this is true regardless of how the files are ordered if (files.size() > files.front()->refcount()) { continue; } ++num_unique_; } else { if (files.empty()) { // This is fine: the !Unique version is *always* called after the Unique // version, which will have moved the unique file vectors. continue; } DWARFS_CHECK(files.size() > 1, "unexpected non-duplicate file"); // needed for reproducibility std::sort(files.begin(), files.end(), [](file const* a, file const* b) { return a->less_revpath(*b); }); } for (auto fp : files) { // need to check because hardlinks share the same number if (!fp->inode_num()) { fp->set_inode_num(inode_num); ++inode_num; } } auto fp = files.front(); auto inode = fp->get_inode(); assert(inode); inode->set_num(obj_num); inode->set_files(std::move(files)); ++obj_num; } tv << "finalized " << (obj_num - obj_num_before) << (Unique ? " " : " non-") << "unique inodes"; } template void file_scanner_::dump_value(std::ostream& os, file const* p) const { std::shared_ptr ino = p->get_inode(); auto ino_num = p->inode_num(); os << "{\n" << " \"ptr\": \"" << fmt::format("{}", reinterpret_cast(p)) << "\",\n" << " \"path\": " << nlohmann::json{p->path_as_string()}.dump() << ",\n" << " \"size\": " << fmt::format("{}", p->size()) << ",\n" << " \"refcnt\": " << fmt::format("{}", p->refcount()) << ",\n" << " \"hash\": \"" << folly::hexlify(p->hash()) << "\",\n" << " \"invalid\": " << (p->is_invalid() ? "true" : "false") << ",\n" << " \"inode_num\": " << (ino_num ? fmt::format("{}", *ino_num) : "null") << ",\n" << " \"inode\": \"" << fmt::format("{}", reinterpret_cast(ino.get())) << "\"\n" << " }"; } template void file_scanner_::dump_value( std::ostream& os, inode::files_vector const& vec) const { os << "[\n"; bool first = true; for (auto p : vec) { if (!first) { os << ",\n"; } first = false; os << " "; dump_value(os, p); } os << "\n ]"; } template void file_scanner_::dump_inodes(std::ostream& os) const { os << " \"inodes\": [\n"; auto inodes = im_.sortable_span(); inodes.all(); bool first = true; for (auto const& ino : inodes) { if (!first) { os << ",\n"; } first = false; os << " {\n" << " \"ptr\": \"" << fmt::format("{}", reinterpret_cast(ino.get())) << "\",\n" << " \"files\": "; dump_value(os, ino->all()); os << "\n }"; } os << "\n ]"; } template void file_scanner_::dump_inode_create_info( std::ostream& os) const { os << " \"inode_create_info\": [\n"; bool first = true; for (auto const& ici : debug_inode_create_) { if (!first) { os << ",\n"; } first = false; os << " {\n" << " \"inode\": \"" << fmt::format("{}", reinterpret_cast(ici.i)) << "\",\n" << " \"file\": "; dump_value(os, ici.f); os << ",\n" << " \"line\": " << fmt::format("{}", ici.line) << "\n" << " }"; } os << "\n ]"; } template template void file_scanner_::dump_map(std::ostream& os, std::string_view name, T const& map) const { os << " \"" << name << "\": {\n"; bool first = true; for (auto const& [k, v] : map) { if (!first) { os << ",\n"; } first = false; os << " \"" << format_key(k) << "\": "; dump_value(os, v); } os << "\n }"; } template void file_scanner_::dump(std::ostream& os) const { std::lock_guard lock(mx_); os << "{\n"; dump_map(os, "hardlinks", hardlinks_); os << ",\n"; dump_map(os, "unique_size", unique_size_); os << ",\n"; dump_map(os, "file_start_hash", file_start_hash_); os << ",\n"; dump_map(os, "first_file_hashed", first_file_hashed_); os << ",\n"; dump_map(os, "by_raw_inode", by_raw_inode_); os << ",\n"; dump_map(os, "by_hash", by_hash_); os << ",\n"; dump_inode_create_info(os); os << ",\n"; dump_inodes(os); os << "\n}\n"; } file_scanner::file_scanner(logger& lgr, worker_group& wg, os_access const& os, inode_manager& im, progress& prog, options const& opts) : impl_{make_unique_logging_object( lgr, wg, os, im, prog, opts)} {} } // namespace dwarfs::writer::internal