/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . */ #include #include #include #include #include #include #include namespace dwarfs::internal { class legacy_string_table : public string_table::impl { public: explicit legacy_string_table(string_table::LegacyTableView v) : v_{v} {} std::string lookup(size_t index) const override { return std::string(v_[index]); } std::vector unpack() const override { throw std::runtime_error("cannot unpack legacy string table"); } bool is_packed() const override { return false; } size_t unpacked_size() const override { return std::accumulate(v_.begin(), v_.end(), 0, [](auto n, auto s) { return n + s.size(); }); } private: string_table::LegacyTableView v_; }; template class packed_string_table : public string_table::impl { public: packed_string_table(logger& lgr, [[maybe_unused]] std::string_view name, string_table::PackedTableView v) : v_{v} , buffer_{v_.buffer().data()} { LOG_PROXY(debug_logger_policy, lgr); if constexpr (PackedData) { auto ti = LOG_TIMED_DEBUG; auto st = v_.symtab(); DWARFS_CHECK(st, "symtab unexpectedly unset"); dec_ = std::make_unique(); auto read = fsst_import( dec_.get(), reinterpret_cast(st->data())); if (read != st->size()) { DWARFS_THROW(runtime_error, fmt::format("read {0} symtab bytes, expected {1}", read, st->size())); } ti << "imported dictionary for " << name << " string table"; } if constexpr (PackedIndex) { auto ti = LOG_TIMED_DEBUG; DWARFS_CHECK(v_.packed_index(), "index unexpectedly not packed"); index_.resize(v_.index().size() + 1); std::partial_sum(v_.index().begin(), v_.index().end(), index_.begin() + 1); ti << "unpacked index for " << name << " string table (" << sizeof(index_.front()) * index_.capacity() << " bytes)"; } } std::string lookup(size_t index) const override { auto beg = buffer_; auto end = buffer_; if constexpr (PackedIndex) { beg += index_[index]; end += index_[index + 1]; } else { beg += v_.index()[index]; end += v_.index()[index + 1]; } if constexpr (PackedData) { thread_local std::string out; size_t size = end - beg; out.resize(8 * size); auto outlen = fsst_decompress( dec_.get(), size, reinterpret_cast(beg), out.size(), reinterpret_cast(out.data())); out.resize(outlen); return out; } return std::string(beg, end); } std::vector unpack() const override { std::vector v; auto size = PackedIndex ? index_.size() : v_.index().size(); if (size > 0) { v.reserve(size - 1); for (size_t i = 0; i < size - 1; ++i) { v.emplace_back(lookup(i)); } } return v; } bool is_packed() const override { return true; } size_t unpacked_size() const override { size_t unpacked = 0; auto size = PackedIndex ? index_.size() : v_.index().size(); for (size_t i = 0; i < size - 1; ++i) { unpacked += lookup(i).size(); } return unpacked; } private: string_table::PackedTableView v_; char const* const buffer_; std::vector index_; std::unique_ptr dec_; }; string_table::string_table(LegacyTableView v) : impl_{std::make_unique(v)} {} namespace { std::unique_ptr build_string_table(logger& lgr, std::string_view name, string_table::PackedTableView v) { if (v.symtab()) { if (v.packed_index()) { return std::make_unique>(lgr, name, v); } else { return std::make_unique>(lgr, name, v); } } else { if (v.packed_index()) { return std::make_unique>(lgr, name, v); } else { return std::make_unique>(lgr, name, v); } } } } // namespace string_table::string_table(logger& lgr, std::string_view name, PackedTableView v) : impl_{build_string_table(lgr, name, v)} {} template thrift::metadata::string_table string_table::pack_generic(std::span input, pack_options const& options) { auto size = input.size(); bool pack_data = options.pack_data; size_t total_input_size = 0; std::string buffer; std::string symtab; std::vector out_len_vec; std::vector out_ptr_vec; if (input.empty()) { pack_data = false; } if (pack_data) { std::vector len_vec; std::vector ptr_vec; len_vec.reserve(size); ptr_vec.reserve(size); for (auto const& s : input) { ptr_vec.emplace_back(reinterpret_cast(s.data())); len_vec.emplace_back(s.size()); total_input_size += s.size(); } std::unique_ptr<::fsst_encoder_t, decltype(&::fsst_destroy)> enc{ ::fsst_create(size, len_vec.data(), ptr_vec.data(), 0), &::fsst_destroy}; symtab.resize(sizeof(::fsst_decoder_t)); auto symtab_size = ::fsst_export( enc.get(), reinterpret_cast(symtab.data())); symtab.resize(symtab_size); if (symtab.size() < total_input_size or options.force_pack_data) { out_len_vec.resize(size); out_ptr_vec.resize(size); buffer.resize(options.force_pack_data ? total_input_size : total_input_size - symtab.size()); size_t num_compressed = 0; do { num_compressed = ::fsst_compress( enc.get(), size, len_vec.data(), ptr_vec.data(), buffer.size(), reinterpret_cast(buffer.data()), out_len_vec.data(), out_ptr_vec.data()); if (num_compressed == size) { break; } buffer.resize(2 * buffer.size()); } while (options.force_pack_data); pack_data = num_compressed == size; } else { pack_data = false; } } else { for (auto const& s : input) { total_input_size += s.size(); } } thrift::metadata::string_table output; if (pack_data) { // store compressed size_t compressed_size = (out_ptr_vec.back() - out_ptr_vec.front()) + out_len_vec.back(); DWARFS_CHECK(reinterpret_cast(out_ptr_vec.front()) == buffer.data(), "string table compression pointer mismatch"); // TODO: only enable this in debug mode DWARFS_CHECK(compressed_size == std::accumulate(out_len_vec.begin(), out_len_vec.end(), static_cast(0)), "string table compression pointer mismatch"); buffer.resize(compressed_size); output.buffer()->swap(buffer); output.symtab() = std::move(symtab); output.index()->resize(size); std::copy(out_len_vec.begin(), out_len_vec.end(), output.index()->begin()); } else { // store uncompressed output.buffer()->reserve(total_input_size); output.index()->reserve(size); for (auto const& s : input) { output.buffer().value() += s; output.index()->emplace_back(s.size()); } } output.packed_index() = options.pack_index; if (!options.pack_index) { output.index()->insert(output.index()->begin(), 0); std::partial_sum(output.index()->begin(), output.index()->end(), output.index()->begin()); } return output; } thrift::metadata::string_table string_table::pack(std::span input, pack_options const& options) { return pack_generic(input, options); } thrift::metadata::string_table string_table::pack(std::span input, pack_options const& options) { return pack_generic(input, options); } } // namespace dwarfs::internal