/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace dwarfs::reader::internal { namespace { /** * Offset cache configuration * * The offset cache is a small cache that improves both random * and sequential read speed in large, fragmented files. * * Due to the way file metadata is organized, accessing a random * location inside a file requires iteration over all chunks until * the correct offset is found. When sequentially reading a file in * multiple requests, this becomes an O(n**2) operation. * * For files with a small enough number of chunks, performing the * linear scan isn't really a problem. For very fragmented files, * it can definitely be an issue. * * The offset cache saves absolute file offsets every * `offset_cache_chunk_index_interval` chunks, so it'll only be * used for files with at least that many chunks in the first * place. The saved offsets can be used to find a nearby chunk * using binary search instead of a linear scan. From that chunk, * the requested offset can be found using a linear scan. * * For the most common use case, sequential reads, the cache entry * includes the last chunk index along with its absolute offset, * so both the binary search and the linear scan can be completely * avoided when a subsequent read request starts at the end of the * previous read request. * * The `offset_cache_updater_max_inline_offsets` constant defines * how many (offset, index) pairs can be stored "inline" (i.e. * without requiring any memory allocations) by the cache updater * while performing the read request. 4 is plenty. * * Last but not least, `offset_cache_size` defines the number of * inodes that can live in the cache simultaneously. The number * of cached offsets for each inode is not limited. */ constexpr size_t const offset_cache_chunk_index_interval = 256; constexpr size_t const offset_cache_updater_max_inline_offsets = 4; constexpr size_t const offset_cache_size = 64; constexpr size_t const readahead_cache_size = 64; } // namespace template class inode_reader_ final : public inode_reader_v2::impl { public: inode_reader_(logger& lgr, block_cache&& bc, inode_reader_options const& opts, std::shared_ptr perfmon [[maybe_unused]]) : cache_(std::move(bc)) , opts_{opts} , LOG_PROXY_INIT(lgr) // clang-format off PERFMON_CLS_PROXY_INIT(perfmon, "inode_reader_v2") PERFMON_CLS_TIMER_INIT(read, "offset", "size") PERFMON_CLS_TIMER_INIT(read_string, "offset", "size") PERFMON_CLS_TIMER_INIT(readv_iovec, "offset", "size") PERFMON_CLS_TIMER_INIT(readv_future, "offset", "size") // clang-format on , offset_cache_{offset_cache_size} , readahead_cache_{readahead_cache_size} , iovec_sizes_(1, 0, 256) {} ~inode_reader_() override { std::lock_guard lock(iovec_sizes_mutex_); if (iovec_sizes_.computeTotalCount() > 0) { LOG_VERBOSE << "iovec size p90: " << iovec_sizes_.getPercentileEstimate(0.9); LOG_VERBOSE << "iovec size p95: " << iovec_sizes_.getPercentileEstimate(0.95); LOG_VERBOSE << "iovec size p99: " << iovec_sizes_.getPercentileEstimate(0.99); } } std::string read_string(uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const override; size_t read(char* buf, uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const override; size_t readv(iovec_read_buf& buf, uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const override; std::vector> readv(uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const override; void dump(std::ostream& os, const std::string& indent, chunk_range chunks) const override; void set_num_workers(size_t num) override { cache_.set_num_workers(num); } void set_cache_tidy_config(cache_tidy_config const& cfg) override { cache_.set_tidy_config(cfg); } size_t num_blocks() const override { return cache_.block_count(); } private: using offset_cache_type = basic_offset_cache; using readahead_cache_type = folly::EvictingCacheMap; std::vector> read_internal(uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const; template size_t read_internal(uint32_t inode, size_t size, file_off_t read_offset, chunk_range chunks, std::error_code& ec, const StoreFunc& store) const; void do_readahead(uint32_t inode, chunk_range::iterator it, chunk_range::iterator end, file_off_t read_offset, size_t size, file_off_t it_offset) const; block_cache cache_; inode_reader_options const opts_; LOG_PROXY_DECL(LoggerPolicy); PERFMON_CLS_PROXY_DECL PERFMON_CLS_TIMER_DECL(read) PERFMON_CLS_TIMER_DECL(read_string) PERFMON_CLS_TIMER_DECL(readv_iovec) PERFMON_CLS_TIMER_DECL(readv_future) mutable offset_cache_type offset_cache_; mutable std::mutex readahead_cache_mutex_; mutable readahead_cache_type readahead_cache_; mutable std::mutex iovec_sizes_mutex_; mutable folly::Histogram iovec_sizes_; }; template void inode_reader_::dump(std::ostream& os, const std::string& indent, chunk_range chunks) const { for (auto const& [index, chunk] : ranges::views::enumerate(chunks)) { os << indent << " [" << index << "] -> (block=" << chunk.block() << ", offset=" << chunk.offset() << ", size=" << chunk.size() << ")\n"; } } template void inode_reader_::do_readahead(uint32_t inode, chunk_range::iterator it, chunk_range::iterator end, file_off_t const read_offset, size_t const size, file_off_t it_offset) const { LOG_TRACE << "readahead (" << inode << "): " << read_offset << "/" << size << "/" << it_offset; file_off_t readahead_pos{0}; file_off_t const current_offset = read_offset + size; file_off_t const readahead_until = current_offset + opts_.readahead; { std::lock_guard lock(readahead_cache_mutex_); if (read_offset > 0) { if (auto i = readahead_cache_.find(inode); i != readahead_cache_.end()) { readahead_pos = i->second; } if (readahead_until <= readahead_pos) { return; } } readahead_cache_.set(inode, readahead_until); } while (it != end) { if (it_offset + it->size() >= readahead_pos) { cache_.get(it->block(), it->offset(), it->size()); } it_offset += it->size(); if (it_offset >= readahead_until) { break; } ++it; } } template std::vector> inode_reader_::read_internal(uint32_t inode, size_t const size, file_off_t const read_offset, chunk_range chunks, std::error_code& ec) const { std::vector> ranges; auto offset = read_offset; if (offset < 0) { // This is exactly how lseek(2) behaves when seeking before the start of // the file. ec = std::make_error_code(std::errc::invalid_argument); return ranges; } // request ranges from block cache if (size == 0 || chunks.empty()) { ec.clear(); return ranges; } auto it = chunks.begin(); auto end = chunks.end(); size_t it_index = 0; file_off_t it_offset = 0; offset_cache_type::value_type oc_ent; offset_cache_type::updater oc_upd; // Check if we can find this inode in the offset cache if (offset > 0 && chunks.size() >= offset_cache_type::chunk_index_interval) { oc_ent = offset_cache_.find(inode, chunks.size()); std::tie(it_index, it_offset) = oc_ent->find(offset, oc_upd); std::advance(it, it_index); offset -= it_offset; } // search for the first chunk that contains data from this request while (it < end) { size_t chunksize = it->size(); if (static_cast(offset) < chunksize) { break; } offset -= chunksize; it_offset += chunksize; ++it; oc_upd.add_offset(++it_index, it_offset); } if (it == end) { // Offset behind end of file. This is exactly how lseek(2) and read(2) // behave when seeking behind the end of the file and reading past EOF. ec.clear(); return ranges; } size_t num_read = 0; while (it != end) { size_t const chunksize = it->size(); size_t const copyoff = it->offset() + offset; size_t copysize = chunksize - offset; DWARFS_CHECK(copysize > 0, "unexpected zero-sized chunk"); if (num_read + copysize > size) { copysize = size - num_read; } ranges.emplace_back(cache_.get(it->block(), copyoff, copysize)); num_read += copysize; if (num_read == size) { if (oc_ent) { oc_ent->update(oc_upd, it_index, it_offset, chunksize); offset_cache_.set(inode, std::move(oc_ent)); } if (opts_.readahead > 0) { do_readahead(inode, it, end, read_offset, size, it_offset); } break; } offset = 0; it_offset += chunksize; ++it; oc_upd.add_offset(++it_index, it_offset); } return ranges; } template template size_t inode_reader_::read_internal(uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec, const StoreFunc& store) const { auto ranges = read_internal(inode, size, offset, chunks, ec); if (ec) { return 0; } try { // now fill the buffer size_t num_read = 0; for (auto& r : ranges) { auto br = r.get(); store(num_read, br); num_read += br.size(); } return num_read; } catch (...) { LOG_ERROR << exception_str(std::current_exception()); } ec = std::make_error_code(std::errc::io_error); return 0; } template std::string inode_reader_::read_string(uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const { PERFMON_CLS_SCOPED_SECTION(read_string) PERFMON_SET_CONTEXT(static_cast(offset), size); auto ranges = read_internal(inode, size, offset, chunks, ec); std::string res; if (!ec) { try { std::vector brs(ranges.size()); size_t size{0}; for (auto& r : ranges) { auto br = r.get(); size += br.size(); brs.emplace_back(std::move(br)); } res.reserve(size); for (auto const& br : brs) { res.append(reinterpret_cast(br.data()), br.size()); } } catch (...) { LOG_ERROR << exception_str(std::current_exception()); ec = std::make_error_code(std::errc::io_error); } } return res; } template size_t inode_reader_::read(char* buf, uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const { PERFMON_CLS_SCOPED_SECTION(read) PERFMON_SET_CONTEXT(static_cast(offset), size); return read_internal(inode, size, offset, chunks, ec, [&](size_t num_read, const block_range& br) { ::memcpy(buf + num_read, br.data(), br.size()); }); } template std::vector> inode_reader_::readv(uint32_t inode, size_t const size, file_off_t offset, chunk_range chunks, std::error_code& ec) const { PERFMON_CLS_SCOPED_SECTION(readv_future) PERFMON_SET_CONTEXT(static_cast(offset), size); return read_internal(inode, size, offset, chunks, ec); } template size_t inode_reader_::readv(iovec_read_buf& buf, uint32_t inode, size_t size, file_off_t offset, chunk_range chunks, std::error_code& ec) const { PERFMON_CLS_SCOPED_SECTION(readv_iovec) PERFMON_SET_CONTEXT(static_cast(offset), size); auto rv = read_internal(inode, size, offset, chunks, ec, [&](size_t, const block_range& br) { auto& iov = buf.buf.emplace_back(); iov.iov_base = const_cast(br.data()); iov.iov_len = br.size(); buf.ranges.emplace_back(br); }); { std::lock_guard lock(iovec_sizes_mutex_); iovec_sizes_.addValue(buf.buf.size()); } return rv; } inode_reader_v2::inode_reader_v2( logger& lgr, block_cache&& bc, inode_reader_options const& opts, std::shared_ptr perfmon) : impl_(make_unique_logging_object( lgr, std::move(bc), opts, std::move(perfmon))) {} } // namespace dwarfs::reader::internal