From 8271bb15dffc3639385001ca487da83610719079 Mon Sep 17 00:00:00 2001 From: sajibreadd Date: Thu, 24 Apr 2025 15:23:40 +0200 Subject: [PATCH] 1. remote link damage identification with reverse parent scrubbing - remote link identification becomes tricky if inode is cached. - Try to open the link normally, if issue while opening mark as damaged - If openned successfully, it can be possible there is damage but inode is cached that's why it is succssful while opening. In that case take that openned inode, and scrub ancestors recursively. If any of the ancestor is damaged it remote link is marked as damaged. - while scrubbing some flag is maintained in the inode, e.g. whether scrub is backward or forward or both - his backward scrubbing will only work in read-only scrub that means without repair flag and mds_scrub_hard_link this ceph flag is turned on. - A new type of damage introduced, using which multiple links point to same inode can be identified, which was not possible previously. 2. mds_damage_log_to_file and mds_damage_log_file is used to print out damages in a file persistently as it's not safe to keep it in memory 3. missing dirfrag can make scrub recurring, so a flag `from_scrub` is used to identify when dirfrag fetch is from scrub function. Fixes: https://tracker.ceph.com/issues/68611 Signed-off-by: Md Mahamudur Rahaman Sajib --- src/common/options/mds.yaml.in | 24 +++ src/mds/CDir.cc | 25 ++-- src/mds/CDir.h | 12 +- src/mds/CInode.cc | 28 +++- src/mds/CInode.h | 11 ++ src/mds/DamageTable.cc | 189 +++++++++++++++++++++--- src/mds/DamageTable.h | 54 ++++++- src/mds/MDCache.cc | 8 +- src/mds/MDSDaemon.cc | 3 + src/mds/MDSRank.cc | 74 +++++----- src/mds/ScrubHeader.h | 14 ++ src/mds/ScrubStack.cc | 258 ++++++++++++++++++++++++++++----- src/mds/ScrubStack.h | 16 +- 13 files changed, 591 insertions(+), 125 deletions(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 6234b96cdc7b9..b43672218a685 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1555,3 +1555,27 @@ options: - mds flags: - runtime +- name: mds_damage_log_to_file + type: bool + level: advanced + desc: send mds damage lines to a file + fmt_desc: Determines if damages should appear in a file. + default: false + see_also: + - log_file + with_legacy: true +- name: mds_damage_log_file + type: str + level: advanced + desc: path to log file where damage will be written + fmt_desc: The location of the logging file for where damage of mds will be written. + daemon_default: /var/log/ceph/$cluster-$name-damages.log + with_legacy: true +- name: mds_scrub_hard_link + type: bool + level: advanced + desc: force scrubbing hard link + default: false + services: + - mds + with_legacy: true \ No newline at end of file diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a8aaf11c0512c..00d03af345829 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1585,8 +1585,8 @@ void CDir::fetch(std::string_view dname, snapid_t last, mdcache->mds->balancer->hit_dir(this, META_POP_FETCH); } -void CDir::fetch_keys(const std::vector& keys, MDSContext *c) -{ +void CDir::fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub) { dout(10) << __func__ << " " << keys.size() << " keys on " << *this << dendl; ceph_assert(is_auth()); ceph_assert(!is_complete()); @@ -1643,7 +1643,7 @@ void CDir::fetch_keys(const std::vector& keys, MDSContext *c) } auth_pin(this); - _omap_fetch(&str_keys, c); + _omap_fetch(&str_keys, c, from_scrub); if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_fetch_keys); @@ -1698,6 +1698,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { map omap; bufferlist btbl; int ret1, ret2, ret3; + bool from_scrub = false; C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) : CDirIOContext(d), fin(f), @@ -1719,7 +1720,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { return; } - dir->_omap_fetched(hdrbl, omap, complete, keys, r); + dir->_omap_fetched(hdrbl, omap, complete, keys, r, from_scrub); if (fin) fin->complete(r); } @@ -1728,8 +1729,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { } }; -void CDir::_omap_fetch(std::set *keys, MDSContext *c) -{ +void CDir::_omap_fetch(std::set *keys, MDSContext *c, bool from_scrub) { C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c); object_t oid = get_ondisk_object(); object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); @@ -1737,6 +1737,7 @@ void CDir::_omap_fetch(std::set *keys, MDSContext *c) rd.omap_get_header(&fin->hdrbl, &fin->ret1); if (keys) { fin->complete = false; + fin->from_scrub = from_scrub; fin->keys.swap(*keys); rd.omap_get_vals_by_keys(fin->keys, &fin->omap, &fin->ret2); } else { @@ -1989,9 +1990,9 @@ CDentry *CDir::_load_dentry( return dn; } -void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, - bool complete, const std::set& keys, int r) -{ +void CDir::_omap_fetched(bufferlist &hdrbl, map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub) { LogChannelRef clog = mdcache->mds->clog; dout(10) << "_fetched header " << hdrbl.length() << " bytes " << omap.size() << " keys for " << *this << dendl; @@ -2006,7 +2007,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, clog->error() << "dir " << dirfrag() << " object missing on disk; some " "files may be lost (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } @@ -2020,14 +2021,14 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, << ": " << err.what() << dendl; clog->warn() << "Corrupt fnode header in " << dirfrag() << ": " << err.what() << " (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } if (!p.end()) { clog->warn() << "header buffer of dir " << dirfrag() << " has " << hdrbl.length() - p.get_off() << " extra bytes (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } } diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7cc4dc7ffcf83..2c124350d0f9a 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -483,7 +483,8 @@ class CDir : public MDSCacheObject, public Counter { void fetch(MDSContext *c, bool ignore_authpinnability=false) { fetch("", CEPH_NOSNAP, c, ignore_authpinnability); } - void fetch_keys(const std::vector& keys, MDSContext *c); + void fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub = false); #if 0 // unused? void wait_for_commit(Context *c, version_t v=0); @@ -653,7 +654,8 @@ class CDir : public MDSCacheObject, public Counter { friend class C_IO_Dir_Committed; friend class C_IO_Dir_Commit_Ops; - void _omap_fetch(std::set *keys, MDSContext *fin=nullptr); + void _omap_fetch(std::set *keys, MDSContext *fin = nullptr, + bool from_scrub = false); void _omap_fetch_more(version_t omap_version, bufferlist& hdrbl, std::map& omap, MDSContext *fin); CDentry *_load_dentry( @@ -671,8 +673,10 @@ class CDir : public MDSCacheObject, public Counter { */ void go_bad(bool complete); - void _omap_fetched(ceph::buffer::list& hdrbl, std::map& omap, - bool complete, const std::set& keys, int r); + void _omap_fetched(ceph::buffer::list &hdrbl, + std::map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub = false); // -- commit -- void _commit(version_t want, int op_prio); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 71b6081be7de2..6ff81082bb99a 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -5198,7 +5198,6 @@ void CInode::scrub_info_create() const { dout(25) << __func__ << dendl; ceph_assert(!scrub_infop); - // break out of const-land to set up implicit initial state CInode *me = const_cast(this); const auto& pi = me->get_projected_inode(); @@ -5231,23 +5230,46 @@ void CInode::scrub_initialize(ScrubHeaderRef& header) // right now we don't handle remote inodes } +void CInode::set_forward_scrub(bool forward_scrub) { + scrub_infop->forward_scrub = forward_scrub; +} + +void CInode::scrub_add_remote_link( + std::vector> &&remote_links) { + + for (auto& [remote_link_path, remote_ino]: remote_links) { + scrub_infop->remote_links.emplace_back(std::move(remote_link_path), + remote_ino); + } +} + +void CInode::scrub_reset_remote_links() { + scrub_infop->remote_links.clear(); +} + +std::vector> && +CInode::scrub_move_remote_links() { + return std::move(scrub_infop->remote_links); +} + void CInode::scrub_aborted() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->scrub_in_progress = false; scrub_infop->header->dec_num_pending(); + scrub_infop->remote_links.clear(); scrub_maybe_delete_info(); } void CInode::scrub_finished() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->last_scrub_version = get_version(); scrub_infop->last_scrub_stamp = ceph_clock_now(); scrub_infop->last_scrub_dirty = true; scrub_infop->scrub_in_progress = false; + scrub_infop->remote_links.clear(); + scrub_infop->forward_scrub = true; scrub_infop->header->dec_num_pending(); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 6f965bffa8ea9..71d57a452f8e7 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -305,6 +305,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> remote_links; + bool forward_scrub = true; fragset_t queued_frags; @@ -458,6 +460,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> &&remote_links); + + void scrub_reset_remote_links(); + + std::vector> &&scrub_move_remote_links(); + + void set_forward_scrub(bool forward_scrub); + fragset_t& scrub_queued_frags() { ceph_assert(scrub_infop); return scrub_infop->queued_frags; diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc index 2079d23333a83..f603116241bc0 100644 --- a/src/mds/DamageTable.cc +++ b/src/mds/DamageTable.cc @@ -29,6 +29,12 @@ namespace { * Record damage to a particular dirfrag, implicitly affecting * any dentries within it. */ +inline std::ostream& operator<<(std::ostream& os, const DamageEntry& entry) +{ + entry.print(os); + return os; +} + class DirFragDamage : public DamageEntry { public: @@ -123,6 +129,28 @@ class BacktraceDamage : public DamageEntry f->close_section(); } }; + +class RemoteLinkDamage : public DamageEntry { +public: + inodeno_t ino; + std::string head_path; + RemoteLinkDamage(inodeno_t ino_, const std::string &head_path_ = "") + : ino(ino_), head_path(head_path_) {} + + damage_entry_type_t get_type() const override { + return DAMAGE_ENTRY_REMOTE_LINK; + } + + void dump(Formatter *f) const override { + f->open_object_section("remote_link_damage"); + f->dump_string("damage_type", "remote_link"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_string("path", path); + f->dump_string("head_path", head_path); + f->close_section(); + } +}; } DamageEntry::~DamageEntry() @@ -132,28 +160,34 @@ bool DamageTable::notify_dentry( inodeno_t ino, frag_t frag, snapid_t snap_id, std::string_view dname, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } // Special cases: damage to these dirfrags is considered fatal to // the MDS rank that owns them. - if ( - (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) - || - (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank) - ) { + if ((MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) || + (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)) { derr << "Damage to dentries in fragment " << frag << " of ino " << ino << "is fatal because it is a system directory for this rank" << dendl; return true; } - auto& df_dentries = dentries[DirFragIdent(ino, frag)]; - if (auto [it, inserted] = df_dentries.try_emplace(DentryIdent(dname, snap_id)); inserted) { - auto entry = std::make_shared(ino, frag, dname, snap_id); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino, frag, dname, snap_id); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto &df_dentries = dentries[DirFragIdent(ino, frag)]; + if (auto [it, inserted] = + df_dentries.try_emplace(DentryIdent(dname, snap_id)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -171,15 +205,24 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, return true; } - if (oversized()) { + bool over_sized = oversized(); + + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); inserted) { - DamageEntryRef entry = std::make_shared(ino, frag); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + DamageEntryRef entry = std::make_shared(ino, frag); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -187,15 +230,47 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { + return true; + } + + auto entry = std::make_shared(ino); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } + } + + return false; +} + +bool DamageTable::notify_remote_link_damaged(inodeno_t ino, + const std::string &path, + const std::string &head_path) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { - auto entry = std::make_shared(ino); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino, head_path); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto& df_remote_links = remote_links[ino]; + if (auto [it, inserted] = df_remote_links.try_emplace(path); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -263,6 +338,10 @@ bool DamageTable::is_remote_damaged( return remotes.count(ino) > 0; } +bool DamageTable::is_remote_link_damaged(const inodeno_t ino) const { + return remote_links.count(ino) > 0; +} + void DamageTable::dump(Formatter *f) const { f->open_array_section("damage_table"); @@ -293,6 +372,19 @@ void DamageTable::erase(damage_entry_id_t damage_id) } else if (type == DAMAGE_ENTRY_BACKTRACE) { auto backtrace_entry = std::static_pointer_cast(entry); remotes.erase(backtrace_entry->ino); + } else if (type == DAMAGE_ENTRY_REMOTE_LINK) { + auto remote_link_entry = std::static_pointer_cast(entry); + auto df_remote_link_it = remote_links.find(remote_link_entry->ino); + if (df_remote_link_it != remote_links.end()) { + auto damage_it = df_remote_link_it->second.find(entry->path); + if (damage_it != df_remote_link_it->second.end()) { + df_remote_link_it->second.erase(entry->path); + } + if(df_remote_link_it->second.empty()) { + remote_links.erase(df_remote_link_it); + } + } + remote_links.erase(remote_link_entry->ino); } else { derr << "Invalid type " << type << dendl; ceph_abort(); @@ -301,3 +393,52 @@ void DamageTable::erase(damage_entry_id_t damage_id) by_id.erase(by_id_entry); } +bool DamageTable::open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path) { + namespace fs = std::filesystem; + + // Reset the stream in case it was previously used + if (fout.is_open()) { + fout.close(); + } + fout.clear(); // clear any error flags + + const fs::path dir = file_path.parent_path(); + + // Create parent directories if needed + if (!dir.empty()) { + std::error_code ec; + + if (!fs::exists(dir, ec)) { + if (ec) { + dout(0) << "error checking existence of damage dir: " << dir << " (" + << ec.message() << ")" << dendl; + return false; + } + + if (!fs::create_directories(dir, ec)) { + dout(0) << "failed to create directories for damage file: " << dir + << " (" << ec.message() << ")" << dendl; + return false; + } + } + } + + // Open in append mode so we keep previous log contents + fout.open(file_path, std::ios::out | std::ios::app); + + if (!fout.is_open()) { + dout(0) << "failed to open damage file: " << file_path << dendl; + return false; + } + + return true; +} + +void DamageTable::clear() { + dirfrags.clear(); + dentries.clear(); + remotes.clear(); + remote_links.clear(); + by_id.clear(); +} diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h index a1b96fe221864..35dc7d89596a7 100644 --- a/src/mds/DamageTable.h +++ b/src/mds/DamageTable.h @@ -16,8 +16,12 @@ #ifndef DAMAGE_TABLE_H_ #define DAMAGE_TABLE_H_ +#include +#include +#include #include +#include "common/Formatter.h" #include "mdstypes.h" #include "include/random.h" @@ -30,7 +34,8 @@ typedef enum { DAMAGE_ENTRY_DIRFRAG, DAMAGE_ENTRY_DENTRY, - DAMAGE_ENTRY_BACKTRACE + DAMAGE_ENTRY_BACKTRACE, + DAMAGE_ENTRY_REMOTE_LINK } damage_entry_type_t; @@ -47,6 +52,11 @@ class DamageEntry virtual damage_entry_type_t get_type() const = 0; virtual void dump(Formatter *f) const = 0; + void print(std::ostream &os) const { + JSONFormatter jf; + dump(&jf); + jf.flush(os); + } damage_entry_id_t id; utime_t reported_at; @@ -121,10 +131,13 @@ class DentryIdent class DamageTable { public: - explicit DamageTable(const mds_rank_t rank_) - : rank(rank_) - { + explicit DamageTable(const mds_rank_t rank_, bool log_to_file_, + const std::string &log_file_) + : rank(rank_), log_to_file(log_to_file_), log_file(log_file_) { ceph_assert(rank_ != MDS_RANK_NONE); + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } } /** @@ -156,6 +169,9 @@ class DamageTable */ bool notify_remote_damaged(inodeno_t ino, std::string_view path); + bool notify_remote_link_damaged(inodeno_t ino, const std::string &path, + const std::string &head_path = ""); + void remove_dentry_damage_entry(CDir *dir); void remove_dirfrag_damage_entry(CDir *dir); @@ -171,10 +187,33 @@ class DamageTable bool is_remote_damaged(const inodeno_t ino) const; + bool is_remote_link_damaged(const inodeno_t ino) const; + void dump(Formatter *f) const; void erase(damage_entry_id_t damage_id); + void set_log_to_file(bool _log_to_file) { + log_to_file = _log_to_file; + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } + } + + void set_log_file(const std::string &_log_file) { + log_file = _log_file; + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } + } + + void clear(); + + private: + bool open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path); + std::ofstream fout; + protected: // I need to know my MDS rank so that I can check if // metadata items are part of my mydir. @@ -194,10 +233,17 @@ class DamageTable // (i.e. have probably/possibly missing backtraces) std::map remotes; + // Map of all links which could not be resolved + // (i.e. have probably/possibly missing primary inodes) + std::map> remote_links; + // All damage, by ID. This is a secondary index // to the dirfrag, dentry, remote maps. It exists // to enable external tools to unambiguously operate // on particular entries. std::map by_id; + bool log_to_file = false; + std::string log_file = ""; + bool log_file_opened = false; }; #endif // DAMAGE_TABLE_H_ diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5480e6dcd5efe..6675a28d96908 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8493,6 +8493,12 @@ int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { dout(4) << "traverse: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mds->damage_table.notify_remote_link_damaged(dnl->get_remote_ino(), + path); return -CEPHFS_EIO; } open_remote_dentry(dn, true, cf.build(), @@ -8817,7 +8823,7 @@ void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext path += dn->get_name(); } - bool fatal = mds->damage_table.notify_remote_damaged(ino, path); + bool fatal = mds->damage_table.notify_remote_link_damaged(ino, path); if (fatal) { mds->damaged(); ceph_abort(); // unreachable, damaged() respawns us diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index e97fd2cf83f8f..67718aae08947 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -402,6 +402,9 @@ void MDSDaemon::set_up_admin_socket() asok_hook, "Remove a damage table entry"); ceph_assert(r == 0); + r = admin_socket->register_command("damage clear", asok_hook, + "clear the damage list"); + ceph_assert(r == 0); r = admin_socket->register_command("osdmap barrier name=target_epoch,type=CephInt", asok_hook, "Wait until the MDS has this OSD map epoch"); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index aa6a8c162f4f5..1ea06d0f80400 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -482,43 +482,30 @@ class C_Drop_Cache : public MDSInternalContext { } }; -MDSRank::MDSRank( - mds_rank_t whoami_, - ceph::fair_mutex &mds_lock_, - LogChannelRef &clog_, - CommonSafeTimer &timer_, - Beacon &beacon_, - std::unique_ptr& mdsmap_, - Messenger *msgr, - MonClient *monc_, - MgrClient *mgrc, - Context *respawn_hook_, - Context *suicide_hook_, - boost::asio::io_context& ioc) : - cct(msgr->cct), mds_lock(mds_lock_), clog(clog_), - timer(timer_), mdsmap(mdsmap_), - objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)), - damage_table(whoami_), sessionmap(this), - op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, - g_conf()->osd_num_op_tracker_shard), - progress_thread(this), whoami(whoami_), - purge_queue(g_ceph_context, whoami_, - mdsmap_->get_metadata_pool(), objecter, - new LambdaContext([this](int r) { - std::lock_guard l(mds_lock); - handle_write_error(r); - } - ) - ), - metrics_handler(cct, this), - beacon(beacon_), - messenger(msgr), monc(monc_), mgrc(mgrc), - respawn_hook(respawn_hook_), - suicide_hook(suicide_hook_), - inject_journal_corrupt_dentry_first(g_conf().get_val("mds_inject_journal_corrupt_dentry_first")), - starttime(mono_clock::now()), - ioc(ioc) -{ +MDSRank::MDSRank(mds_rank_t whoami_, ceph::fair_mutex &mds_lock_, + LogChannelRef &clog_, + CommonSafeTimer &timer_, Beacon &beacon_, + std::unique_ptr &mdsmap_, Messenger *msgr, + MonClient *monc_, MgrClient *mgrc, Context *respawn_hook_, + Context *suicide_hook_, boost::asio::io_context &ioc) + : cct(msgr->cct), mds_lock(mds_lock_), clog(clog_), timer(timer_), + mdsmap(mdsmap_), objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)), + damage_table(whoami_, g_conf().get_val("mds_damage_log_to_file"), + g_conf().get_val("mds_damage_log_file")), + sessionmap(this), + op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, + g_conf()->osd_num_op_tracker_shard), + progress_thread(this), whoami(whoami_), + purge_queue(g_ceph_context, whoami_, mdsmap_->get_metadata_pool(), + objecter, new LambdaContext([this](int r) { + std::lock_guard l(mds_lock); + handle_write_error(r); + })), + metrics_handler(cct, this), beacon(beacon_), messenger(msgr), monc(monc_), + mgrc(mgrc), respawn_hook(respawn_hook_), suicide_hook(suicide_hook_), + inject_journal_corrupt_dentry_first( + g_conf().get_val("mds_inject_journal_corrupt_dentry_first")), + starttime(mono_clock::now()), ioc(ioc) { hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self()); // The metadata pool won't change in the whole life time @@ -2929,6 +2916,9 @@ void MDSRankDispatcher::handle_asok_command( goto out; } damage_table.erase(id); + } else if (command == "damage clear") { + std::lock_guard l(mds_lock); + damage_table.clear(); } else { r = -CEPHFS_ENOSYS; } @@ -3866,6 +3856,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_inject_rename_corrupt_dentry_first", "mds_inject_journal_corrupt_dentry_first", "mds_session_metadata_threshold", + "mds_damage_log_to_file", + "mds_damage_log_file", NULL }; return KEYS; @@ -3936,6 +3928,14 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s if (changed.count("mds_inject_journal_corrupt_dentry_first")) { inject_journal_corrupt_dentry_first = g_conf().get_val("mds_inject_journal_corrupt_dentry_first"); } + if (changed.count("mds_damage_log_to_file")) { + damage_table.set_log_to_file( + g_conf().get_val("mds_damage_log_to_file")); + } + if (changed.count("mds_damage_log_file")) { + damage_table.set_log_file( + g_conf().get_val("mds_damage_log_file")); + } finisher->queue(new LambdaContext([this, changed](int) { std::scoped_lock lock(mds_lock); diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h index a5d35f61ce428..0a27ab5ee4871 100644 --- a/src/mds/ScrubHeader.h +++ b/src/mds/ScrubHeader.h @@ -64,6 +64,18 @@ class ScrubHeader { } unsigned get_num_pending() const { return num_pending; } + void inc_scrubbed_inode_count() { ++scrubbed_inode_count; } + + uint64_t get_scrubbed_inode_count() const { return scrubbed_inode_count; } + + void inc_scrubbed_remote_link_count(uint64_t val = 1) { + scrubbed_remote_link_count += val; + } + + uint64_t get_scrubbed_remote_link_count() const { + return scrubbed_remote_link_count; + } + protected: const std::string tag; bool is_tag_internal; @@ -76,6 +88,8 @@ class ScrubHeader { bool repaired = false; // May be set during scrub if repairs happened unsigned epoch_last_forwarded = 0; unsigned num_pending = 0; + uint64_t scrubbed_inode_count = 0; + uint64_t scrubbed_remote_link_count = 0; }; typedef std::shared_ptr ScrubHeaderRef; diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 742c464f4d37a..d8d2260d879e0 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -59,12 +59,18 @@ void ScrubStack::dequeue(MDSCacheObject *obj) stack_size--; } -int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) -{ +int ScrubStack::_enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, + std::vector> &&remote_links) { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); if (CInode *in = dynamic_cast(obj)) { if (in->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *in << "}" << ", already in scrubbing" << dendl; + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + } else { + in->set_forward_scrub(true); + } return -CEPHFS_EBUSY; } if(in->state_test(CInode::STATE_PURGING)) { @@ -75,6 +81,11 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl; in->scrub_initialize(header); + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + in->set_forward_scrub(false); + } + } else if (CDir *dir = dynamic_cast(obj)) { if (dir->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl; @@ -103,7 +114,7 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) scrub_stack.push_front(&obj->item_scrub); else scrub_stack.push_back(&obj->item_scrub); - return 0; + return 1; } int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) @@ -209,49 +220,53 @@ void ScrubStack::kick_off_scrubs() if (scrubs_in_progress == 0) { set_state(STATE_IDLE); } - return; } assert(state == STATE_RUNNING || state == STATE_IDLE); set_state(STATE_RUNNING); - if (CInode *in = dynamic_cast(*it)) { + if (CInode *in = dynamic_cast(*it)) { dout(20) << __func__ << " examining " << *in << dendl; ++it; if (!validate_inode_auth(in)) - continue; + continue; if (!in->is_dir()) { - // it's a regular file, symlink, or hard link - dequeue(in); // we only touch it this once, so remove from stack - - scrub_file_inode(in); + // it's a regular file, symlink, or hard link + dequeue(in); // we only touch it this once, so remove from stack + scrub_file_inode(in); + } else if (in->scrub_info()->forward_scrub) { + bool added_children = false; + bool done = false; // it's done, so pop it off the stack + scrub_dir_inode(in, &added_children, &done); + if (done) { + dout(20) << __func__ << " dir inode, done" << dendl; + in->set_forward_scrub(false); + dequeue(in); + } + if (added_children) { + // dirfrags were queued at top of stack + it = scrub_stack.begin(); + } + } else if (!in->scrub_info()->remote_links.empty()){ + dequeue(in); + scrub_dir_inode_final(in); } else { - bool added_children = false; - bool done = false; // it's done, so pop it off the stack - scrub_dir_inode(in, &added_children, &done); - if (done) { - dout(20) << __func__ << " dir inode, done" << dendl; - dequeue(in); - } - if (added_children) { - // dirfrags were queued at top of stack - it = scrub_stack.begin(); - } + dequeue(in); } - } else if (CDir *dir = dynamic_cast(*it)) { - auto next = it; - ++next; + } else if (CDir *dir = dynamic_cast(*it)) { + ++it; + bool added_children = false; bool done = false; // it's done, so pop it off the stack - scrub_dirfrag(dir, &done); + scrub_dirfrag(dir, &added_children, &done); if (done) { - dout(20) << __func__ << " dirfrag, done" << dendl; - ++it; // child inodes were queued at bottom of stack - dequeue(dir); - } else { - it = next; + dout(20) << __func__ << " dirfrag, done" << dendl; + dequeue(dir); + } + if (added_children) { + it = scrub_stack.begin(); } } else { ceph_assert(0 == "dentry in scrub stack"); @@ -341,7 +356,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub()); } else if (dir->get_version() == 0) { dout(20) << __func__ << " barebones " << *dir << dendl; - dir->fetch_keys({}, gather.new_sub()); + dir->fetch_keys({}, gather.new_sub(), true); } else { _enqueue(dir, header, true); queued.insert_raw(dir->get_frag()); @@ -392,9 +407,10 @@ class C_InodeValidated : public MDSInternalContext ScrubStack *stack; CInode::validated_data result; CInode *target; + MDCache* mdcache; C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) - : MDSInternalContext(mds), stack(stack_), target(target_) + : MDSInternalContext(mds), stack(stack_), target(target_), mdcache(mds->mdcache) { stack->scrubs_in_progress++; } @@ -408,16 +424,119 @@ class C_InodeValidated : public MDSInternalContext void ScrubStack::scrub_dir_inode_final(CInode *in) { dout(20) << __func__ << " " << *in << dendl; + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + in->scrub_reset_remote_links(); + in->scrub_finished(); + return; + } + } C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); in->validate_disk_state(&fin->result, fin); return; } -void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) +void ScrubStack::add_remote_link_damage(const std::string &path, + inodeno_t ino) { + CInode* remote_inode = mdcache->get_inode(ino); + std::string head_path = ""; + if (remote_inode) { + remote_inode->make_path_string(head_path); + } + bool fatal = mdcache->mds->damage_table.notify_remote_link_damaged(ino, path, + head_path); + if (fatal) { + mdcache->mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } +} + +class C_RemoteInodeOpenned : public MDSInternalContext { +public: + ScrubStack *stack; + CDentry *dn; + ScrubHeaderRef header; + inodeno_t ino; + MDCache* mdcache; + C_RemoteInodeOpenned(MDSRank *mds, ScrubStack *stack_, + ScrubHeaderRef &header_, CDentry *dn_, inodeno_t ino_) + : MDSInternalContext(mds), stack(stack_), header(header_), dn(dn_), + ino(ino_), mdcache(stack_->mdcache) { + stack->scrubs_in_progress++; + header->inc_num_pending(); + dn->get(MDSCacheObject::PIN_SCRUBQUEUE); + } + void finish(int r) override { + std::string path; + CDir *dir = dn->get_dir(); + CInode *remote_inode = nullptr; + + stack->scrubs_in_progress--; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (r < 0 || !(dnl->is_remote() && dnl->get_remote_ino() == ino)) { + goto safe_exit; + } + remote_inode = mds->mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + std::string path; + if (dir) { + dir->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + } + stack->add_remote_link_damage(path, ino); + header->inc_scrubbed_remote_link_count(); + goto safe_exit; + } + stack->_enqueue(remote_inode, header, true, + {std::make_pair(std::move(path), ino)}); + stack->kick_off_scrubs(); + safe_exit: + dn->put(MDSCacheObject::PIN_SCRUBQUEUE); + header->dec_num_pending(); + } +}; + +CInode *ScrubStack::remote_link_checkup(CDentry *dn, ScrubHeaderRef &header) { + + CDentry::linkage_t *dnl = dn->get_linkage(); + CInode *remote_inode = mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + if (mdcache->mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { + dout(4) << "scrub: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mdcache->mds->damage_table.notify_remote_link_damaged( + dnl->get_remote_ino(), path); + return nullptr; + } + MDSContext *ctx = + (!header->get_repair() && g_conf()->mds_scrub_hard_link) + ? (MDSContext *)(new C_RemoteInodeOpenned( + mdcache->mds, this, header, dn, dnl->get_remote_ino())) + : (MDSContext *)(new C_MDSInternalNoop()); + + mdcache->open_remote_dentry(dn, true, ctx); + } + return remote_inode; +} + +void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) { ceph_assert(dir != NULL); - dout(10) << __func__ << " " << *dir << dendl; if (!dir->is_complete()) { @@ -455,9 +574,24 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) continue; } if (dnl->is_primary()) { - _enqueue(dnl->get_inode(), header, false); + if (_enqueue(dnl->get_inode(), header, true) == 1) { + *added_children = true; + } } else if (dnl->is_remote()) { - // TODO: check remote linkage + auto remote_ino = dnl->get_remote_ino(); + CInode *remote_inode = remote_link_checkup(dn, header); + if (remote_inode && !header->get_repair() && + g_conf()->mds_scrub_hard_link) { + std::string remote_path; + dir->get_inode()->make_path_string(remote_path); + remote_path += "/"; + remote_path += dn->get_name(); + if (_enqueue(remote_inode, header, true, + {std::make_pair(std::move(remote_path), remote_ino)}) == + 1) { + *added_children = true; + } + } } } } @@ -479,6 +613,24 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) void ScrubStack::scrub_file_inode(CInode *in) { + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + in->scrub_reset_remote_links(); + in->scrub_finished(); + return; + } + } + C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); // At this stage the DN is already past scrub_initialize, so // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned @@ -489,7 +641,7 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, const CInode::validated_data &result) { LogChannelRef clog = mdcache->mds->clog; - const ScrubHeaderRefConst header = in->scrub_info()->header; + ScrubHeaderRef header = in->scrub_info()->header; std::string path; if (!result.passed_validation) { @@ -537,7 +689,32 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, dout(10) << __func__ << " scrub passed on inode " << *in << dendl; } + if (!in->scrub_info()->remote_links.empty()) { + if (!result.passed_validation) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + } else { + CDentry *pdn = in->get_parent_dn(); + if (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + _enqueue(diri, header, true, std::move(in->scrub_move_remote_links())); + } else { + header->inc_scrubbed_remote_link_count( + in->scrub_info()->remote_links.size()); + } + } + } + in->scrub_reset_remote_links(); + + if (in->scrub_info()->forward_scrub) { + _enqueue(in, header, true); + } + in->scrub_finished(); + header->inc_scrubbed_inode_count(); } void ScrubStack::complete_control_contexts(int r) { @@ -637,7 +814,8 @@ void ScrubStack::scrub_status(Formatter *f) { if (scrubbing_map.empty()) *css << "no active scrubs running"; else - *css << state << " (waiting for more scrubs)"; + *css << state << " (waiting for more scrubs, " << stack_size + << "inodes in the stack)"; } else if (state == STATE_RUNNING) { if (clear_stack) { *css << "ABORTING"; @@ -676,6 +854,10 @@ void ScrubStack::scrub_status(Formatter *f) { f->dump_stream("path") << "#" << header->get_origin(); f->dump_string("tag", header->get_tag()); + f->dump_unsigned("scrubbed_inode_count", + header->get_scrubbed_inode_count()); + f->dump_unsigned("scrubbed_remote_link_count", + header->get_scrubbed_remote_link_count()); CachedStackStringStream optcss; if (header->get_recursive()) { diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index 756ebd9cb0e95..789fe38a70695 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -154,8 +154,12 @@ class ScrubStack { friend std::ostream &operator<<(std::ostream &os, const State &state); friend class C_InodeValidated; + friend class C_RemoteInodeOpenned; + friend class C_RemoteLinkCheckFinished; - int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top); + int _enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, + std::vector> &&remote_links = {}); /** * Remove the inode/dirfrag from the stack. */ @@ -188,6 +192,12 @@ class ScrubStack { */ void scrub_file_inode(CInode *in); + /** + * Scrub a file inode. + * @param dn The remote dentry to identify + */ + CInode *remote_link_checkup(CDentry *dn, ScrubHeaderRef &header); + /** * Callback from completion of CInode::validate_disk_state * @param in The inode we were validating @@ -211,9 +221,10 @@ class ScrubStack { * scrub of the dirfrag. * * @param dir The dirfrag to scrub (must be auth) + * @param added_children set to true if we pushed some of our children * @param done set to true if we started to do final scrub */ - void scrub_dirfrag(CDir *dir, bool *done); + void scrub_dirfrag(CDir *dir, bool *added_children, bool *done); /** * Scrub a directory-representing dentry. * @@ -267,6 +278,7 @@ class ScrubStack { void handle_scrub(const cref_t &m); void handle_scrub_stats(const cref_t &m); + void add_remote_link_damage(const std::string &path, inodeno_t ino); State state = STATE_IDLE; bool clear_stack = false;