From a6b248eeb49d053426b9ec0961626086a20b8185 Mon Sep 17 00:00:00 2001 From: sajibreadd-croit Date: Fri, 19 Dec 2025 08:55:27 +0100 Subject: [PATCH 1/3] mds: scrub pins more inodes than the mds_cache_memory_limit For scrubbing dirfrag we are pushing children back into the scrub stack. Instead we can follow the same strategy for scrub directory and pushing children front of the scrub stack, and in kick_off_scrubs always start scrubbing from the front of the stack. It will prevent ScrubStack to pinning whole level of the file-system tree. Fixes: https://tracker.ceph.com/issues/71167 Signed-off-by: Md Mahamudur Rahaman Sajib --- src/mds/ScrubStack.cc | 26 +++++++++++++------------- src/mds/ScrubStack.h | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 742c464f4d37a..71d510669c2af 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -103,7 +103,7 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) scrub_stack.push_front(&obj->item_scrub); else scrub_stack.push_back(&obj->item_scrub); - return 0; + return 1; } int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) @@ -242,16 +242,16 @@ void ScrubStack::kick_off_scrubs() } } } else if (CDir *dir = dynamic_cast(*it)) { - auto next = it; - ++next; + ++it; + bool added_children = false; bool done = false; // it's done, so pop it off the stack - scrub_dirfrag(dir, &done); + scrub_dirfrag(dir, &added_children, &done); if (done) { - dout(20) << __func__ << " dirfrag, done" << dendl; - ++it; // child inodes were queued at bottom of stack - dequeue(dir); - } else { - it = next; + dout(20) << __func__ << " dirfrag, done" << dendl; + dequeue(dir); + } + if (added_children) { + it = scrub_stack.begin(); } } else { ceph_assert(0 == "dentry in scrub stack"); @@ -414,10 +414,9 @@ void ScrubStack::scrub_dir_inode_final(CInode *in) return; } -void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) +void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) { ceph_assert(dir != NULL); - dout(10) << __func__ << " " << *dir << dendl; if (!dir->is_complete()) { @@ -455,7 +454,9 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) continue; } if (dnl->is_primary()) { - _enqueue(dnl->get_inode(), header, false); + if (_enqueue(dnl->get_inode(), header, true) == 1) { + *added_children = true; + } } else if (dnl->is_remote()) { // TODO: check remote linkage } @@ -476,7 +477,6 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) *done = true; dout(10) << __func__ << " done" << dendl; } - void ScrubStack::scrub_file_inode(CInode *in) { C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index 756ebd9cb0e95..4723d3b90fff0 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -213,7 +213,7 @@ class ScrubStack { * @param dir The dirfrag to scrub (must be auth) * @param done set to true if we started to do final scrub */ - void scrub_dirfrag(CDir *dir, bool *done); + void scrub_dirfrag(CDir *dir, bool *added_children, bool *done); /** * Scrub a directory-representing dentry. * From 314eadcdf3b9ed142b5a2d046ad36d78b6c1de81 Mon Sep 17 00:00:00 2001 From: sajibreadd-croit Date: Fri, 19 Dec 2025 09:28:12 +0100 Subject: [PATCH 2/3] mds: gracefully terminate missed dir object scrubbing Fixes: https://tracker.ceph.com/issues/68611 Signed-off-by: Md Mahamudur Rahaman Sajib --- src/mds/CDir.cc | 25 +++++++++++++------------ src/mds/CDir.h | 12 ++++++++---- src/mds/ScrubStack.cc | 2 +- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a8aaf11c0512c..00d03af345829 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1585,8 +1585,8 @@ void CDir::fetch(std::string_view dname, snapid_t last, mdcache->mds->balancer->hit_dir(this, META_POP_FETCH); } -void CDir::fetch_keys(const std::vector& keys, MDSContext *c) -{ +void CDir::fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub) { dout(10) << __func__ << " " << keys.size() << " keys on " << *this << dendl; ceph_assert(is_auth()); ceph_assert(!is_complete()); @@ -1643,7 +1643,7 @@ void CDir::fetch_keys(const std::vector& keys, MDSContext *c) } auth_pin(this); - _omap_fetch(&str_keys, c); + _omap_fetch(&str_keys, c, from_scrub); if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_fetch_keys); @@ -1698,6 +1698,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { map omap; bufferlist btbl; int ret1, ret2, ret3; + bool from_scrub = false; C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) : CDirIOContext(d), fin(f), @@ -1719,7 +1720,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { return; } - dir->_omap_fetched(hdrbl, omap, complete, keys, r); + dir->_omap_fetched(hdrbl, omap, complete, keys, r, from_scrub); if (fin) fin->complete(r); } @@ -1728,8 +1729,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { } }; -void CDir::_omap_fetch(std::set *keys, MDSContext *c) -{ +void CDir::_omap_fetch(std::set *keys, MDSContext *c, bool from_scrub) { C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c); object_t oid = get_ondisk_object(); object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); @@ -1737,6 +1737,7 @@ void CDir::_omap_fetch(std::set *keys, MDSContext *c) rd.omap_get_header(&fin->hdrbl, &fin->ret1); if (keys) { fin->complete = false; + fin->from_scrub = from_scrub; fin->keys.swap(*keys); rd.omap_get_vals_by_keys(fin->keys, &fin->omap, &fin->ret2); } else { @@ -1989,9 +1990,9 @@ CDentry *CDir::_load_dentry( return dn; } -void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, - bool complete, const std::set& keys, int r) -{ +void CDir::_omap_fetched(bufferlist &hdrbl, map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub) { LogChannelRef clog = mdcache->mds->clog; dout(10) << "_fetched header " << hdrbl.length() << " bytes " << omap.size() << " keys for " << *this << dendl; @@ -2006,7 +2007,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, clog->error() << "dir " << dirfrag() << " object missing on disk; some " "files may be lost (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } @@ -2020,14 +2021,14 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, << ": " << err.what() << dendl; clog->warn() << "Corrupt fnode header in " << dirfrag() << ": " << err.what() << " (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } if (!p.end()) { clog->warn() << "header buffer of dir " << dirfrag() << " has " << hdrbl.length() - p.get_off() << " extra bytes (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } } diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7cc4dc7ffcf83..2c124350d0f9a 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -483,7 +483,8 @@ class CDir : public MDSCacheObject, public Counter { void fetch(MDSContext *c, bool ignore_authpinnability=false) { fetch("", CEPH_NOSNAP, c, ignore_authpinnability); } - void fetch_keys(const std::vector& keys, MDSContext *c); + void fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub = false); #if 0 // unused? void wait_for_commit(Context *c, version_t v=0); @@ -653,7 +654,8 @@ class CDir : public MDSCacheObject, public Counter { friend class C_IO_Dir_Committed; friend class C_IO_Dir_Commit_Ops; - void _omap_fetch(std::set *keys, MDSContext *fin=nullptr); + void _omap_fetch(std::set *keys, MDSContext *fin = nullptr, + bool from_scrub = false); void _omap_fetch_more(version_t omap_version, bufferlist& hdrbl, std::map& omap, MDSContext *fin); CDentry *_load_dentry( @@ -671,8 +673,10 @@ class CDir : public MDSCacheObject, public Counter { */ void go_bad(bool complete); - void _omap_fetched(ceph::buffer::list& hdrbl, std::map& omap, - bool complete, const std::set& keys, int r); + void _omap_fetched(ceph::buffer::list &hdrbl, + std::map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub = false); // -- commit -- void _commit(version_t want, int op_prio); diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 71d510669c2af..e28e513a72400 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -341,7 +341,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub()); } else if (dir->get_version() == 0) { dout(20) << __func__ << " barebones " << *dir << dendl; - dir->fetch_keys({}, gather.new_sub()); + dir->fetch_keys({}, gather.new_sub(), true); } else { _enqueue(dir, header, true); queued.insert_raw(dir->get_frag()); From de31182bc3e1d63119e433c3e383d8d2110c3576 Mon Sep 17 00:00:00 2001 From: sajibreadd-croit Date: Fri, 19 Dec 2025 09:57:39 +0100 Subject: [PATCH 3/3] mds: remote link scrub fix using backward scrubbing from head inode 1. remote link damage identification with reverse parent scrubbing - remote link identification becomes tricky if inode is cached. - Try to open the link normally, if issue while opening mark as damaged - If openned successfully, it can be possible there is damage but inode is cached that's why it is succssful while opening. In that case take that openned inode, and scrub ancestors recursively. If any of the ancestor is damaged it remote link is marked as damaged. - while scrubbing some flag is maintained in the inode, e.g. whether scrub is backward or forward or both - his backward scrubbing will only work in read-only scrub that means without repair flag and mds_scrub_hard_link this ceph flag is turned on. - A new type of damage introduced, using which multiple links point to same inode can be identified, which was not possible previously. 2. mds_damage_log_to_file and mds_damage_log_file is used to print out damages in a file persistently as it's not safe to keep it in memory Signed-off-by: Md Mahamudur Rahaman Sajib --- src/common/options/mds.yaml.in | 24 ++++ src/mds/CDir.cc | 6 +- src/mds/CInode.cc | 28 +++- src/mds/CInode.h | 11 ++ src/mds/DamageTable.cc | 186 ++++++++++++++++++++++---- src/mds/DamageTable.h | 57 +++++++- src/mds/MDCache.cc | 8 +- src/mds/MDSDaemon.cc | 3 + src/mds/MDSRank.cc | 17 ++- src/mds/ScrubHeader.h | 14 ++ src/mds/ScrubStack.cc | 235 ++++++++++++++++++++++++++++----- src/mds/ScrubStack.h | 16 ++- 12 files changed, 538 insertions(+), 67 deletions(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 6234b96cdc7b9..e05ef685759ea 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1555,3 +1555,27 @@ options: - mds flags: - runtime +- name: mds_damage_log_to_file + type: bool + level: advanced + desc: send mds damage lines to a file + fmt_desc: Determines if damages should appear in a file. + default: false + see_also: + - log_file + with_legacy: true +- name: mds_damage_log_file + type: str + level: advanced + desc: path to log file where damage will be written + fmt_desc: The location of the logging file for where damage of mds will be written. + daemon_default: /var/log/ceph/$cluster-$name-damages.log + with_legacy: true +- name: mds_force_hard_link_scrubbing + type: bool + level: advanced + desc: force scrubbing hard link + default: false + services: + - mds + with_legacy: true \ No newline at end of file diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 00d03af345829..4a089dc61d06e 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -2007,7 +2007,7 @@ void CDir::_omap_fetched(bufferlist &hdrbl, map &omap, clog->error() << "dir " << dirfrag() << " object missing on disk; some " "files may be lost (" << get_path() << ")"; - go_bad(complete | from_scrub); + go_bad(complete || from_scrub); return; } @@ -2021,14 +2021,14 @@ void CDir::_omap_fetched(bufferlist &hdrbl, map &omap, << ": " << err.what() << dendl; clog->warn() << "Corrupt fnode header in " << dirfrag() << ": " << err.what() << " (" << get_path() << ")"; - go_bad(complete | from_scrub); + go_bad(complete || from_scrub); return; } if (!p.end()) { clog->warn() << "header buffer of dir " << dirfrag() << " has " << hdrbl.length() - p.get_off() << " extra bytes (" << get_path() << ")"; - go_bad(complete | from_scrub); + go_bad(complete || from_scrub); return; } } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 71b6081be7de2..6fdc20d7f2f9a 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -5198,7 +5198,6 @@ void CInode::scrub_info_create() const { dout(25) << __func__ << dendl; ceph_assert(!scrub_infop); - // break out of const-land to set up implicit initial state CInode *me = const_cast(this); const auto& pi = me->get_projected_inode(); @@ -5231,23 +5230,46 @@ void CInode::scrub_initialize(ScrubHeaderRef& header) // right now we don't handle remote inodes } +void CInode::set_forward_scrub(bool forward_scrub) { + scrub_infop->forward_scrub = forward_scrub; +} + +void CInode::scrub_add_remote_link( + std::vector> &&remote_links) { + + for (auto &p : remote_links) { + scrub_infop->remote_links.emplace_back(std::move(p)); + } +} + +void CInode::scrub_reset_remote_links() { + scrub_infop->remote_links.clear(); +} + +std::vector> && +CInode::scrub_move_remote_links() { + return std::move(scrub_infop->remote_links); +} + void CInode::scrub_aborted() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->scrub_in_progress = false; scrub_infop->header->dec_num_pending(); + scrub_infop->remote_links.clear(); + scrub_infop->forward_scrub = true; scrub_maybe_delete_info(); } void CInode::scrub_finished() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->last_scrub_version = get_version(); scrub_infop->last_scrub_stamp = ceph_clock_now(); scrub_infop->last_scrub_dirty = true; scrub_infop->scrub_in_progress = false; + scrub_infop->remote_links.clear(); + scrub_infop->forward_scrub = true; scrub_infop->header->dec_num_pending(); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 6f965bffa8ea9..71d57a452f8e7 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -305,6 +305,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> remote_links; + bool forward_scrub = true; fragset_t queued_frags; @@ -458,6 +460,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> &&remote_links); + + void scrub_reset_remote_links(); + + std::vector> &&scrub_move_remote_links(); + + void set_forward_scrub(bool forward_scrub); + fragset_t& scrub_queued_frags() { ceph_assert(scrub_infop); return scrub_infop->queued_frags; diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc index 2079d23333a83..944cba980188e 100644 --- a/src/mds/DamageTable.cc +++ b/src/mds/DamageTable.cc @@ -29,6 +29,12 @@ namespace { * Record damage to a particular dirfrag, implicitly affecting * any dentries within it. */ +inline std::ostream& operator<<(std::ostream& os, const DamageEntry& entry) +{ + entry.print(os); + return os; +} + class DirFragDamage : public DamageEntry { public: @@ -123,6 +129,28 @@ class BacktraceDamage : public DamageEntry f->close_section(); } }; + +class RemoteLinkDamage : public DamageEntry { +public: + inodeno_t ino; + std::string head_path; + RemoteLinkDamage(inodeno_t ino_, const std::string &head_path_ = "") + : ino(ino_), head_path(head_path_) {} + + damage_entry_type_t get_type() const override { + return DAMAGE_ENTRY_REMOTE_LINK; + } + + void dump(Formatter *f) const override { + f->open_object_section("remote_link_damage"); + f->dump_string("damage_type", "remote_link"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_string("path", path); + f->dump_string("head_path", head_path); + f->close_section(); + } +}; } DamageEntry::~DamageEntry() @@ -132,28 +160,34 @@ bool DamageTable::notify_dentry( inodeno_t ino, frag_t frag, snapid_t snap_id, std::string_view dname, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } // Special cases: damage to these dirfrags is considered fatal to // the MDS rank that owns them. - if ( - (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) - || - (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank) - ) { + if ((MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) || + (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)) { derr << "Damage to dentries in fragment " << frag << " of ino " << ino << "is fatal because it is a system directory for this rank" << dendl; return true; } - auto& df_dentries = dentries[DirFragIdent(ino, frag)]; - if (auto [it, inserted] = df_dentries.try_emplace(DentryIdent(dname, snap_id)); inserted) { - auto entry = std::make_shared(ino, frag, dname, snap_id); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino, frag, dname, snap_id); + entry->path = path; + if (log_to_file && log_file_opened) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto &df_dentries = dentries[DirFragIdent(ino, frag)]; + if (auto [it, inserted] = + df_dentries.try_emplace(DentryIdent(dname, snap_id)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -171,15 +205,24 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, return true; } - if (oversized()) { + bool over_sized = oversized(); + + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); inserted) { - DamageEntryRef entry = std::make_shared(ino, frag); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + DamageEntryRef entry = std::make_shared(ino, frag); + entry->path = path; + if (log_to_file && log_file_opened) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -187,15 +230,47 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { - auto entry = std::make_shared(ino); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino); + entry->path = path; + if (log_to_file && log_file_opened) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } + } + + return false; +} + +bool DamageTable::notify_remote_link_damaged(inodeno_t ino, + const std::string &path, + const std::string &head_path) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { + return true; + } + + auto entry = std::make_shared(ino, head_path); + entry->path = path; + if (log_to_file && log_file_opened) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto& df_remote_links = remote_links[ino]; + if (auto [it, inserted] = df_remote_links.try_emplace(path); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -263,6 +338,10 @@ bool DamageTable::is_remote_damaged( return remotes.count(ino) > 0; } +bool DamageTable::is_remote_link_damaged(const inodeno_t ino) const { + return remote_links.count(ino) > 0; +} + void DamageTable::dump(Formatter *f) const { f->open_array_section("damage_table"); @@ -293,6 +372,15 @@ void DamageTable::erase(damage_entry_id_t damage_id) } else if (type == DAMAGE_ENTRY_BACKTRACE) { auto backtrace_entry = std::static_pointer_cast(entry); remotes.erase(backtrace_entry->ino); + } else if (type == DAMAGE_ENTRY_REMOTE_LINK) { + auto remote_link_entry = std::static_pointer_cast(entry); + auto df_remote_link_it = remote_links.find(remote_link_entry->ino); + if (df_remote_link_it != remote_links.end()) { + df_remote_link_it->second.erase(entry->path); + if(df_remote_link_it->second.empty()) { + remote_links.erase(df_remote_link_it); + } + } } else { derr << "Invalid type " << type << dendl; ceph_abort(); @@ -301,3 +389,53 @@ void DamageTable::erase(damage_entry_id_t damage_id) by_id.erase(by_id_entry); } +void DamageTable::open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path) { + namespace fs = std::filesystem; + close_damage_log_file(); + + const fs::path dir = file_path.parent_path(); + + if (!dir.empty()) { + std::error_code ec; + + if (!fs::exists(dir, ec)) { + if (ec) { + derr << "error checking existence of damage dir: " << dir << " (" + << ec.message() << ")" << dendl; + return; + } + + if (!fs::create_directories(dir, ec)) { + derr << "failed to create directories for damage file: " << dir << " (" + << ec.message() << ")" << dendl; + return; + } + } + } + + fout.open(file_path, std::ios::out | std::ios::app); + + if (!fout.is_open()) { + derr << "failed to open damage file: " << file_path << dendl; + return; + } + + log_file_opened = true; +} + +void DamageTable::close_damage_log_file() { + if (fout.is_open()) { + fout.close(); + } + fout.clear(); + log_file_opened = false; +} + +void DamageTable::clear() { + dirfrags.clear(); + dentries.clear(); + remotes.clear(); + remote_links.clear(); + by_id.clear(); +} diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h index a1b96fe221864..5524828278463 100644 --- a/src/mds/DamageTable.h +++ b/src/mds/DamageTable.h @@ -16,8 +16,12 @@ #ifndef DAMAGE_TABLE_H_ #define DAMAGE_TABLE_H_ +#include +#include +#include #include +#include "common/Formatter.h" #include "mdstypes.h" #include "include/random.h" @@ -30,7 +34,8 @@ typedef enum { DAMAGE_ENTRY_DIRFRAG, DAMAGE_ENTRY_DENTRY, - DAMAGE_ENTRY_BACKTRACE + DAMAGE_ENTRY_BACKTRACE, + DAMAGE_ENTRY_REMOTE_LINK } damage_entry_type_t; @@ -47,6 +52,11 @@ class DamageEntry virtual damage_entry_type_t get_type() const = 0; virtual void dump(Formatter *f) const = 0; + void print(std::ostream &os) const { + JSONFormatter jf; + dump(&jf); + jf.flush(os); + } damage_entry_id_t id; utime_t reported_at; @@ -121,10 +131,13 @@ class DentryIdent class DamageTable { public: - explicit DamageTable(const mds_rank_t rank_) - : rank(rank_) - { + explicit DamageTable(const mds_rank_t rank_, bool log_to_file_, + const std::string &log_file_) + : rank(rank_), log_to_file(log_to_file_), log_file(log_file_) { ceph_assert(rank_ != MDS_RANK_NONE); + if (log_to_file) { + open_damage_log_file(fout, log_file); + } } /** @@ -156,6 +169,9 @@ class DamageTable */ bool notify_remote_damaged(inodeno_t ino, std::string_view path); + bool notify_remote_link_damaged(inodeno_t ino, const std::string &path, + const std::string &head_path = ""); + void remove_dentry_damage_entry(CDir *dir); void remove_dirfrag_damage_entry(CDir *dir); @@ -171,10 +187,36 @@ class DamageTable bool is_remote_damaged(const inodeno_t ino) const; + bool is_remote_link_damaged(const inodeno_t ino) const; + void dump(Formatter *f) const; void erase(damage_entry_id_t damage_id); + void set_log_to_file(bool _log_to_file) { + log_to_file = _log_to_file; + if (log_to_file) { + open_damage_log_file(fout, log_file); + } else { + close_damage_log_file(); + } + } + + void set_log_file(const std::string &_log_file) { + log_file = _log_file; + if (log_to_file) { + open_damage_log_file(fout, log_file); + } + } + + void clear(); + + private: + void open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path); + void close_damage_log_file(); + std::ofstream fout; + protected: // I need to know my MDS rank so that I can check if // metadata items are part of my mydir. @@ -194,10 +236,17 @@ class DamageTable // (i.e. have probably/possibly missing backtraces) std::map remotes; + // Map of all links which could not be resolved + // (i.e. have probably/possibly missing primary inodes) + std::map> remote_links; + // All damage, by ID. This is a secondary index // to the dirfrag, dentry, remote maps. It exists // to enable external tools to unambiguously operate // on particular entries. std::map by_id; + bool log_to_file = false; + std::string log_file = ""; + bool log_file_opened = false; }; #endif // DAMAGE_TABLE_H_ diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5480e6dcd5efe..6675a28d96908 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8493,6 +8493,12 @@ int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { dout(4) << "traverse: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mds->damage_table.notify_remote_link_damaged(dnl->get_remote_ino(), + path); return -CEPHFS_EIO; } open_remote_dentry(dn, true, cf.build(), @@ -8817,7 +8823,7 @@ void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext path += dn->get_name(); } - bool fatal = mds->damage_table.notify_remote_damaged(ino, path); + bool fatal = mds->damage_table.notify_remote_link_damaged(ino, path); if (fatal) { mds->damaged(); ceph_abort(); // unreachable, damaged() respawns us diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index e97fd2cf83f8f..67718aae08947 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -402,6 +402,9 @@ void MDSDaemon::set_up_admin_socket() asok_hook, "Remove a damage table entry"); ceph_assert(r == 0); + r = admin_socket->register_command("damage clear", asok_hook, + "clear the damage list"); + ceph_assert(r == 0); r = admin_socket->register_command("osdmap barrier name=target_epoch,type=CephInt", asok_hook, "Wait until the MDS has this OSD map epoch"); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index aa6a8c162f4f5..462125216e4d0 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -498,7 +498,9 @@ MDSRank::MDSRank( cct(msgr->cct), mds_lock(mds_lock_), clog(clog_), timer(timer_), mdsmap(mdsmap_), objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)), - damage_table(whoami_), sessionmap(this), + damage_table(whoami_, g_conf()->mds_damage_log_to_file, + g_conf()->mds_damage_log_file), + sessionmap(this), op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, g_conf()->osd_num_op_tracker_shard), progress_thread(this), whoami(whoami_), @@ -2929,6 +2931,9 @@ void MDSRankDispatcher::handle_asok_command( goto out; } damage_table.erase(id); + } else if (command == "damage clear") { + std::lock_guard l(mds_lock); + damage_table.clear(); } else { r = -CEPHFS_ENOSYS; } @@ -3866,6 +3871,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_inject_rename_corrupt_dentry_first", "mds_inject_journal_corrupt_dentry_first", "mds_session_metadata_threshold", + "mds_damage_log_to_file", + "mds_damage_log_file", NULL }; return KEYS; @@ -3936,6 +3943,14 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s if (changed.count("mds_inject_journal_corrupt_dentry_first")) { inject_journal_corrupt_dentry_first = g_conf().get_val("mds_inject_journal_corrupt_dentry_first"); } + if (changed.count("mds_damage_log_to_file")) { + damage_table.set_log_to_file( + g_conf().get_val("mds_damage_log_to_file")); + } + if (changed.count("mds_damage_log_file")) { + damage_table.set_log_file( + g_conf().get_val("mds_damage_log_file")); + } finisher->queue(new LambdaContext([this, changed](int) { std::scoped_lock lock(mds_lock); diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h index a5d35f61ce428..0a27ab5ee4871 100644 --- a/src/mds/ScrubHeader.h +++ b/src/mds/ScrubHeader.h @@ -64,6 +64,18 @@ class ScrubHeader { } unsigned get_num_pending() const { return num_pending; } + void inc_scrubbed_inode_count() { ++scrubbed_inode_count; } + + uint64_t get_scrubbed_inode_count() const { return scrubbed_inode_count; } + + void inc_scrubbed_remote_link_count(uint64_t val = 1) { + scrubbed_remote_link_count += val; + } + + uint64_t get_scrubbed_remote_link_count() const { + return scrubbed_remote_link_count; + } + protected: const std::string tag; bool is_tag_internal; @@ -76,6 +88,8 @@ class ScrubHeader { bool repaired = false; // May be set during scrub if repairs happened unsigned epoch_last_forwarded = 0; unsigned num_pending = 0; + uint64_t scrubbed_inode_count = 0; + uint64_t scrubbed_remote_link_count = 0; }; typedef std::shared_ptr ScrubHeaderRef; diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index e28e513a72400..753c790c66918 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -59,12 +59,18 @@ void ScrubStack::dequeue(MDSCacheObject *obj) stack_size--; } -int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) -{ +int ScrubStack::_enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, bool *added, + std::vector> &&remote_links) { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); if (CInode *in = dynamic_cast(obj)) { if (in->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *in << "}" << ", already in scrubbing" << dendl; + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + } else { + in->set_forward_scrub(true); + } return -CEPHFS_EBUSY; } if(in->state_test(CInode::STATE_PURGING)) { @@ -75,6 +81,11 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl; in->scrub_initialize(header); + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + in->set_forward_scrub(false); + } + } else if (CDir *dir = dynamic_cast(obj)) { if (dir->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl; @@ -103,7 +114,12 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) scrub_stack.push_front(&obj->item_scrub); else scrub_stack.push_back(&obj->item_scrub); - return 1; + + if (added) { + *added = true; + } + + return 0; } int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) @@ -209,39 +225,43 @@ void ScrubStack::kick_off_scrubs() if (scrubs_in_progress == 0) { set_state(STATE_IDLE); } - return; } assert(state == STATE_RUNNING || state == STATE_IDLE); set_state(STATE_RUNNING); - if (CInode *in = dynamic_cast(*it)) { + if (CInode *in = dynamic_cast(*it)) { dout(20) << __func__ << " examining " << *in << dendl; ++it; if (!validate_inode_auth(in)) - continue; + continue; if (!in->is_dir()) { - // it's a regular file, symlink, or hard link - dequeue(in); // we only touch it this once, so remove from stack - - scrub_file_inode(in); + // it's a regular file, symlink, or hard link + dequeue(in); // we only touch it this once, so remove from stack + scrub_file_inode(in); + } else if (in->scrub_info()->forward_scrub) { + bool added_children = false; + bool done = false; // it's done, so pop it off the stack + scrub_dir_inode(in, &added_children, &done); + if (done) { + dout(20) << __func__ << " dir inode, done" << dendl; + in->set_forward_scrub(false); + dequeue(in); + } + if (added_children) { + // dirfrags were queued at top of stack + it = scrub_stack.begin(); + } + } else if (!in->scrub_info()->remote_links.empty()){ + dequeue(in); + scrub_dir_inode_final(in); } else { - bool added_children = false; - bool done = false; // it's done, so pop it off the stack - scrub_dir_inode(in, &added_children, &done); - if (done) { - dout(20) << __func__ << " dir inode, done" << dendl; - dequeue(in); - } - if (added_children) { - // dirfrags were queued at top of stack - it = scrub_stack.begin(); - } + dequeue(in); } - } else if (CDir *dir = dynamic_cast(*it)) { + } else if (CDir *dir = dynamic_cast(*it)) { ++it; bool added_children = false; bool done = false; // it's done, so pop it off the stack @@ -392,9 +412,10 @@ class C_InodeValidated : public MDSInternalContext ScrubStack *stack; CInode::validated_data result; CInode *target; + MDCache* mdcache; C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) - : MDSInternalContext(mds), stack(stack_), target(target_) + : MDSInternalContext(mds), stack(stack_), target(target_), mdcache(mds->mdcache) { stack->scrubs_in_progress++; } @@ -408,12 +429,109 @@ class C_InodeValidated : public MDSInternalContext void ScrubStack::scrub_dir_inode_final(CInode *in) { dout(20) << __func__ << " " << *in << dendl; + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino, header); + } + in->scrub_finished(); + return; + } + } C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); in->validate_disk_state(&fin->result, fin); return; } +void ScrubStack::add_remote_link_damage(const std::string &path, inodeno_t ino, + ScrubHeaderRef &header) { + CInode* remote_inode = mdcache->get_inode(ino); + std::string head_path = ""; + if (remote_inode) { + remote_inode->make_path_string(head_path); + } + mdcache->mds->damage_table.notify_remote_link_damaged(ino, path, head_path); + header->inc_scrubbed_remote_link_count(); +} + +class C_RemoteInodeOpened : public MDSInternalContext { +public: + ScrubStack *stack; + CDentry *dn; + ScrubHeaderRef header; + inodeno_t ino; + MDCache* mdcache; + C_RemoteInodeOpened(MDSRank *mds, ScrubStack *stack_, + ScrubHeaderRef &header_, CDentry *dn_, inodeno_t ino_) + : MDSInternalContext(mds), stack(stack_), header(header_), dn(dn_), + ino(ino_), mdcache(stack_->mdcache) { + stack->scrubs_in_progress++; + header->inc_num_pending(); + dn->get(MDSCacheObject::PIN_SCRUBQUEUE); + } + void finish(int r) override { + std::string path; + CDir *dir = dn->get_dir(); + CInode *remote_inode = nullptr; + + stack->scrubs_in_progress--; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (r < 0 || !(dnl->is_remote() && dnl->get_remote_ino() == ino)) { + goto safe_exit; + } + remote_inode = mds->mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + std::string path; + if (dir) { + dir->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + } + stack->add_remote_link_damage(path, ino, header); + goto safe_exit; + } + stack->_enqueue(remote_inode, header, true, nullptr, + {std::make_pair(std::move(path), ino)}); + stack->kick_off_scrubs(); + safe_exit: + dn->put(MDSCacheObject::PIN_SCRUBQUEUE); + header->dec_num_pending(); + } +}; + +CInode *ScrubStack::remote_link_checkup(CDentry *dn, ScrubHeaderRef &header) { + + CDentry::linkage_t *dnl = dn->get_linkage(); + CInode *remote_inode = mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + if (mdcache->mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { + dout(4) << "scrub: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mdcache->mds->damage_table.notify_remote_link_damaged( + dnl->get_remote_ino(), path); + return nullptr; + } + MDSContext *ctx = + (!header->get_repair() && g_conf()->mds_force_hard_link_scrubbing) + ? (MDSContext *)(new C_RemoteInodeOpened( + mdcache->mds, this, header, dn, dnl->get_remote_ino())) + : (MDSContext *)(new C_MDSInternalNoop()); + + mdcache->open_remote_dentry(dn, true, ctx); + } + return remote_inode; +} + void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) { ceph_assert(dir != NULL); @@ -454,11 +572,19 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) continue; } if (dnl->is_primary()) { - if (_enqueue(dnl->get_inode(), header, true) == 1) { - *added_children = true; - } + _enqueue(dnl->get_inode(), header, true, added_children); } else if (dnl->is_remote()) { - // TODO: check remote linkage + auto remote_ino = dnl->get_remote_ino(); + CInode *remote_inode = remote_link_checkup(dn, header); + if (remote_inode && !header->get_repair() && + g_conf()->mds_force_hard_link_scrubbing) { + std::string remote_path; + dir->get_inode()->make_path_string(remote_path); + remote_path += "/"; + remote_path += dn->get_name(); + _enqueue(remote_inode, header, true, added_children, + {std::make_pair(std::move(remote_path), remote_ino)}); + } } } } @@ -477,8 +603,25 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) *done = true; dout(10) << __func__ << " done" << dendl; } + void ScrubStack::scrub_file_inode(CInode *in) { + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino, header); + } + in->scrub_finished(); + return; + } + } + C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); // At this stage the DN is already past scrub_initialize, so // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned @@ -489,7 +632,7 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, const CInode::validated_data &result) { LogChannelRef clog = mdcache->mds->clog; - const ScrubHeaderRefConst header = in->scrub_info()->header; + ScrubHeaderRef header = in->scrub_info()->header; std::string path; if (!result.passed_validation) { @@ -537,7 +680,34 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, dout(10) << __func__ << " scrub passed on inode " << *in << dendl; } - in->scrub_finished(); + if (!in->scrub_info()->remote_links.empty()) { + if (!result.passed_validation) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino, header); + } + } else { + CDentry *pdn = in->get_parent_dn(); + if (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + _enqueue(diri, header, true, nullptr, + std::move(in->scrub_move_remote_links())); + } else { + header->inc_scrubbed_remote_link_count( + in->scrub_info()->remote_links.size()); + } + } + } + in->scrub_reset_remote_links(); + + if (in->scrub_info()->forward_scrub && in->is_dir()) { + in->scrub_finished(); + _enqueue(in, header, true); + } else { + in->scrub_finished(); + } + + header->inc_scrubbed_inode_count(); } void ScrubStack::complete_control_contexts(int r) { @@ -637,7 +807,8 @@ void ScrubStack::scrub_status(Formatter *f) { if (scrubbing_map.empty()) *css << "no active scrubs running"; else - *css << state << " (waiting for more scrubs)"; + *css << state << " (waiting for more scrubs, " << stack_size + << "inodes in the stack)"; } else if (state == STATE_RUNNING) { if (clear_stack) { *css << "ABORTING"; @@ -676,6 +847,10 @@ void ScrubStack::scrub_status(Formatter *f) { f->dump_stream("path") << "#" << header->get_origin(); f->dump_string("tag", header->get_tag()); + f->dump_unsigned("scrubbed_inode_count", + header->get_scrubbed_inode_count()); + f->dump_unsigned("scrubbed_remote_link_count", + header->get_scrubbed_remote_link_count()); CachedStackStringStream optcss; if (header->get_recursive()) { diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index 4723d3b90fff0..85412b07b39c4 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -154,8 +154,13 @@ class ScrubStack { friend std::ostream &operator<<(std::ostream &os, const State &state); friend class C_InodeValidated; + friend class C_RemoteInodeOpened; + friend class C_RemoteLinkCheckFinished; - int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top); + int _enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, + bool *added = nullptr, + std::vector> &&remote_links = {}); /** * Remove the inode/dirfrag from the stack. */ @@ -188,6 +193,12 @@ class ScrubStack { */ void scrub_file_inode(CInode *in); + /** + * Scrub a file inode. + * @param dn The remote dentry to identify + */ + CInode *remote_link_checkup(CDentry *dn, ScrubHeaderRef &header); + /** * Callback from completion of CInode::validate_disk_state * @param in The inode we were validating @@ -211,6 +222,7 @@ class ScrubStack { * scrub of the dirfrag. * * @param dir The dirfrag to scrub (must be auth) + * @param added_children set to true if we pushed some of our children * @param done set to true if we started to do final scrub */ void scrub_dirfrag(CDir *dir, bool *added_children, bool *done); @@ -267,6 +279,8 @@ class ScrubStack { void handle_scrub(const cref_t &m); void handle_scrub_stats(const cref_t &m); + void add_remote_link_damage(const std::string &path, inodeno_t ino, + ScrubHeaderRef &header); State state = STATE_IDLE; bool clear_stack = false;