diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 6234b96cdc7b9..b43672218a685 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1555,3 +1555,27 @@ options: - mds flags: - runtime +- name: mds_damage_log_to_file + type: bool + level: advanced + desc: send mds damage lines to a file + fmt_desc: Determines if damages should appear in a file. + default: false + see_also: + - log_file + with_legacy: true +- name: mds_damage_log_file + type: str + level: advanced + desc: path to log file where damage will be written + fmt_desc: The location of the logging file for where damage of mds will be written. + daemon_default: /var/log/ceph/$cluster-$name-damages.log + with_legacy: true +- name: mds_scrub_hard_link + type: bool + level: advanced + desc: force scrubbing hard link + default: false + services: + - mds + with_legacy: true \ No newline at end of file diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a8aaf11c0512c..00d03af345829 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1585,8 +1585,8 @@ void CDir::fetch(std::string_view dname, snapid_t last, mdcache->mds->balancer->hit_dir(this, META_POP_FETCH); } -void CDir::fetch_keys(const std::vector& keys, MDSContext *c) -{ +void CDir::fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub) { dout(10) << __func__ << " " << keys.size() << " keys on " << *this << dendl; ceph_assert(is_auth()); ceph_assert(!is_complete()); @@ -1643,7 +1643,7 @@ void CDir::fetch_keys(const std::vector& keys, MDSContext *c) } auth_pin(this); - _omap_fetch(&str_keys, c); + _omap_fetch(&str_keys, c, from_scrub); if (mdcache->mds->logger) mdcache->mds->logger->inc(l_mds_dir_fetch_keys); @@ -1698,6 +1698,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { map omap; bufferlist btbl; int ret1, ret2, ret3; + bool from_scrub = false; C_IO_Dir_OMAP_Fetched(CDir *d, MDSContext *f) : CDirIOContext(d), fin(f), @@ -1719,7 +1720,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { return; } - dir->_omap_fetched(hdrbl, omap, complete, keys, r); + dir->_omap_fetched(hdrbl, omap, complete, keys, r, from_scrub); if (fin) fin->complete(r); } @@ -1728,8 +1729,7 @@ class C_IO_Dir_OMAP_Fetched : public CDirIOContext { } }; -void CDir::_omap_fetch(std::set *keys, MDSContext *c) -{ +void CDir::_omap_fetch(std::set *keys, MDSContext *c, bool from_scrub) { C_IO_Dir_OMAP_Fetched *fin = new C_IO_Dir_OMAP_Fetched(this, c); object_t oid = get_ondisk_object(); object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); @@ -1737,6 +1737,7 @@ void CDir::_omap_fetch(std::set *keys, MDSContext *c) rd.omap_get_header(&fin->hdrbl, &fin->ret1); if (keys) { fin->complete = false; + fin->from_scrub = from_scrub; fin->keys.swap(*keys); rd.omap_get_vals_by_keys(fin->keys, &fin->omap, &fin->ret2); } else { @@ -1989,9 +1990,9 @@ CDentry *CDir::_load_dentry( return dn; } -void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, - bool complete, const std::set& keys, int r) -{ +void CDir::_omap_fetched(bufferlist &hdrbl, map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub) { LogChannelRef clog = mdcache->mds->clog; dout(10) << "_fetched header " << hdrbl.length() << " bytes " << omap.size() << " keys for " << *this << dendl; @@ -2006,7 +2007,7 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, clog->error() << "dir " << dirfrag() << " object missing on disk; some " "files may be lost (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } @@ -2020,14 +2021,14 @@ void CDir::_omap_fetched(bufferlist& hdrbl, map& omap, << ": " << err.what() << dendl; clog->warn() << "Corrupt fnode header in " << dirfrag() << ": " << err.what() << " (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } if (!p.end()) { clog->warn() << "header buffer of dir " << dirfrag() << " has " << hdrbl.length() - p.get_off() << " extra bytes (" << get_path() << ")"; - go_bad(complete); + go_bad(complete | from_scrub); return; } } diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7cc4dc7ffcf83..2c124350d0f9a 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -483,7 +483,8 @@ class CDir : public MDSCacheObject, public Counter { void fetch(MDSContext *c, bool ignore_authpinnability=false) { fetch("", CEPH_NOSNAP, c, ignore_authpinnability); } - void fetch_keys(const std::vector& keys, MDSContext *c); + void fetch_keys(const std::vector &keys, MDSContext *c, + bool from_scrub = false); #if 0 // unused? void wait_for_commit(Context *c, version_t v=0); @@ -653,7 +654,8 @@ class CDir : public MDSCacheObject, public Counter { friend class C_IO_Dir_Committed; friend class C_IO_Dir_Commit_Ops; - void _omap_fetch(std::set *keys, MDSContext *fin=nullptr); + void _omap_fetch(std::set *keys, MDSContext *fin = nullptr, + bool from_scrub = false); void _omap_fetch_more(version_t omap_version, bufferlist& hdrbl, std::map& omap, MDSContext *fin); CDentry *_load_dentry( @@ -671,8 +673,10 @@ class CDir : public MDSCacheObject, public Counter { */ void go_bad(bool complete); - void _omap_fetched(ceph::buffer::list& hdrbl, std::map& omap, - bool complete, const std::set& keys, int r); + void _omap_fetched(ceph::buffer::list &hdrbl, + std::map &omap, + bool complete, const std::set &keys, int r, + bool from_scrub = false); // -- commit -- void _commit(version_t want, int op_prio); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 71b6081be7de2..6ff81082bb99a 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -5198,7 +5198,6 @@ void CInode::scrub_info_create() const { dout(25) << __func__ << dendl; ceph_assert(!scrub_infop); - // break out of const-land to set up implicit initial state CInode *me = const_cast(this); const auto& pi = me->get_projected_inode(); @@ -5231,23 +5230,46 @@ void CInode::scrub_initialize(ScrubHeaderRef& header) // right now we don't handle remote inodes } +void CInode::set_forward_scrub(bool forward_scrub) { + scrub_infop->forward_scrub = forward_scrub; +} + +void CInode::scrub_add_remote_link( + std::vector> &&remote_links) { + + for (auto& [remote_link_path, remote_ino]: remote_links) { + scrub_infop->remote_links.emplace_back(std::move(remote_link_path), + remote_ino); + } +} + +void CInode::scrub_reset_remote_links() { + scrub_infop->remote_links.clear(); +} + +std::vector> && +CInode::scrub_move_remote_links() { + return std::move(scrub_infop->remote_links); +} + void CInode::scrub_aborted() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->scrub_in_progress = false; scrub_infop->header->dec_num_pending(); + scrub_infop->remote_links.clear(); scrub_maybe_delete_info(); } void CInode::scrub_finished() { dout(20) << __func__ << dendl; ceph_assert(scrub_is_in_progress()); - scrub_infop->last_scrub_version = get_version(); scrub_infop->last_scrub_stamp = ceph_clock_now(); scrub_infop->last_scrub_dirty = true; scrub_infop->scrub_in_progress = false; + scrub_infop->remote_links.clear(); + scrub_infop->forward_scrub = true; scrub_infop->header->dec_num_pending(); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 6f965bffa8ea9..71d57a452f8e7 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -305,6 +305,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> remote_links; + bool forward_scrub = true; fragset_t queued_frags; @@ -458,6 +460,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter> &&remote_links); + + void scrub_reset_remote_links(); + + std::vector> &&scrub_move_remote_links(); + + void set_forward_scrub(bool forward_scrub); + fragset_t& scrub_queued_frags() { ceph_assert(scrub_infop); return scrub_infop->queued_frags; diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc index 2079d23333a83..f603116241bc0 100644 --- a/src/mds/DamageTable.cc +++ b/src/mds/DamageTable.cc @@ -29,6 +29,12 @@ namespace { * Record damage to a particular dirfrag, implicitly affecting * any dentries within it. */ +inline std::ostream& operator<<(std::ostream& os, const DamageEntry& entry) +{ + entry.print(os); + return os; +} + class DirFragDamage : public DamageEntry { public: @@ -123,6 +129,28 @@ class BacktraceDamage : public DamageEntry f->close_section(); } }; + +class RemoteLinkDamage : public DamageEntry { +public: + inodeno_t ino; + std::string head_path; + RemoteLinkDamage(inodeno_t ino_, const std::string &head_path_ = "") + : ino(ino_), head_path(head_path_) {} + + damage_entry_type_t get_type() const override { + return DAMAGE_ENTRY_REMOTE_LINK; + } + + void dump(Formatter *f) const override { + f->open_object_section("remote_link_damage"); + f->dump_string("damage_type", "remote_link"); + f->dump_int("id", id); + f->dump_int("ino", ino); + f->dump_string("path", path); + f->dump_string("head_path", head_path); + f->close_section(); + } +}; } DamageEntry::~DamageEntry() @@ -132,28 +160,34 @@ bool DamageTable::notify_dentry( inodeno_t ino, frag_t frag, snapid_t snap_id, std::string_view dname, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } // Special cases: damage to these dirfrags is considered fatal to // the MDS rank that owns them. - if ( - (MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) - || - (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank) - ) { + if ((MDS_INO_IS_MDSDIR(ino) && MDS_INO_MDSDIR_OWNER(ino) == rank) || + (MDS_INO_IS_STRAY(ino) && MDS_INO_STRAY_OWNER(ino) == rank)) { derr << "Damage to dentries in fragment " << frag << " of ino " << ino << "is fatal because it is a system directory for this rank" << dendl; return true; } - auto& df_dentries = dentries[DirFragIdent(ino, frag)]; - if (auto [it, inserted] = df_dentries.try_emplace(DentryIdent(dname, snap_id)); inserted) { - auto entry = std::make_shared(ino, frag, dname, snap_id); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino, frag, dname, snap_id); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto &df_dentries = dentries[DirFragIdent(ino, frag)]; + if (auto [it, inserted] = + df_dentries.try_emplace(DentryIdent(dname, snap_id)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -171,15 +205,24 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, return true; } - if (oversized()) { + bool over_sized = oversized(); + + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); inserted) { - DamageEntryRef entry = std::make_shared(ino, frag); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + DamageEntryRef entry = std::make_shared(ino, frag); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = dirfrags.try_emplace(DirFragIdent(ino, frag)); + inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -187,15 +230,47 @@ bool DamageTable::notify_dirfrag(inodeno_t ino, frag_t frag, bool DamageTable::notify_remote_damaged(inodeno_t ino, std::string_view path) { - if (oversized()) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { + return true; + } + + auto entry = std::make_shared(ino); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } + } + + return false; +} + +bool DamageTable::notify_remote_link_damaged(inodeno_t ino, + const std::string &path, + const std::string &head_path) { + bool over_sized = oversized(); + if (!log_to_file && over_sized) { return true; } - if (auto [it, inserted] = remotes.try_emplace(ino); inserted) { - auto entry = std::make_shared(ino); - entry->path = path; - it->second = entry; - by_id[entry->id] = std::move(entry); + auto entry = std::make_shared(ino, head_path); + entry->path = path; + if (log_to_file) { + fout << *entry << std::endl; + } + + if (!over_sized) { + auto& df_remote_links = remote_links[ino]; + if (auto [it, inserted] = df_remote_links.try_emplace(path); inserted) { + it->second = entry; + by_id[entry->id] = std::move(entry); + } } return false; @@ -263,6 +338,10 @@ bool DamageTable::is_remote_damaged( return remotes.count(ino) > 0; } +bool DamageTable::is_remote_link_damaged(const inodeno_t ino) const { + return remote_links.count(ino) > 0; +} + void DamageTable::dump(Formatter *f) const { f->open_array_section("damage_table"); @@ -293,6 +372,19 @@ void DamageTable::erase(damage_entry_id_t damage_id) } else if (type == DAMAGE_ENTRY_BACKTRACE) { auto backtrace_entry = std::static_pointer_cast(entry); remotes.erase(backtrace_entry->ino); + } else if (type == DAMAGE_ENTRY_REMOTE_LINK) { + auto remote_link_entry = std::static_pointer_cast(entry); + auto df_remote_link_it = remote_links.find(remote_link_entry->ino); + if (df_remote_link_it != remote_links.end()) { + auto damage_it = df_remote_link_it->second.find(entry->path); + if (damage_it != df_remote_link_it->second.end()) { + df_remote_link_it->second.erase(entry->path); + } + if(df_remote_link_it->second.empty()) { + remote_links.erase(df_remote_link_it); + } + } + remote_links.erase(remote_link_entry->ino); } else { derr << "Invalid type " << type << dendl; ceph_abort(); @@ -301,3 +393,52 @@ void DamageTable::erase(damage_entry_id_t damage_id) by_id.erase(by_id_entry); } +bool DamageTable::open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path) { + namespace fs = std::filesystem; + + // Reset the stream in case it was previously used + if (fout.is_open()) { + fout.close(); + } + fout.clear(); // clear any error flags + + const fs::path dir = file_path.parent_path(); + + // Create parent directories if needed + if (!dir.empty()) { + std::error_code ec; + + if (!fs::exists(dir, ec)) { + if (ec) { + dout(0) << "error checking existence of damage dir: " << dir << " (" + << ec.message() << ")" << dendl; + return false; + } + + if (!fs::create_directories(dir, ec)) { + dout(0) << "failed to create directories for damage file: " << dir + << " (" << ec.message() << ")" << dendl; + return false; + } + } + } + + // Open in append mode so we keep previous log contents + fout.open(file_path, std::ios::out | std::ios::app); + + if (!fout.is_open()) { + dout(0) << "failed to open damage file: " << file_path << dendl; + return false; + } + + return true; +} + +void DamageTable::clear() { + dirfrags.clear(); + dentries.clear(); + remotes.clear(); + remote_links.clear(); + by_id.clear(); +} diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h index a1b96fe221864..35dc7d89596a7 100644 --- a/src/mds/DamageTable.h +++ b/src/mds/DamageTable.h @@ -16,8 +16,12 @@ #ifndef DAMAGE_TABLE_H_ #define DAMAGE_TABLE_H_ +#include +#include +#include #include +#include "common/Formatter.h" #include "mdstypes.h" #include "include/random.h" @@ -30,7 +34,8 @@ typedef enum { DAMAGE_ENTRY_DIRFRAG, DAMAGE_ENTRY_DENTRY, - DAMAGE_ENTRY_BACKTRACE + DAMAGE_ENTRY_BACKTRACE, + DAMAGE_ENTRY_REMOTE_LINK } damage_entry_type_t; @@ -47,6 +52,11 @@ class DamageEntry virtual damage_entry_type_t get_type() const = 0; virtual void dump(Formatter *f) const = 0; + void print(std::ostream &os) const { + JSONFormatter jf; + dump(&jf); + jf.flush(os); + } damage_entry_id_t id; utime_t reported_at; @@ -121,10 +131,13 @@ class DentryIdent class DamageTable { public: - explicit DamageTable(const mds_rank_t rank_) - : rank(rank_) - { + explicit DamageTable(const mds_rank_t rank_, bool log_to_file_, + const std::string &log_file_) + : rank(rank_), log_to_file(log_to_file_), log_file(log_file_) { ceph_assert(rank_ != MDS_RANK_NONE); + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } } /** @@ -156,6 +169,9 @@ class DamageTable */ bool notify_remote_damaged(inodeno_t ino, std::string_view path); + bool notify_remote_link_damaged(inodeno_t ino, const std::string &path, + const std::string &head_path = ""); + void remove_dentry_damage_entry(CDir *dir); void remove_dirfrag_damage_entry(CDir *dir); @@ -171,10 +187,33 @@ class DamageTable bool is_remote_damaged(const inodeno_t ino) const; + bool is_remote_link_damaged(const inodeno_t ino) const; + void dump(Formatter *f) const; void erase(damage_entry_id_t damage_id); + void set_log_to_file(bool _log_to_file) { + log_to_file = _log_to_file; + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } + } + + void set_log_file(const std::string &_log_file) { + log_file = _log_file; + if (log_to_file) { + log_file_opened = open_damage_log_file(fout, log_file); + } + } + + void clear(); + + private: + bool open_damage_log_file(std::ofstream &fout, + const std::filesystem::path &file_path); + std::ofstream fout; + protected: // I need to know my MDS rank so that I can check if // metadata items are part of my mydir. @@ -194,10 +233,17 @@ class DamageTable // (i.e. have probably/possibly missing backtraces) std::map remotes; + // Map of all links which could not be resolved + // (i.e. have probably/possibly missing primary inodes) + std::map> remote_links; + // All damage, by ID. This is a secondary index // to the dirfrag, dentry, remote maps. It exists // to enable external tools to unambiguously operate // on particular entries. std::map by_id; + bool log_to_file = false; + std::string log_file = ""; + bool log_file_opened = false; }; #endif // DAMAGE_TABLE_H_ diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5480e6dcd5efe..6675a28d96908 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8493,6 +8493,12 @@ int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { dout(4) << "traverse: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mds->damage_table.notify_remote_link_damaged(dnl->get_remote_ino(), + path); return -CEPHFS_EIO; } open_remote_dentry(dn, true, cf.build(), @@ -8817,7 +8823,7 @@ void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext path += dn->get_name(); } - bool fatal = mds->damage_table.notify_remote_damaged(ino, path); + bool fatal = mds->damage_table.notify_remote_link_damaged(ino, path); if (fatal) { mds->damaged(); ceph_abort(); // unreachable, damaged() respawns us diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc index e97fd2cf83f8f..67718aae08947 100644 --- a/src/mds/MDSDaemon.cc +++ b/src/mds/MDSDaemon.cc @@ -402,6 +402,9 @@ void MDSDaemon::set_up_admin_socket() asok_hook, "Remove a damage table entry"); ceph_assert(r == 0); + r = admin_socket->register_command("damage clear", asok_hook, + "clear the damage list"); + ceph_assert(r == 0); r = admin_socket->register_command("osdmap barrier name=target_epoch,type=CephInt", asok_hook, "Wait until the MDS has this OSD map epoch"); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index aa6a8c162f4f5..1ea06d0f80400 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -482,43 +482,30 @@ class C_Drop_Cache : public MDSInternalContext { } }; -MDSRank::MDSRank( - mds_rank_t whoami_, - ceph::fair_mutex &mds_lock_, - LogChannelRef &clog_, - CommonSafeTimer &timer_, - Beacon &beacon_, - std::unique_ptr& mdsmap_, - Messenger *msgr, - MonClient *monc_, - MgrClient *mgrc, - Context *respawn_hook_, - Context *suicide_hook_, - boost::asio::io_context& ioc) : - cct(msgr->cct), mds_lock(mds_lock_), clog(clog_), - timer(timer_), mdsmap(mdsmap_), - objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)), - damage_table(whoami_), sessionmap(this), - op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, - g_conf()->osd_num_op_tracker_shard), - progress_thread(this), whoami(whoami_), - purge_queue(g_ceph_context, whoami_, - mdsmap_->get_metadata_pool(), objecter, - new LambdaContext([this](int r) { - std::lock_guard l(mds_lock); - handle_write_error(r); - } - ) - ), - metrics_handler(cct, this), - beacon(beacon_), - messenger(msgr), monc(monc_), mgrc(mgrc), - respawn_hook(respawn_hook_), - suicide_hook(suicide_hook_), - inject_journal_corrupt_dentry_first(g_conf().get_val("mds_inject_journal_corrupt_dentry_first")), - starttime(mono_clock::now()), - ioc(ioc) -{ +MDSRank::MDSRank(mds_rank_t whoami_, ceph::fair_mutex &mds_lock_, + LogChannelRef &clog_, + CommonSafeTimer &timer_, Beacon &beacon_, + std::unique_ptr &mdsmap_, Messenger *msgr, + MonClient *monc_, MgrClient *mgrc, Context *respawn_hook_, + Context *suicide_hook_, boost::asio::io_context &ioc) + : cct(msgr->cct), mds_lock(mds_lock_), clog(clog_), timer(timer_), + mdsmap(mdsmap_), objecter(new Objecter(g_ceph_context, msgr, monc_, ioc)), + damage_table(whoami_, g_conf().get_val("mds_damage_log_to_file"), + g_conf().get_val("mds_damage_log_file")), + sessionmap(this), + op_tracker(g_ceph_context, g_conf()->mds_enable_op_tracker, + g_conf()->osd_num_op_tracker_shard), + progress_thread(this), whoami(whoami_), + purge_queue(g_ceph_context, whoami_, mdsmap_->get_metadata_pool(), + objecter, new LambdaContext([this](int r) { + std::lock_guard l(mds_lock); + handle_write_error(r); + })), + metrics_handler(cct, this), beacon(beacon_), messenger(msgr), monc(monc_), + mgrc(mgrc), respawn_hook(respawn_hook_), suicide_hook(suicide_hook_), + inject_journal_corrupt_dentry_first( + g_conf().get_val("mds_inject_journal_corrupt_dentry_first")), + starttime(mono_clock::now()), ioc(ioc) { hb = g_ceph_context->get_heartbeat_map()->add_worker("MDSRank", pthread_self()); // The metadata pool won't change in the whole life time @@ -2929,6 +2916,9 @@ void MDSRankDispatcher::handle_asok_command( goto out; } damage_table.erase(id); + } else if (command == "damage clear") { + std::lock_guard l(mds_lock); + damage_table.clear(); } else { r = -CEPHFS_ENOSYS; } @@ -3866,6 +3856,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_inject_rename_corrupt_dentry_first", "mds_inject_journal_corrupt_dentry_first", "mds_session_metadata_threshold", + "mds_damage_log_to_file", + "mds_damage_log_file", NULL }; return KEYS; @@ -3936,6 +3928,14 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s if (changed.count("mds_inject_journal_corrupt_dentry_first")) { inject_journal_corrupt_dentry_first = g_conf().get_val("mds_inject_journal_corrupt_dentry_first"); } + if (changed.count("mds_damage_log_to_file")) { + damage_table.set_log_to_file( + g_conf().get_val("mds_damage_log_to_file")); + } + if (changed.count("mds_damage_log_file")) { + damage_table.set_log_file( + g_conf().get_val("mds_damage_log_file")); + } finisher->queue(new LambdaContext([this, changed](int) { std::scoped_lock lock(mds_lock); diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h index a5d35f61ce428..0a27ab5ee4871 100644 --- a/src/mds/ScrubHeader.h +++ b/src/mds/ScrubHeader.h @@ -64,6 +64,18 @@ class ScrubHeader { } unsigned get_num_pending() const { return num_pending; } + void inc_scrubbed_inode_count() { ++scrubbed_inode_count; } + + uint64_t get_scrubbed_inode_count() const { return scrubbed_inode_count; } + + void inc_scrubbed_remote_link_count(uint64_t val = 1) { + scrubbed_remote_link_count += val; + } + + uint64_t get_scrubbed_remote_link_count() const { + return scrubbed_remote_link_count; + } + protected: const std::string tag; bool is_tag_internal; @@ -76,6 +88,8 @@ class ScrubHeader { bool repaired = false; // May be set during scrub if repairs happened unsigned epoch_last_forwarded = 0; unsigned num_pending = 0; + uint64_t scrubbed_inode_count = 0; + uint64_t scrubbed_remote_link_count = 0; }; typedef std::shared_ptr ScrubHeaderRef; diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 742c464f4d37a..d8d2260d879e0 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -59,12 +59,18 @@ void ScrubStack::dequeue(MDSCacheObject *obj) stack_size--; } -int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) -{ +int ScrubStack::_enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, + std::vector> &&remote_links) { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); if (CInode *in = dynamic_cast(obj)) { if (in->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *in << "}" << ", already in scrubbing" << dendl; + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + } else { + in->set_forward_scrub(true); + } return -CEPHFS_EBUSY; } if(in->state_test(CInode::STATE_PURGING)) { @@ -75,6 +81,11 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl; in->scrub_initialize(header); + if (!remote_links.empty()) { + in->scrub_add_remote_link(std::move(remote_links)); + in->set_forward_scrub(false); + } + } else if (CDir *dir = dynamic_cast(obj)) { if (dir->scrub_is_in_progress()) { dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl; @@ -103,7 +114,7 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top) scrub_stack.push_front(&obj->item_scrub); else scrub_stack.push_back(&obj->item_scrub); - return 0; + return 1; } int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top) @@ -209,49 +220,53 @@ void ScrubStack::kick_off_scrubs() if (scrubs_in_progress == 0) { set_state(STATE_IDLE); } - return; } assert(state == STATE_RUNNING || state == STATE_IDLE); set_state(STATE_RUNNING); - if (CInode *in = dynamic_cast(*it)) { + if (CInode *in = dynamic_cast(*it)) { dout(20) << __func__ << " examining " << *in << dendl; ++it; if (!validate_inode_auth(in)) - continue; + continue; if (!in->is_dir()) { - // it's a regular file, symlink, or hard link - dequeue(in); // we only touch it this once, so remove from stack - - scrub_file_inode(in); + // it's a regular file, symlink, or hard link + dequeue(in); // we only touch it this once, so remove from stack + scrub_file_inode(in); + } else if (in->scrub_info()->forward_scrub) { + bool added_children = false; + bool done = false; // it's done, so pop it off the stack + scrub_dir_inode(in, &added_children, &done); + if (done) { + dout(20) << __func__ << " dir inode, done" << dendl; + in->set_forward_scrub(false); + dequeue(in); + } + if (added_children) { + // dirfrags were queued at top of stack + it = scrub_stack.begin(); + } + } else if (!in->scrub_info()->remote_links.empty()){ + dequeue(in); + scrub_dir_inode_final(in); } else { - bool added_children = false; - bool done = false; // it's done, so pop it off the stack - scrub_dir_inode(in, &added_children, &done); - if (done) { - dout(20) << __func__ << " dir inode, done" << dendl; - dequeue(in); - } - if (added_children) { - // dirfrags were queued at top of stack - it = scrub_stack.begin(); - } + dequeue(in); } - } else if (CDir *dir = dynamic_cast(*it)) { - auto next = it; - ++next; + } else if (CDir *dir = dynamic_cast(*it)) { + ++it; + bool added_children = false; bool done = false; // it's done, so pop it off the stack - scrub_dirfrag(dir, &done); + scrub_dirfrag(dir, &added_children, &done); if (done) { - dout(20) << __func__ << " dirfrag, done" << dendl; - ++it; // child inodes were queued at bottom of stack - dequeue(dir); - } else { - it = next; + dout(20) << __func__ << " dirfrag, done" << dendl; + dequeue(dir); + } + if (added_children) { + it = scrub_stack.begin(); } } else { ceph_assert(0 == "dentry in scrub stack"); @@ -341,7 +356,7 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) dir->add_waiter(CDir::WAIT_UNFREEZE, gather.new_sub()); } else if (dir->get_version() == 0) { dout(20) << __func__ << " barebones " << *dir << dendl; - dir->fetch_keys({}, gather.new_sub()); + dir->fetch_keys({}, gather.new_sub(), true); } else { _enqueue(dir, header, true); queued.insert_raw(dir->get_frag()); @@ -392,9 +407,10 @@ class C_InodeValidated : public MDSInternalContext ScrubStack *stack; CInode::validated_data result; CInode *target; + MDCache* mdcache; C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) - : MDSInternalContext(mds), stack(stack_), target(target_) + : MDSInternalContext(mds), stack(stack_), target(target_), mdcache(mds->mdcache) { stack->scrubs_in_progress++; } @@ -408,16 +424,119 @@ class C_InodeValidated : public MDSInternalContext void ScrubStack::scrub_dir_inode_final(CInode *in) { dout(20) << __func__ << " " << *in << dendl; + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + in->scrub_reset_remote_links(); + in->scrub_finished(); + return; + } + } C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); in->validate_disk_state(&fin->result, fin); return; } -void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) +void ScrubStack::add_remote_link_damage(const std::string &path, + inodeno_t ino) { + CInode* remote_inode = mdcache->get_inode(ino); + std::string head_path = ""; + if (remote_inode) { + remote_inode->make_path_string(head_path); + } + bool fatal = mdcache->mds->damage_table.notify_remote_link_damaged(ino, path, + head_path); + if (fatal) { + mdcache->mds->damaged(); + ceph_abort(); // unreachable, damaged() respawns us + } +} + +class C_RemoteInodeOpenned : public MDSInternalContext { +public: + ScrubStack *stack; + CDentry *dn; + ScrubHeaderRef header; + inodeno_t ino; + MDCache* mdcache; + C_RemoteInodeOpenned(MDSRank *mds, ScrubStack *stack_, + ScrubHeaderRef &header_, CDentry *dn_, inodeno_t ino_) + : MDSInternalContext(mds), stack(stack_), header(header_), dn(dn_), + ino(ino_), mdcache(stack_->mdcache) { + stack->scrubs_in_progress++; + header->inc_num_pending(); + dn->get(MDSCacheObject::PIN_SCRUBQUEUE); + } + void finish(int r) override { + std::string path; + CDir *dir = dn->get_dir(); + CInode *remote_inode = nullptr; + + stack->scrubs_in_progress--; + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (r < 0 || !(dnl->is_remote() && dnl->get_remote_ino() == ino)) { + goto safe_exit; + } + remote_inode = mds->mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + std::string path; + if (dir) { + dir->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + } + stack->add_remote_link_damage(path, ino); + header->inc_scrubbed_remote_link_count(); + goto safe_exit; + } + stack->_enqueue(remote_inode, header, true, + {std::make_pair(std::move(path), ino)}); + stack->kick_off_scrubs(); + safe_exit: + dn->put(MDSCacheObject::PIN_SCRUBQUEUE); + header->dec_num_pending(); + } +}; + +CInode *ScrubStack::remote_link_checkup(CDentry *dn, ScrubHeaderRef &header) { + + CDentry::linkage_t *dnl = dn->get_linkage(); + CInode *remote_inode = mdcache->get_inode(dnl->get_remote_ino()); + if (!remote_inode) { + if (mdcache->mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { + dout(4) << "scrub: remote dentry points to damaged ino " << *dn << dendl; + std::string path; + dn->get_dir()->get_inode()->make_path_string(path); + path += "/"; + path += dn->get_name(); + mdcache->mds->damage_table.notify_remote_link_damaged( + dnl->get_remote_ino(), path); + return nullptr; + } + MDSContext *ctx = + (!header->get_repair() && g_conf()->mds_scrub_hard_link) + ? (MDSContext *)(new C_RemoteInodeOpenned( + mdcache->mds, this, header, dn, dnl->get_remote_ino())) + : (MDSContext *)(new C_MDSInternalNoop()); + + mdcache->open_remote_dentry(dn, true, ctx); + } + return remote_inode; +} + +void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children, bool *done) { ceph_assert(dir != NULL); - dout(10) << __func__ << " " << *dir << dendl; if (!dir->is_complete()) { @@ -455,9 +574,24 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) continue; } if (dnl->is_primary()) { - _enqueue(dnl->get_inode(), header, false); + if (_enqueue(dnl->get_inode(), header, true) == 1) { + *added_children = true; + } } else if (dnl->is_remote()) { - // TODO: check remote linkage + auto remote_ino = dnl->get_remote_ino(); + CInode *remote_inode = remote_link_checkup(dn, header); + if (remote_inode && !header->get_repair() && + g_conf()->mds_scrub_hard_link) { + std::string remote_path; + dir->get_inode()->make_path_string(remote_path); + remote_path += "/"; + remote_path += dn->get_name(); + if (_enqueue(remote_inode, header, true, + {std::make_pair(std::move(remote_path), remote_ino)}) == + 1) { + *added_children = true; + } + } } } } @@ -479,6 +613,24 @@ void ScrubStack::scrub_dirfrag(CDir *dir, bool *done) void ScrubStack::scrub_file_inode(CInode *in) { + ScrubHeaderRef header = in->scrub_info()->header; + if (!in->scrub_info()->forward_scrub && + !in->scrub_info()->remote_links.empty()) { + auto parent = in->get_projected_parent_dn(); + if (mdcache->mds->damage_table.is_remote_damaged(in->ino()) || + (parent && mdcache->mds->damage_table.is_dentry_damaged( + parent->get_dir(), parent->get_name(), parent->last))) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + in->scrub_reset_remote_links(); + in->scrub_finished(); + return; + } + } + C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); // At this stage the DN is already past scrub_initialize, so // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned @@ -489,7 +641,7 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, const CInode::validated_data &result) { LogChannelRef clog = mdcache->mds->clog; - const ScrubHeaderRefConst header = in->scrub_info()->header; + ScrubHeaderRef header = in->scrub_info()->header; std::string path; if (!result.passed_validation) { @@ -537,7 +689,32 @@ void ScrubStack::_validate_inode_done(CInode *in, int r, dout(10) << __func__ << " scrub passed on inode " << *in << dendl; } + if (!in->scrub_info()->remote_links.empty()) { + if (!result.passed_validation) { + for (auto &[remote_link_path, remote_ino] : + in->scrub_info()->remote_links) { + add_remote_link_damage(remote_link_path, remote_ino); + header->inc_scrubbed_remote_link_count(); + } + } else { + CDentry *pdn = in->get_parent_dn(); + if (pdn) { + CInode *diri = pdn->get_dir()->get_inode(); + _enqueue(diri, header, true, std::move(in->scrub_move_remote_links())); + } else { + header->inc_scrubbed_remote_link_count( + in->scrub_info()->remote_links.size()); + } + } + } + in->scrub_reset_remote_links(); + + if (in->scrub_info()->forward_scrub) { + _enqueue(in, header, true); + } + in->scrub_finished(); + header->inc_scrubbed_inode_count(); } void ScrubStack::complete_control_contexts(int r) { @@ -637,7 +814,8 @@ void ScrubStack::scrub_status(Formatter *f) { if (scrubbing_map.empty()) *css << "no active scrubs running"; else - *css << state << " (waiting for more scrubs)"; + *css << state << " (waiting for more scrubs, " << stack_size + << "inodes in the stack)"; } else if (state == STATE_RUNNING) { if (clear_stack) { *css << "ABORTING"; @@ -676,6 +854,10 @@ void ScrubStack::scrub_status(Formatter *f) { f->dump_stream("path") << "#" << header->get_origin(); f->dump_string("tag", header->get_tag()); + f->dump_unsigned("scrubbed_inode_count", + header->get_scrubbed_inode_count()); + f->dump_unsigned("scrubbed_remote_link_count", + header->get_scrubbed_remote_link_count()); CachedStackStringStream optcss; if (header->get_recursive()) { diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index 756ebd9cb0e95..789fe38a70695 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -154,8 +154,12 @@ class ScrubStack { friend std::ostream &operator<<(std::ostream &os, const State &state); friend class C_InodeValidated; + friend class C_RemoteInodeOpenned; + friend class C_RemoteLinkCheckFinished; - int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top); + int _enqueue( + MDSCacheObject *obj, ScrubHeaderRef &header, bool top, + std::vector> &&remote_links = {}); /** * Remove the inode/dirfrag from the stack. */ @@ -188,6 +192,12 @@ class ScrubStack { */ void scrub_file_inode(CInode *in); + /** + * Scrub a file inode. + * @param dn The remote dentry to identify + */ + CInode *remote_link_checkup(CDentry *dn, ScrubHeaderRef &header); + /** * Callback from completion of CInode::validate_disk_state * @param in The inode we were validating @@ -211,9 +221,10 @@ class ScrubStack { * scrub of the dirfrag. * * @param dir The dirfrag to scrub (must be auth) + * @param added_children set to true if we pushed some of our children * @param done set to true if we started to do final scrub */ - void scrub_dirfrag(CDir *dir, bool *done); + void scrub_dirfrag(CDir *dir, bool *added_children, bool *done); /** * Scrub a directory-representing dentry. * @@ -267,6 +278,7 @@ class ScrubStack { void handle_scrub(const cref_t &m); void handle_scrub_stats(const cref_t &m); + void add_remote_link_damage(const std::string &path, inodeno_t ino); State state = STATE_IDLE; bool clear_stack = false;