Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions doc/cephfs/mantle.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,23 @@ Mantle with `vstart.sh`

::

bin/ceph fs set cephfs allow_multimds true --yes-i-really-mean-it
bin/ceph fs set cephfs max_mds 5
bin/ceph fs set cephfs_a allow_multimds true --yes-i-really-mean-it
bin/ceph fs set cephfs_a max_mds 5
bin/ceph fs set cephfs_a balancer greedyspill.lua


4. Mount CephFS in another window:

::

bin/ceph-fuse /cephfs -o allow_other &
tail -f out/mds.a.log
mkdir /cephfs
bin/ceph-fuse /cephfs -o allow_other &
tail -f out/mds.a.log


Note that if you look at the last MDS (which could be a, b, or c -- it's
random), you will see an an attempt to index a nil value. This is because the
last MDS tries to check the load of its neighbor, which does not exist.
Note that if you look at the last MDS (which could be a, b, or c -- it's
random), you will see an an attempt to index a nil value. This is because the
last MDS tries to check the load of its neighbor, which does not exist.

5. Run a simple benchmark. In our case, we use the Docker mdtest image to
create load:
Expand Down Expand Up @@ -161,10 +162,10 @@ Implementation Details
Most of the implementation is in MDBalancer. Metrics are passed to the balancer
policies via the Lua stack and a list of loads is returned back to MDBalancer.
It sits alongside the current balancer implementation and it's enabled with a
Ceph CLI command ("ceph fs set cephfs balancer mybalancer.lua"). If the Lua policy
fails (for whatever reason), we fall back to the original metadata load
balancer. The balancer is stored in the RADOS metadata pool and a string in the
MDSMap tells the MDSs which balancer to use.
Ceph CLI command ("ceph fs set cephfs balancer mybalancer.lua"). If the Lua
policy fails (for whatever reason), we do not migrate any load. The balancer
is stored in the RADOS metadata pool and a string in the MDSMap tells the MDSs
which balancer to use.

Exposing Metrics to Lua
~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
139 changes: 77 additions & 62 deletions qa/tasks/cephfs/test_mantle.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

log = logging.getLogger(__name__)
failure = "using old balancer; mantle failed for balancer="
failure = "no load migrated; mantle failed for balancer="
success = "mantle balancer version changed: "

class TestMantle(CephFSTestCase):
Expand All @@ -23,88 +23,103 @@ def start_mantle(self):
def push_balancer(self, obj, lua_code, expect):
self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
self.fs.rados(["put", obj, "-"], stdin_data=lua_code)
with self.assert_cluster_log(failure + obj + " " + expect):
with self.assert_cluster_log(failure + obj + " " + expect, timeout=60):
log.info("run a " + obj + " balancer that expects=" + expect)

def test_version_empty(self):
def test_invalid_neighbor(self):
self.start_mantle()
expect = " : (2) No such file or directory"
expect = ": (22) Invalid argument"

ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
assert(ret == 22) # EINVAL
lua_code = "dict = {}\ntest = dict[0]"
self.push_balancer("invalid_neighbor.lua", lua_code, expect)

self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
with self.assert_cluster_log(failure + " " + expect): pass
lua_code = "dict = {}\nif dict[0] == nil then return {3, 4} end\ntest = dict[0]"
self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid_neighbor.lua")
self.fs.rados(["put", "valid_neighbor.lua", "-"], stdin_data=lua_code)
with self.assert_cluster_log(success + "valid_neighbor.lua"):
log.info("run a valid_neighbor.lua balancer")

def test_version_not_in_rados(self):
self.start_mantle()
expect = failure + "ghost.lua : (2) No such file or directory"
self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
with self.assert_cluster_log(expect): pass

def test_balancer_invalid(self):
self.start_mantle()
expect = ": (22) Invalid argument"

#def test_version_empty(self):
# self.start_mantle()
# expect = " : (2) No such file or directory"

lua_code = "this is invalid lua code!"
self.push_balancer("invalid.lua", lua_code, expect)
# ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
# assert(ret == 22) # EINVAL

lua_code = "BAL_LOG()"
self.push_balancer("invalid_log.lua", lua_code, expect)
# self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
# with self.assert_cluster_log(failure + " " + expect): pass

lua_code = "BAL_LOG(0)"
self.push_balancer("invalid_log_again.lua", lua_code, expect)
#def test_version_not_in_rados(self):
# self.start_mantle()
# expect = failure + "ghost.lua : (2) No such file or directory"
# self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
# with self.assert_cluster_log(expect): pass

def test_balancer_valid(self):
self.start_mantle()
lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
with self.assert_cluster_log(success + "valid.lua"):
log.info("run a valid.lua balancer")
#def test_balancer_invalid(self):
# self.start_mantle()
# expect = ": (22) Invalid argument"

def test_return_invalid(self):
self.start_mantle()
expect = ": (22) Invalid argument"
# lua_code = "this is invalid lua code!"
# self.push_balancer("invalid.lua", lua_code, expect)

lua_code = "return \"hello\""
self.push_balancer("string.lua", lua_code, expect)
# lua_code = "BAL_LOG()"
# self.push_balancer("invalid_log.lua", lua_code, expect)

lua_code = "return 3"
self.push_balancer("number.lua", lua_code, expect)
# lua_code = "BAL_LOG(0)"
# self.push_balancer("invalid_log_again.lua", lua_code, expect)

lua_code = "return {}"
self.push_balancer("dict_empty.lua", lua_code, expect)
#def test_balancer_valid(self):
# self.start_mantle()
# lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
# self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
# self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
# with self.assert_cluster_log(success + "valid.lua"):
# log.info("run a valid.lua balancer")

lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
self.push_balancer("dict_of_strings.lua", lua_code, expect)
#def test_return_invalid(self):
# self.start_mantle()
# expect = ": (22) Invalid argument"

lua_code = "return {3, \"test\"}"
self.push_balancer("dict_of_mixed.lua", lua_code, expect)
# lua_code = "return \"hello\""
# self.push_balancer("string.lua", lua_code, expect)

lua_code = "return {3}"
self.push_balancer("not_enough_numbers.lua", lua_code, expect)
# lua_code = "return 3"
# self.push_balancer("number.lua", lua_code, expect)

lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
self.push_balancer("too_many_numbers.lua", lua_code, expect)
# lua_code = "return {}"
# self.push_balancer("dict_empty.lua", lua_code, expect)

def test_dead_osd(self):
self.start_mantle()
expect = " : (110) Connection timed out"
# lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
# self.push_balancer("dict_of_strings.lua", lua_code, expect)

# lua_code = "return {3, \"test\"}"
# self.push_balancer("dict_of_mixed.lua", lua_code, expect)

# lua_code = "return {3}"
# self.push_balancer("not_enough_numbers.lua", lua_code, expect)

# lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
# self.push_balancer("too_many_numbers.lua", lua_code, expect)

#def test_dead_osd(self):
# self.start_mantle()
# expect = " : (110) Connection timed out"

# kill the OSDs so that the balancer pull from RADOS times out
osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
for i in range(0, len(osd_map['osds'])):
self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
# # kill the OSDs so that the balancer pull from RADOS times out
# osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
# for i in range(0, len(osd_map['osds'])):
# self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
# self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))

# trigger a pull from RADOS
self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
# # trigger a pull from RADOS
# self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")

# make the timeout a little longer since dead OSDs spam ceph -w
with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
log.info("run a balancer that should timeout")
# # make the timeout a little longer since dead OSDs spam ceph -w
# with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
# log.info("run a balancer that should timeout")

# cleanup
for i in range(0, len(osd_map['osds'])):
self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
# # cleanup
# for i in range(0, len(osd_map['osds'])):
# self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
37 changes: 23 additions & 14 deletions src/mds/MDBalancer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -318,13 +318,10 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m)

/* avoid spamming ceph -w if user does not turn mantle on */
if (mds->mdsmap->get_balancer() != "") {
int r = mantle_prep_rebalance();
if (!r) return;
mds->clog->warn() << "using old balancer; mantle failed for "
<< "balancer=" << mds->mdsmap->get_balancer()
<< " : " << cpp_strerror(r);
mantle_prep_rebalance();
} else {
prep_rebalance(m->get_beat());
}
prep_rebalance(m->get_beat());
}
}

Expand Down Expand Up @@ -670,13 +667,17 @@ void MDBalancer::prep_rebalance(int beat)



int MDBalancer::mantle_prep_rebalance()
void MDBalancer::mantle_prep_rebalance()
{
/* refresh balancer if it has changed */
if (bal_version != mds->mdsmap->get_balancer()) {
bal_version.assign("");
int r = localize_balancer();
if (r) return r;
if (r) {
mds->clog->warn() << "no load migrated; mds=" << mds->get_nodeid()
<< " cannot localize balancer : " << cpp_strerror(r);
return;
};

/* only spam the cluster log from 1 mds on version changes */
if (mds->get_nodeid() == 0)
Expand Down Expand Up @@ -710,16 +711,24 @@ int MDBalancer::mantle_prep_rebalance()
/* execute the balancer */
Mantle mantle;
int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, my_targets);
dout(2) << " mantle decided that new targets=" << my_targets << dendl;
if (ret) {
mds->clog->warn() << "no load migrated; mantle failed for "
<< "balancer=" << mds->mdsmap->get_balancer()
<< " : " << cpp_strerror(ret);
return;
}

/* mantle doesn't know about cluster size, so check target len here */
if ((int) my_targets.size() != cluster_size)
return -EINVAL;
else if (ret)
return ret;
int ntargets = (int) my_targets.size();
if (ntargets != cluster_size) {
mds->clog->warn() << "no load migrated; mantle failed for "
<< "balancer=" << mds->mdsmap->get_balancer()
<< " : " << cpp_strerror(-EINVAL);
return;
}
dout(2) << " mantle decided that new targets=" << my_targets << dendl;

try_rebalance();
return 0;
}


Expand Down
2 changes: 1 addition & 1 deletion src/mds/MDBalancer.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class MDBalancer {
//set up the rebalancing targets for export and do one if the
//MDSMap is up to date
void prep_rebalance(int beat);
int mantle_prep_rebalance();
void mantle_prep_rebalance();
/*check if the monitor has recorded the current export targets;
if it has then do the actual export. Otherwise send off our
export targets message again*/
Expand Down
4 changes: 4 additions & 0 deletions src/mds/balancers/greedyspill.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ end

-- Shed load when you have load and your neighbor doesn't
function when()
if mds[whoami+1] == nil then
BAL_LOG(2, "when: neighbor does not exist; I'm probably the last mds")
return false
end
my_load = mds[whoami]["load"]
his_load = mds[whoami+1]["load"]
if my_load > 0.01 and his_load < 0.01 then
Expand Down
Loading