From 1aa0eb90e5e1282a3214a120b624d8ea657d8fb8 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Wed, 6 Oct 2021 17:51:49 -0700 Subject: [PATCH 01/26] anyof --- wikipedia/config.yml | 1 + wikipedia/preprocessing/wiki_api_data.py | 2 +- wikipedia/simulate.py | 81 +++++++++++++++--------- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/wikipedia/config.yml b/wikipedia/config.yml index 74128ff..9d082b7 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -6,6 +6,7 @@ parsed_doc_dir = %(data_dir)s/doc_pkl/ parsed_tmp_dir = %(data_dir)s/parsed_tmp/ diff_dir = %(data_dir)s/diffs/ embedding_dir = %(data_dir)s/embeddings/ +exp_dir = %(data_dir)s/simulation_output/ [files] data_dir = /data/wooders/wikipedia diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index 67235c5..8a3efe1 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -15,7 +15,6 @@ from multiprocessing import Pool # from concurrent.futures import ProcessPoolExecutor -from bs4 import BeautifulSoup # from generate diffs file (originally from DPR repo... sorry kevin) from generate_diffs import generate_sentence_level_diffs @@ -23,6 +22,7 @@ def query_recentchanges(start_time, end_time, revision_file): + from bs4 import BeautifulSoup pass diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 7b11339..2a0500c 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -45,15 +45,16 @@ class WeightedLoadBalancer(CrossKeyLoadBalancer): # self.key_weights = key_weights def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: - chosen_key = None max_len = 0 + total_len = 0 for key in per_key_queues.keys(): - if per_key_queues[key].size() > max_len: + size = per_key_queues[key].size() + if size > max_len: chosen_key = key - max_len = per_key_queues[key].size() - # print("choose", chosen_key, max_len) - return chosen_key + max_len = size + total_len += size + return chosen_key, total_len class WikiMapper(RalfMapper): @@ -68,6 +69,7 @@ def __init__( super().__init__(env, source_queues, key_selection_policy_cls, model_run_time_s) self.keys = keys + self.source_queues = source_queues # self.env = env # self.source_queues = source_queues @@ -75,34 +77,50 @@ def __init__( # self.model_runtime_s = model_run_time_s # self.env.process(self.run()) - # self.ready_time_to_batch: Dict[float, List[Tuple[int, float]]] = {} + self.ready_time_to_batch: Dict[float, List[Tuple[int, float]]] = {} + + def run(self, replica_id: int): - def run(self): + #this_shard_source_queues = { + # key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] + #} + while True: if self.env.now > 387: break - # windows = yield self.source_queue.get() - chosen_key = self.key_selection_policy.choose(self.source_queues) - - if chosen_key is not None: - - # for chosen_key in self.keys: - windows = yield self.source_queues[chosen_key].get() - print( - f"at time {self.env.now:.2f}, RalfMapper should work on {windows} (last timestamp)" - ) - edits = [(val, chosen_key) for val in windows.window[0].value] - print("edits", edits) - - if self.env.now in self.ready_time_to_batch: - self.ready_time_to_batch[self.env.now] += edits - else: - self.ready_time_to_batch[self.env.now] = edits - - yield self.env.timeout(self.model_runtime_s) - else: # nothing to do - yield self.env.timeout(0.01) + _, total_size_orig = self.key_selection_policy.choose(self.source_queues) + trigger = yield simpy.AnyOf(self.env, [q.get() for q in self.source_queues.values()]) + print("keys", list(trigger.keys())) + print("values", list(trigger.values())) + + + # Add back items to queue (since trigger gets them) + for event in trigger.keys(): + print("add back", event.value.key, event.value) + yield self.source_queues[event.value.key].put(event.value) + + # choose key + chosen_key, total_size = self.key_selection_policy.choose(self.source_queues) + assert chosen_key is not None + + # make sure queue size OK - jk doesn't work with dropping + #assert total_size_orig == 0 or total_size == total_size_orig, f"Bad queue size {total_size_orig} -> {total_size}" + + # get chosen key + windows = yield self.source_queues[chosen_key].get() + print( + f"at time {self.env.now:.2f}, RalfMapper should work on {windows} (last timestamp), queue size {total_size}, wait time {self.model_runtime_s}" + ) + edits = [(val, windows.key) for val in windows.window[0].value] + print("edits", edits) + + if self.env.now in self.ready_time_to_batch: + self.ready_time_to_batch[self.env.now] += edits + else: + self.ready_time_to_batch[self.env.now] = edits + + yield self.env.timeout(self.model_runtime_s) policies = { @@ -188,9 +206,9 @@ def run_once( # policies prioritization_policies = ["fifo", "lifo"] - load_shedding_policies = ["always_process"] - model_runtimes = [0.000001, 0.00001, 0.0000001, 0.000000001, 0] - records_per_second = [100] + load_shedding_policies = ["always_process", "sample_half"] + model_runtimes = [1, 10, 0.01, 0, 0.1, 0.001] + records_per_second = [10] output_files = [] @@ -215,3 +233,4 @@ def run_once( print("DONE", out_path) for f in output_files: print(f) + open("plans.txt", "w").write('\n'.join(output_files)) From ce03916eda685a382952adf35494a907fe4847c8 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 7 Oct 2021 01:08:57 +0000 Subject: [PATCH 02/26] fix --- wikipedia/config.yml | 12 +++++----- wikipedia/simulate.py | 51 +++++++++++++++++-------------------------- 2 files changed, 26 insertions(+), 37 deletions(-) diff --git a/wikipedia/config.yml b/wikipedia/config.yml index 9d082b7..b13c221 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -1,5 +1,5 @@ [directory] -data_dir = /data/wooders/wikipedia +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia revisions_dir = %(data_dir)s/recentchanges raw_doc_dir = %(data_dir)s/doc_xml/ parsed_doc_dir = %(data_dir)s/doc_pkl/ @@ -9,19 +9,19 @@ embedding_dir = %(data_dir)s/embeddings/ exp_dir = %(data_dir)s/simulation_output/ [files] -data_dir = /data/wooders/wikipedia -raw_questions_file = %(data_dir)s/10052021_questions_revid.csv +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +raw_questions_file = %(data_dir)s/10052021_questions_revid.csv model_file = %(data_dir)s/bert-base-encoder.cp changes_file = %(data_dir)s/changes.csv titles_file = %(data_dir)s/top_titles.csv revisions_file = %(data_dir)s/title_revisions_timestamps.json -edits_file = %(data_dir)s/edits.csv +edits_file = %(data_dir)s/edits.csv questions_file = %(data_dir)s/questions.csv pageview_file = %(data_dir)s/top_title_views.csv [simulation] -data_dir = /data/wooders/wikipedia -plan_dir = /data/wooders/wiki-plans +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +plan_dir = /home/ubuntu/experiments/wikipedia/result/wiki-plans init_data_file = %(data_dir)s/init_data.json stream_edits_file = %(data_dir)s/edit_stream.json stream_questions_file = %(data_dir)s/question_stream.json diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 2a0500c..05c75cd 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -81,39 +81,28 @@ def __init__( def run(self, replica_id: int): - #this_shard_source_queues = { - # key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] - #} - + self.source_queues = { + key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] + } + while True: - if self.env.now > 387: - break - - _, total_size_orig = self.key_selection_policy.choose(self.source_queues) - trigger = yield simpy.AnyOf(self.env, [q.get() for q in self.source_queues.values()]) - print("keys", list(trigger.keys())) - print("values", list(trigger.values())) - - - # Add back items to queue (since trigger gets them) - for event in trigger.keys(): - print("add back", event.value.key, event.value) - yield self.source_queues[event.value.key].put(event.value) - + yield simpy.AnyOf(self.env, [q.wait() for q in self.source_queues.values()]) + # choose key - chosen_key, total_size = self.key_selection_policy.choose(self.source_queues) + chosen_key, total_size = self.key_selection_policy.choose( + self.source_queues + ) assert chosen_key is not None # make sure queue size OK - jk doesn't work with dropping - #assert total_size_orig == 0 or total_size == total_size_orig, f"Bad queue size {total_size_orig} -> {total_size}" - - # get chosen key + # assert total_size_orig == 0 or total_size == total_size_orig, f"Bad queue size {total_size_orig} -> {total_size}" + + # get chosen key windows = yield self.source_queues[chosen_key].get() - print( - f"at time {self.env.now:.2f}, RalfMapper should work on {windows} (last timestamp), queue size {total_size}, wait time {self.model_runtime_s}" - ) + # print( + # f"at time {self.env.now:.2f}, RalfMapper should work on {windows} (last timestamp), queue size {total_size}, wait time {self.model_runtime_s}" + # ) edits = [(val, windows.key) for val in windows.window[0].value] - print("edits", edits) if self.env.now in self.ready_time_to_batch: self.ready_time_to_batch[self.env.now] += edits @@ -205,10 +194,10 @@ def run_once( keys = list(init_data.keys()) # policies - prioritization_policies = ["fifo", "lifo"] - load_shedding_policies = ["always_process", "sample_half"] - model_runtimes = [1, 10, 0.01, 0, 0.1, 0.001] - records_per_second = [10] + prioritization_policies = ["fifo"] # ["fifo", "lifo"] + load_shedding_policies = ["always_process"] + model_runtimes = [0] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + records_per_second = [100] output_files = [] @@ -233,4 +222,4 @@ def run_once( print("DONE", out_path) for f in output_files: print(f) - open("plans.txt", "w").write('\n'.join(output_files)) + open("plans.txt", "w").write("\n".join(output_files)) From cf0ece14abc38307c16270b75b0dd5212c223030 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 7 Oct 2021 01:09:30 +0000 Subject: [PATCH 03/26] add aws confg --- wikipedia/config_aws.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 wikipedia/config_aws.yml diff --git a/wikipedia/config_aws.yml b/wikipedia/config_aws.yml new file mode 100644 index 0000000..b13c221 --- /dev/null +++ b/wikipedia/config_aws.yml @@ -0,0 +1,29 @@ +[directory] +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +revisions_dir = %(data_dir)s/recentchanges +raw_doc_dir = %(data_dir)s/doc_xml/ +parsed_doc_dir = %(data_dir)s/doc_pkl/ +parsed_tmp_dir = %(data_dir)s/parsed_tmp/ +diff_dir = %(data_dir)s/diffs/ +embedding_dir = %(data_dir)s/embeddings/ +exp_dir = %(data_dir)s/simulation_output/ + +[files] +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +raw_questions_file = %(data_dir)s/10052021_questions_revid.csv +model_file = %(data_dir)s/bert-base-encoder.cp +changes_file = %(data_dir)s/changes.csv +titles_file = %(data_dir)s/top_titles.csv +revisions_file = %(data_dir)s/title_revisions_timestamps.json +edits_file = %(data_dir)s/edits.csv +questions_file = %(data_dir)s/questions.csv +pageview_file = %(data_dir)s/top_title_views.csv + +[simulation] +data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +plan_dir = /home/ubuntu/experiments/wikipedia/result/wiki-plans +init_data_file = %(data_dir)s/init_data.json +stream_edits_file = %(data_dir)s/edit_stream.json +stream_questions_file = %(data_dir)s/question_stream.json + + From 25046b00f8629b9ea96ed39e9d53ae2ddbe4b55f Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 7 Oct 2021 01:10:10 +0000 Subject: [PATCH 04/26] revert --- wikipedia/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wikipedia/config.yml b/wikipedia/config.yml index b13c221..fb35c24 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -1,5 +1,5 @@ [directory] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +data_dir = /data/wooders/wikipedia revisions_dir = %(data_dir)s/recentchanges raw_doc_dir = %(data_dir)s/doc_xml/ parsed_doc_dir = %(data_dir)s/doc_pkl/ @@ -9,7 +9,7 @@ embedding_dir = %(data_dir)s/embeddings/ exp_dir = %(data_dir)s/simulation_output/ [files] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia +data_dir = /data/wooders/wikipedia raw_questions_file = %(data_dir)s/10052021_questions_revid.csv model_file = %(data_dir)s/bert-base-encoder.cp changes_file = %(data_dir)s/changes.csv @@ -20,8 +20,8 @@ questions_file = %(data_dir)s/questions.csv pageview_file = %(data_dir)s/top_title_views.csv [simulation] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia -plan_dir = /home/ubuntu/experiments/wikipedia/result/wiki-plans +data_dir = /data/wooders/wikipedia +plan_dir = /data/wooders/wiki-plans init_data_file = %(data_dir)s/init_data.json stream_edits_file = %(data_dir)s/edit_stream.json stream_questions_file = %(data_dir)s/question_stream.json From 2b1b2c0d7edb6335afdc057d6df0d9a0136887b4 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Wed, 6 Oct 2021 22:15:45 -0700 Subject: [PATCH 05/26] add initial key policies --- wikipedia/config.yml | 3 +- wikipedia/preprocessing/wiki_api_data.py | 24 ++++++ wikipedia/simulate.py | 99 ++++++++++++++++++------ 3 files changed, 101 insertions(+), 25 deletions(-) diff --git a/wikipedia/config.yml b/wikipedia/config.yml index fb35c24..26fbd66 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -17,7 +17,8 @@ titles_file = %(data_dir)s/top_titles.csv revisions_file = %(data_dir)s/title_revisions_timestamps.json edits_file = %(data_dir)s/edits.csv questions_file = %(data_dir)s/questions.csv -pageview_file = %(data_dir)s/top_title_views.csv +raw_pageview_file = %(data_dir)s/top_title_views.csv +pageview_file = %(data_dir)s/pageviews.csv [simulation] data_dir = /data/wooders/wikipedia diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index 8a3efe1..fa1ce90 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -116,6 +116,24 @@ def get_questions(raw_questions_file, questions_file): questions_df.to_csv(questions_file) return questions_df +def get_pageviews(raw_pageview_file, pageview_file, edits_file): + + edits_df = pd.read_csv(edits_file) + pageview_df = pd.read_csv(raw_pageview_file) + + # map title -> id + title_to_id = edits_df.set_index("title")["pageid"].to_dict() + open("title_to_id.json", "w").write(json.dumps(title_to_id)) + + # calculate page weights + total_views = pageview_df.iloc[:, 2:].sum(axis=1).sum() + weights = pageview_df.iloc[:, 2:].sum(axis=1) / total_views + pageview_df['weights'] = weights + pageview_df['doc_id'] = pageview_df['title'].apply(lambda x: title_to_id[x]) + pageview_df.to_csv(pageview_file) + + return pageview_df + # create diff JSON file from valid list of revision pairs, doc pkl def create_diff_json(doc_pkl, rev_pairs, diff_dir): @@ -578,6 +596,7 @@ def check_dataset( "--run_parse_docs", action="store_true", default=False ) # re-parse document versions parser.add_argument("--run_get_questions", action="store_true", default=False) + parser.add_argument("--run_get_pageviews", action="store_true", default=False) parser.add_argument( "--run_generate_diffs", action="store_true", default=False ) # re-process generating diffs @@ -605,6 +624,7 @@ def check_dataset( edits_file = config["files"]["edits_file"] raw_questions_file = config["files"]["raw_questions_file"] questions_file = config["files"]["questions_file"] + raw_pageview_file = config["files"]["raw_pageview_file"] pageview_file = config["files"]["pageview_file"] # simulation data @@ -647,6 +667,10 @@ def check_dataset( questions_df = get_questions(raw_questions_file, questions_file) print("Generated questions file", questions_file) + # generate pageviews / compute page weights + if args.run_get_pageviews: + get_pageviews(raw_pageview_file, pageview_file, edits_file) + # generate diffs between document versions if args.run_generate_diffs: # if not os.path.isdir(diff_dir): diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 05c75cd..fc5193f 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -3,6 +3,7 @@ from collections import defaultdict from dataclasses import dataclass from functools import cmp_to_key +import random import configparser @@ -40,9 +41,53 @@ class WeightedLoadBalancer(CrossKeyLoadBalancer): - # def __init__(self, keys, key_weights): - # self.keys = keys - # self.key_weights = key_weights + def __init__(self, pageview_file): + pageview_df = pd.read_csv(pageview_file) + self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() + print(self.weights) + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + chosen_key = None + max_len = 0 + total_len = 0 + keys = [] + weights = [] + for key in per_key_queues.keys(): + size = per_key_queues[key].size() + if size >= 1 and int(key) in self.weights: + keys.append(key) + weights.append(self.weights[int(key)]) + total_len += size + + chosen_key = random.choices(keys, weights, k=1)[0] + #print("choose", chosen_key, keys, weights) + return chosen_key, total_len + +class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): + + def __init__(self, pageview_file): + pageview_df = pd.read_csv(pageview_file) + self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() + print(self.weights) + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + chosen_key = None + max_len = 0 + total_len = 0 + for key in per_key_queues.keys(): + size = per_key_queues[key].size() + if int(key) not in self.weights: + continue + weighted_size = self.weights[int(key)] + if weighted_size > max_len: + chosen_key = key + max_len = size + total_len += size + print(chosen_key, max_len, self.weights[int(chosen_key)]) + return chosen_key, total_len + + +class LongestQueueLoadBalancer(CrossKeyLoadBalancer): def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = None @@ -129,6 +174,7 @@ def run_once( total_runtime_s: float, model_runtime_constant: float, data_file: str = None, + weights_file: str = None ): env = simpy.Environment() @@ -164,7 +210,7 @@ def run_once( env, source_queues=windows_to_mapper_queue, model_run_time_s=model_runtime_constant, - key_selection_policy_cls=WeightedLoadBalancer, + key_selection_policy_cls=WeightedLongestQueueLoadBalancer(weights_file), keys=keys, ) env.run(until=total_runtime_s) @@ -187,6 +233,7 @@ def run_once( init_data_file = config["simulation"]["init_data_file"] stream_edits_file = config["simulation"]["stream_edits_file"] stream_questions_file = config["simulation"]["stream_questions_file"] + pageview_file = config["files"]["pageview_file"] # load simulation data edits = json.load(open(stream_edits_file)) @@ -195,31 +242,35 @@ def run_once( # policies prioritization_policies = ["fifo"] # ["fifo", "lifo"] + key_selection_policies = ["pageview"] load_shedding_policies = ["always_process"] - model_runtimes = [0] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + model_runtimes = [0.1] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] records_per_second = [100] output_files = [] - for prio_policy in prioritization_policies: - for load_shed_policy in load_shedding_policies: - for runtime in model_runtimes: - for rate in records_per_second: - - out_path = f"{plan_dir}/plan-{prio_policy}-{load_shed_policy}-{runtime}-{rate}.json" - print("running", out_path, runtime) - run_once( - out_path, - prio_policy, - load_shed_policy, - keys, - per_key_records_per_second=rate, - total_runtime_s=len(edits), - model_runtime_constant=runtime, - data_file=stream_edits_file, - ) - output_files.append(out_path) - print("DONE", out_path) + for key_selection in key_selection_policies: + for prio_policy in prioritization_policies: + for load_shed_policy in load_shedding_policies: + for runtime in model_runtimes: + for rate in records_per_second: + + out_path = f"{plan_dir}/plan-{key_selection}_{prio_policy}-{load_shed_policy}-{runtime}-{rate}.json" + print("running", out_path, runtime) + run_once( + out_path, + prio_policy, + load_shed_policy, + keys, + per_key_records_per_second=rate, + total_runtime_s=len(edits), + model_runtime_constant=runtime, + data_file=stream_edits_file, + weights_file=pageview_file, + ) + + output_files.append(out_path) + print("DONE", out_path) for f in output_files: print(f) open("plans.txt", "w").write("\n".join(output_files)) From 63c52239bc2be4c7513f2765f9f6b38071c127fd Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 7 Oct 2021 13:30:57 -0700 Subject: [PATCH 06/26] add key selection policy options --- stl/scratch.ipynb | 32 ++++---- wikipedia/config.yml | 2 +- wikipedia/preprocessing/wiki_api_data.py | 2 +- wikipedia/simulate.py | 96 +++++++++++++++++------- 4 files changed, 87 insertions(+), 45 deletions(-) diff --git a/stl/scratch.ipynb b/stl/scratch.ipynb index 962bf16..2334fd9 100644 --- a/stl/scratch.ipynb +++ b/stl/scratch.ipynb @@ -3,33 +3,33 @@ { "cell_type": "code", "execution_count": null, - "source": [], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [] } ], "metadata": { - "orig_nbformat": 4, + "interpreter": { + "hash": "a10b01f403a1542ddbe951c0fc128eb6a019580013b1191ba1a82a0d150f03e0" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python", - "version": "3.7.10", - "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "pygments_lexer": "ipython3", + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.10 64-bit ('ralf': conda)" - }, - "interpreter": { - "hash": "a10b01f403a1542ddbe951c0fc128eb6a019580013b1191ba1a82a0d150f03e0" + "pygments_lexer": "ipython3", + "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/wikipedia/config.yml b/wikipedia/config.yml index 26fbd66..78d42d8 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -10,7 +10,7 @@ exp_dir = %(data_dir)s/simulation_output/ [files] data_dir = /data/wooders/wikipedia -raw_questions_file = %(data_dir)s/10052021_questions_revid.csv +raw_questions_file = %(data_dir)s/10062021_filtered_questions.csv model_file = %(data_dir)s/bert-base-encoder.cp changes_file = %(data_dir)s/changes.csv titles_file = %(data_dir)s/top_titles.csv diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index fa1ce90..2a1a718 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -665,7 +665,7 @@ def check_dataset( # get questions if args.run_get_questions: questions_df = get_questions(raw_questions_file, questions_file) - print("Generated questions file", questions_file) + print("Generated questions file", raw_questions_file, questions_file) # generate pageviews / compute page weights if args.run_get_pageviews: diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index fc5193f..11e477f 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -44,7 +44,7 @@ class WeightedLoadBalancer(CrossKeyLoadBalancer): def __init__(self, pageview_file): pageview_df = pd.read_csv(pageview_file) self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - print(self.weights) + #print(self.weights) def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = None @@ -63,12 +63,29 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: #print("choose", chosen_key, keys, weights) return chosen_key, total_len +class RandomLoadBalancer(CrossKeyLoadBalancer): + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + chosen_key = None + max_len = 0 + total_len = 0 + keys = [] + for key in per_key_queues.keys(): + size = per_key_queues[key].size() + if size >= 1: + keys.append(key) + total_len += size + + chosen_key = random.choices(keys, k=1)[0] + return chosen_key, total_len + + class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): def __init__(self, pageview_file): pageview_df = pd.read_csv(pageview_file) self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - print(self.weights) + #print(self.weights) def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = None @@ -83,9 +100,32 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = key max_len = size total_len += size - print(chosen_key, max_len, self.weights[int(chosen_key)]) + #print(chosen_key, max_len, self.weights[int(chosen_key)]) return chosen_key, total_len +class WeightedLoadBalancer(CrossKeyLoadBalancer): + + def __init__(self, pageview_file): + pageview_df = pd.read_csv(pageview_file) + self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() + #print(self.weights) + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + chosen_key = None + max_len = 0 + total_len = 0 + keys = [] + weights = [] + for key in per_key_queues.keys(): + size = per_key_queues[key].size() + if size >= 1 and int(key) in self.weights: + keys.append(key) + weights.append(self.weights[int(key)]) + total_len += size + + chosen_key = random.choices(keys, weights, k=1)[0] + #print("choose", chosen_key, keys, weights) + return chosen_key, total_len class LongestQueueLoadBalancer(CrossKeyLoadBalancer): @@ -157,11 +197,30 @@ def run(self, replica_id: int): yield self.env.timeout(self.model_runtime_s) +# configuration file +config = configparser.ConfigParser() +config.read("config.yml") +plan_dir = config["simulation"]["plan_dir"] +init_data_file = config["simulation"]["init_data_file"] +stream_edits_file = config["simulation"]["stream_edits_file"] +stream_questions_file = config["simulation"]["stream_questions_file"] +pageview_file = config["files"]["pageview_file"] + +# load simulation data +edits = json.load(open(stream_edits_file)) +init_data = json.load(open(init_data_file)) +keys = list(init_data.keys()) + policies = { "fifo": fifo, "lifo": lifo, "always_process": always_process, "sample_half": make_sampling_policy(0.5), + "weighted_random": WeightedLoadBalancer(pageview_file), + "weighted_longest_queue": WeightedLongestQueueLoadBalancer(pageview_file), + "longest_queue": LongestQueueLoadBalancer(), + "random": RandomLoadBalancer(), + "round_robin": RoundRobinLoadBalancer() } @@ -173,8 +232,7 @@ def run_once( per_key_records_per_second: int, total_runtime_s: float, model_runtime_constant: float, - data_file: str = None, - weights_file: str = None + key_selection_policy: str ): env = simpy.Environment() @@ -195,7 +253,7 @@ def run_once( num_keys=len(keys), next_queue=source_to_window_queue, total_run_time=total_runtime_s, - data_file=data_file, + data_file=stream_edits_file, ) WindowOperator( @@ -210,7 +268,7 @@ def run_once( env, source_queues=windows_to_mapper_queue, model_run_time_s=model_runtime_constant, - key_selection_policy_cls=WeightedLongestQueueLoadBalancer(weights_file), + key_selection_policy_cls=policies[key_selection_policy], keys=keys, ) env.run(until=total_runtime_s) @@ -225,26 +283,11 @@ def run_once( # load sheding: random, drop short edits # prioritization: prioritize most recent version # cross-key prioritzation: historical page views, - - # configuration file - config = configparser.ConfigParser() - config.read("config.yml") - plan_dir = config["simulation"]["plan_dir"] - init_data_file = config["simulation"]["init_data_file"] - stream_edits_file = config["simulation"]["stream_edits_file"] - stream_questions_file = config["simulation"]["stream_questions_file"] - pageview_file = config["files"]["pageview_file"] - - # load simulation data - edits = json.load(open(stream_edits_file)) - init_data = json.load(open(init_data_file)) - keys = list(init_data.keys()) - # policies - prioritization_policies = ["fifo"] # ["fifo", "lifo"] - key_selection_policies = ["pageview"] + prioritization_policies = ["lifo", "fifo"] # ["fifo", "lifo"] + key_selection_policies = ["weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] load_shedding_policies = ["always_process"] - model_runtimes = [0.1] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + model_runtimes = [0.01, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] records_per_second = [100] output_files = [] @@ -265,8 +308,7 @@ def run_once( per_key_records_per_second=rate, total_runtime_s=len(edits), model_runtime_constant=runtime, - data_file=stream_edits_file, - weights_file=pageview_file, + key_selection_policy=key_selection, ) output_files.append(out_path) From 60c75f51c2ed8fd73ebf0e3ccaba3880fb98ff60 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 7 Oct 2021 13:37:43 -0700 Subject: [PATCH 07/26] only return key --- wikipedia/simulate.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 11e477f..d7361ce 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -61,7 +61,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = random.choices(keys, weights, k=1)[0] #print("choose", chosen_key, keys, weights) - return chosen_key, total_len + return chosen_key class RandomLoadBalancer(CrossKeyLoadBalancer): @@ -77,7 +77,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: total_len += size chosen_key = random.choices(keys, k=1)[0] - return chosen_key, total_len + return chosen_key class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): @@ -101,7 +101,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: max_len = size total_len += size #print(chosen_key, max_len, self.weights[int(chosen_key)]) - return chosen_key, total_len + return chosen_key class WeightedLoadBalancer(CrossKeyLoadBalancer): @@ -125,7 +125,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = random.choices(keys, weights, k=1)[0] #print("choose", chosen_key, keys, weights) - return chosen_key, total_len + return chosen_key class LongestQueueLoadBalancer(CrossKeyLoadBalancer): @@ -139,7 +139,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = key max_len = size total_len += size - return chosen_key, total_len + return chosen_key class WikiMapper(RalfMapper): @@ -174,7 +174,7 @@ def run(self, replica_id: int): yield simpy.AnyOf(self.env, [q.wait() for q in self.source_queues.values()]) # choose key - chosen_key, total_size = self.key_selection_policy.choose( + chosen_key = self.key_selection_policy.choose( self.source_queues ) assert chosen_key is not None From 1220cdd2a38d5b313e97279bc8050bb8937b4393 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 7 Oct 2021 22:45:12 -0700 Subject: [PATCH 08/26] start implementing time-specific weights --- stl/offline/evaluation.py | 34 ++++++++-- stl/offline/run_2_eval_yahoo_keys.sh | 11 ++- stl/offline/simulation.py | 24 +++++-- wikipedia/config.yml | 1 + wikipedia/preprocessing/wiki_api_data.py | 19 +++++- wikipedia/simulate.py | 85 ++++++++++++++++++++++-- wikipedia/wiki_eval.py | 17 +++-- 7 files changed, 158 insertions(+), 33 deletions(-) diff --git a/stl/offline/evaluation.py b/stl/offline/evaluation.py index 92ff30b..067003e 100644 --- a/stl/offline/evaluation.py +++ b/stl/offline/evaluation.py @@ -37,14 +37,10 @@ def predict(event, model): SEASONALITY = 24 * 7 -def offline_eval(yahoo_csv_path, plan_json_path): +def offline_eval(yahoo_csv_path, plan_df): df = pd.read_csv(yahoo_csv_path) df["timestamp"] = list(range(len(df))) - # Headers - # processing_time window_start_seq_id window_end_seq_id key - plan_df = pd.read_json(plan_json_path) - # Given our model versions from offline plan, run training on corresponding # events. offline_stl = {} @@ -98,6 +94,24 @@ def find_freshest_model_version(event_time, model_versions): df[new_col] = add_df[new_col] return df +def offline_eval_all(yahoo_path): + + policy_plan_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" + policy_params = json.load(open(policy_plan_path)) + plan_df = pd.read_csv(plan_json_path) + + # loop through each key + for key in policy_params.keys(): + output_file = "output_{key}.csv" + print(key, output_file) + plan_df_key = plan_df[plan_df["key"] == key] + csv_path = f"{key}.csv" + df = offline_eval(csv_path, plan_df_key) + df.to_csv(output_file) + + return + + def offline_oracle(yahoo_csv_path): df = pd.read_csv(yahoo_csv_path) @@ -117,8 +131,16 @@ def offline_oracle(yahoo_csv_path): def run_exp(csv_path, plan_path, output_path, run_oracle=False): if run_oracle: df = offline_oracle(csv_path) + elif run_policy: + offline_eval_all(csv_path) else: - df = offline_eval(csv_path, plan_path) + + # Headers + # processing_time window_start_seq_id window_end_seq_id key + plan_df = pd.read_json(plan_json_path) + + df = offline_eval(csv_path, plan_df) + df.to_csv(output_path, index=None) diff --git a/stl/offline/run_2_eval_yahoo_keys.sh b/stl/offline/run_2_eval_yahoo_keys.sh index 1d9865e..444da55 100644 --- a/stl/offline/run_2_eval_yahoo_keys.sh +++ b/stl/offline/run_2_eval_yahoo_keys.sh @@ -1,17 +1,14 @@ set -ex -data_dir="/home/ubuntu/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/" +data_dir="/data/wooders/stl/yahoo" -tmp_script=`mktemp` -for data in `ls $data_dir/A4Benchmark-TS*` +for data in `ls $data_dir/A4/*` do key=`basename $data` for slide in 6 12 18 24 48 96 168 192 336 672 do - echo python evaluation.py --offline-yahoo-csv-path $data \ + python evaluation.py --offline-yahoo-csv-path $data \ --offline-plan-path ./result/offline_1_slide/plan/slide_${slide}_plan.json \ - --output-path ./result/offline_1_slide/plan_eval/slide_${slide}_key_${key} >> $tmp_script + --output-path ./result/offline_1_slide/plan_eval/slide_${slide}_key_${key} done done - -cat $tmp_script | parallel --bar bash -l -c \ No newline at end of file diff --git a/stl/offline/simulation.py b/stl/offline/simulation.py index 41461bb..2039983 100644 --- a/stl/offline/simulation.py +++ b/stl/offline/simulation.py @@ -63,7 +63,7 @@ None, "path to generated per key's window slide size config.", ) -flags.DEFINE_integer("num_mapper_replicas", None, "number of replicas for mapper") +flags.DEFINE_integer("num_mapper_replicas", 1, "number of replicas for mapper") def _get_config() -> Dict: @@ -74,14 +74,22 @@ def _get_config() -> Dict: def main(argv): env = simpy.Environment() # source --source_to_window_queue--> window --windows_to_mapper_queue--> mapper + + policy_plan_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" + policy_params = json.load(open(policy_plan_path)) + + keys = policy_params.keys() + print(keys) + source_to_window_queue = simpy.Store(env) windows_to_mapper_queue = { - i: PerKeyPriorityQueue( + key: PerKeyPriorityQueue( env, processing_policy=prio_policies[FLAGS.key_prio_policy], load_shedding_policy=load_shed_policies[FLAGS.key_load_shed_policy], ) - for i in range(FLAGS.num_keys) + #for i in range(FLAGS.num_keys) + for key in keys } Source( env, @@ -89,13 +97,15 @@ def main(argv): num_keys=FLAGS.num_keys, next_queue=source_to_window_queue, total_run_time=FLAGS.total_runtime_s, - data_file=FLAGS.source_data_path, + keys=keys, + data_dir="/data/wooders/stl/yahoo/A4", + #data_file=FLAGS.source_data_path, ) WindowOperator( env, window_size=FLAGS.window_size, slide_size=FLAGS.slide_size, - per_key_slide_size_path=FLAGS.per_key_slide_size_plan, + per_key_slide_size_path=policy_plan_path, source_queue=source_to_window_queue, next_queues=windows_to_mapper_queue, ) @@ -104,7 +114,7 @@ def main(argv): source_queues=windows_to_mapper_queue, model_run_time_s=FLAGS.model_runtime_s, # TODO(simon): customize this once we want different key selection policy - key_selection_policy_cls=RoundRobinLoadBalancer, + key_selection_policy_cls=RoundRobinLoadBalancer(), num_replicas=FLAGS.num_mapper_replicas, ) env.run(until=FLAGS.total_runtime_s) @@ -112,7 +122,7 @@ def main(argv): plan = m.plan config = _get_config() if FLAGS.output_path: - os.makedirs(os.path.split(FLAGS.output_path)[0], exist_ok=True) + #os.makedirs(os.path.split(FLAGS.output_path)[0], exist_ok=True) with open(FLAGS.output_path, "w") as f: json.dump(plan, f, indent=2) with open(FLAGS.output_path + ".config.json", "w") as f: diff --git a/wikipedia/config.yml b/wikipedia/config.yml index 78d42d8..c6bb8a1 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -19,6 +19,7 @@ edits_file = %(data_dir)s/edits.csv questions_file = %(data_dir)s/questions.csv raw_pageview_file = %(data_dir)s/top_title_views.csv pageview_file = %(data_dir)s/pageviews.csv +timestamp_weights_file = %(data_dir)s/timestamp_weights_file.json [simulation] data_dir = /data/wooders/wikipedia diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index 2a1a718..4928566 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -4,6 +4,7 @@ import json from tqdm import tqdm from collections import defaultdict +from datetime import datetime import subprocess import configparser @@ -116,7 +117,7 @@ def get_questions(raw_questions_file, questions_file): questions_df.to_csv(questions_file) return questions_df -def get_pageviews(raw_pageview_file, pageview_file, edits_file): +def get_pageviews(raw_pageview_file, pageview_file, edits_file, timestamp_weights_file): edits_df = pd.read_csv(edits_file) pageview_df = pd.read_csv(raw_pageview_file) @@ -132,6 +133,19 @@ def get_pageviews(raw_pageview_file, pageview_file, edits_file): pageview_df['doc_id'] = pageview_df['title'].apply(lambda x: title_to_id[x]) pageview_df.to_csv(pageview_file) + # page weights per timestamp + ts_to_weights = {} + dates = pageview_df.columns[2:-2] + for date in dates: + print(date) + dt = datetime.strptime(date[:-2], '%Y%m%d') + ts = dt.timestamp() * 1000000000 + ts_min = assign_timestamps_min(ts) + view_counts = pageview_df[date].tolist() + id_to_count = pageview_df.set_index("doc_id")[date].to_dict() + ts_to_weights[ts_min] = id_to_count + open(timestamp_weights_file, "w").write(json.dumps(ts_to_weights)) + print("Generated ts weights file", timestamp_weights_file) return pageview_df @@ -626,6 +640,7 @@ def check_dataset( questions_file = config["files"]["questions_file"] raw_pageview_file = config["files"]["raw_pageview_file"] pageview_file = config["files"]["pageview_file"] + timestamp_weights_file = config["files"]["timestamp_weights_file"] # simulation data init_data_file = config["simulation"]["init_data_file"] @@ -669,7 +684,7 @@ def check_dataset( # generate pageviews / compute page weights if args.run_get_pageviews: - get_pageviews(raw_pageview_file, pageview_file, edits_file) + get_pageviews(raw_pageview_file, pageview_file, edits_file, timestamp_weights_file) # generate diffs between document versions if args.run_generate_diffs: diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index d7361ce..92e71fc 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -1,4 +1,5 @@ import json +import itertools from typing import DefaultDict, Dict, List, Optional, Tuple from collections import defaultdict from dataclasses import dataclass @@ -38,6 +39,62 @@ from typing import Dict, List, Tuple, Type +class RoundRobinLoadBalancerFix(CrossKeyLoadBalancer): + """Simple policy that cycle through all the keys fairly""" + + def __init__(self): + self.cur_key_set = set() + self.cur_key_iter = None + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + key_set = set(per_key_queues.keys()) + if key_set != self.cur_key_set: + self.cur_key_set = key_set + self.cur_key_iter = itertools.cycle(key_set) + + key = next(self.cur_key_iter) + while per_key_queues[key].size() == 0: + key = next(self.cur_key_iter) + # TODO(simon): maybe do a "peak" here to trigger eviction policies + return key + + +class WeightedRoundRobin(CrossKeyLoadBalancer): + """Simple policy that cycle through all the keys fairly""" + + def __init__(self, pageview_file, all_keys): + self.cur_key_set = [] + self.cur_key_iter = None + pageview_df = pd.read_csv(pageview_file) + self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() + self.weights = {} + for key in self.raw_weights.keys(): + if str(key) not in all_keys: + continue + + self.weights[key] = int(self.raw_weights[key]*1000) + assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + print(self.weights) + + + for key in self.weights.keys(): + for i in range(self.weights[key]): + self.cur_key_set.append(str(key)) + random.shuffle(self.cur_key_set) + print(self.cur_key_set) + print("size", len(self.cur_key_set)) + self.cur_key_iter = itertools.cycle(self.cur_key_set) + + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + + key = next(self.cur_key_iter) + while per_key_queues[key].size() == 0: + key = next(self.cur_key_iter) + # TODO(simon): maybe do a "peak" here to trigger eviction policies + return key + + class WeightedLoadBalancer(CrossKeyLoadBalancer): @@ -95,12 +152,14 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: size = per_key_queues[key].size() if int(key) not in self.weights: continue - weighted_size = self.weights[int(key)] + weighted_size = self.weights[int(key)]*self.weights[int(key)] if weighted_size > max_len: chosen_key = key max_len = size total_len += size #print(chosen_key, max_len, self.weights[int(chosen_key)]) + per_key_queues[chosen_key].clear() + print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) return chosen_key class WeightedLoadBalancer(CrossKeyLoadBalancer): @@ -139,6 +198,9 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: chosen_key = key max_len = size total_len += size + per_key_queues[chosen_key].clear() + print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) + return chosen_key @@ -210,6 +272,7 @@ def run(self, replica_id: int): edits = json.load(open(stream_edits_file)) init_data = json.load(open(init_data_file)) keys = list(init_data.keys()) +ts_to_weights = json.load(open(config["files"]["weights"])) policies = { "fifo": fifo, @@ -220,10 +283,22 @@ def run(self, replica_id: int): "weighted_longest_queue": WeightedLongestQueueLoadBalancer(pageview_file), "longest_queue": LongestQueueLoadBalancer(), "random": RandomLoadBalancer(), - "round_robin": RoundRobinLoadBalancer() + "round_robin": RoundRobinLoadBalancer(), + "round_robin_fix": RoundRobinLoadBalancerFix(), + "weighted_round_robin": WeightedRoundRobin(pageview_file, keys) } +def current_weights(ts): + min_dist = max(list(ts_to_weights.keys())) + weights = None + + for key in ts_to_weights.keys(): + if abs(ts - key) <= min_dist: + min_dist = abs(ts - key) + weights = ts_to_weights[key] + return weights + def run_once( out_path: str, prioritization_policy: str, @@ -284,10 +359,10 @@ def run_once( # prioritization: prioritize most recent version # cross-key prioritzation: historical page views, # policies - prioritization_policies = ["lifo", "fifo"] # ["fifo", "lifo"] - key_selection_policies = ["weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] + prioritization_policies = ["lifo"] # ["fifo", "lifo"] + key_selection_policies = ["weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin", "round_robin_fix"] load_shedding_policies = ["always_process"] - model_runtimes = [0.01, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + model_runtimes = [5] #, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] records_per_second = [100] output_files = [] diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py index c47d2ff..cdb587d 100644 --- a/wikipedia/wiki_eval.py +++ b/wikipedia/wiki_eval.py @@ -170,6 +170,7 @@ def offline_eval(plan_json_path, exp_id, compute_embeddings=True): # compute initial passage embeddings for each document init_data = json.load(open(init_data_file)) init_state = {} + staleness = [] for key in tqdm(init_data.keys()): if filter_keys and key not in keys: @@ -210,7 +211,7 @@ def offline_eval(plan_json_path, exp_id, compute_embeddings=True): for version in tqdm(embed_version_keys): state = {} for task in plan[version]: - print("task", task, version) + #print("task", task, version) rev_file = task[0] doc_id = task[1] # doc_id = task[2] @@ -266,7 +267,7 @@ def get_latest_embedding(timestep, doc_id): and doc_id in embed_versions[str(version)] ): latest = version - print(doc_id, "latest", timestep, latest, timestep - latest) + #print(doc_id, "latest", timestep, latest, timestep - latest) assert ( doc_id in embed_versions[str(latest)] ), f"Missing doc id {doc_id} {latest} {doc_id in init_data}" @@ -305,6 +306,11 @@ def get_latest_embedding(timestep, doc_id): # print(init_data.keys()) continue + # get current embedding and write + passage_texts, passage_embeddings, version, latest = get_latest_embedding( + timestep, doc_id + ) + # loop through questions doc_questions = questions[ts][doc_id] queries = [] @@ -319,10 +325,8 @@ def get_latest_embedding(timestep, doc_id): ), f"time mismatch {q['ts_min']}, {timestep}, {ts}" queries.append([question, [answer], doc_id]) - # get current embedding and write - passage_texts, passage_embeddings, version, latest = get_latest_embedding( - timestep, doc_id - ) + # append per query + staleness.append(timestep - latest) # dump CTX/question script contex_file = f"{directory}/dpr_ctx_after_{int(ts)}_{doc_id}" @@ -349,6 +353,7 @@ def get_latest_embedding(timestep, doc_id): assert len(passage_embeddings) == len(passage_texts) print("done processing queries!", len(questions)) + print("staleness", np.array(staleness).mean()) return directory From 754ccc82f2633a03c1642ee89028d424a2b31a5e Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 8 Oct 2021 14:16:22 -0700 Subject: [PATCH 09/26] plan-level simulation --- stl/offline/config_gen.py | 2 +- stl/offline/evaluation.py | 46 +++++++++++----- stl/offline/simulation.py | 4 +- wikipedia/simulate.py | 112 ++++++++++++++++++++++++++++++-------- 4 files changed, 124 insertions(+), 40 deletions(-) diff --git a/stl/offline/config_gen.py b/stl/offline/config_gen.py index 6e52b11..0248638 100644 --- a/stl/offline/config_gen.py +++ b/stl/offline/config_gen.py @@ -95,7 +95,7 @@ def run_lp(df: pd.DataFrame, max_n_fits=None, max_loss=None, objective="min_loss return plan_output, optimal_loss -def get_loss_per_key(key: int, csv_dir): +def get_loss_per_key(key: int, csv_dir) key_one = glob(f"{csv_dir}/slide_*_key_A4Benchmark-TS{key}.csv") assert len(key_one) > 0 diff --git a/stl/offline/evaluation.py b/stl/offline/evaluation.py index 067003e..a0cbde2 100644 --- a/stl/offline/evaluation.py +++ b/stl/offline/evaluation.py @@ -1,7 +1,8 @@ import argparse +import json import os import bisect - +from tqdm import tqdm import numpy as np import pandas as pd from statsmodels.tsa.seasonal import STL @@ -38,14 +39,16 @@ def predict(event, model): def offline_eval(yahoo_csv_path, plan_df): + print(yahoo_csv_path) df = pd.read_csv(yahoo_csv_path) df["timestamp"] = list(range(len(df))) # Given our model versions from offline plan, run training on corresponding # events. offline_stl = {} - for _, row in plan_df.iterrows(): - records = df.iloc[row.window_start_seq_id : row.window_end_seq_id + 1].to_dict( + print(plan_df) + for _, row in tqdm(plan_df.iterrows()): # note: doesn't preserve types + records = df.iloc[int(row.window_start_seq_id) : int(row.window_end_seq_id) + 1].to_dict( orient="records" ) @@ -54,6 +57,8 @@ def offline_eval(yahoo_csv_path, plan_df): trained = train(records, window_size=len(records), seasonality=SEASONALITY) offline_stl[row.processing_time] = trained + print(offline_stl) + # Assign the trained model with every events in the source file. def find_freshest_model_version(event_time, model_versions): model_loc = bisect.bisect_left(model_versions, event_time) - 1 @@ -66,6 +71,7 @@ def find_freshest_model_version(event_time, model_versions): for et in df["timestamp"] ] + # Run prediction! predicted = [] for _, row in df.iterrows(): @@ -94,19 +100,28 @@ def find_freshest_model_version(event_time, model_versions): df[new_col] = add_df[new_col] return df -def offline_eval_all(yahoo_path): +def offline_eval_all(yahoo_path, plan_json_path): - policy_plan_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" - policy_params = json.load(open(policy_plan_path)) - plan_df = pd.read_csv(plan_json_path) + param_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" + print(param_path) + policy_params = json.load(open(param_path)) + plan_df = pd.read_json(plan_json_path) + plan_df.to_csv("plan.csv") + print("plan index", plan_df.index) # loop through each key for key in policy_params.keys(): - output_file = "output_{key}.csv" + output_file = f"output_{key}.csv" print(key, output_file) - plan_df_key = plan_df[plan_df["key"] == key] - csv_path = f"{key}.csv" + plan_df_key = plan_df[plan_df["key"] == int(key)] + print("key index", plan_df_key.index) + plan_df_key.index = pd.RangeIndex(start=0, stop=len(plan_df_key.index)) + print(plan_df_key) + print("key index", plan_df_key.index) + #plan_df_key = plan_df + csv_path = f"{yahoo_path}/{key}.csv" df = offline_eval(csv_path, plan_df_key) + plan_df_key.to_csv(f"{key}_plan.csv") df.to_csv(output_file) return @@ -128,20 +143,20 @@ def offline_oracle(yahoo_csv_path): return df -def run_exp(csv_path, plan_path, output_path, run_oracle=False): +def run_exp(csv_path, plan_path, output_path, run_policy=False, run_oracle=False): if run_oracle: df = offline_oracle(csv_path) elif run_policy: - offline_eval_all(csv_path) + offline_eval_all(csv_path, plan_path) else: # Headers # processing_time window_start_seq_id window_end_seq_id key - plan_df = pd.read_json(plan_json_path) + plan_df = pd.read_json(plan_path) df = offline_eval(csv_path, plan_df) - df.to_csv(output_path, index=None) + df.to_csv(output_path, index=None) def _ensure_dir(path): @@ -160,13 +175,14 @@ def main(): assert args.offline_yahoo_csv_path if not args.offline_run_oracle: assert args.offline_plan_path - _ensure_dir(args.output_path) + #_ensure_dir(args.output_path) run_exp( csv_path=args.offline_yahoo_csv_path, plan_path=args.offline_plan_path, output_path=args.output_path, run_oracle=args.offline_run_oracle, + run_policy=True ) diff --git a/stl/offline/simulation.py b/stl/offline/simulation.py index 2039983..17df84f 100644 --- a/stl/offline/simulation.py +++ b/stl/offline/simulation.py @@ -45,7 +45,7 @@ flags.DEFINE_float("total_runtime_s", 14, "When to end the simulation.") flags.DEFINE_float( "model_runtime_s", - 0.2, + 0.01, "The latency for the map function (when processing a single record).", ) flags.DEFINE_integer("window_size", 24 * 7, "The sliding window size.") @@ -63,7 +63,7 @@ None, "path to generated per key's window slide size config.", ) -flags.DEFINE_integer("num_mapper_replicas", 1, "number of replicas for mapper") +flags.DEFINE_integer("num_mapper_replicas", 10, "number of replicas for mapper") def _get_config() -> Dict: diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 92e71fc..1ff24a7 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -39,6 +39,18 @@ from typing import Dict, List, Tuple, Type +def current_weights(ts, ts_to_weights): + ts = int(ts) + min_dist = max(list(ts_to_weights.keys())) + + index = 0 + for key in ts_to_weights.keys(): + if key >= ts: + break + index = key + + return ts_to_weights[key] + class RoundRobinLoadBalancerFix(CrossKeyLoadBalancer): """Simple policy that cycle through all the keys fairly""" @@ -46,7 +58,7 @@ def __init__(self): self.cur_key_set = set() self.cur_key_iter = None - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: key_set = set(per_key_queues.keys()) if key_set != self.cur_key_set: self.cur_key_set = key_set @@ -86,7 +98,7 @@ def __init__(self, pageview_file, all_keys): self.cur_key_iter = itertools.cycle(self.cur_key_set) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: key = next(self.cur_key_iter) while per_key_queues[key].size() == 0: @@ -94,6 +106,70 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: # TODO(simon): maybe do a "peak" here to trigger eviction policies return key +class AdaptiveWeightedRoundRobin(CrossKeyLoadBalancer): + """Simple policy that cycle through all the keys fairly""" + + def __init__(self, timestamp_weights_file): + self.cur_key_set = [] + self.cur_key_iter = None + + pageview_df = pd.read_csv(pageview_file) + self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() + self.weights = {} + for key in self.raw_weights.keys(): + if str(key) not in all_keys: + continue + + self.weights[key] = int(self.raw_weights[key]*1000) + assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + print(self.weights) + + + for key in self.weights.keys(): + for i in range(self.weights[key]): + self.cur_key_set.append(str(key)) + random.shuffle(self.cur_key_set) + print(self.cur_key_set) + print("size", len(self.cur_key_set)) + self.cur_key_iter = itertools.cycle(self.cur_key_set) + + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: + + key = next(self.cur_key_iter) + while per_key_queues[key].size() == 0: + key = next(self.cur_key_iter) + # TODO(simon): maybe do a "peak" here to trigger eviction policies + return key + + +class AdaptiveWeightedLoadBalancer(CrossKeyLoadBalancer): + + def __init__(self, timestamp_weights_file): + data = json.load(open(timestamp_weights_file)) + self.timestamp_weights = {} + for key in data.keys(): + self.timestamp_weights[int(key)] = data[key] + + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], timestamp: int) -> str: + weights_map = current_weights(timestamp, self.timestamp_weights) + + chosen_key = None + max_len = 0 + total_len = 0 + keys = [] + weights = [] + for key in per_key_queues.keys(): + size = per_key_queues[key].size() + if size >= 1 and key in weights_map: + keys.append(key) + weights.append(weights_map[key]) + total_len += size + print(weights) + print(keys) + chosen_key = random.choices(keys, weights, k=1)[0] + print("choose", chosen_key, keys, weights) + return chosen_key class WeightedLoadBalancer(CrossKeyLoadBalancer): @@ -103,7 +179,7 @@ def __init__(self, pageview_file): self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() #print(self.weights) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None max_len = 0 total_len = 0 @@ -122,7 +198,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: class RandomLoadBalancer(CrossKeyLoadBalancer): - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None max_len = 0 total_len = 0 @@ -144,7 +220,7 @@ def __init__(self, pageview_file): self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() #print(self.weights) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None max_len = 0 total_len = 0 @@ -169,7 +245,7 @@ def __init__(self, pageview_file): self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() #print(self.weights) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None max_len = 0 total_len = 0 @@ -188,7 +264,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: class LongestQueueLoadBalancer(CrossKeyLoadBalancer): - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue]) -> str: + def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None max_len = 0 total_len = 0 @@ -236,8 +312,10 @@ def run(self, replica_id: int): yield simpy.AnyOf(self.env, [q.wait() for q in self.source_queues.values()]) # choose key + print("env time", self.env.now) chosen_key = self.key_selection_policy.choose( - self.source_queues + self.source_queues, + self.env.now*100 ) assert chosen_key is not None @@ -267,12 +345,12 @@ def run(self, replica_id: int): stream_edits_file = config["simulation"]["stream_edits_file"] stream_questions_file = config["simulation"]["stream_questions_file"] pageview_file = config["files"]["pageview_file"] +timestamp_weights_file = config["files"]["timestamp_weights_file"] # load simulation data edits = json.load(open(stream_edits_file)) init_data = json.load(open(init_data_file)) keys = list(init_data.keys()) -ts_to_weights = json.load(open(config["files"]["weights"])) policies = { "fifo": fifo, @@ -280,6 +358,7 @@ def run(self, replica_id: int): "always_process": always_process, "sample_half": make_sampling_policy(0.5), "weighted_random": WeightedLoadBalancer(pageview_file), + "adaptive_weighted_random": AdaptiveWeightedLoadBalancer(timestamp_weights_file), "weighted_longest_queue": WeightedLongestQueueLoadBalancer(pageview_file), "longest_queue": LongestQueueLoadBalancer(), "random": RandomLoadBalancer(), @@ -288,17 +367,6 @@ def run(self, replica_id: int): "weighted_round_robin": WeightedRoundRobin(pageview_file, keys) } - -def current_weights(ts): - min_dist = max(list(ts_to_weights.keys())) - weights = None - - for key in ts_to_weights.keys(): - if abs(ts - key) <= min_dist: - min_dist = abs(ts - key) - weights = ts_to_weights[key] - return weights - def run_once( out_path: str, prioritization_policy: str, @@ -360,9 +428,9 @@ def run_once( # cross-key prioritzation: historical page views, # policies prioritization_policies = ["lifo"] # ["fifo", "lifo"] - key_selection_policies = ["weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin", "round_robin_fix"] + key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin", "round_robin_fix"] load_shedding_policies = ["always_process"] - model_runtimes = [5] #, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] records_per_second = [100] output_files = [] From 9de4f9a64872e8975f219f44da43ed01a99ba098 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 8 Oct 2021 21:42:56 -0700 Subject: [PATCH 10/26] add policy evaluation script --- stl/offline/evaluation.py | 73 ++++++++++++++------------- stl/offline/run_4_generate_plan.sh | 2 +- stl/offline/run_5_simulate_lp_plan.sh | 24 +++++++-- stl/offline/simulation.py | 16 +++--- wikipedia/simulate.py | 18 ++++--- wikipedia/wiki_eval.py | 1 + 6 files changed, 80 insertions(+), 54 deletions(-) diff --git a/stl/offline/evaluation.py b/stl/offline/evaluation.py index a0cbde2..3305a31 100644 --- a/stl/offline/evaluation.py +++ b/stl/offline/evaluation.py @@ -1,10 +1,12 @@ import argparse +from multiprocessing import Pool import json import os import bisect from tqdm import tqdm import numpy as np import pandas as pd +import time from statsmodels.tsa.seasonal import STL @@ -38,7 +40,16 @@ def predict(event, model): SEASONALITY = 24 * 7 -def offline_eval(yahoo_csv_path, plan_df): +def offline_eval(yahoo_csv_path, plan_json_path, key, output_path): + + # get plan DF for key + param_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" + policy_params = json.load(open(param_path)) + plan_df = pd.read_json(plan_json_path) + plan_df_key = plan_df[plan_df["key"] == int(key)] + plan_df_key.index = pd.RangeIndex(start=0, stop=len(plan_df_key.index)) + + # get original data print(yahoo_csv_path) df = pd.read_csv(yahoo_csv_path) df["timestamp"] = list(range(len(df))) @@ -46,15 +57,19 @@ def offline_eval(yahoo_csv_path, plan_df): # Given our model versions from offline plan, run training on corresponding # events. offline_stl = {} - print(plan_df) - for _, row in tqdm(plan_df.iterrows()): # note: doesn't preserve types + print(plan_df_key) + for _, row in tqdm(plan_df_key.iterrows()): # note: doesn't preserve types + st = time.time() records = df.iloc[int(row.window_start_seq_id) : int(row.window_end_seq_id) + 1].to_dict( orient="records" ) + #print("find time", time.time() - st) # The yahoo dataset seasonaly can be 12hr, daily, and weekly. # Each record is an hourly record. Here we chose weekly seasonality. + st = time.time() trained = train(records, window_size=len(records), seasonality=SEASONALITY) + #print("fit time", time.time() - st) offline_stl[row.processing_time] = trained print(offline_stl) @@ -67,11 +82,10 @@ def find_freshest_model_version(event_time, model_versions): return model_versions[model_loc] df["model_version"] = [ - find_freshest_model_version(et, plan_df["processing_time"]) + find_freshest_model_version(et, plan_df_key["processing_time"]) for et in df["timestamp"] ] - # Run prediction! predicted = [] for _, row in df.iterrows(): @@ -98,37 +112,27 @@ def find_freshest_model_version(event_time, model_versions): add_df = pd.DataFrame(predicted) for new_col in add_df.columns: df[new_col] = add_df[new_col] - return df + df.to_csv(output_file) + return -def offline_eval_all(yahoo_path, plan_json_path): +def offline_eval_all(yahoo_path, plan_json_path, output_path, param_path): - param_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" - print(param_path) policy_params = json.load(open(param_path)) - plan_df = pd.read_json(plan_json_path) - plan_df.to_csv("plan.csv") - print("plan index", plan_df.index) # loop through each key + inputs = [] for key in policy_params.keys(): - output_file = f"output_{key}.csv" - print(key, output_file) - plan_df_key = plan_df[plan_df["key"] == int(key)] - print("key index", plan_df_key.index) - plan_df_key.index = pd.RangeIndex(start=0, stop=len(plan_df_key.index)) - print(plan_df_key) - print("key index", plan_df_key.index) - #plan_df_key = plan_df - csv_path = f"{yahoo_path}/{key}.csv" - df = offline_eval(csv_path, plan_df_key) - plan_df_key.to_csv(f"{key}_plan.csv") - df.to_csv(output_file) + key_output_path = f"{output_path}/{key}.csv" + inputs.append((f"{yahoo_path}/{key}.csv", plan_json_path, key, key_output_path)) + p = Pool(100) + p.starmap(offline_eval, inputs) + p.close() return -def offline_oracle(yahoo_csv_path): +def offline_oracle(yahoo_csv_path, output_path): df = pd.read_csv(yahoo_csv_path) df["timestamp"] = list(range(len(df))) df["model_version"] = "oracle" @@ -140,22 +144,20 @@ def offline_oracle(yahoo_csv_path): df["pred_seasonality"] = oracle_model["stl_result"].seasonal df["pred_staleness"] = 0 - return df + df.to_csv(output_path) -def run_exp(csv_path, plan_path, output_path, run_policy=False, run_oracle=False): +def run_exp(csv_path, plan_path, output_path, run_policy=False, run_oracle=False, param_path=None): if run_oracle: - df = offline_oracle(csv_path) + df = offline_oracle(csv_path, output_path) elif run_policy: - offline_eval_all(csv_path, plan_path) + offline_eval_all(csv_path, plan_path, output_path, param_path) else: # Headers # processing_time window_start_seq_id window_end_seq_id key plan_df = pd.read_json(plan_path) - - df = offline_eval(csv_path, plan_df) - + offline_eval(csv_path, plan_df, output_path) df.to_csv(output_path, index=None) @@ -169,7 +171,9 @@ def main(): parser.add_argument("--offline-yahoo-csv-path", type=str) parser.add_argument("--offline-plan-path", type=str) parser.add_argument("--output-path", type=str) - parser.add_argument("--offline-run-oracle", type=bool, default=False) + parser.add_argument("--offline-run-oracle", default=False, action='store_true') + parser.add_argument("--run-policy", default=False, action='store_true') + parser.add_argument("--param-path", type=str, default=None) args = parser.parse_args() assert args.offline_yahoo_csv_path @@ -182,7 +186,8 @@ def main(): plan_path=args.offline_plan_path, output_path=args.output_path, run_oracle=args.offline_run_oracle, - run_policy=True + run_policy=args.run_policy, + param_path=args.param_path, ) diff --git a/stl/offline/run_4_generate_plan.sh b/stl/offline/run_4_generate_plan.sh index 0d06e8d..442cf98 100644 --- a/stl/offline/run_4_generate_plan.sh +++ b/stl/offline/run_4_generate_plan.sh @@ -5,4 +5,4 @@ set -ex python config_gen.py \ --csv_dir "./result/offline_1_slide/plan_eval" \ - --output_path "./result/offline_1_slide/min_loss_plan.json" \ No newline at end of file + --output_path "./result/offline_1_slide/min_loss_plan.json" diff --git a/stl/offline/run_5_simulate_lp_plan.sh b/stl/offline/run_5_simulate_lp_plan.sh index 7e353f0..bc67ae1 100644 --- a/stl/offline/run_5_simulate_lp_plan.sh +++ b/stl/offline/run_5_simulate_lp_plan.sh @@ -1,8 +1,26 @@ set -ex +PARAM_PATH=result/offline_1_slide/min_loss_plan.json +PLAN_PATH=result/offline_1_slide/lp_eval/varying_slide_size_trace.json +SOURCE_PATH=/data/wooders/stl/yahoo/A4 +OUTPUT_CSV_PATH=result/offline_1_slide/ +# re-run simulation with lp-generated weights python simulation.py --model_runtime_s 0.02 --total_runtime_s 150 \ --per_key_records_per_second 100 \ - --num_mapper_replicas 2 --num_keys 100 \ + --num_mapper_replicas 2 \ --window_size 672 --slide_size 0 \ - --per_key_slide_size_plan result/offline_1_slide/min_loss_plan.json \ - --output_path result/offline_1_slide/lp_eval/varying_slide_size_trace.json \ No newline at end of file + --per_key_slide_size_plan $PARAM_PATH \ + --output_path $PLAN_PATH \ + --source_data_path $SOURCE_PATH + +# run evaluation with simulation results +python evaluation.py --offline-yahoo-csv-path $SOURCE_PATH \ + --offline-plan-path $PLAN_PATH \ + --output-path $OUTPUT_CSV_PATH \ + --param-path $PARAM_PATH \ + --run-policy + +# get final results +python evaluate_loss.py --offline-yahoo-csv-path $SOURCE_PATH --predicted-csv-path $OUTPUT_CSV_PATH --output-path + + diff --git a/stl/offline/simulation.py b/stl/offline/simulation.py index 17df84f..33a16b0 100644 --- a/stl/offline/simulation.py +++ b/stl/offline/simulation.py @@ -75,11 +75,11 @@ def main(argv): env = simpy.Environment() # source --source_to_window_queue--> window --windows_to_mapper_queue--> mapper - policy_plan_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" - policy_params = json.load(open(policy_plan_path)) - - keys = policy_params.keys() - print(keys) + if FLAGS.per_key_slide_size_plan is not None: + policy_params = json.load(open(FLAGS.per_key_slide_size_plan)) + keys = policy_params.keys() + else: + keys = [i in range(FLAGS.num_keys)] source_to_window_queue = simpy.Store(env) windows_to_mapper_queue = { @@ -88,7 +88,6 @@ def main(argv): processing_policy=prio_policies[FLAGS.key_prio_policy], load_shedding_policy=load_shed_policies[FLAGS.key_load_shed_policy], ) - #for i in range(FLAGS.num_keys) for key in keys } Source( @@ -98,14 +97,13 @@ def main(argv): next_queue=source_to_window_queue, total_run_time=FLAGS.total_runtime_s, keys=keys, - data_dir="/data/wooders/stl/yahoo/A4", - #data_file=FLAGS.source_data_path, + data_dir=FLAGS.source_data_path, ) WindowOperator( env, window_size=FLAGS.window_size, slide_size=FLAGS.slide_size, - per_key_slide_size_path=policy_plan_path, + per_key_slide_size_path=FLAGS.per_key_slide_size_plan, source_queue=source_to_window_queue, next_queues=windows_to_mapper_queue, ) diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 1ff24a7..d636647 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -1,4 +1,4 @@ -import json +import json import itertools from typing import DefaultDict, Dict, List, Optional, Tuple from collections import defaultdict @@ -78,14 +78,17 @@ def __init__(self, pageview_file, all_keys): self.cur_key_set = [] self.cur_key_iter = None pageview_df = pd.read_csv(pageview_file) - self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() + #self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() + self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() self.weights = {} for key in self.raw_weights.keys(): if str(key) not in all_keys: continue self.weights[key] = int(self.raw_weights[key]*1000) - assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + if self.weights[key] == 0: + self.weights[key] = 1 print(self.weights) @@ -362,8 +365,7 @@ def run(self, replica_id: int): "weighted_longest_queue": WeightedLongestQueueLoadBalancer(pageview_file), "longest_queue": LongestQueueLoadBalancer(), "random": RandomLoadBalancer(), - "round_robin": RoundRobinLoadBalancer(), - "round_robin_fix": RoundRobinLoadBalancerFix(), + "round_robin": RoundRobinLoadBalancerFix(), "weighted_round_robin": WeightedRoundRobin(pageview_file, keys) } @@ -428,9 +430,11 @@ def run_once( # cross-key prioritzation: historical page views, # policies prioritization_policies = ["lifo"] # ["fifo", "lifo"] - key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin", "round_robin_fix"] + #key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] + key_selection_policies = ["round_robin"] load_shedding_policies = ["always_process"] - model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + #model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + model_runtimes = [0.02, 0.05, 0.07] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] records_per_second = [100] output_files = [] diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py index cdb587d..1214cfb 100644 --- a/wikipedia/wiki_eval.py +++ b/wikipedia/wiki_eval.py @@ -326,6 +326,7 @@ def get_latest_embedding(timestep, doc_id): queries.append([question, [answer], doc_id]) # append per query + print("staleness", timestep - latest) staleness.append(timestep - latest) # dump CTX/question script From b897fa4e9a474639a89d6f263f93d28f73825486 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Sat, 9 Oct 2021 15:11:27 -0700 Subject: [PATCH 11/26] add wandb logging + wikipedia plot notebook --- stl/offline/config_gen.py | 2 +- wikipedia/config.yml | 1 + wikipedia/log_data.py | 51 +++++ wikipedia/notebooks/Wikipedia Plots.ipynb | 263 ++++++++++++++++++++++ 4 files changed, 316 insertions(+), 1 deletion(-) create mode 100644 wikipedia/log_data.py create mode 100644 wikipedia/notebooks/Wikipedia Plots.ipynb diff --git a/stl/offline/config_gen.py b/stl/offline/config_gen.py index 0248638..6e52b11 100644 --- a/stl/offline/config_gen.py +++ b/stl/offline/config_gen.py @@ -95,7 +95,7 @@ def run_lp(df: pd.DataFrame, max_n_fits=None, max_loss=None, objective="min_loss return plan_output, optimal_loss -def get_loss_per_key(key: int, csv_dir) +def get_loss_per_key(key: int, csv_dir): key_one = glob(f"{csv_dir}/slide_*_key_A4Benchmark-TS{key}.csv") assert len(key_one) > 0 diff --git a/wikipedia/config.yml b/wikipedia/config.yml index c6bb8a1..ea0ee7a 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -7,6 +7,7 @@ parsed_tmp_dir = %(data_dir)s/parsed_tmp/ diff_dir = %(data_dir)s/diffs/ embedding_dir = %(data_dir)s/embeddings/ exp_dir = %(data_dir)s/simulation_output/ +dpr_dir = /home/eecs/wooders/DPR [files] data_dir = /data/wooders/wikipedia diff --git a/wikipedia/log_data.py b/wikipedia/log_data.py new file mode 100644 index 0000000..1d8969f --- /dev/null +++ b/wikipedia/log_data.py @@ -0,0 +1,51 @@ +import wandb +import configparser +import os + + + + +if __name__ == "__main__": + + print("Running wandb logging on data") + run = wandb.init(job_type="dataset-creation", project="wiki-workload") + + # configuration file + config = configparser.ConfigParser() + config.read("config.yml") + + # log files + artifact = wandb.Artifact("files", type='dataset') + artifact.add_file(config["files"]["changes_file"]) + artifact.add_file(config["files"]["titles_file"]) + artifact.add_file(config["files"]["edits_file"]) + run.log_artifact(artifact) + + # log pageview + artifact = wandb.Artifact("pageviews", type='dataset') + artifact.add_file(config["files"]["raw_pageview_file"]) + artifact.add_file(config["files"]["pageview_file"]) + artifact.add_file(config["files"]["timestamp_weights_file"]) + run.log_artifact(artifact) + + # log questions file + artifact = wandb.Artifact("questions", type='dataset') + artifact.add_file(config["files"]["raw_questions_file"]) + artifact.add_file(config["files"]["questions_file"]) + run.log_artifact(artifact) + + # log simulation data + artifact = wandb.Artifact("simulation", type='dataset') + artifact.add_file(config["simulation"]["stream_edits_file"]) + artifact.add_file(config["simulation"]["stream_questions_file"]) + artifact.add_file(config["simulation"]["init_data_file"]) + run.log_artifact(artifact) + + + # log experiment output + artifact = wandb.Artifact("prediction_results", type='dataset') + files = os.listdir(config["directory"]["dpr_dir"]) + for filename in files: + if "plan-" in filename and '.json' in filename: + artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + run.log_artifact(artifact) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb new file mode 100644 index 0000000..ca4eab8 --- /dev/null +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e0030940", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import wandb\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "1e071d8f", + "metadata": {}, + "source": [ + "# Plot Wikipedia Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95457b57", + "metadata": {}, + "outputs": [], + "source": [ + "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", + "pageview_dir = run.use_artifact('pageviews:latest').download()\n", + "questions_dir = run.use_artifact('questions:latest').download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f5bc6ae", + "metadata": {}, + "outputs": [], + "source": [ + "pageview_df = pd.read_csv(f\"{pageview_dir}/pageviews.csv\")\n", + "pageview_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38782d70", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " \"edit_frequency\": pageview_df.edit_count / pageview_df.edit_count.sum(),\n", + " \"query_frequency\": pageview_df[\"2021080600\"] / pageview_df[\"2021080600\"].sum()\n", + "})\n", + "\n", + "df.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "e2e430fe", + "metadata": {}, + "source": [ + "# Plot DPR Model Accuracy Results " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c15f88fd", + "metadata": {}, + "outputs": [], + "source": [ + "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", + "artifact = run.use_artifact('prediction_results:latest')\n", + "artifact_dir = artifact.download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43f6adb3", + "metadata": {}, + "outputs": [], + "source": [ + "artifact_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaf30e01", + "metadata": {}, + "outputs": [], + "source": [ + "constants = [0.01, 0.05, 0.1, 1, 5, 10]\n", + "policies = [\"lifo\"]\n", + "key_policies = [\"weighted_round_robin\", \"weighted_random\", \"random\", \"round_robin\"]\n", + "d = artifact_dir\n", + "metric = 'top5'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b31501", + "metadata": {}, + "outputs": [], + "source": [ + "all_results = {}\n", + "for policy in policies: \n", + " for key_policy in key_policies: \n", + " scores = []\n", + " name = f\"plan-{key_policy}_{policy}-always_process\"\n", + " for constant in constants: \n", + " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " scores.append(results[metric])\n", + " all_results[name] = scores\n", + "all_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b479a2bc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "all_results.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "332c0ff6", + "metadata": {}, + "outputs": [], + "source": [ + "plan_weighted_random_lifo = []\n", + "for constant in constants:\n", + " with open(f'{d}/plan-weighted_random_lifo-always_process-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " plan_weighted_random_lifo.append(results[metric])\n", + "print(plan_weighted_random_lifo)\n", + " \n", + "plan_weighted_longest_queue_lifo = []\n", + "for constant in constants:\n", + " with open(f'{d}/plan-weighted_longest_queue_lifo-always_process-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " plan_weighted_longest_queue_lifo.append(results[metric])\n", + "print(plan_weighted_longest_queue_lifo)\n", + "\n", + "plan_longest_queue_lifo = []\n", + "for constant in constants:\n", + " with open(f'{d}/plan-longest_queue_lifo-always_process-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " plan_longest_queue_lifo.append(results[metric])\n", + "print(plan_longest_queue_lifo)\n", + "\n", + "plan_random_lifo = []\n", + "for constant in constants:\n", + " with open(f'{d}/plan-random_lifo-always_process-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " plan_random_lifo.append(results[metric])\n", + "print(plan_random_lifo)\n", + "\n", + "plan_round_robin_lifo = []\n", + "for constant in constants:\n", + " with open(f'{d}/plan-round_robin_lifo-always_process-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " plan_round_robin_lifo.append(results[metric])\n", + "print(plan_round_robin_lifo)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d536763", + "metadata": {}, + "outputs": [], + "source": [ + "from pylab import rcParams\n", + "rcParams['figure.figsize'] = 12, 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e07c3e9", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "\n", + "df = pd.DataFrame({\n", + " 'Factor': resources, \n", + " **all_results\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars='Factor').rename(columns=str.title)\n", + "seaborn.barplot(x='Factor', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "seaborn.despine(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d517d0", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.set_xscale('log')\n", + "#resources = [10/c for c in constants]\n", + "resources = constants \n", + "print(resources)\n", + "ax.plot(resources, plan_weighted_longest_queue_lifo, label=\"LIFO Weighted Queue\", c='coral', marker='.')\n", + "ax.plot(resources, plan_longest_queue_lifo, label=\"LIFO Queue\", c='coral', marker='.', linestyle='dashed')\n", + "\n", + "ax.plot(resources, plan_weighted_random_lifo, label=\"LIFO Weighted Random\", c='red', marker='.')\n", + "ax.plot(resources, plan_random_lifo, label=\"LIFO Random\", c='red', marker='.', linestyle='dashed')\n", + "\n", + "#ax.plot(resources, plan_lifo_sample_half, label=\"LIFO Sample Half\", c='dodgerblue', marker='.', linestyle='dashed')\n", + "#ax.plot(resources, plan_lifo_always_process, label=\"LIFO Always\", c='dodgerblue', marker='.')\n", + "\n", + "#ax.plot(resources, plan_round_robin_lifo, label=\"LIFO Round Robin\", c='blue', marker='.', linestyle='dashed')\n", + "\n", + "ax.grid()\n", + "ax.set(xlabel='resource constraint', ylabel=f'{metric} accuracy', title='Passage Retriever')\n", + "plt.legend()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 01dc41b4204f878b1aa5e14ee02df1ae77daadbb Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Mon, 11 Oct 2021 22:14:01 -0700 Subject: [PATCH 12/26] change to scripts --- stl/offline/evaluation.py | 4 - wikipedia/config.yml | 3 +- wikipedia/log_data.py | 51 - wikipedia/notebooks/Wikipedia Plots.ipynb | 1737 +++++++++++++++++++- wikipedia/preprocessing/wiki_api_data.py | 10 +- wikipedia/run_0_generate_data.sh | 1 + wikipedia/run_1_generate_plan.sh | 16 + wikipedia/run_2_prepare_data.sh | 18 + wikipedia/run_3_run_predictions.sh | 26 + wikipedia/run_4_run_optimal_predictions.sh | 8 + wikipedia/simulate.py | 114 +- 11 files changed, 1850 insertions(+), 138 deletions(-) delete mode 100644 wikipedia/log_data.py create mode 100644 wikipedia/run_0_generate_data.sh create mode 100644 wikipedia/run_1_generate_plan.sh create mode 100644 wikipedia/run_2_prepare_data.sh create mode 100644 wikipedia/run_3_run_predictions.sh create mode 100644 wikipedia/run_4_run_optimal_predictions.sh diff --git a/stl/offline/evaluation.py b/stl/offline/evaluation.py index 3305a31..a32aab5 100644 --- a/stl/offline/evaluation.py +++ b/stl/offline/evaluation.py @@ -43,14 +43,11 @@ def predict(event, model): def offline_eval(yahoo_csv_path, plan_json_path, key, output_path): # get plan DF for key - param_path = "/data/wooders/eurosys-results/10-05/stl-offline/result/offline_1_slide/min_loss_plan.json" - policy_params = json.load(open(param_path)) plan_df = pd.read_json(plan_json_path) plan_df_key = plan_df[plan_df["key"] == int(key)] plan_df_key.index = pd.RangeIndex(start=0, stop=len(plan_df_key.index)) # get original data - print(yahoo_csv_path) df = pd.read_csv(yahoo_csv_path) df["timestamp"] = list(range(len(df))) @@ -72,7 +69,6 @@ def offline_eval(yahoo_csv_path, plan_json_path, key, output_path): #print("fit time", time.time() - st) offline_stl[row.processing_time] = trained - print(offline_stl) # Assign the trained model with every events in the source file. def find_freshest_model_version(event_time, model_versions): diff --git a/wikipedia/config.yml b/wikipedia/config.yml index ea0ee7a..ad09cc0 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -11,7 +11,7 @@ dpr_dir = /home/eecs/wooders/DPR [files] data_dir = /data/wooders/wikipedia -raw_questions_file = %(data_dir)s/10062021_filtered_questions.csv +raw_questions_file = %(data_dir)s/10042021_questions_revid_filtered.csv model_file = %(data_dir)s/bert-base-encoder.cp changes_file = %(data_dir)s/changes.csv titles_file = %(data_dir)s/top_titles.csv @@ -26,6 +26,7 @@ timestamp_weights_file = %(data_dir)s/timestamp_weights_file.json data_dir = /data/wooders/wikipedia plan_dir = /data/wooders/wiki-plans init_data_file = %(data_dir)s/init_data.json +optimal_plan_file = %(data_dir)s/optimal_plan.json stream_edits_file = %(data_dir)s/edit_stream.json stream_questions_file = %(data_dir)s/question_stream.json diff --git a/wikipedia/log_data.py b/wikipedia/log_data.py deleted file mode 100644 index 1d8969f..0000000 --- a/wikipedia/log_data.py +++ /dev/null @@ -1,51 +0,0 @@ -import wandb -import configparser -import os - - - - -if __name__ == "__main__": - - print("Running wandb logging on data") - run = wandb.init(job_type="dataset-creation", project="wiki-workload") - - # configuration file - config = configparser.ConfigParser() - config.read("config.yml") - - # log files - artifact = wandb.Artifact("files", type='dataset') - artifact.add_file(config["files"]["changes_file"]) - artifact.add_file(config["files"]["titles_file"]) - artifact.add_file(config["files"]["edits_file"]) - run.log_artifact(artifact) - - # log pageview - artifact = wandb.Artifact("pageviews", type='dataset') - artifact.add_file(config["files"]["raw_pageview_file"]) - artifact.add_file(config["files"]["pageview_file"]) - artifact.add_file(config["files"]["timestamp_weights_file"]) - run.log_artifact(artifact) - - # log questions file - artifact = wandb.Artifact("questions", type='dataset') - artifact.add_file(config["files"]["raw_questions_file"]) - artifact.add_file(config["files"]["questions_file"]) - run.log_artifact(artifact) - - # log simulation data - artifact = wandb.Artifact("simulation", type='dataset') - artifact.add_file(config["simulation"]["stream_edits_file"]) - artifact.add_file(config["simulation"]["stream_questions_file"]) - artifact.add_file(config["simulation"]["init_data_file"]) - run.log_artifact(artifact) - - - # log experiment output - artifact = wandb.Artifact("prediction_results", type='dataset') - files = os.listdir(config["directory"]["dpr_dir"]) - for filename in files: - if "plan-" in filename and '.json' in filename: - artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) - run.log_artifact(artifact) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index ca4eab8..29bf8cb 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 237, "id": "e0030940", "metadata": {}, "outputs": [], @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "1e071d8f", + "id": "594d6d4e", "metadata": {}, "source": [ "# Plot Wikipedia Dataset" @@ -24,10 +24,58 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "95457b57", + "execution_count": 238, + "id": "016e13bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:1i504fr1) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:1i504fr1). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Syncing run resilient-planet-72 to Weights & Biases (docs).
\n", + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "pageview_dir = run.use_artifact('pageviews:latest').download()\n", @@ -36,10 +84,385 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "3f5bc6ae", + "execution_count": 239, + "id": "7690f6d7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titleedit_count2021080500202108060020210807002021080800202108090020210810002021081100...20210828002021082900202108300020210831002021090100202109020020210903002021090400weightsdoc_id
00Deaths in 20211877383536313496656...69506368505239460.02851165984422
112021 Atlantic hurricane season14381151689714...820285121150.00380557798785
22Neeraj Chopra11563732434...560492130.00217051150040
33Fall of Kabul (2021)10091891212161012...11169920155100.00487668481047
44Great Britain at the 2020 Summer Paralympics989135641689...3868107470.00339760043578
..................................................................
211211List of fungi of South Africa203897132149...10761135560.00346768354495
212212Mister Supranational 2021203897132149...10761135560.00346767918135
2132132021–22 FC Barcelona season20219292927282723...21262916272043180.01269867089631
214214Hamid Karzai International Airport20114261517261417...1910251326142270.007258487602
215215Characters of the Marvel Cinematic Universe20114261517261417...1910251326142270.00725862372638
\n", + "

216 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title edit_count \\\n", + "0 0 Deaths in 2021 1877 \n", + "1 1 2021 Atlantic hurricane season 1438 \n", + "2 2 Neeraj Chopra 1156 \n", + "3 3 Fall of Kabul (2021) 1009 \n", + "4 4 Great Britain at the 2020 Summer Paralympics 989 \n", + ".. ... ... ... \n", + "211 211 List of fungi of South Africa 203 \n", + "212 212 Mister Supranational 2021 203 \n", + "213 213 2021–22 FC Barcelona season 202 \n", + "214 214 Hamid Karzai International Airport 201 \n", + "215 215 Characters of the Marvel Cinematic Universe 201 \n", + "\n", + " 2021080500 2021080600 2021080700 2021080800 2021080900 2021081000 \\\n", + "0 38 35 36 31 349 66 \n", + "1 11 5 16 8 9 7 \n", + "2 3 7 3 2 4 3 \n", + "3 18 9 12 12 16 10 \n", + "4 13 5 6 4 16 8 \n", + ".. ... ... ... ... ... ... \n", + "211 8 9 7 13 21 4 \n", + "212 8 9 7 13 21 4 \n", + "213 19 29 29 27 28 27 \n", + "214 14 26 15 17 26 14 \n", + "215 14 26 15 17 26 14 \n", + "\n", + " 2021081100 ... 2021082800 2021082900 2021083000 2021083100 \\\n", + "0 56 ... 69 50 63 68 \n", + "1 14 ... 8 20 2 8 \n", + "2 4 ... 5 6 0 4 \n", + "3 12 ... 11 16 9 9 \n", + "4 9 ... 3 8 6 8 \n", + ".. ... ... ... ... ... ... \n", + "211 9 ... 10 7 6 1 \n", + "212 9 ... 10 7 6 1 \n", + "213 23 ... 21 26 29 16 \n", + "214 17 ... 19 10 25 13 \n", + "215 17 ... 19 10 25 13 \n", + "\n", + " 2021090100 2021090200 2021090300 2021090400 weights doc_id \n", + "0 50 52 39 46 0.028511 65984422 \n", + "1 5 12 11 5 0.003805 57798785 \n", + "2 9 2 1 3 0.002170 51150040 \n", + "3 20 15 5 10 0.004876 68481047 \n", + "4 10 7 4 7 0.003397 60043578 \n", + ".. ... ... ... ... ... ... \n", + "211 13 5 5 6 0.003467 68354495 \n", + "212 13 5 5 6 0.003467 67918135 \n", + "213 27 20 43 18 0.012698 67089631 \n", + "214 26 14 22 7 0.007258 487602 \n", + "215 26 14 22 7 0.007258 62372638 \n", + "\n", + "[216 rows x 36 columns]" + ] + }, + "execution_count": 239, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pageview_df = pd.read_csv(f\"{pageview_dir}/pageviews.csv\")\n", "pageview_df" @@ -47,10 +470,31 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "38782d70", + "execution_count": 240, + "id": "5b5d1edc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 240, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df = pd.DataFrame({\n", " \"edit_frequency\": pageview_df.edit_count / pageview_df.edit_count.sum(),\n", @@ -62,7 +506,7 @@ }, { "cell_type": "markdown", - "id": "e2e430fe", + "id": "1ca13ffa", "metadata": {}, "source": [ "# Plot DPR Model Accuracy Results " @@ -70,10 +514,106 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c15f88fd", + "execution_count": 241, + "id": "39b1975e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:h5pqozf8) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Waiting for W&B process to finish, PID 38831... (success)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value=' 0.68MB of 0.68MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced resilient-planet-72: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/h5pqozf8
\n", + "Find logs at: ./wandb/run-20211011_174530-h5pqozf8/logs
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:h5pqozf8). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Syncing run stoic-blaze-73 to Weights & Biases (docs).
\n", + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "artifact = run.use_artifact('prediction_results:latest')\n", @@ -82,34 +622,146 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "43f6adb3", + "execution_count": 242, + "id": "101571e2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'./artifacts/prediction_results:v1'" + ] + }, + "execution_count": 242, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "artifact_dir" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 289, + "id": "c106b186", + "metadata": {}, + "outputs": [], + "source": [ + "artifact_dir = \"/home/eecs/wooders/DPR\"" + ] + }, + { + "cell_type": "code", + "execution_count": 290, "id": "eaf30e01", "metadata": {}, "outputs": [], "source": [ - "constants = [0.01, 0.05, 0.1, 1, 5, 10]\n", + "constants = [0.01, 0.05, 0.1, 1, 5]\n", "policies = [\"lifo\"]\n", - "key_policies = [\"weighted_round_robin\", \"weighted_random\", \"random\", \"round_robin\"]\n", + "key_policies = [\"random\", \"weighted_random\"]\n", "d = artifact_dir\n", "metric = 'top5'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 291, + "id": "25eb4c0d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.1-100.json\n", + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-1-100.json\n", + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-5-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.1-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-1-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-5-100.json\n" + ] + }, + { + "data": { + "text/plain": [ + "{'plan-random_lifo-always_process': [0.41722204591135087,\n", + " 0.41605839416058393,\n", + " 0.3628477731936951,\n", + " 0.19216121866074262,\n", + " 0.20681265206812652],\n", + " 'plan-weighted_random_lifo-always_process': [0.4166931132973659,\n", + " 0.4052681688352904,\n", + " 0.3899820162911245,\n", + " 0.256373637998519,\n", + " 0.17237913889770443]}" + ] + }, + "execution_count": 291, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_results = {}\n", + "for policy in policies: \n", + " for key_policy in key_policies: \n", + " scores = []\n", + " name = f\"plan-{key_policy}_{policy}-always_process\"\n", + " for constant in constants: \n", + " print(f'{d}/{name}-{constant}-100.json')\n", + " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " scores.append(results[metric])\n", + " all_results[name] = scores\n", + "all_results" + ] + }, + { + "cell_type": "code", + "execution_count": 228, "id": "d3b31501", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'plan-weighted_round_robin_lifo-always_process': [0.7134603189515466,\n", + " 0.14824122570088702,\n", + " 0.13923792971165966,\n", + " 0.14143003656121067,\n", + " 0.1144984381238697,\n", + " 0.12701693402541278],\n", + " 'plan-weighted_random_lifo-always_process': [0.7100077506635037,\n", + " 0.1506368853293249,\n", + " 0.15045681940954037,\n", + " 0.13972332479977453,\n", + " 0.11063093532501898,\n", + " 0.12261706242024253],\n", + " 'plan-random_lifo-always_process': [0.699352545584079,\n", + " 0.14759142259905583,\n", + " 0.15119274099474678,\n", + " 0.12415936616796236,\n", + " 0.11849120417126617,\n", + " 0.11131988319202073],\n", + " 'plan-round_robin_lifo-always_process': [0.6862547071580117,\n", + " 0.14121082587625558,\n", + " 0.14572030282390336,\n", + " 0.12318857599173262,\n", + " 0.11423225372070993,\n", + " 0.11082665915087175]}" + ] + }, + "execution_count": 228, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "all_results = {}\n", "for policy in policies: \n", @@ -126,22 +778,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 229, "id": "b479a2bc", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['plan-weighted_round_robin_lifo-always_process', 'plan-weighted_random_lifo-always_process', 'plan-random_lifo-always_process', 'plan-round_robin_lifo-always_process'])" + ] + }, + "execution_count": 229, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "all_results.keys()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 230, "id": "332c0ff6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.7100077506635037, 0.1506368853293249, 0.15045681940954037, 0.13972332479977453, 0.11063093532501898, 0.12261706242024253]\n", + "[0.11061527741895076, 0.11378600339776562, 0.11060744846591665, 0.11156258073607817, 0.11068573799625776, 0.11357462166584463]\n", + "[0.11424008267374404, 0.11315185820200264, 0.127032591931481, 0.11396606931755017, 0.11010639547173356, 0.11089711972817876]\n", + "[0.699352545584079, 0.14759142259905583, 0.15119274099474678, 0.12415936616796236, 0.11849120417126617, 0.11131988319202073]\n", + "[0.6862547071580117, 0.14121082587625558, 0.14572030282390336, 0.12318857599173262, 0.11423225372070993, 0.11082665915087175]\n" + ] + } + ], "source": [ "plan_weighted_random_lifo = []\n", "for constant in constants:\n", @@ -181,25 +856,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 231, "id": "6d536763", "metadata": {}, "outputs": [], "source": [ "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 12, 10" + "rcParams['figure.figsize'] = 12, 6" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 232, "id": "1e07c3e9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn\n", - "\n", + "resources = constants \n", "df = pd.DataFrame({\n", " 'Factor': resources, \n", " **all_results\n", @@ -212,10 +898,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 233, "id": "61d517d0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.01, 0.05, 0.1, 1, 5, 10]\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 233, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fig, ax = plt.subplots()\n", "ax.set_xscale('log')\n", @@ -237,6 +951,961 @@ "ax.set(xlabel='resource constraint', ylabel=f'{metric} accuracy', title='Passage Retriever')\n", "plt.legend()" ] + }, + { + "cell_type": "markdown", + "id": "cdf98fa5", + "metadata": {}, + "source": [ + "## Observe how often each key was updated " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "91736ad0", + "metadata": {}, + "outputs": [], + "source": [ + "plan_dir = '/data/wooders/wiki-plans'\n", + "diff_dir = '/data/wooders/wikipedia/diffs'" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "2cdeefee", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict \n", + "\n", + "def evaluate_plan(plan_file, optimal_file, start_ts=0, end_ts=37000): \n", + " plan = json.load(open(plan_file))\n", + " optimal_plan = json.load(open(optimal_file))\n", + " \n", + "\n", + " title_counts = defaultdict(lambda: 0)\n", + " title_counts_opt = defaultdict(lambda: 0)\n", + "\n", + " for ts in plan.keys(): \n", + " if float(ts) < start_ts or float(ts) > end_ts: continue \n", + " for edit in plan[ts]: \n", + " edit_file = edit[0]\n", + " edit_data = json.load(open(f\"{diff_dir}/{edit_file}\"))\n", + " title = edit_data['title']\n", + " title_counts[title] += 1\n", + " \n", + " for ts in optimal_plan.keys(): \n", + " if float(ts) < start_ts or float(ts) > end_ts: continue \n", + " for edit in optimal_plan[ts]: \n", + " edit_file = edit[0]\n", + " edit_data = json.load(open(f\"{diff_dir}/{edit_file}\"))\n", + " title = edit_data['title']\n", + " title_counts_opt[title] += 1\n", + " \n", + " #assert title_counts_opt != title_counts\n", + " \n", + " title_counts_df = pd.DataFrame({\"title\": title_counts.keys(), \"updates\": title_counts.values()})\n", + " title_counts_opt_df = pd.DataFrame({\"title\": title_counts_opt.keys(), \"optimal_updates\": title_counts_opt.values()})\n", + " \n", + " plan_data_df = title_counts_df.merge(pageview_df, on=\"title\")\n", + " plan_data_df = plan_data_df.merge(title_counts_opt_df, on=\"title\")\n", + " plan_data_df[\"pageviews\"] = plan_data_df[\"2021090300\"]\n", + " return plan_data_df" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "d6440018", + "metadata": {}, + "outputs": [], + "source": [ + "plan_names = [\n", + " 'plan-round_robin_lifo-always_process-5-100',\n", + " 'plan-weighted_round_robin_lifo-always_process-5-100',\n", + " 'plan-random_lifo-always_process-5-100',\n", + " 'plan-weighted_random_lifo-always_process-5-100' \n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "523bf657", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "plan-round_robin_lifo-always_process-5-100\n", + "plan-weighted_round_robin_lifo-always_process-5-100\n", + "plan-random_lifo-always_process-5-100\n", + "plan-weighted_random_lifo-always_process-5-100\n" + ] + } + ], + "source": [ + "results = {}\n", + "end_ts = 1000\n", + "for plan_name in plan_names:\n", + " print(plan_name)\n", + " plan_file = f'{plan_dir}/{plan_name}.json'\n", + " plan_data_df = evaluate_plan(plan_file, f'/home/eecs/wooders/experiments/wikipedia/optimal_plan.json', end_ts=end_ts)\n", + " results[plan_name] = plan_data_df\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "d1958139", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results['plan-round_robin_lifo-always_process-5-100'].set_index(\"title\").sort_values(by=\"pageviews\").tail(10)[[\"updates\", \"optimal_updates\", \"pageviews\"]].plot(kind=\"bar\", title=\"Updates for Top Documents (Weighted Round Robin)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "3ea99ccb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results['plan-round_robin_lifo-always_process-5-100'].set_index(\"title\").sort_values(by=\"pageviews\").head(10)[[\"updates\", \"optimal_updates\", \"pageviews\"]].plot(kind=\"bar\", title=\"Updates for Least Queried Documents (Round Robin)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "999fc591", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"updates\", kind=\"hist\")" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "00d43d3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"updates\", kind=\"hist\")" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "7af47144", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"optimal_updates\", kind=\"hist\")" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "4d55378e", + "metadata": {}, + "outputs": [], + "source": [ + "optimal_plan_df = evaluate_plan(f'/home/eecs/wooders/experiments/wikipedia/optimal_plan.json', f'/home/eecs/wooders/experiments/wikipedia/optimal_plan.json', end_ts=end_ts)" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "id": "739fdc68", + "metadata": {}, + "outputs": [], + "source": [ + "n_fits = np.array(range(0, 250, 1)) #optimal_plan_df[\"updates\"].unique()\n", + "n_fits.sort()\n", + "n_fits_map = {v: i for i, v in enumerate(n_fits)}\n", + "n_fits_ticks = {i: v for i, v in enumerate(n_fits)}" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "c34cc7c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0,\n", + " 1: 1,\n", + " 2: 2,\n", + " 3: 3,\n", + " 4: 4,\n", + " 5: 5,\n", + " 6: 6,\n", + " 7: 7,\n", + " 8: 8,\n", + " 9: 9,\n", + " 10: 10,\n", + " 11: 11,\n", + " 12: 12,\n", + " 13: 13,\n", + " 14: 14,\n", + " 15: 15,\n", + " 16: 16,\n", + " 17: 17,\n", + " 18: 18,\n", + " 19: 19,\n", + " 20: 20,\n", + " 21: 21,\n", + " 22: 22,\n", + " 23: 23,\n", + " 24: 24,\n", + " 25: 25,\n", + " 26: 26,\n", + " 27: 27,\n", + " 28: 28,\n", + " 29: 29,\n", + " 30: 30,\n", + " 31: 31,\n", + " 32: 32,\n", + " 33: 33,\n", + " 34: 34,\n", + " 35: 35,\n", + " 36: 36,\n", + " 37: 37,\n", + " 38: 38,\n", + " 39: 39,\n", + " 40: 40,\n", + " 41: 41,\n", + " 42: 42,\n", + " 43: 43,\n", + " 44: 44,\n", + " 45: 45,\n", + " 46: 46,\n", + " 47: 47,\n", + " 48: 48,\n", + " 49: 49,\n", + " 50: 50,\n", + " 51: 51,\n", + " 52: 52,\n", + " 53: 53,\n", + " 54: 54,\n", + " 55: 55,\n", + " 56: 56,\n", + " 57: 57,\n", + " 58: 58,\n", + " 59: 59,\n", + " 60: 60,\n", + " 61: 61,\n", + " 62: 62,\n", + " 63: 63,\n", + " 64: 64,\n", + " 65: 65,\n", + " 66: 66,\n", + " 67: 67,\n", + " 68: 68,\n", + " 69: 69,\n", + " 70: 70,\n", + " 71: 71,\n", + " 72: 72,\n", + " 73: 73,\n", + " 74: 74,\n", + " 75: 75,\n", + " 76: 76,\n", + " 77: 77,\n", + " 78: 78,\n", + " 79: 79,\n", + " 80: 80,\n", + " 81: 81,\n", + " 82: 82,\n", + " 83: 83,\n", + " 84: 84,\n", + " 85: 85,\n", + " 86: 86,\n", + " 87: 87,\n", + " 88: 88,\n", + " 89: 89,\n", + " 90: 90,\n", + " 91: 91,\n", + " 92: 92,\n", + " 93: 93,\n", + " 94: 94,\n", + " 95: 95,\n", + " 96: 96,\n", + " 97: 97,\n", + " 98: 98,\n", + " 99: 99,\n", + " 100: 100,\n", + " 101: 101,\n", + " 102: 102,\n", + " 103: 103,\n", + " 104: 104,\n", + " 105: 105,\n", + " 106: 106,\n", + " 107: 107,\n", + " 108: 108,\n", + " 109: 109,\n", + " 110: 110,\n", + " 111: 111,\n", + " 112: 112,\n", + " 113: 113,\n", + " 114: 114,\n", + " 115: 115,\n", + " 116: 116,\n", + " 117: 117,\n", + " 118: 118,\n", + " 119: 119,\n", + " 120: 120,\n", + " 121: 121,\n", + " 122: 122,\n", + " 123: 123,\n", + " 124: 124,\n", + " 125: 125,\n", + " 126: 126,\n", + " 127: 127,\n", + " 128: 128,\n", + " 129: 129,\n", + " 130: 130,\n", + " 131: 131,\n", + " 132: 132,\n", + " 133: 133,\n", + " 134: 134,\n", + " 135: 135,\n", + " 136: 136,\n", + " 137: 137,\n", + " 138: 138,\n", + " 139: 139,\n", + " 140: 140,\n", + " 141: 141,\n", + " 142: 142,\n", + " 143: 143,\n", + " 144: 144,\n", + " 145: 145,\n", + " 146: 146,\n", + " 147: 147,\n", + " 148: 148,\n", + " 149: 149,\n", + " 150: 150,\n", + " 151: 151,\n", + " 152: 152,\n", + " 153: 153,\n", + " 154: 154,\n", + " 155: 155,\n", + " 156: 156,\n", + " 157: 157,\n", + " 158: 158,\n", + " 159: 159,\n", + " 160: 160,\n", + " 161: 161,\n", + " 162: 162,\n", + " 163: 163,\n", + " 164: 164,\n", + " 165: 165,\n", + " 166: 166,\n", + " 167: 167,\n", + " 168: 168,\n", + " 169: 169,\n", + " 170: 170,\n", + " 171: 171,\n", + " 172: 172,\n", + " 173: 173,\n", + " 174: 174,\n", + " 175: 175,\n", + " 176: 176,\n", + " 177: 177,\n", + " 178: 178,\n", + " 179: 179,\n", + " 180: 180,\n", + " 181: 181,\n", + " 182: 182,\n", + " 183: 183,\n", + " 184: 184,\n", + " 185: 185,\n", + " 186: 186,\n", + " 187: 187,\n", + " 188: 188,\n", + " 189: 189,\n", + " 190: 190,\n", + " 191: 191,\n", + " 192: 192,\n", + " 193: 193,\n", + " 194: 194,\n", + " 195: 195,\n", + " 196: 196,\n", + " 197: 197,\n", + " 198: 198,\n", + " 199: 199,\n", + " 200: 200,\n", + " 201: 201,\n", + " 202: 202,\n", + " 203: 203,\n", + " 204: 204,\n", + " 205: 205,\n", + " 206: 206,\n", + " 207: 207,\n", + " 208: 208,\n", + " 209: 209,\n", + " 210: 210,\n", + " 211: 211,\n", + " 212: 212,\n", + " 213: 213,\n", + " 214: 214,\n", + " 215: 215,\n", + " 216: 216,\n", + " 217: 217,\n", + " 218: 218,\n", + " 219: 219,\n", + " 220: 220,\n", + " 221: 221,\n", + " 222: 222,\n", + " 223: 223,\n", + " 224: 224,\n", + " 225: 225,\n", + " 226: 226,\n", + " 227: 227,\n", + " 228: 228,\n", + " 229: 229,\n", + " 230: 230,\n", + " 231: 231,\n", + " 232: 232,\n", + " 233: 233,\n", + " 234: 234,\n", + " 235: 235,\n", + " 236: 236,\n", + " 237: 237,\n", + " 238: 238,\n", + " 239: 239,\n", + " 240: 240,\n", + " 241: 241,\n", + " 242: 242,\n", + " 243: 243,\n", + " 244: 244,\n", + " 245: 245,\n", + " 246: 246,\n", + " 247: 247,\n", + " 248: 248,\n", + " 249: 249}" + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_fits_map" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "c768b43d", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "sns.set(style=\"whitegrid\", palette=\"muted\")" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "fc803fd4", + "metadata": {}, + "outputs": [], + "source": [ + "max_fits = 60 " + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "cfe761f5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(12, 12))\n", + "for i, plan_name in enumerate(results.keys()):\n", + " plan_file = f'{plan_dir}/{plan_name}.json'\n", + " plan_data_df = results[plan_name]\n", + " plt.subplot(4, 2, i + 1)\n", + " #plan, loss = run_lp(df, max_n_fits=max_n_fits)\n", + " #arr = np.array([(key, n_fits_map[fits]) for (key, fits) in plan.items()])\n", + " vals = plan_data_df[\"updates\"].tolist()\n", + " arr = np.array([(i, vals[i]) for i in range(len(vals))])\n", + " plt.scatter(arr[:, 0], arr[:, 1], label=max_n_fits)\n", + " plt.yticks(ticks=list(n_fits_ticks.keys()), labels=list(n_fits_ticks.values()))\n", + " plt.xlabel(\"key\")\n", + " plt.ylabel(\"n_fits\")\n", + " plt.legend()\n", + " plt.title(plan_name)\n", + "plt.suptitle(\"Sample plan selection\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "id": "ac3582ce", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"/data/wooders/wikipedia/10042021_questions_revid_filtered.csv\", sep=\"\\t\")\n", + "df.columns = [\"question\", \"answer\", \"doc_id\", \"timestamp\", \"revid\", \"oldrevid\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "id": "e7485904", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionanswerdoc_idtimestamprevidoldrevid
0what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:16:27.42857210372125321037212489
1what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:32:54.85714410372125321037212489
2what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:49:22.28571610372125321037212489
3what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:05:49.71428810372125321037212489
4what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:22:17.14286010372125321037212489
.....................
127727who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 20:46:09.23070010416509361041650818
127728who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 21:30:27.69223610416509361041650818
127729who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:14:46.15377210416509361041650818
127730who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:59:04.61530810416509361041650818
127731who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 23:43:23.07684410416509361041650818
\n", + "

127732 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 what is the most common death in 2021??????? \n", + "1 what is the most common death in 2021??????? \n", + "2 what is the most common death in 2021??????? \n", + "3 what is the most common death in 2021??????? \n", + "4 what is the most common death in 2021??????? \n", + "... ... \n", + "127727 who is the ayo?????? \n", + "127728 who is the ayo?????? \n", + "127729 who is the ayo?????? \n", + "127730 who is the ayo?????? \n", + "127731 who is the ayo?????? \n", + "\n", + " answer doc_id \\\n", + "0 A typical entry reports information in the fol... 65984422 \n", + "1 A typical entry reports information in the fol... 65984422 \n", + "2 A typical entry reports information in the fol... 65984422 \n", + "3 A typical entry reports information in the fol... 65984422 \n", + "4 A typical entry reports information in the fol... 65984422 \n", + "... ... ... \n", + "127727 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127728 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127729 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127730 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127731 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "\n", + " timestamp revid oldrevid \n", + "0 2021-08-06 00:16:27.428572 1037212532 1037212489 \n", + "1 2021-08-06 00:32:54.857144 1037212532 1037212489 \n", + "2 2021-08-06 00:49:22.285716 1037212532 1037212489 \n", + "3 2021-08-06 01:05:49.714288 1037212532 1037212489 \n", + "4 2021-08-06 01:22:17.142860 1037212532 1037212489 \n", + "... ... ... ... \n", + "127727 2021-09-01 20:46:09.230700 1041650936 1041650818 \n", + "127728 2021-09-01 21:30:27.692236 1041650936 1041650818 \n", + "127729 2021-09-01 22:14:46.153772 1041650936 1041650818 \n", + "127730 2021-09-01 22:59:04.615308 1041650936 1041650818 \n", + "127731 2021-09-01 23:43:23.076844 1041650936 1041650818 \n", + "\n", + "[127732 rows x 6 columns]" + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "id": "2d5a778a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "332667 32284\n", + "1305297 10610\n", + "66304621 5330\n", + "17888363 3900\n", + "67089631 3621\n", + "Name: doc_id, dtype: int64" + ] + }, + "execution_count": 320, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.doc_id.value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 308, + "id": "f7546f77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "60043578 590\n", + "51150040 510\n", + "68187748 470\n", + "66187257 450\n", + "64783122 370\n", + "Name: doc_id, dtype: int64" + ] + }, + "execution_count": 308, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.doc_id.value_counts().tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 309, + "id": "5823be22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "332667 5090\n", + "65984422 3769\n", + "68553225 1740\n", + "57798785 1410\n", + "56185392 1360\n", + "68294454 1200\n", + "66293350 1040\n", + "57817558 930\n", + "60043578 590\n", + "51150040 510\n", + "68187748 470\n", + "66187257 450\n", + "64783122 370\n", + "Name: doc_id, dtype: int64" + ] + }, + "execution_count": 309, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.doc_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 310, + "id": "a8154736", + "metadata": {}, + "outputs": [], + "source": [ + "weights = df.doc_id.value_counts().to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 311, + "id": "48960f20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{332667: 5090,\n", + " 65984422: 3769,\n", + " 68553225: 1740,\n", + " 57798785: 1410,\n", + " 56185392: 1360,\n", + " 68294454: 1200,\n", + " 66293350: 1040,\n", + " 57817558: 930,\n", + " 60043578: 590,\n", + " 51150040: 510,\n", + " 68187748: 470,\n", + " 66187257: 450,\n", + " 64783122: 370}" + ] + }, + "execution_count": 311, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ade5579", + "metadata": {}, + "outputs": [], + "source": [ + "we" + ] } ], "metadata": { diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index 4928566..fc4b01d 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -15,12 +15,16 @@ from multiprocessing import Pool +import wandb + # from concurrent.futures import ProcessPoolExecutor # from generate diffs file (originally from DPR repo... sorry kevin) from generate_diffs import generate_sentence_level_diffs from embedding import generate_embeddings +from log_data import log_files, log_pageview, log_simulation, log_questions + def query_recentchanges(start_time, end_time, revision_file): from bs4 import BeautifulSoup @@ -589,7 +593,7 @@ def check_dataset( if __name__ == "__main__": - print("starting script") + run = wandb.init(job_type="dataset-creation", project="wiki-workload") # configuration file config = configparser.ConfigParser() @@ -662,6 +666,7 @@ def check_dataset( print("Generated titles file", titles_file) edits_df = get_edits(edits_file, changes_file, titles_file) print("Generated edits file", edits_file) + log_files(run, config) # query document versions for list of titles if args.run_query_doc_versions: @@ -681,10 +686,12 @@ def check_dataset( if args.run_get_questions: questions_df = get_questions(raw_questions_file, questions_file) print("Generated questions file", raw_questions_file, questions_file) + log_questions(run, config) # generate pageviews / compute page weights if args.run_get_pageviews: get_pageviews(raw_pageview_file, pageview_file, edits_file, timestamp_weights_file) + log_pageview(run, config) # generate diffs between document versions if args.run_generate_diffs: @@ -704,6 +711,7 @@ def check_dataset( stream_edits_file, stream_questions_file, ) + log_simulation(run, config) # run tests to validate simulation data if args.run_check_dataset: diff --git a/wikipedia/run_0_generate_data.sh b/wikipedia/run_0_generate_data.sh new file mode 100644 index 0000000..5d92ff6 --- /dev/null +++ b/wikipedia/run_0_generate_data.sh @@ -0,0 +1 @@ +python preprocessing/wiki_api_data.py --run_generate_simulation_data --run_get_questions diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh new file mode 100644 index 0000000..debb1e6 --- /dev/null +++ b/wikipedia/run_1_generate_plan.sh @@ -0,0 +1,16 @@ +set -xe + +for key_policy in "random" "weighted_random" "round_robin" "weighted_round_robin" +do + for event_policy in "fifo" "lifo" + do + for load_shedding_policy in "always_process" + do + for model_runtime in 0.01 0.05 0.1 1 5 10 + do + python simulate.py --model_runtime $model_runtime --send_rate 100 \ + --event_policy $event_policy --key_policy $key_policy --load_shedding_policy $load_shedding_policy + done + done + done +done diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh new file mode 100644 index 0000000..ff2b74e --- /dev/null +++ b/wikipedia/run_2_prepare_data.sh @@ -0,0 +1,18 @@ +set -xe + +plan_dir=/data/wooders/wiki-plans + +for key_policy in "round_robin" "weighted_round_robin" #"random" "weighted_random" +do + for event_policy in "lifo" "fifo" + do + for load_shedding_policy in "always_process" + do + for model_runtime in 0.01 0.05 0.1 1 5 10 + do + python wiki_eval.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json + done + done + done +done +p diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh new file mode 100644 index 0000000..6655996 --- /dev/null +++ b/wikipedia/run_3_run_predictions.sh @@ -0,0 +1,26 @@ +set -xe + +plan_dir=/data/wooders/wiki-plans +dpr_dir=~/DPR + +cd $dpr_dir + +for key_policy in "round_robin" "weighted_round_robin" +#for key_policy in "random" "weighted_random" +do + for event_policy in "lifo" + do + for load_shedding_policy in "always_process" + do + for model_runtime in 0.01 0.05 0.1 1 5 + do + plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 + echo $plan_file + CUDA_VISIBLE_DEVICES=1,2,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + pid=$! + done + #wait $pid + done + done +done +p diff --git a/wikipedia/run_4_run_optimal_predictions.sh b/wikipedia/run_4_run_optimal_predictions.sh new file mode 100644 index 0000000..542d828 --- /dev/null +++ b/wikipedia/run_4_run_optimal_predictions.sh @@ -0,0 +1,8 @@ +set -xe + +plan_dir=/data/wooders/wiki-plans +dpr_dir=~/DPR +cd $dpr_dir +plan_file="optimal_plan" +echo $plan_file +CUDA_VISIBLE_DEVICES=5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index d636647..5162e99 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -7,9 +7,12 @@ import random import configparser +import argparse import pandas as pd +import wandb + import simpy from ralf.state import Record from ralf.policies.load_shedding_policy import ( @@ -39,6 +42,8 @@ from typing import Dict, List, Tuple, Type +from preprocessing.log_data import log_plans + def current_weights(ts, ts_to_weights): ts = int(ts) min_dist = max(list(ts_to_weights.keys())) @@ -89,15 +94,12 @@ def __init__(self, pageview_file, all_keys): #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" if self.weights[key] == 0: self.weights[key] = 1 - print(self.weights) for key in self.weights.keys(): for i in range(self.weights[key]): self.cur_key_set.append(str(key)) random.shuffle(self.cur_key_set) - print(self.cur_key_set) - print("size", len(self.cur_key_set)) self.cur_key_iter = itertools.cycle(self.cur_key_set) @@ -125,15 +127,12 @@ def __init__(self, timestamp_weights_file): self.weights[key] = int(self.raw_weights[key]*1000) assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" - print(self.weights) for key in self.weights.keys(): for i in range(self.weights[key]): self.cur_key_set.append(str(key)) random.shuffle(self.cur_key_set) - print(self.cur_key_set) - print("size", len(self.cur_key_set)) self.cur_key_iter = itertools.cycle(self.cur_key_set) @@ -168,10 +167,7 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], timestamp: int) keys.append(key) weights.append(weights_map[key]) total_len += size - print(weights) - print(keys) chosen_key = random.choices(keys, weights, k=1)[0] - print("choose", chosen_key, keys, weights) return chosen_key @@ -180,7 +176,6 @@ class WeightedLoadBalancer(CrossKeyLoadBalancer): def __init__(self, pageview_file): pageview_df = pd.read_csv(pageview_file) self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - #print(self.weights) def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None @@ -278,7 +273,6 @@ def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: max_len = size total_len += size per_key_queues[chosen_key].clear() - print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) return chosen_key @@ -372,7 +366,7 @@ def run(self, replica_id: int): def run_once( out_path: str, prioritization_policy: str, - load_sheeding_policy: str, + load_shedding_policy: str, keys: List[str], per_key_records_per_second: int, total_runtime_s: float, @@ -387,7 +381,7 @@ def run_once( key: PerKeyPriorityQueue( env, processing_policy=policies[prioritization_policy], - load_shedding_policy=policies[load_sheeding_policy], + load_shedding_policy=policies[load_shedding_policy], ) for key in keys } @@ -425,41 +419,67 @@ def run_once( if __name__ == "__main__": + # argument flags + parser = argparse.ArgumentParser() + parser.add_argument("--send_rate", type=int) + parser.add_argument("--model_runtime", type=float) + parser.add_argument("--total_runtime", type=float, default=len(edits)) + parser.add_argument("--event_policy", type=str) + parser.add_argument("--key_policy", type=str) + parser.add_argument("--load_shedding_policy", type=str) + args = parser.parse_args() + + out_path = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}.json" + print(out_path) + run_once( + out_path=out_path, + prioritization_policy=args.event_policy, + load_shedding_policy=args.load_shedding_policy, + keys=keys, + per_key_records_per_second=args.send_rate, + total_runtime_s=args.total_runtime, + model_runtime_constant=args.model_runtime, + key_selection_policy=args.key_policy, + ) + run = wandb.init(job_type="dataset-creation", project="wiki-workload") + log_plans(run, config, out_path) + + # load sheding: random, drop short edits # prioritization: prioritize most recent version # cross-key prioritzation: historical page views, # policies - prioritization_policies = ["lifo"] # ["fifo", "lifo"] - #key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] - key_selection_policies = ["round_robin"] - load_shedding_policies = ["always_process"] - #model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] - model_runtimes = [0.02, 0.05, 0.07] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] - records_per_second = [100] - - output_files = [] - - for key_selection in key_selection_policies: - for prio_policy in prioritization_policies: - for load_shed_policy in load_shedding_policies: - for runtime in model_runtimes: - for rate in records_per_second: - - out_path = f"{plan_dir}/plan-{key_selection}_{prio_policy}-{load_shed_policy}-{runtime}-{rate}.json" - print("running", out_path, runtime) - run_once( - out_path, - prio_policy, - load_shed_policy, - keys, - per_key_records_per_second=rate, - total_runtime_s=len(edits), - model_runtime_constant=runtime, - key_selection_policy=key_selection, - ) - - output_files.append(out_path) - print("DONE", out_path) - for f in output_files: - print(f) - open("plans.txt", "w").write("\n".join(output_files)) + #prioritization_policies = ["lifo"] # ["fifo", "lifo"] + ##key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] + #key_selection_policies = ["round_robin"] + #load_shedding_policies = ["always_process"] + ##model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + #model_runtimes = [0.02, 0.05, 0.07] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] + #records_per_second = [100] + + #output_files = [] + + #for key_selection in key_selection_policies: + # for prio_policy in prioritization_policies: + # for load_shed_policy in load_shedding_policies: + # for runtime in model_runtimes: + # for rate in records_per_second: + + # out_path = f"{plan_dir}/plan-{key_selection}_{prio_policy}-{load_shed_policy}-{runtime}-{rate}.json" + # print("running", out_path, runtime) + # run_once( + # out_path, + # prio_policy, + # load_shed_policy, + # keys, + # per_key_records_per_second=rate, + # total_runtime_s=len(edits), + # model_runtime_constant=runtime, + # key_selection_policy=key_selection, + # ) + + # output_files.append(out_path) + # print("DONE", out_path) + #for f in output_files: + # print(f) + #open("plans.txt", "w").write("\n".join(output_files)) From 7728c353930d300ca67c31ce14d75a40f31a99ed Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 12 Oct 2021 15:58:39 -0700 Subject: [PATCH 13/26] parallelize preprocessing --- wikipedia/notebooks/Wikipedia Plots.ipynb | 1540 ++------------------ wikipedia/preprocessing/log_data.py | 72 + wikipedia/run_1_generate_plan.sh | 4 +- wikipedia/run_2_prepare_data.sh | 2 +- wikipedia/run_3_run_predictions.sh | 6 +- wikipedia/run_4_run_optimal_predictions.sh | 4 +- wikipedia/run_5_pipeline_predict.sh | 30 + wikipedia/simulate.py | 56 +- wikipedia/wiki_eval.py | 213 +-- 9 files changed, 353 insertions(+), 1574 deletions(-) create mode 100644 wikipedia/preprocessing/log_data.py create mode 100644 wikipedia/run_5_pipeline_predict.sh diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 29bf8cb..08f0f5a 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 237, + "execution_count": null, "id": "e0030940", "metadata": {}, "outputs": [], @@ -24,58 +24,10 @@ }, { "cell_type": "code", - "execution_count": 238, + "execution_count": null, "id": "016e13bb", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Finishing last run (ID:1i504fr1) before initializing another..." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Successfully finished last run (ID:1i504fr1). Initializing new run:
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", - "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Syncing run resilient-planet-72 to Weights & Biases (docs).
\n", - "\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "pageview_dir = run.use_artifact('pageviews:latest').download()\n", @@ -84,385 +36,10 @@ }, { "cell_type": "code", - "execution_count": 239, + "execution_count": null, "id": "7690f6d7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0titleedit_count2021080500202108060020210807002021080800202108090020210810002021081100...20210828002021082900202108300020210831002021090100202109020020210903002021090400weightsdoc_id
00Deaths in 20211877383536313496656...69506368505239460.02851165984422
112021 Atlantic hurricane season14381151689714...820285121150.00380557798785
22Neeraj Chopra11563732434...560492130.00217051150040
33Fall of Kabul (2021)10091891212161012...11169920155100.00487668481047
44Great Britain at the 2020 Summer Paralympics989135641689...3868107470.00339760043578
..................................................................
211211List of fungi of South Africa203897132149...10761135560.00346768354495
212212Mister Supranational 2021203897132149...10761135560.00346767918135
2132132021–22 FC Barcelona season20219292927282723...21262916272043180.01269867089631
214214Hamid Karzai International Airport20114261517261417...1910251326142270.007258487602
215215Characters of the Marvel Cinematic Universe20114261517261417...1910251326142270.00725862372638
\n", - "

216 rows × 36 columns

\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 title edit_count \\\n", - "0 0 Deaths in 2021 1877 \n", - "1 1 2021 Atlantic hurricane season 1438 \n", - "2 2 Neeraj Chopra 1156 \n", - "3 3 Fall of Kabul (2021) 1009 \n", - "4 4 Great Britain at the 2020 Summer Paralympics 989 \n", - ".. ... ... ... \n", - "211 211 List of fungi of South Africa 203 \n", - "212 212 Mister Supranational 2021 203 \n", - "213 213 2021–22 FC Barcelona season 202 \n", - "214 214 Hamid Karzai International Airport 201 \n", - "215 215 Characters of the Marvel Cinematic Universe 201 \n", - "\n", - " 2021080500 2021080600 2021080700 2021080800 2021080900 2021081000 \\\n", - "0 38 35 36 31 349 66 \n", - "1 11 5 16 8 9 7 \n", - "2 3 7 3 2 4 3 \n", - "3 18 9 12 12 16 10 \n", - "4 13 5 6 4 16 8 \n", - ".. ... ... ... ... ... ... \n", - "211 8 9 7 13 21 4 \n", - "212 8 9 7 13 21 4 \n", - "213 19 29 29 27 28 27 \n", - "214 14 26 15 17 26 14 \n", - "215 14 26 15 17 26 14 \n", - "\n", - " 2021081100 ... 2021082800 2021082900 2021083000 2021083100 \\\n", - "0 56 ... 69 50 63 68 \n", - "1 14 ... 8 20 2 8 \n", - "2 4 ... 5 6 0 4 \n", - "3 12 ... 11 16 9 9 \n", - "4 9 ... 3 8 6 8 \n", - ".. ... ... ... ... ... ... \n", - "211 9 ... 10 7 6 1 \n", - "212 9 ... 10 7 6 1 \n", - "213 23 ... 21 26 29 16 \n", - "214 17 ... 19 10 25 13 \n", - "215 17 ... 19 10 25 13 \n", - "\n", - " 2021090100 2021090200 2021090300 2021090400 weights doc_id \n", - "0 50 52 39 46 0.028511 65984422 \n", - "1 5 12 11 5 0.003805 57798785 \n", - "2 9 2 1 3 0.002170 51150040 \n", - "3 20 15 5 10 0.004876 68481047 \n", - "4 10 7 4 7 0.003397 60043578 \n", - ".. ... ... ... ... ... ... \n", - "211 13 5 5 6 0.003467 68354495 \n", - "212 13 5 5 6 0.003467 67918135 \n", - "213 27 20 43 18 0.012698 67089631 \n", - "214 26 14 22 7 0.007258 487602 \n", - "215 26 14 22 7 0.007258 62372638 \n", - "\n", - "[216 rows x 36 columns]" - ] - }, - "execution_count": 239, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pageview_df = pd.read_csv(f\"{pageview_dir}/pageviews.csv\")\n", "pageview_df" @@ -470,31 +47,10 @@ }, { "cell_type": "code", - "execution_count": 240, + "execution_count": null, "id": "5b5d1edc", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 240, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "df = pd.DataFrame({\n", " \"edit_frequency\": pageview_df.edit_count / pageview_df.edit_count.sum(),\n", @@ -514,106 +70,10 @@ }, { "cell_type": "code", - "execution_count": 241, + "execution_count": null, "id": "39b1975e", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Finishing last run (ID:h5pqozf8) before initializing another..." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Waiting for W&B process to finish, PID 38831... (success)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Label(value=' 0.68MB of 0.68MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", - "
Synced resilient-planet-72: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/h5pqozf8
\n", - "Find logs at: ./wandb/run-20211011_174530-h5pqozf8/logs
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Successfully finished last run (ID:h5pqozf8). Initializing new run:
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", - "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Syncing run stoic-blaze-73 to Weights & Biases (docs).
\n", - "\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "artifact = run.use_artifact('prediction_results:latest')\n", @@ -622,29 +82,18 @@ }, { "cell_type": "code", - "execution_count": 242, + "execution_count": null, "id": "101571e2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'./artifacts/prediction_results:v1'" - ] - }, - "execution_count": 242, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "artifact_dir" ] }, { "cell_type": "code", - "execution_count": 289, - "id": "c106b186", + "execution_count": null, + "id": "03e14929", "metadata": {}, "outputs": [], "source": [ @@ -653,60 +102,24 @@ }, { "cell_type": "code", - "execution_count": 290, + "execution_count": null, "id": "eaf30e01", "metadata": {}, "outputs": [], "source": [ "constants = [0.01, 0.05, 0.1, 1, 5]\n", "policies = [\"lifo\"]\n", - "key_policies = [\"random\", \"weighted_random\"]\n", + "key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", - "metric = 'top5'" + "metric = 'top10'" ] }, { "cell_type": "code", - "execution_count": 291, - "id": "25eb4c0d", + "execution_count": null, + "id": "96209574", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.05-100.json\n", - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.1-100.json\n", - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-1-100.json\n", - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-5-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.05-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.1-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-1-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-5-100.json\n" - ] - }, - { - "data": { - "text/plain": [ - "{'plan-random_lifo-always_process': [0.41722204591135087,\n", - " 0.41605839416058393,\n", - " 0.3628477731936951,\n", - " 0.19216121866074262,\n", - " 0.20681265206812652],\n", - " 'plan-weighted_random_lifo-always_process': [0.4166931132973659,\n", - " 0.4052681688352904,\n", - " 0.3899820162911245,\n", - " 0.256373637998519,\n", - " 0.17237913889770443]}" - ] - }, - "execution_count": 291, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "all_results = {}\n", "for policy in policies: \n", @@ -724,44 +137,10 @@ }, { "cell_type": "code", - "execution_count": 228, + "execution_count": null, "id": "d3b31501", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'plan-weighted_round_robin_lifo-always_process': [0.7134603189515466,\n", - " 0.14824122570088702,\n", - " 0.13923792971165966,\n", - " 0.14143003656121067,\n", - " 0.1144984381238697,\n", - " 0.12701693402541278],\n", - " 'plan-weighted_random_lifo-always_process': [0.7100077506635037,\n", - " 0.1506368853293249,\n", - " 0.15045681940954037,\n", - " 0.13972332479977453,\n", - " 0.11063093532501898,\n", - " 0.12261706242024253],\n", - " 'plan-random_lifo-always_process': [0.699352545584079,\n", - " 0.14759142259905583,\n", - " 0.15119274099474678,\n", - " 0.12415936616796236,\n", - " 0.11849120417126617,\n", - " 0.11131988319202073],\n", - " 'plan-round_robin_lifo-always_process': [0.6862547071580117,\n", - " 0.14121082587625558,\n", - " 0.14572030282390336,\n", - " 0.12318857599173262,\n", - " 0.11423225372070993,\n", - " 0.11082665915087175]}" - ] - }, - "execution_count": 228, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "all_results = {}\n", "for policy in policies: \n", @@ -770,6 +149,7 @@ " name = f\"plan-{key_policy}_{policy}-always_process\"\n", " for constant in constants: \n", " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", + " print(f'{d}/{name}-{constant}-100.json')\n", " results = json.load(results_file)\n", " scores.append(results[metric])\n", " all_results[name] = scores\n", @@ -778,45 +158,22 @@ }, { "cell_type": "code", - "execution_count": 229, + "execution_count": null, "id": "b479a2bc", "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['plan-weighted_round_robin_lifo-always_process', 'plan-weighted_random_lifo-always_process', 'plan-random_lifo-always_process', 'plan-round_robin_lifo-always_process'])" - ] - }, - "execution_count": 229, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "all_results.keys()" ] }, { "cell_type": "code", - "execution_count": 230, + "execution_count": null, "id": "332c0ff6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0.7100077506635037, 0.1506368853293249, 0.15045681940954037, 0.13972332479977453, 0.11063093532501898, 0.12261706242024253]\n", - "[0.11061527741895076, 0.11378600339776562, 0.11060744846591665, 0.11156258073607817, 0.11068573799625776, 0.11357462166584463]\n", - "[0.11424008267374404, 0.11315185820200264, 0.127032591931481, 0.11396606931755017, 0.11010639547173356, 0.11089711972817876]\n", - "[0.699352545584079, 0.14759142259905583, 0.15119274099474678, 0.12415936616796236, 0.11849120417126617, 0.11131988319202073]\n", - "[0.6862547071580117, 0.14121082587625558, 0.14572030282390336, 0.12318857599173262, 0.11423225372070993, 0.11082665915087175]\n" - ] - } - ], + "outputs": [], "source": [ "plan_weighted_random_lifo = []\n", "for constant in constants:\n", @@ -856,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 231, + "execution_count": null, "id": "6d536763", "metadata": {}, "outputs": [], @@ -867,21 +224,10 @@ }, { "cell_type": "code", - "execution_count": 232, + "execution_count": null, "id": "1e07c3e9", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn\n", @@ -898,38 +244,10 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": null, "id": "61d517d0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0.01, 0.05, 0.1, 1, 5, 10]\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 233, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtcAAAGMCAYAAAARL470AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABw/0lEQVR4nO3de3zO9f/H8ce1MztgY0zmWM7G5Jgop0hjvlF8famQToqSkNOcIkoqilRIy0/fUhbKITlUXzlUihxzSow5jpmdr98f7+3aLtsYbbt2eN5vt+t2Xdfn/fl8rtd1bdf2vN7X+/P+WKxWqxUREREREfnHnBxdgIiIiIhIUaFwLSIiIiKSSxSuRURERERyicK1iIiIiEguUbgWEREREcklCtciIiIiIrlE4VpERPLc448/zpdffunoMkRE8pxF81yLiORcu3btOHv2LM7OzpQoUYJ77rmHsWPH4unp6ejSbtmoUaNYuXIlrq6uuLq6Uq9ePcaOHUuNGjVuuO3s2bM5duwYr7/+ej5UKiJS8KnnWkTkJs2bN49ff/2VL7/8kl27djF37lxHl/SPDRw4kF9//ZXNmzdTvnx5xowZkyv7tVqtpKSk5Mq+blZSUpJDHldEijeFaxGRW1S+fHlat27NwYMHiY6O5sknn6RFixY0bdqUJ598klOnTtnW/eKLL2jfvj3BwcG0a9eOr776CoBjx47Rt29f7rzzTpo3b87zzz9v22bKlCncc889NG7cmAcffJAdO3bY2uLi4hg5ciRNmzbl/vvv5/3336dNmza29tOnT/Pcc8/RokUL2rVrx+LFi3P0nDw8PLj//vvZt2/fDfe1efNm3nvvPb755huCg4Pp1q0bAP369WPWrFn07t2bhg0bcvz4cfr168dnn31m2+fnn3/O/fffT9OmTRk4cCAnTpwAYPz48UyfPt2upqeffpqFCxfe8HnNnj2bIUOGMHz4cBo3bqxhKCLiEArXIiK3KDIyks2bN1OnTh1SUlJ48MEH2bBhAxs2bMDd3Z1JkyYBEBsby5QpU3j//ff59ddfWbp0KXXq1AHgrbfeolWrVmzfvp3NmzfTt29f2/4bNGjA8uXL2bZtGyEhIQwdOpT4+HgA5syZw4kTJ/j2229ZuHChLawDpKSk8PTTT1OrVi02b97MRx99xEcffcT3339/w+cUGxvLypUrqVy58g331aZNG5588knuv/9+fv31V7saIiIimDx5Mr/88gsVK1a0e4xvv/2W9957jzlz5rBlyxbuvPNOXnzxRQC6du3K119/TdqIxejoaH788Ue6dOmSo+e1fv16OnfuzI4dO+jatWvOf5giIrlE4VpE5CYNHjyYJk2a0KdPH5o2bcpTTz1FmTJl6NSpEyVKlMDLy4unn36a7du327ZxcnLi4MGDxMXF4e/vzx133AGAi4sLJ0+eJCoqCnd3d5o0aWLbJjQ0lDJlyuDi4sKAAQNISEjgyJEjAHzzzTc8+eSTlCpVigoVKvDII4/Yttu1axfnz5/n2Wefxc3NjcDAQB5++GG+/vrrbJ/TggULaNKkCY0bN+bnn39mxowZt7wvgH/961/ccccduLi44Orqate2dOlSnnjiCWrUqIGLiwtPPfUUe/fu5cSJEzRp0gSLxWLrpV+zZg2NGjWifPnyOaqlUaNGdOjQAScnJzw8PK5bo4hIXnBxdAEiIoXNO++8w1133WW37OrVq0ybNo3vv/+e6OhoAK5cuUJycjIlS5Zk1qxZLFiwgDFjxtC4cWNGjhxJjRo1eOmll3jrrbfo2bMnpUqVon///vTs2RMwgfezzz4jKioKi8VCTEwMFy5cACAqKoqAgADb41eoUMF2+8SJE0RFRdkF9eTkZLv71xowYAAvvPACJ0+e5PHHH+fIkSPUrl37lvYF2NV2rZMnTzJ16lS74R9Wq5XTp09z22230aVLF1auXEnTpk1ZsWKFbbhJTmrJ+DqIiDiCwrWISC5YsGABR44c4b///S/lypVj7969dO/e3Ta8oXXr1rRu3Zq4uDjefPNNxo0bx5IlSyhXrhxTpkwBYMeOHfTv35+mTZty5swZ3n//fRYtWsQdd9yBk5MTTZs2te2vXLlynDp1ittvvx3Abnx3QEAAlSpVYu3atTf9PCpWrMiYMWMYOXIkbdu2veG+LBbLTS1Pq++pp56yheZrhYSEMGDAAJ544gl+//133nnnHdt2N3pe13tcEZH8oGEhIiK54MqVK7i7u+Pj48PFixeZM2eOre3s2bOsX7+e2NhY3NzcKFmyJM7OzoAZ3pEWjEuVKoXFYsHJyYkrV67g7OyMr68vSUlJzJkzh5iYGNs+77//ft577z2io6M5ffo04eHhtragoCC8vLyYP38+cXFxJCcnc+DAAX7//fccPZdWrVrh7+/Pp59+esN9+fn5ceLEiZuaEaR3797Mnz+fgwcPAnD58mW++eYbW3vdunXx9fVl7Nix3H333fj4+OTK8xIRyQ8K1yIiueDRRx8lPj6eFi1a0KtXL1q3bm1rS0lJYeHChbRu3ZpmzZqxfft2wsLCADOm+aGHHiI4OJinn36aMWPGEBgYyN13302bNm3o1KkT7dq1w93d3W6oxeDBg6lQoQLt27fnscceo1OnTri5uQHg7OzM3Llz2bdvH+3bt6dFixaMHTvWLpzfyOOPP84HH3xAcnLydffVuXNnAJo3b86//vWvHO27Y8eOPP744wwbNozGjRsTEhLC5s2b7dZ54IEH+N///kdISIhtWW48LxGRvKaTyIiIFAFLlizh66+/tuvBFhGR/KeeaxGRQigqKoqff/6ZlJQUDh8+zMKFC+nQoYOjyxIRKfZ0QKOISCGUmJhIWFgYf//9N97e3jzwwAP06dPH0WWJiBR7GhYiIiIiIpJLikzPdUpKCleuXMHV1VVTMYmIiIhInrFarSQmJuLp6YmTk/0o6yITrq9cucKBAwccXYaIiIiIFBM1a9bE29vbblmRCddpp9etWbOmbToqEREREZHclpCQwIEDB2z5M6MiE67ThoK4ubnh7u7u4GpEREREpKjLaiiypuITEREREcklCtciIiIiIrlE4VpEREREJJcUmTHXIiIiUvgkJiby999/ExcX5+hSRDLx8PCgUqVKWR64mB2FaxEREXGYtLOMVq1aVeepkALFarVy7tw5/v77b6pVq5bj7TQsRERERBwmLi4OPz8/BWspcCwWC35+fjf9rYrCtYiIiDiUgrUUVLfyu6lwLSIiIiKSSxSuRURERFK1a9eOAwcOZFrer18/NmzYAMDs2bNp2bIloaGhtktMTAwAkZGRDBkyhPbt29OxY0cGDhyY5f4Ahg8fzvz58233w8PDqVu3rm1fACEhIWzZsuW6NYeGhuZo6EJ2zw1g0aJFnDt37ob7yMqoUaMIDw/Psi06OprRo0fTvn17OnXqRK9evfjpp59u6XEKC4VrERERKXyO74Pvl5lrB+jevTsRERG2i5eXF4mJiQwYMIDg4GDWr1/PunXreOihh+jfvz/R0dGZ9tG8eXO2bdtmu79t2zbq16/Pjh07ADh//jzHjh0jODj4urVERETg4eHxj57P4sWLbzlcX8/QoUPx8vJi7dq1rFmzhhdffJGhQ4dy6NChXH+sgkKzheSGLVtg40a4915o2dLR1YiIiBROOzfAr+tvvF58LJw+ClYrWCxQviq4l7z+NsHtoVHb3KgyW6tWrcLb25v+/fvblnXu3JnVq1cTHh7O4MGD7dZv3rw5U6dOJSkpCRcXF/bs2cOwYcPYunUr9957L9u2bSMoKAgPDw8OHz7M1KlTuXDhAomJiTz66KP06NEDgFq1avHLL7/g6enJjh07mDhxom3/69ev57333qNmzZoAfPPNN4wbN44zZ84wYMAA+vbty9y5c4mKimLIkCG4u7szc+ZMKleuzKxZs9i+fTuJiYnUrFmTCRMm4OnpyenTpxkxYgQXLlygUqVKJCcnZ/l6bN++nSNHjvD+++/j7OwMQLNmzejZsyfvvfceM2bMYNSoUdSvX5++ffsC2N2PiYlh2rRp7N+/n/j4eJo3b87LL7+Ms7Mz7dq1Y968ebbnlfH+9V6r/KCe639q40a45x4YNw7atzdBW0RERPJO3BUTrMFcx13J9xKWL19uGxKSFmb3799Pw4YNM63bqFEj9u/fn2l55cqVKVWqFH/88Qd//vknVapUoUWLFmzfvh0wPdnNmzcnKSmJ4cOH8/LLL7Ns2TKWLFnC/PnzM/X+JiQkMGzYMMLCwlixYgXNmzfn5MmTduvExcXx6aefsnjxYmbOnMmVK1d4+umn8ff35+233yYiIoLbb7+dDz74AG9vbz7//HMiIiLw9/e3DWGZMmUKTZs25auvvuLll1+2633PaP/+/dSrVy/THNGNGjXKdnhKRtOmTaNp06a2Gs6fP8+yZcuuu01OX6u8pJ7rf2rtWkhMNLcTEkzYVu+1iIjIzWvUNme9y8f3wUdhkJwEzi7Q4wUIrJ339WXQvXt3Ro4cabfMmhb4b0KzZs3YunUrXl5eNGvWDF9fX+Lj44mJiWHbtm2MHTuWo0ePcujQIYYNG2bbLjExkcOHD1OjRg3bssOHD+Ph4UGTJk0A6NixIz4+PnaP16VLFwAqVaqEj48Pp06dsttHmu+++46YmBjWrFkDmOBeu7Z5jbdu3crYsWMBCAwMpGU2ued6r0dOZuH47rvv+P3331m4cCFgPhiUL1/+utvk9LXKSwrX/1TXrjBjBiQng6urGRoiIiIieSewNjw6EY7+AVXr5Xuwzk7t2rVZsmRJpuU7d+60DV+4VrNmzVi9ejXe3t7069cPgODgYNatW8dff/1FcHAwf/31F2XKlCEiIuKGNdwotLq7u9tuOzs7Zzukw2q1EhYWlm1wzonatWvzwQcfkJiYaNd7vXPnTts4cmdnZ1JSUmxt8fHxdjW8++67BAYGZtp3dttZrdYcv1Z5Jd+GhRw5coRevXrZjhQ9evRopnVGjBhhd+Rt7dq1Wb8+B2OvHKllSwgPN2O+undXr7WIiEh+CKwNrXsUmGANplc4Ojra1tMKsHr1arZt22YbU3yt5s2b88svv7Br1y4aNGgAQNOmTZk3bx4NGzbE3d2datWq4eHhwfLly23bHTp0yG5WEYDq1asTGxvLzz//DMC3337LpUuXclS7p6cnly9ftt1v164dixYtss1CEhMTYxta0aJFC9vwjOPHj2c7m0nTpk2pUqUKr732mi3Eb9++nXXr1vHEE08AZmjMrl27AIiKimLr1q12NcyfP9+27fnz5zl+/Him7bZs2cLZs2cBcvxa5aV867kOCwujT58+hIaGEhERwfjx41m8eLHdOjNmzLDd3rdvH48++iitW7fOrxJvXZUqEBwMX3wBJ09CxYqOrkhERERuUf/+/W0H4AGsWLEiR9u5ubmxYMECXn31VT7++GOcnJwIDAxkwYIFlC5dOsttAgMDKV26NIGBgbbe3WbNmnH06FFCQkIAcHFxYd68eUydOpUPP/yQlJQU/Pz8ePPNNzM9/syZM5kwYQIeHh60aNGCsmXL4u3tfcPaH3nkEUaPHo2HhwczZ87kiSeeYM6cOfTs2ROLxYLFYuHZZ5+lRo0ajBkzhhEjRrB69WqqVatGq1atst3v22+/zfTp0+nYsSNWq5XY2FgiIiKoUKECAA8//DBDhgyhW7duVK1alaCgINu2o0eP5rXXXiM0NBSLxYKrqyujR48mMDCQoUOHMmrUKD777DMaN25MxdTsldPXKi9ZrLcyQOgmnTt3jk6dOrF161bbVxDNmzdn7dq1+Pr6ZrnNlClTAGxjem4kPj6e3bt3U79+fbuvPPLcli3mQMb4eEhJgR494PPP8+/xRURECrG9e/dSp04dR5dRZMTExODl5QXATz/9xKhRo/juu+9wcnL8HBYxMTG88MIL+Pr6Mm3atAJRU05k9Tt6vdyZLz3XkZGRlC9f3vYp0NnZGX9/fyIjI7MM1wkJCaxYsYJFixblR3n/zLffQlxc+nRAX34Jx46Z3mwRERGRfLR27VoWLVqE1Wq19WQXlBDr5eXF+++/7+gy8lyBPKDx22+/pWLFioXjk2z79jBhggnX7u6QlARTpkAx+OURERGRguXBBx/kwQcfdHQZxVq+fJQJCAjg9OnTtgHpycnJREVFERAQkOX6y5Yty9fJvv+Ru+6CtHE8Tz1lLgsXQhE+85CIiIiIZC1fwrWfnx916tRh5cqVAKxcuZI6depkOSTk1KlT/Pzzz7ZB/IXCc89B27awZAk8/7yZkm/SJEdXJSIiIiL5LN8G4UyYMIHw8HA6depEeHi47WxGgwYNsk2lAvDll1/Stm3bbI+qLbBeeQWiouC//4XBg830fPv2OboqEREREclH+TJbSH5w2GwhGb32GnTrBr6+UK0ahITA0qWOqUVERKQQ0GwhUtDd7GwhBePw0aLipZegVi0oVw6GDoVPP4UMvfIiIiIiUrQpXOe2Q4fg3/+GRx8FHx8IC3N0RSIiIpJD7dq148CBA5mW9+vXjw0bNgAwe/ZsWrZsaXdW6bQzAEZGRjJkyBDat29Px44dGThwYJb7SxMdHc3o0aNp37697SzWP/30U948OckXBXIqvkItMdGMu65YEYYNM9P0/fwz3HmnoysTEREpOo7vg6N/QNV6DjkFevfu3Rk5cqTdssTERAYMGMDDDz/M22+/DZjTn/fv35+vv/6aUqVKZdrP0KFDqVmzJmvXrsXZ2Zlt27bx3HPPsWTJEmrUqJEvz0Vyl8J1bqtdGx55BN55B375Bd5+G8aPh1WrHF2ZiIhIwbcwizMz12sFze6HhHj4ZDLEx8Lpo+kncLu7B7T/D1y5BP+dkXn7pp2h/t15XvqqVavw9vamf//+tmWdO3dm9erVhIeHM3jwYLv1t2/fzpEjR3j//fdtJ9pr1qwZPXv25L333mPGjBmMGjWK+vXr07dvXwC7+zExMUybNo39+/cTHx9P8+bNefnll3F2dqZdu3bMmzePmjVrAtjdP3z4MFOnTuXChQskJiby6KOPFp4pkAsBDQvJC2Fh5lTob79txmF//bU5TbqIiIj8c3FXTLAGc332RL6XsHz5ctuQkLQZ0Pbv30/Dhg0zrduoUSP279+fafn+/fupV68erq6umda/3lCSNNOmTaNp06Z8/vnnREREcP78eZYtW3bdbZKSkhg+fDgvv/wyy5YtY8mSJcyfP59DOj9HrlHPdV6oWhUGDYL5803v9RtvwLhx5lTpIiIikr3+U7Jvc3M37cf3wUdhkJwEzi5wVzfT7ulz/e1zUVbDQm52ArbrrW+xWG64/Xfffcfvv//OwoULAYiLi6N8+fLX3ebo0aMcOnSIYcOG2ZYlJiZy+PBhDUPJJQrXeWXsWHNAY6VK8PLLZvz1pk1wzz2OrkxERKRwC6wNj0506JjrrNSuXZslS5ZkWr5z507b8Ixr1//ggw9ITEy0673euXMnwcHBADg7O5OSkmJri4+Pt922Wq28++67BAYGZtp3dttZrVbKlClDRETELTxDyQkNC8krAQEwbRqUKWNOiV6xoum9LhrTiouIiDhWYG1o3aPABGuALl26EB0dbetJBnNA47Zt22xjpjNq2rQpVapU4bXXXiM5ORkw47DXrVvHE088AUDlypVtJ9uLiopi69attu3btWvH/PnzbdueP3+e48ePZ9puy5YtnD17FoBq1arh4eHB8uXLbfs5dOiQbbYT+efUc53X1q0zPdajR8Ozz5r7993n6KpEREQkG/3797cdYAiwYsWKHG3n5ubGggULePXVV/n4449xcnIiMDCQBQsWZHvm6bfffpvp06fTsWNHrFYrsbGxREREUKFCBQAefvhhhgwZQrdu3ahatSpBQUG2bUePHs1rr71GaGgoFosFV1dXRo8eTWBgIEOHDmXUqFF89tlnNG7cmIoVKwLg4uLCvHnzmDp1Kh9++CEpKSn4+fnx5ptv3tqLJZnoDI15bcIEmDjRHNDYqxdUqAA//WSObhYRESnmdIbGdDExMbzwwgv4+voybdo0nJw0wKAg0BkaC5phw8zp0CdNMsNCtm3TtHwiIiKSiZeXF++//z7Tp09XsC7E9JPLaz4+MHIkfPMN3H47VK9uQnaGgwxEREREpGhQuM4Pzz5rhoOEhZnLzp3w5ZeOrkpEREREcpnCdX4oWRJmzDBjrv/9b3MWx7AwSD26V0RERESKBoXr/NKvHzzzDLi6moMc//gDPv3U0VWJiIiISC5SuM5PSUnw3ntmHHaDBiZkJyU5uioRERERySUK1/lt1iwYMcIMCzl4EMLDHV2RiIiIpGrXrh0HDhzItLxfv35s2LABgNmzZ9OyZUtCQ0Ntl7STsERGRjJkyBDat29Px44dGThwYJb7Axg+fDjz58+33Q8PD6du3bp2J3QJCQlhy5Yt1605NDSUuLi4W35uAIsWLeLcuXM33EdWRo0aRXg2eaZWrVp07dqVbt260bVrV9avX39Lj3E9tWrV4sqVK7m+31ulcJ2fXFzMnNe7d0NcHNx5p7mfkODoykRERAqXLVvMmZBvEDzzSvfu3YmIiLBdvLy8SExMZMCAAQQHB7N+/XrWrVvHQw89RP/+/YmOjs60j+bNm7Nt2zbb/W3btlG/fn127NgBmDMuHjt2zHYq9OxERETg4eHxj57P4sWLbzlc38jSpUv56quvGD58OMOHDyepiH9rrzM05reHHjJ/DMLCTC92t26wcCE8+aSjKxMREXGsxYthwYIbrxcdDb//bqa1dXKCoCAoVer62wwYAI88kjt1ZmPVqlV4e3vTv39/27LOnTuzevVqwsPDGTx4sN36zZs3Z+rUqSQlJeHi4sKePXsYNmwYW7du5d5772Xbtm0EBQXh4eHB4cOHmTp1KhcuXCAxMZFHH32UHj16AKbn9pdffsHT05MdO3YwceJE2/7Xr1/Pe++9R82aNQH45ptvGDduHGfOnGHAgAH07duXuXPnEhUVxZAhQ3B3d2fmzJlUrlyZWbNmsX37dhITE6lZsyYTJkzA09OT06dPM2LECC5cuEClSpVsp1+/kebNmxMbG8ulS5fw9fVlwYIFrFq1iuTkZNzd3ZkwYYLtZC21atXihRdeYN26dVy8eJERI0bQqVMnANauXcsbb7xB6dKladOmjd1jbN68mTfeeIPk5GR8fX2ZNGkSVapUYevWrbzyyisEBQXx22+/4eLiwowZM5gzZw4HDx4kICCA2bNnU7JkyVv4ydtTz3V+c3KCyZPh0CGIjISWLWHKFNOTLSIiIjcWHZ1+voiUFHM/ny1fvtw2JCQtzO7fv5+GDRtmWrdRo0bs378/0/LKlStTqlQp/vjjD/7880+qVKlCixYt2L59O2B6sps3b05SUhLDhw/n5ZdfZtmyZSxZsoT58+dz6NAhu/0lJCQwbNgwwsLCWLFiBc2bN+fkyZN268TFxfHpp5+yePFiZs6cyZUrV3j66afx9/fn7bffJiIigttvv50PPvgAb29vPv/8cyIiIvD397cNYZkyZQpNmzblq6++4uWXX7brfb+edevW0aJFC3x9fQHT+79s2TKWL1/O0KFDCQsLs1vfy8uLZcuWMWPGDKZMmQLAuXPnGDduHO+++y5Lly7F1dXVtv65c+cYMWIEr7/+OitWrCAkJIThw4fb2g8dOsR//vMfVqxYQaNGjRg4cCAvv/wyX3/9NU5OTqzKpZP8qefaEUJCzCfoGjVM0O7QAebPhyFDHF2ZiIiI4zzySM56l7dsgfbtzbBKNzf45BPTWZWPunfvzsiRI+2WWa3Wm95Ps2bN2Lp1K15eXjRr1gxfX1/i4+OJiYlh27ZtjB07lqNHj3Lo0CGGDRtm2y4xMZHDhw9To0YN27LDhw/j4eFBkyZNAOjYsSM+Pj52j9elSxcAKlWqhI+PD6dOnbLbR5rvvvuOmJgY1qxZA5jgXrt2bQC2bt3K2LFjAQgMDKTlDV773r17c+XKFc6dO2c3Nnv37t289957REdHY7FYOHr0aJa1NmrUiKioKOLj49m5cyd169alevXqAPTq1YvXX38dgN9++43atWtz++23A9CjRw8mTpxoG8NerVo1W8943bp1OXnyJBUqVACgXr16HDt27LrPI6cUrh3BYoEPPzS3rVa4916YOhUef9zMiS0iIiLZa9kS1q+HjRvN/9B8DtbZqV27NkuWLMm0fOfOnbZhGddq1qwZq1evxtvbm379+gEQHBzMunXr+OuvvwgODuavv/6iTJkyRERE3LAGi8Vy3XZ3d3fbbWdn52yHdFitVsLCwm4YnHNi6dKleHp68uGHHzJkyBBWr16NxWJh6NChhIeHU69ePU6fPp1piEdarc7OzgAkJSVd9wOM1Wq97vN3c3Oz3XZ2ds70WsTHx9/S87uWhoU40oULMGkSvPwynD4N777r6IpEREQKh5Ytzf/PAhKswfS0RkdHs3DhQtuy1atXs23bNvr27ZvlNs2bN+eXX35h165dNGjQAICmTZsyb948GjZsiLu7O9WqVcPDw4Ply5fbtjt06JDdrCIA1atXJzY2lp9//hmAb7/9lkuXLuWodk9PTy5fvmy7365dOxYtWmSbhSQmJsY2DKVFixYsW7YMgOPHj99wNpM0AwYMwM/Pj6VLl5KQkEBSUhIBAQEAWX4oyUpwcDB79uyx9XJ/9tlndm179+611fnll19St25dvLy8crTv3KKea0fav9/MdT11Ktx3H7z6qjmw0dvb0ZWJiIgUW/3797f1lgKsWLEiR9u5ubmxYMECXn31VT7++GOcnJwIDAxkwYIFlC5dOsttAgMDKV26NIGBgbbxw82aNePo0aOEhIQA4OLiwrx585g6dSoffvghKSkp+Pn58eabb2Z6/JkzZzJhwgQ8PDxo0aIFZcuWxTsHueKRRx5h9OjReHh4MHPmTJ544gnmzJlDz549sVgsWCwWnn32WWrUqMGYMWMYMWIEq1evplq1arRq1SpHr4/FYmHkyJG88MIL9O7dmyFDhtCzZ08CAgIy9Vpnx8/Pj8mTJ/PUU09RunRpOnfubGvz9fVlxowZthlJfH19ee2113K039xksd7KAKECKD4+nt27d1O/fn27bv4Cr2tX+OEH+PxzM/Z6yhQYM8bRVYmIiOSLvXv32sbByj8XExNj66n96aefGDVqFN999x1OThqscKuy+h29Xu5Uz7WjTZ4MwcFm3FjXrvD66zB4MGTzCVdEREQkO2vXrmXRokVYrVZbT7aCdf5SuHa0Ro3g4YfNnNdffQUrVsAbb5ix2CIiIiI34cEHH+TBBx90dBnFmj7KFASTJpkhIdWrQ48e8OabkEdnSRIRERGRvKNwXRDUqgXLl0PVquZ06DEx4IAB+CIiIiLyzyhcFyR//gnbt8O//w2zZ5vp+URERESk0FC4LkjeegsGDTJnb4yLM1PziYiIiEihoXBdkIweDa6u8NFH8OijMHcunDjh6KpEREREJIcUrguSgAB49lkID4fevSE52ZxgRkRERPJFu3btOHDgQKbl/fr1Y8OGDQDMnj2bli1bEhoaaruknS0xMjKSIUOG0L59ezp27MjAgQOz3F+aWrVq0bVrV7p160bXrl1Zv359rj+nWrVqceXKlVzfr2RN4bqgGTkSvLxg/nwYOBDefx+OHXN0VSIiIgXLli0wbZq5doDu3bsTERFhu3h5eZGYmMiAAQMIDg5m/fr1rFu3joceeoj+/fsTHR2d7b6WLl3KV199xfDhw21nF5TCS/NcFzR+fjBqFERGwksvwaJF5kQzH3zg6MpERETy3r33Zl728MPwzDMQGwtdukB0NPz+O6SkgJMTvPyyOcPx2bPQs2fm7Z9+Gnr1yvPSV61ahbe3N/3797ct69y5M6tXryY8PJzBgwdfd/vmzZsTGxvLpUuX8PX1ZcGCBaxatYrk5GTc3d2ZMGGC7UyBtWrV4oUXXmDdunVcvHiRESNG0KlTJ8CcSOaNN96gdOnSmU4rvnnzZt544w2Sk5Px9fVl0qRJVKlSha1bt/LKK68QFBTEb7/9houLCzNmzGDOnDkcPHiQgIAAZs+eTcmSJXP5VSt61HNdEI0ebWYLqVwZnnzSBOw//3R0VSIiIgVDdLQJ1mCu9+3L9xKWL19uGxIyceJEAPbv30/Dhg0zrduoUSP2799/w32uW7eOFi1a4OvrC5je8WXLlrF8+XKGDh1KWFiY3fpeXl4sW7aMGTNmMGXKFADOnTvHuHHjePfdd1m6dCmurq629c+dO8eIESN4/fXXWbFiBSEhIQwfPtzWfujQIf7zn/+wYsUKGjVqxMCBA3n55Zf5+uuvcXJyYtWqVTf/QhVD6rkuyDZvhgceMENDJk6Ejz92dEUiIiJ5a+PG7NtKljTtW7ZA+/aQkABubvDii6a9bNnrb5+LunfvzsiRI+2WWa3WW9pX7969uXLlCufOnSM8PNy2fPfu3bz33ntER0djsVg4evSo3XZdunQBTHiPiooiPj6enTt3UrduXapXrw5Ar169eP311wH47bffqF27NrfffjsAPXr0YOLEibbx4tWqVbP1jNetW5eTJ09SoUIFAOrVq8cxDVPNEYXrgio+3hzUeMcdMHgwzJxperRTf+lFRESKrZYtYf16E6TvvdfcLwBq167NkiVLMi3fuXMnNWvWzHa7pUuX4unpyYcffsiQIUNYvXo1FouFoUOHEh4eTr169Th9+nSmIR7u7u4AODs7A5CUlHTdgG+1WrFYLNm2u7m52W47Ozvb9p92Pz4+PtttJZ2GhRRU7u4wZozpvW7WDDw9YcIER1clIiJSMLRsacZaF5BgDaYnOTo6moULF9qWrV69mm3bttG3b98bbj9gwAD8/PxYunQpCQkJJCUlERAQAJBlaM9KcHAwe/bssfVyf/bZZ3Zte/fu5dChQwB8+eWX1K1bFy8vr5w+RckB9VwXZIMGmdOgz5gBQ4aYaflGj4YsxnOJiIhI7ujfv7+tNxhgxYoVOdrOzc2NBQsW8Oqrr/Lxxx/j5OREYGAgCxYsoHTp0jfc3mKxMHLkSF544QV69+7NkCFD6NmzJwEBAZl6rbPj5+fH5MmTeeqppyhdujSdO3e2tfn6+jJjxgzbjCS+vr689tprOdqv5JzFeqsDhG7SkSNHGDVqFBcvXqR06dJMnz6dqlWrZlrv66+/Zu7cubavLhYuXEjZsmVvuP/4+Hh2795N/fr17b7GKPQWLjRnbPz4YzMH9r33wvLljq5KREQkV+zdu9c2zlekIMrqd/R6uTPfhoWEhYXRp08f1qxZQ58+fRg/fnymdXbt2sWcOXNYsGABK1euZMmSJXh7e+dXiQVTv37QqhVYreaAjYgI2LHD0VWJiIiISBbyJVyfO3eOPXv2EBISAkBISAh79uzh/PnzdustWrSIAQMGUK5cOQC8vb2LVi/0rXBxge+/NyF76FDw9YVx4xxdlYiIiIhkIV/CdWRkJOXLl7eNX3J2dsbf35/IyEi79Q4dOsTx48f5z3/+w7/+9S/efffdW57WpkixWMyp0FevhuHDzfX//ufoqkRERHKF/tdLQXUrv5sFaraQ5ORk9u/fz8KFC/n444/ZvHkzERERji6rYPjuO3N2KW9v8PdX77WIiBQJHh4enDt3TgFbChyr1cq5c+fw8PC4qe3yZbaQgIAATp8+TXJyMs7OziQnJxMVFWWbXiZNxYoV6dy5M25ubri5udG+fXt+//13unfvnh9lFmwdOkDz5jB9uum9HjECNmyAtm0dXZmIiMgtq1SpEn///TdnzpxxdCkimXh4eFCpUqWb2iZfwrWfnx916tRh5cqVhIaGsnLlSurUqWM7vWeakJAQNm3aRGhoKElJSfz000906tQpP0os+CwWeOUVE7ItFrjtNtN7/f335r6IiEgh5OrqSrVq1RxdhkiuybdhIRMmTCA8PJxOnToRHh7OxIkTARg0aBC7du0C4IEHHsDPz48uXbrQvXt3br/9dnr27JlfJRZ87dtDu3Zm7usXX4Qff4S1ax1dlYiIiIikyrd5rvNakZ3n+lo//QSPPQaffgqhoVCuHGzbpt5rERERkXxSIOa5llzSogXs2WPO0jh+vJnzOodnjhIRERGRvKVwXRg5OcHly1ClCtx+uwnZKSmOrkpERESk2FO4LqxeeMEMC3n+efjtN1i2zNEViYiIiBR7CteF1fDhcPUqHDwIdepAWJg50YyIiIiIOIzCdWFVuzY88gjMmwdDhsDevbB0qaOrEhERESnWFK4Ls7AwM9b6118hKAgmTICkJEdXJSIiIlJsKVwXZlWrwqBB8NdfJlj/+ScsXuzoqkRERESKrXw5Q6PkoVmzwM0NrFZo2hQmTYK+fc0yEREREclX6rku7NJC9N9/w+DBcOwYfPihY2sSERERKaYUrouC+Hi4805zMpm77oJXXoG4OEdXJSIiIlLsKFwXBe7u8MwzZq7rRx6BEyfgvfccXZWIiIhIsaNwXVQMGwa+vrB8ObRtC1OnwpUrjq5KREREpFhRuC4qfHxg1ChYvRp69oSoKHjnHUdXJSIiIlKsKFwXJYMHQ6VK5syNnTvDjBlw6ZKjqxIREREpNjQVX1FSsiTs2weentCmDTRrBm+9BePGOboyERERkWJBPddFjaenuXZ3h65dYeZMuHDBsTWJiIiIFBMK10XRt99Cw4bmwMboaHjjDUdXJCIiIlIsKFwXRffeC7VqmZPJ9OwJb74JZ886uioRERGRIk/huihycYGJE+GPP8wp0a9cMQc3ioiIiEieUrguqh56yAwNmT8f/v1vmDMHTp1ydFUiIiIiRZrCdVHl5ASTJ5v5rh9+GBIS4NVXHV2ViIiISJGmcF2UhYTAsWMQGgqPPgpz58Lffzu6KhEREZEiS+G6KLNYoEwZsFrhscfM9SuvOLoqERERkSJL4bo4GDIEevQwvdcffghHjji6IhEREZEiSeG6OPjPf+DMGfDzSx+LLSIiIiK5TuG6OGjRwoy/fu896N8fFi+GgwcdXZWIiIhIkaNwXVxMngwXL4KHhzk1+sSJjq5IREREpMhRuC4uGjUyU/KtWwfPPANLlpiTzIiIiIhIrlG4Lk7eeQd+/hlGjQIvL5gwwdEViYiIiBQpCtfFSdmyZkiIpyc8+SR8/jns3OnoqkRERESKDIXr4iYhAerVM7OHlC4N48c7uiIRERGRIkPhurhxc4P774dPPjEzh6xYAdu2OboqERERkSJB4bo4GjMGXF3h5Ekz97V6r0VERERyhcJ1cRQQAM8+C//9rzlr45o18MMPjq5KREREpNBTuC6uRo40M4ZYLFC+PIwb5+iKRERERAo9F0cXIA7i5we//AI1akDlyjB0KHz3HbRr5+jKRERERAot9VwXZ7ffbnquH34YKlWCsWPBanV0VSIiIiKFlsJ1cffDDyZk9+oFW7bA6tWOrkhERESk0FK4Lu4aNwZvb9i6FapUMWOv1XstIiIicksUrou7kiXNcJAffoAePczp0SMiHF2ViIiISKGkcC0waJDptd60yQwRGT8eUlIcXZWIiIhIoZNv4frIkSP06tWLTp060atXL44ePZppndmzZ9OyZUtCQ0MJDQ1l4sSJ+VVe8ebmBmFhpte6Xz/YtQs+/9zRVYmIiIgUOharNX8G2D7yyCP06NGD0NBQIiIiWLZsGYsXL7ZbZ/bs2cTGxjJy5Mib3n98fDy7d++mfv36uLu751bZxUdSEuzZA/XqQVCQ6bnevRucnR1dmYiIiEiBcr3cmS891+fOnWPPnj2EhIQAEBISwp49ezh//nx+PLzkhIuLCdXOzub06Pv2wZIljq5KREREpFDJl3AdGRlJ+fLlcU7tBXV2dsbf35/IyMhM665atYquXbsyYMAAfv311/woTzKaNAkmTICGDWHiREhMdHRFIiIiIoVGgTqgsXfv3qxfv54VK1YwcOBAnnnmGS5cuODosoqXxo3h4EFo1QoOHYKPPnJ0RSIiIiKFRr6E64CAAE6fPk1ycjIAycnJREVFERAQYLdeuXLlcHV1BaBVq1YEBARw8ODB/ChR0jzwALRoAV99BU2awOTJEB/v6KpERERECoV8Cdd+fn7UqVOHlStXArBy5Urq1KmDr6+v3XqnT5+23d67dy8nTpygWrVq+VGipLFY4JVX4O+/4c474a+/4MMPHV2ViIiISKGQo9lCunfvTvfu3QkJCaFs2bK39ECHDh1i1KhRXLp0CR8fH6ZPn0716tUZNGgQQ4YMoUGDBowcOZI//vgDJycnXF1dGTJkCPfcc0+O9q/ZQnJZ+/ZmWEilSnD4sLldooSjqxIRERFxuOvlzhyF6zVr1rBixQp++OEHmjRpQmhoKB07dsTDwyPPir5ZCte57OBB8PKC/fuhbVt44w144QVHVyUiIiLicP84XKe5ePEi33zzDV999RUHDx6kY8eOdOvWjZYtW+Z60TdL4TqPWK2mF/uPP0wPtqenoysSERERcahcm+e6dOnSdO/end69exMQEMDatWsZP348nTp14n//+1+uFi0FQEKC6bUODISoKJgzx9EViYiIiBRoLjlZKSUlhR9//JGIiAg2btxIo0aNeOKJJ2xDQ9asWcNLL73Ejz/+mNf1Sn5yc4MKFWDZMmjXDmbMgKefBh8fR1cmIiIiUiDlqOe6devWTJ8+nVq1arFq1So++OADunbtahtz3alTJ6pXr56nhYqDTJoEcXFQvjycPw9vvunoikREREQKrByNud61axcNGjTIj3pumcZc56GBA+GTT+Dee2HLFjhyBK6ZRlFERESkuPjHY64PHTrEvn377Jbt27eP5cuX51qRUoCNHw8pKVCuHFy6BDNnOroiERERkQIpR+H6rbfeynQ2xQoVKvDWW2/lSVFSwFSpAl9/De+9B716wVtvwZkzjq5KREREpMDJUbiOiYnBy8vLbpm3tzeXLl3Kk6KkAOrQAUqWNL3YV6+agxtFRERExE6OwnWNGjVYs2aN3bJ169ZRo0aNPClKCqhffoF//Qu6dDHT8kVGOroiERERkQIlR1PxDR8+nCeeeIJvvvmGwMBA/vrrL7Zs2cL8+fPzuj4pSKpUgVOnoGpVSEyEadPg7bcdXZWIiIhIgZGjnusmTZqwcuVKGjRowNWrVwkKCmLlypXceeedeV2fFCR+fjBsGKxdCyEhZgz28eOOrkpERESkwLip058XZJqKL59cugTVq0O9emZavv79TcgWERERKSaulztzNCwEYP369Wzfvp0LFy6QMY/P0IFtxYuPD4waBS+9BD16wIIFMHKkCdwiIiIixVyOhoXMmTOHsLAwUlJSWL16NaVLl+aHH37AR6fBLp4GD4b//tecrdHFBSZPdnRFIiIiIgVCjsL1smXLWLBgAaNHj8bV1ZXRo0czb948/v7777yuTwqiEiXgoYegUiV46ilYvBj273d0VSIiIiIOl6NwfenSJWrWrAmAq6sriYmJBAUFsX379jwtTgq499+HX38Fd3eYONHR1YiIiIg4XI7CdeXKlTl48CAAd9xxB//3f//H8uXLKVWqVJ4WJwWcqyts2gSdO8PSpbB7t6MrEhEREXGoHIXr559/nosXLwJmzuuPP/6Y1157jVGjRuVlbVLQ9e0LtWrBnj3g6QlhYY6uSERERMShbjhbSEpKCm5ubjRs2BCAoKAg1q1bl+eFSSGQdjDjww+bMzd+8YUZJhIc7OjKRERERBzihj3XTk5OPPPMM7i5ueVHPVLY9OgBjRrBzp1QujSMH+/ggkREREQcJ0fDQpo2bcrOnTvzuBQplJyc4J134JNPzNzXK1fCTz85uioRERERh8jRSWQqVqzIoEGDaN++PRUqVMBisdjahg4dmmfFSSFx113mukEDmDXL9F6vXevYmkREREQcIEfhOj4+ng4dOgBw+vTpPC1ICqmEBHjxRbj7bli+HDZvhjZtHF2ViIiISL7KUbieNm1aXtchhZ2bGxw6BL/9Bv7+MG4cbNwIGb7lEBERESnqchSujx8/nm1bYGBgrhUjhdwrr0CLFhASYsZer18Pqd94iIiIiBQHOQrXHTt2xGKxYLVabcvSxl3v3bs3byqTwqd5c+ja1QwJue0203vdvr16r0VERKTYyFG43rdvn939M2fOMGfOHJo0aZInRUkhNmUKNGxoDnL85hv4+mt44AFHVyUiIiKSL3I0Fd+1ypUrx5gxY3jjjTdyux4p7IKCYO5cePNNqF7dzByS4RsPERERkaLslsI1wOHDh7l69Wpu1iJFxVNPQc2aJlj/8ouZPURERESkGMjRsJA+ffrYzW199epV/vzzTwYPHpxnhUkhd/AgLFsG1aqZkB0aak44IyIiIlKE5ShcP/TQQ3b3S5QoQe3atalatWpe1CRFgZsbrF5t5r3esAH++1/o3dvRVYmIiIjkKYvVWjQGxMbHx7N7927q16+Pu7u7o8sRgOeeM+Ovq1c3vda7d4NLjj7PiYiIiBRY18udOfqe/tlnn2XHjh12y3bs2MGQIUNyr0opesaMMT3YAQGwfz988omjKxIRERHJUzkK19u3byc4ONhuWaNGjdi6dWueFCVFRIUKpvf6+++hdm2YNAkSEx1dlYiIiEieydF39G5ubly9ehUvLy/bstjYWFz0Fb/cyIgRULIk1KkDvXrBokUwaJCjqxIRERHJEznqub777rsZP348MTExAMTExDBp0iRat26dp8VJEeDnB2Fh8NBD5gyOkydDfLyjqxIRERHJEzkK16NGjSImJoZmzZrRsmVLmjVrRkxMDKNHj87r+qSoWL3aDBM5fhzef9/R1YiIiIjkiRyN6yhVqhTz58/nzJkzREZGEhAQQLly5fK6NilK/vgDIiLMGRynToWBA6FECUdXJSIiIpKrctRz/cMPP3DkyBHKlStHUFAQ5cqV4/Dhw/z44495XZ8UFYMHm1lDrFaIjDRT9ImIiIgUMTkK15MmTcLT09NumaenJ5MmTcqToqQIKlECxo6FXbugUSOYNg1Sx/CLiIiIFBU5Ctfnzp3D39/fbpm/vz9nzpzJ8QMdOXKEXr160alTJ3r16sXRo0ezXffw4cM0bNiQ6dOn53j/Ugg8/jhUrQpxcXD2LMye7eiKRERERHJVjsJ1YGAgW7ZssVu2detWKlWqlOMHCgsLo0+fPqxZs4Y+ffowfvz4LNdLTk4mLCyMDh065HjfUki4ucHMmTB8OHTpAq+9BtHRjq5KREREJNfk6IDGZ599lueee46ePXsSGBjI8ePH+eKLL5g6dWqOHuTcuXPs2bOHhQsXAhASEsLkyZM5f/48vr6+duvOnz+fe++9l9jYWGJjY2/y6UiB9+CD5jo4GO68E2bNggkTHFqSiIiISG7JUc91hw4dWLBgAbGxsWzatInY2Fg++OCDHPcuR0ZGUr58eZydnQFwdnbG39+fyMhIu/X27dvHDz/8wGOPPXZzz0IKl6Qkc9bG5s1NuD5/3tEViYiIiOSKHJ9iMSgoiKCgoDwrJDExkXHjxjFt2jRbCJciysnJnKnx7Fm4dAlef91MzyciIiJSyOU4XO/du5cdO3Zw4cIFrFarbfnQoUNvuG1AQACnT58mOTkZZ2dnkpOTiYqKIiAgwLbOmTNn+Ouvv3jiiScAuHTpElarlZiYGCZPnnwzz0kKOicnmDIFQkKgSRN46y14/nm45qBZERERkcImR+H6008/Zdq0abRq1YrNmzfTpk0bfvzxR9q3b5+jB/Hz86NOnTqsXLmS0NBQVq5cSZ06dezGW1esWJGtW7fa7s+ePZvY2FhGjhx5k09JCoUuXaBlSzhyBK5ehenTzcGOIiIiIoVYjsZcf/DBB3zwwQe88847eHh48M477/DWW2/h4pLjjm8mTJhAeHg4nTp1Ijw8nIkTJwIwaNAgdu3adWvVS+FlscArr8CpU6b3+t134eRJR1clIiIi8o9YrBnHeGSjcePG/PLLLwA0b96cLVu24OTkRLNmzdi2bVueF5kT8fHx7N69m/r16+Pu7u7ociSnXngBmjaFRx+FJ5+EOXMcXZGIiIjIdV0vd+ao67lChQr8/fffVKpUiapVq7J+/XrKlCmDq6trnhQsxcisWeZ60yZ4/30YMQIqV3ZsTSIiIiK3KEfDQh5//HEOHToEwDPPPMNLL73Eo48+yuDBg/O0OCkmzp4FZ2ewWs2BjiIiIiKFVI6GhVwrISGBxMREPD0986KmW6JhIYXY779Dw4bmpDI7d8L+/VCjhqOrEhEREcnS9XJnjnqur+Xm5laggrUUckFB0Ls37N0LLi4waZKjKxIRERG5JbcUrkVy3cSJEB8P9epBeDjs2+foikRERERumsK1FAw1a5oZQ3bvBnd3mDDB0RWJiIiI3LScT1QtktfGj4fYWPDzg3fegTFjoEEDR1clIiIikmM33XN95coVYmJi8qIWKe6qVIH/+z8z5trHB8LCHF2RiIiIyE25brieO3eu7faFCxcYOHAgd955J02bNuWxxx7j3LlzeV6gFEOnTkGrVvDll/Dzz46uRkRERCTHrhuu33//fdvtGTNm4OnpyQ8//MD3339PmTJleO211/K8QCmGFi+G1atN7/X48Y6uRkRERCTHrhuuM06BvWXLFiZMmEDZsmUpW7Ys48eP58cff8zzAqUYGjECvL0hMBC+/hq2bHF0RSIiIiI5ct1wbbFYsFqtJCcnY7VaKV26tK2tdOnSGnstecPXF158Ef74A0qXhnHjHF2RiIiISI5cN1zHxsZSt25d6tWrR1RUFHv37rW1HT16FF9f3zwvUIqp5583s4aULw/r18OmTY6uSEREROSGrjsV3/r16+3ulylTxnb78uXLDBs2LG+qEvHxMXNd//knXLpkeq83bQKLxdGViYiIiGTruuH6tttuy7YtKCiIoKCgXC9IxObZZ831HXeY2+vWwX33ObYmERERkevI0TzXCQkJvPXWW9x33300atSI++67jzfffJP4+Pi8rk+KO6vVzH9dvrzpvc5wkK2IiIhIQZOjMzROmDCBI0eOMGbMGG677TZOnDjB/PnzOX36NNOmTcvrGqU4S0qC554DDw/Ytg1WrYKQEEdXJSIiIpKlHIXr9evXs27dOnx8fAC4/fbbadiwIffpK3rJa66uZuz1Y4+l91536QJON31yUREREZE8l6OEUrZsWa5evWq3LD4+nnLlyuVJUSJ2+vaF2rXB2Rl27jRnbhQREREpgHLUcx0aGsrjjz9Ov379KF++PKdOneKTTz4hNDSULRlO8NGyZcs8K1SKMWdnmDQJHn4YAgIgLAy6dzfLRURERAoQi9V64yPE2rVrd+MdWSyZpu7LT/Hx8ezevZv69evj7u7usDokj6SkwAMPwO23w5w5sGQJ/Pvfjq5KREREiqHr5c4chevCQOG6mEhJgUaNIC4O9uwBlxx9+SIiIiKSa66XO3N8VFhSUhLbt29n5cqV7Nixg6SkpFwvVOSGkpPh3nvh4EEID3d0NSIiIiJ2ctTtd+jQIZ5++mni4uIICAggMjISd3d35s2bR40aNfK6RpF0P/0Es2dDpUpmHHafPuDm5uiqRERERIAc9lxPnDiRhx9+mE2bNvHpp5+yefNmevfuzYQJE/K4PJFrtG4N7dvD5ctw5AgsXOjoikRERERschSu9+3bR//+/bFYLLZljz76KPv27cuzwkSy9corEB0NgYEwZYoZfy0iIiJSAOQoXPv7+7Nt2za7ZTt27MDf3z9PihK5rubNoVs3OH8e/v4b3n/f0RWJiIiIADkcc/3CCy/wzDPPcO+991KxYkVOnjzJxo0bee211/K6PpGsTZ4Mx46ZMzi+8goMHAglSzq6KhERESnmctRz3b59e7744gvuuOMOrly5wh133MEXX3xBhw4d8ro+kawFBcGvv8Ibb8Dp0/Duu46uSERERCRn81x/+OGHDBw4MNPyhQsX0r9//zwp7GZpnuti6uJFaNcOjh+Hw4fB29vRFYmIiEgR94/nuX7nnXeyXD537tx/Xp3IPzFmDOzeDWfPwttvO7oaERERKeauO+Z6y5YtAKSkpPDTTz+RsZP777//xtPTM2+rE7mRESPggw+gcmV4/XUYPBhKl3Z0VSIiIlJMXTdcjxkzBjBd36NHj7Ytt1gslCtXjrFjx+ZtdSI3UqUKPPmkGXOdnAyzZsHEiY6uSkRERIqpHI25HjFiBDNmzMiPem6ZxlwXY6dOQfXqULasGYN95Aj4+Tm6KhERESmi/vGY64IerKWYq1ABhgyBatXMmRs1RaSIiIg4SI7CtUiB98orsGkT9OkDs2eb6flERERE8pnCtRQNzs7mesAAuHoVXn3VsfWIiIhIsaRwLUVHYiL06wcBATB3Lpw44eiKREREpJhRuJaiw9UVhg2DkychKQmmTnV0RSIiIlLMKFxL0TJ4MFSsCOXKwfz5cOyYoysSERGRYkThWoqWEiVg7FgzPZ/FApMnO7oiERERKUbyLVwfOXKEXr160alTJ3r16sXRo0czrbNs2TK6du1KaGgoXbt2ZfHixflVnhQlAwdCzZrQvDksWgR//unoikRERKSYyNFJZHLDI488Qo8ePQgNDSUiIoJly5ZlCs8xMTF4enpisViIiYmha9euzJ07l9q1a99w/zqJjNhJSIDz583JZXr0gI8/dnRFIiIiUkT845PI/FPnzp1jz549hISEABASEsKePXs4f/683XpeXl5YLBYA4uLiSExMtN0XuSlububkMj16QHg47N3r6IpERESkGMiXcB0ZGUn58uVxTp2L2NnZGX9/fyIjIzOtu379eh544AHatm3L448/Tq1atfKjRCmKNm0ywdrdHSZMcHQ1IiIiUgwUuAMa27dvz6pVq1izZg0REREcPnzY0SVJYdW6NTRqZA5y/O9/4bffHF2RiIiIFHH5Eq4DAgI4ffo0ycnJACQnJxMVFUVAQEC221SsWJEGDRqwcePG/ChRiiInJ5gyBS5eBA8PCAtzdEUiIiJSxOVLuPbz86NOnTqsXLkSgJUrV1KnTh18fX3t1jt06JDt9vnz59m6dSs1a9bMjxKlqOrSBe66y5xgJiICduxwdEUiIiJShOXbsJAJEyYQHh5Op06dCA8PZ+LEiQAMGjSIXbt2AfDpp5/ywAMPEBoaymOPPUbfvn25++6786tEKYosFnjlFdOL7eMD48Y5uiIREREpwvJtKr68pqn45LpiYuDdd2HkSPjxR9ObLSIiInILHD4Vn4jDeXnB00+Dr696r0VERCTPKFxL8TF2LMTGwnffwYYNjq5GREREiiCFayk+Bg6E+HjTiz1uHBSNEVEiIiJSgChcS/ERFAS9eplTo//4I6xd6+iKREREpIhRuJbiZeJESE4Gb2/1XouIiEiuU7iW4qVmTXjsMROut2+HFSscXZGIiIgUIQrXUvzMnAkHD8Ltt8P48ZCS4uiKREREpIhQuJbip1QpKFnSzHn922/wxReOrkhERESKCIVrKZ4SE2HaNHPWxvHjzThsERERkX9I4VqKJ1dXeOghuHwZ9u6FpUsdXZGIiIgUAQrXUnyNGGEObPTxgQkTICnJ0RWJiIhIIadwLcWXry8MHw6XLsGff8LixY6uSERERAo5hWsp3p5/HsqWhfLlYdIkc4IZERERkVukcC3Fm7c3/PorLFwIx47BggWOrkhEREQKMYVrkUqVoHNnaNIEJk+GuDhHVyQiIiKFlMK1CMBPP8Hvv8PJk/Dee46uRkRERAophWsRgDvvhIoVwcsLpk6FK1ccXZGIiIgUQgrXIgBubmY6vpgYiIqCd95xdEUiIiJSCClci6Tp2xfq1AFPT5g+3ZxgRkREROQmKFyLpHF2NtPxxcbC+fPw1luOrkhEREQKGYVrkYwefBD27YNu3eD11+HCBUdXJCIiIoWIwrVIRk5OULOm6cGOjoY33nB0RSIiIlKIKFyLZCUiAkqUgFmz4OxZR1cjIiIihYTCtUhW2rSBq1fNlHwzZji6GhERESkkFK5FsnLvvdChA7i7w+zZcOqUoysSERGRQkDhWiQ7r7wC8fHm8uqrjq5GRERECgGFa5HsNGtmZg0pUQLefRf+/tvRFYmIiEgBp3Atcj1vvw2bNpnbr7zi2FpERESkwFO4FrmeKlWgSRMYOBA++ACOHHF0RSIiIlKAKVyL3EhSEvz8M1itMHmyo6sRERGRAkzhWuRGXFwgKMjc/ugjOHjQsfWIiIhIgaVwLZIT48eDszNYLDBxoqOrERERkQJK4VokJypXhiefhJQU+OQT+OMPR1ckIiIiBZDCtUhOjR5tpuVzdYUJExxdjYiIiBRACtciOVWhAqxbB8OHw+efw86djq5IREREChiFa5GbcdddMGIElCplxmGLiIiIZKBwLXKz0ua6XrECtm1zbC0iIiJSoChci9ys22830/O5uqr3WkREROwoXIvcLG9vePllSEyENWvghx8cXZGIiIgUEArXIrfimWcgIMD0Xo8d6+hqREREpIDIt3B95MgRevXqRadOnejVqxdHjx7NtM4777zDAw88QLdu3XjwwQf5/vvv86s8kZtTogSMG2d6rzdtgu++c3RFIiIiUgDkW7gOCwujT58+rFmzhj59+jA+i7GqQUFBfP7553z11VdMnTqVF154gbi4uPwqUeTmDBxopuS77TYTtK1WR1ckIiIiDpYv4frcuXPs2bOHkJAQAEJCQtizZw/nz5+3W69169aUKFECgFq1amG1Wrl48WJ+lChy89zcoEcPMyzkf/+D1asdXZGIiIg4WL6E68jISMqXL4+zszMAzs7O+Pv7ExkZme02y5cvp3LlylSoUCE/ShS5dcnJJmir91pERKTYK5AHNG7bto233nqLmTNnOroUkRsrXx4SEuDnn+GrrxxdjYiIiDhQvoTrgIAATp8+TXJyMgDJyclERUUREBCQad1ff/2Vl156iXfeeYfq1avnR3ki/8yDD0KjRmbu6zFjICXF0RWJiIiIg+RLuPbz86NOnTqsXLkSgJUrV1KnTh18fX3t1vv999954YUXePvtt6lXr15+lCbyzzk5wSuvQFIS/PGHOchRREREiiWL1Zo/g0QPHTrEqFGjuHTpEj4+PkyfPp3q1aszaNAghgwZQoMGDejRowcnTpygfPnytu1mzJhBrVq1brj/+Ph4du/eTf369XF3d8/LpyKSmdUKd99tTodeo4YJ2anHGIiIiEjRcr3cmW/hOq8pXIvD/forrF0Lo0bB4sXQr5+jKxIREZE8cL3cWSAPaBQplIKD4aWXzPjriRPNCWZERESkWFG4FslNKSng5weHDpneaxERESlWFK5FcpOLC3h6mvHWYWEQH+/oikRERCQfKVyL5LbJk00P9okT8OGHjq5GRERE8pHCtUhuCwqC3r3NFH2TJsHVq46uSERERPKJwrVIXpg40VyfPg3z5jm2FhEREck3CtcieeGOO+Cjj6BVK3j1VbhyxdEViYiISD5QuBbJK337wmuvQVQUzJnj6GpEREQkHyhci+SlsmXN1HzTpsGlS46uRkRERPKYwrVIXvLxMUNCoqPhzTcdXY2IiIjkMYVrkbxUvjw8/7y5PWMGnD/v0HJEREQkbylci+S1l14CLy/Tgz1zpqOrERERkTykcC2S13x9YcQIc/uNN+DMGcfWIyIiInlG4VokPzz/PAwfbk6HPmOGo6sRERGRPKJwLZIfvL3NtHx9+5pp+SIjHV2RiIiI5AGFa5H81Lq16b2eOtXRlYiIiEgeULgWyU/JyWC1mlOiHz/u6Gry15YtZr7vLVscXYmIiEiecXF0ASLFyoABptf6+HHo3h3Gj4c77zRtFkv6emm3c7osr9uvt018PFy4AAkJEBeXfl23rhkOc+wYLF0KkydDUhK4usKsWfDYY+DpaZY5O9vvV0REpJCyWK1Wq6OLyA3x8fHs3r2b+vXr4+7u7uhyRLI3frwJmsWdpyeUKAGJieYkO87O5uLqai4NGphwfvmyaS9Z0mzj6WmWV6oEHh7g7p5+cXO7udsZ77u5KeCLiEiOXC93qudaJL+dPm1/v00buHjRhMykpPTr5s2hY0czP/a4cenL0z4Ph4TA/febE9OMG5f5cf71L2jb1hw8OW2afZvFAr16mcf4+29YuBBcXOwv990Ht99utt+wwSxzdk5vv/NOKFcOzp6Fgwczt1eubMLzlSsQEQG7d6c/fvXq0LmzeS5//QVHj5re7rRLQoLpEY+NNe1ZTV/o7GyG2eQmV9ebC+S3EuJv5baTRvCJiBQWCtci+e3aoFSnDmzfbnpmPTzSe2O7doVHHzUh8+hR+15aDw9o1QpatoSrV6FmTfs2d3eoVg0qVDChfPDg9OUeHib8ZvT663n7nB94AO6919Ti6grh4ab2nLBaTUC/eNEMP0m7dOtm2j77DNavNx8y0tZJSICPPzav3ejR8N139vv08TGztyQkwIIFcORIeqB1czPtd95ptj982AR+q9VcX70KKSnmEh+f/kEg4+2EhNx9/Vxc8ifQ3+w21/4eiYiIhoWI5LstW0yPckKCCSkbNuQ8aBZmW7bAxo0mZOfn842Lsw/lFy6YHu/QUNM+Zw7s2GHfftttsHq1aW/WzHz4yahlS/jf/8ztjh1N736ZMumXZs1g0CATtL/4wgTzkiVNT76Hh7mG7MN5Vvdzcjsn6+Xmn3wnp/zryb+ZbVxdNcRHRPLU9XKnwrWIIzgqaMrNO3sWzp1LD94XL5ox3yEhpv2ll+DQIftwfs89sHixaS9b1myfUZ8+8Mkn5vadd5pQmBbMS5eGDh3MAa9WK3z1lVmWMbx7et5aeLRazdCi/AjxN7vv3B7i46jhPDdaLyc/N/19ECnwNOZapKBp2VL/NAuLsmXNJTuvvXb97bduNUNWMobv6tVNW0oK1Khhlp0+Dfv2mdueniZcx8SY62uNGweTJpnQ3qWLfTAvU8YMmWnRwgyn2brVPpz7+KQfGFqQJCfnfdi/dpurV82Hpeutl5SUu88zbVx/doE8Ph7++MP8bjg7m6FhzZtDxYoQEGCu/f1Nm4gUSArXIiJ5qUYNc8mKkxP897/Zb1uiBPz8c3qPeVo4T/tglpBggvP586b3PG2dSpVMuP7zT2jfPvNjfvSROVvo7t3wwgv2wbxMGejZ0xzMeuGC2W/a8lKl8i7UOTuboTMlS+bN/m9VSkr+hf2EBNi/3zwmmA8cCxaYS0bOzlC+vAna17v4+elgWBEHULgWESmoXFygcePs2wMCYM0a+2VWa3o4q1HDDC+4dsx5gwamPT7e9G7//Xd6W0ICNGxowvWmTWbWmYx8fMx49JYtzYGic+bYD1kpU8bMROPnZ2Z5OX8+fbmra669NPnGySn9QOP8sGWL+UCUdkzGmjXm4OSTJ7O+HDkCP/6YeegRmNc7rbf7epfSpTVGXSQXKVyLiBQlFkt677KXlxn/nZ0770w/MBPSZ0RJmwWkRQsz5vvacF6xommPjoYDB9KXX71qlnfoYML14sUwfHj6/j09TZDbvt2Evs8/h5Ur7YN56dImnLu5mXCelGSW51e4dbSWLc3sN9eOua5U6frbxcXBqVPpoTsy0j6E79tnPgxdvJh5Ww+PnIVwb2+FcJEc0AGNIiKSO9LO1lm2rAno+/dnnonlwgXT2+3pCTNnwltvmWUxMen7SUgwva7PPgvvvGOWpR30Wa4c/PabCXkffmhuZwzn/v5m/ncw+3VzM0NNFAqN2NjMwfvay4kT5huNa3l63jiABwQUvPH8InlAs4WIiOMd3wdH/4Cq9SCwtqOrkYImKcn0ql68aIakgBkisXOn/ZjzxERYtMi0P/00LF1qetDT/pVVqgTHj5vbXbrAN9+YoJ42prxhw/Rx7m+9ZXrHM445DwyEpk1N+5UrxTeYX76cOXRfG8pPnDA95tcqVco+bGcXwovLtxFSJClci4hjJCfB5fNw8BdY/aE5QMvZBR6bBOUqw5Fd4OIKLm7p16XLQQkvs21ivFnm7FI8A47kTHIyXLpkwndcHNSta5Z/9VX6DCxpF39/mD3btLdtC5s3p49RB2jd2iwDc4KnAwfsD/bs0CH9jKfTp5vrjENaqlVL/3CQklK0Dyi0Ws0Hm+v1gqddEhMzb+/re+Oe8AoVCudYfSnyNBWfiOQ+qxUunYNLZyE69frSOahaH2o3gwtR8NZTwDWf35OTTA+2qwd8+mrm/XZ/Dhq1gxN/woKXUxda0sP3v4ZArabw1z74+v3U5a7g6m6u73kYKlSDyCPw+8bM4b3uXeBdBi5Gwelj6dulreMbYK6TEk04cnEBJ017VqA5O6cH3Iy6dTOX7GzYYH6PL19OD98ZZ0MZOtT+YM9r219/3cyDntGjj5qedavVDI/IOId5mTLw8MPw5JPmA8GMGZnnMK9a1czQsmGDGXN9113/7LXJSxaLqb906fQPNFmxWs0Bl1mF7rTe8D17zO2s5jv398+65zvjfX9/nTG0OCqgc8LrN1FEsvf3AROYo8+mh+fAWtCiqwnJswbZr+/qASW9Tbj2LgP3PAQ+ZSHhKqwPT++5rloP/ALgyTcgKcEE2aR4cx2QOgd06XLQqX9qWwIkpq5Xqpxpd3YBH7/07a9Ep98GuHAKdqw1y6wZeiYr1TK1HdoJK+Zmfs6DZ0O5SrD9G1iz0CxzckkP8U/NAh9fs+9f1plQ7pohvIc+B27usH87/LXXPti7uEKTTiaUnDoKMRcytKXuxy/1YMHEBNPr6eSsXvu8ZLGYGVB8fKBKFfu2p566/rZRUWYMc8bw7etr2lJSYMSIzNMopg2jiI6G0aMz7/Pxx80JhtJOrOPklH7WSVdX02v+xBNmmsWQkPTlrq5mvZdfNsNhDhyAkSPt211dzXO68044eNBM8Xdte48eJuAfO2ZCy7Xtd91lPgScOQOHD9s/tqurGZbj5pY+tWBae9p88UFB2b+eycnmw8r1esB/+cXMCX/tl+5OTjmbnrBs2aL9bUJxkZwMX38NDz1khpS5uZkDgQtIwFa4FinO/vwVzp20D9D+VSDkSdP+f1NNaAUT/nz8oOxtqfddTZD0LAWlypo2jwxnDnRxhbb/Tn+sSjUzj7kOqJZ9bT5+0PI6vY633Q59sggnaeq2NBcwf4jTgrdH6jzKtVuYHu608J527eNn2ivXhQ6P2LclJYBb6jhRN3fzQSIpEeJj4UrqOmnP/6+98NNKSM7wdbjFCZp2Nre3roRf19vX7F4SXk49c+Pyt+GPH802acG+tD88OdO0f/MhnPzTPriXKQ/3PWbat682Q3Iytnv7QZ3m6fUlJdqHfw9P88EBTJuTs4LI9Vgs6SfkuXY2D2dnmDgx+219fc3sKhnP/HnhggkIacHaYoE2bcwY8MREc6lZ02zv5gaNGpl109oSE9N//65eNT3gGdsSE9NPSnT0qDmg9NrhGg0amHC9fTs89ljmun/80QTslSthwIDM7b//bvYxfz4MGWL/Wrm6moNcq1aFt982w2quDe+bNkFwsDkQdulS+7bbb4effjL7ev99M/tJXJy5XL1qLuXKwV9/maE90dGZ63NxMb3epUqZi7+/uZQvb8bbp52wJynJXK798OLmlv3PVP65tFlvIiPtL9cui4qyH86VkGA+DCpci0ius1pN0PNIPVp/zxYTwDL2PHv7woCppn39JxB5KL0X2KesCctpHnrJBL5SflAii2m4gtvlvLbA2o47kNHZGZxLgHuJ9GWePuaSndtuN5fsBN1jLtnp+Ii5pKSYgJ0W0NO0eQiCO9iH94watIbyVdNDfVKiGcKSxs3DBOKkBIiLte+1B9j9Axzbg92wnEq10sP1irlw5rj9Y1ZvCI9MMLdnD4boM+Dsmh6+azeDrk+b9o8nmcfMGM6rNYAm95n275aYcJ6xZ758FfPNR0oKHPk9c69+SR8z3t5qhZTkot9rnzYFXkBA+jJfX5g3L32e66lTsw4MlSub8Jmdhg1N0M1Ox47mMaxWE+TTwneJ1PfI/fdnHc7r1EnfftWqzOE+MNC03323GTaTsS3tpEdggvL992fef9r46rQzdsbE2Lc7OZn387FjZuhMxsd3dTUnXQJzkqRPPrF/zl5e8Nxzpgf866/NSZSyY7Fk7h2vUMEM5alY0XwrsH+/fa99ixbw2Wdm3Z49TcjPGM5btIDJk037c8+Z8J+xvUkTM6wIzOMkJ9u316uXPrXm55+b1yHjtxpnz5pvNO65x7y+Pj4F46BRq9UcE3FtYM4qNGc1VWTatxJpQ4HuvNP8LK5eNcdPJCeb1+Hee/P7mWVLBzSKFBZWK1yNMUMJ/CubZbu+hz9/SQ3P58zFvSS8lDqc4f+mmYMJfXzTw7N/ZWjT07SfP2VCmmepoh1iiqu0kJoW3q1W8Cpt2k4dyRDKU4N5SR+onvq1/dav4eplc1Bp2vYVqqX3vC991fw+ZuzVr9MCOvQzjzPlYTN0KKMWXaHzAEiIg6n/JpM2D0G7PhBzEV7vn9prnyGA3/OQGVYTfRaWvZGhLbU9uL0J+JfOw47VmcN71frgW8HUfepIaluGMfdepcwHmJQU835w1HuigI4jLfCs1vSf2ZUrZshOxmButcIdd5j2ffvM0JbERDPmPirKXFeqZML3Dz+YWWcuXDAhODrahP1rubiYGWVKlDA94B07mgD4zTdmf86p3/6kpJhe8VmzzHZ3321mW7n2W4W0s3F6e9tPTwlmyND775vnkdU3Si4ups3NLX3OeTe39F7655+HwYPNcxk6NH152qVVK/PhKT7eDPlJW57djDkpKeY1vFFgPnUqvZ6MMn64rFAh/XbGS4UK5tuI7M4M68D3ig5ozGuHdsKh302PTMUaqb+EFvNPzMkJ4q+af1CQ3mYhvScwMT71n1CGP+YWS/rXz8lJ6Z+gM26vg6wKrx1rYO9PJow06WSWxcWm9y5Xa2B6k3/fBDs3pPc8J8YDFhj7qQkEJw7Ckd1mWEZAdajVzNxO+yfz4PMmLGT31b5vhfx6xuIIFov5PXJ2se+1BxOUr6d5l+u39x51/ccd91lqsE9KD+8uqb2Szq7m25OMQ3IS49M/NLq4Qds+1wzJyTDe3mo1zykx3nwASFvnjjtN+6WzsPmzzHX1fNH8zkcehsVhmdv/PdocLHtgByydlt5rn3bQa88XzfCmP3fCD8vse+VdXOHeXmbozomDZsjVtQfT1m5ufg4Xz5hvBTK2ubql/s9whtvKwN01oVKZzDVK9jIGwLThOtmpXdtcsvP885mXxcTceI7wuXOzDpLe3uZspb/9ZsL3XXdlfZBmmujozL36GYek7Npl37ZokQnmKSmmN79rVxM20z4YREeboAom9H/3nelNvnQpPV+8/bYJ13/8YXqH0zg5mQNzW7QwQfvwYXNJ++bjWj4+6XPOBwXBAw+Yb1puu80+SJfKhU6dSgXzvaKe63/q+D5YMMb+gKk0o8LN1/NrF8H/IjK3j//c/CFdOc+ErYxc3EyAAlg2C3Zttm/3LAUvLTK3/28a7N+GXTj3DYDn5pjbH0+Co7sz9MRYoEJVGJg6ndSiceafTcbgXqkW/GesaV8wGs5F2gf/ag3MeFtrimm/ctFsm6Z6Q2jb27zRl0wxPVVp+wbTO9aks2n/YlZqT1HaxhaoVt+MiU1KgHWLM792letC1brmg8vWVZnbb7vDPMerMfDH/1IXZvhVr1Dd/BO8etm8NmntaauUvc28xldj4PTR1OYM25cqZ/5JXr0C0VGp21nT1/MsZf75J1w1+7BmaEtOtO/RszilPnaG/VepCx5eJlRHnzGP5VYCSnia5eUCzZhfZxcTAJxTD7hLu5/xtl2bS9Zt+fH1u+a5lvxktZr3WcaAXsI79X0bYw4ovTa8Vw8yH07PHIfdP9r36iclQOue5m/DwV/g+2WZt+833rw3t66Cbz7IXNPQ96CMP2z+HL77JHP7Sx/B+ZP2/1PS3qdOTjB8oXm/frcEftuY4YDX1B7+p1LH42/6zHyjZXFKX8fDEx5+ybT/uNwMF0vb1snJ/M3q+Ihp377aHIuRNubeyRm8ykCz1JPz/L7JfLvg5ASW1HW8y5gPDwAHfjZ/851S921xMh8cbkvtNf77gPngZXFKf4wSXuZvMpiZhsB+e1f39A+ISYnpdRe0b9zShkDkZHrChITM25cpk7PpCa8d+71li+m9TRsec+3447RZca7tUT550gxfOXHCDCs5c8Z8CMiKr6856LdECdPznxbk4+LM7TlvwYOhsH4D9Otvv63FAh/PgzsbwuYfYe5C8CwJniXMpWQJ6HE/lPGB4yfg8F9Qwg083MDDFdxdzbU12byvk5Mg9hJs/xmOnoXq5WH8u/n6v0XzXOelFfPg5wzB2K+iCaZWK9xWE5wsJpheOpP6FW2KubammHas5o9YzIXU5RlCVoWqZv0LpyD2curyFEixmv36VjT7uXjGhMy0NqzmD4+Pn2m/dD79K2FrWrvF/DFLSTEBMzkZW8BL6/V0djXbJ8anb1c0fl3MH3Tn1C9uMo5TTftD7eFpwmxKUvoBfekrmZksPLxMeD5/yu5zBWDCu6eP+QeUFs7TZPwmAkwd3n7p/0DT/pnZxuqm/iFJC+VJifYHyeUKSzbBO6fhPZv10u5Hn4X/LU+dLcTZTJfnmzorRsYPJWlst7P4nUv7XcywaZ6sl5+Pdd31snldbFc5Xe9Wa7qVx8rr9az2teZ7Tdn9rmao6dq/99YU856wYAKQbRabDOuV9DEfpmMvpe/fxS09VJYqZ/62x14yx1ZkrNtiST3Y2GJ67uOuZKjPav6mVEidiefcSfN331az1dQWWNvs5+Sf5n9Oxu3dS0DVBqb96O70/acp4W06XSwWE+zjr+m99SqTOuTIAnu3pH+bm6ZUufT23zdmHlLkV9HsH0tqZ1SGn4HFyYznr1LPhPZfv039W+6U3qkUUA0q3mH+du763myT1mZxMu3+VUxdB7ZnaE+9rlDdfOsRH2uOZci4rcVijo8oVda8LicPm//RFidTr5PFfOjyLGU6ZM6dNL8HV+LgwmW4eBniLRB9GU5FwanTcP4inI82l6ymJyztA2XLgJ8vlPOFhHhY90Pq7wLQMtj87T13wezjwkWIyyLMu7pAKU+zv1Ke4F0CvD3MtZcHeLmbi6cHWKxmFqPkJPM6p30AtFohMRmcnczlSjycioa4RIhPTL1OgsZVoFRJOBQFPx6AuCT79ifbQjlv+OkQrNmVudawfuDjDj/8AT/uBVLgTOp7xdkJpr0Iw2dk3i6PKFznpax6lW+F7VN4hussl1kyL0vrdcy0viVD+432kZuP6ZzDfeT0MZ2zeE63WHdB6OXYscZ8W5Em5Kn0oSE5lXEsbVrwTsoQwG0hPCn9YLqsAnpyUjbbJV5nHznYv+SyDN8a2RZZ0tssubye7Son6+VTTTdVewGr6dq/O2nfEmbY1Lbe6WMmHKfxLgvlK2MXhDOF+2tCf6YPTNlsk9WHqUwfFlLvp41Dt1rBmmwWZwxXYP7OWjF/B6wp6fu3Ws3zdErr0IhP7wiytad26GDN0KFzzb7ThkImpJ0VMsNzSBsCZbXaHxyc9hzS/g+kfaNhJzWRWixk+S20I1mtEJsAl6/C5bjsLzHXnCnT2QlKlzRBOS0kl/I0t0u6gre7ue3hiu3Dgatb+rE712rUzoT1v/ZC1F/2bU7OZtpUZxfY+Z35ZiIjD08z5MrZxXwTfewP+/bS/maKUydn8033np/hYmx6+HYvDXd2gUGDIDwMvt0Ef5yAyGi4kPohzwL0C4WPlt/6a32TNOY6LzW7H/b8CMkp5pf5wReg4u03Hzql+EgL0teOub4ZGcfSFjRp/7wyhv6/95vhP2k9112fSZ3POpuAkdH1gkjGhTkOgzezXn4+1jXrFYQPgpL/ju8zQ/XS3isPv6hhVI6U8dvkaz9wZPVhJiX1m4rkJHOdkmw+mLi6mQ8XiQkQG526PLUtJcXMyOTqbr4xuHAqdXlquzXZ/L10Sx2vf/po+vK09b77FqbMT88iL/4bQh9O/UbUGRq0Md8+nDpqtrcNA0291G5uwvPpY6nfxmZowwJ3NDa3zxyHyxfst3dygsqps8hUCzLfimT8ZsDJ2XyzAOY4oISrph3MtbNL+hSpPV80r92125f0Nu2PToJHUkz7is/h3/3T3yv/6pUHvwC3Rj3XuUFjSUVuTO8TkZzRe0Vu1vF9MGkwHD4F1SvA+HeKx+/O8v+Db1bC/SHQPYsZiPJQgRgWcuTIEUaNGsXFixcpXbo006dPp2rVqnbr/PDDD7zxxhscOHCAfv36MXLkyBzvX1PxiYiISLGlD2X56nq5M9/GI4SFhdGnTx/WrFlDnz59GD9+fKZ1AgMDmTJlCgMHDsyvskREREQKv8Da0LqHgnUBkC/h+ty5c+zZs4eQkBAAQkJC2LNnD+evme6lSpUq1K1bFxeXAjiOVERERETkBvIlXEdGRlK+fHmcU8+w4+zsjL+/P5GRkfnx8CIiIiIi+ULTVIiIiIiI5JJ8CdcBAQGcPn2a5NSJ0JOTk4mKiiIg46k+RUREREQKuXwJ135+ftSpU4eVK1cCsHLlSurUqYOvr29+PLyIiIiISL7It2EhEyZMIDw8nE6dOhEeHs7EiRMBGDRoELt2mdNc7tixgzZt2rBw4UKWLl1KmzZt+P777/OrRBERERGRf0QnkRERERERuQkFYp5rEREREZGiTuFaRERERCSXKFyLiIiIiOSSInMqxLSh4wkJCQ6uRERERESKsrS8mdWhi0UmXCcmJgJw4MABB1ciIiIiIsVBYmIiHh4edsuKzGwhKSkpXLlyBVdXVywWi6PLEREREZEiymq1kpiYiKenJ05O9qOsi0y4FhERERFxNB3QKCIiIiKSSxSuRURERERyicK1iIiIiEguUbgWEREREcklCtciIiIiIrlE4VpEREREJJcoXIuIiIiI5BKFaxERERGRXKJwLSIiIiKSS1wcXUBxsGPHDr744gsSEhLw8fFh/Pjxji5JpEC6fPkyU6dO5X//+x+bNm1ydDkiBUp8fDxhYWF4eXlhsVgYM2aMo0sSKZAc/b9EPdfZmD59Ou3ataNWrVocOHDAtvzIkSP06tWLTp060atXL44ePXrDfTVp0oSpU6fy+uuvExkZyZUrV/KwcpH8lZvvFW9vb6ZNm0a1atXysGIRx7uV983atWtp2rQpY8eOpUSJEuzatcsBlYvkr1t5rzj6f4nCdTbat2/PJ598wm233Wa3PCwsjD59+rBmzRr69Olj1wv9119/8dhjj9ldPvjgA1v7xo0bqVGjBp6envn2PETyWl68V0SKult535w8edK2fqVKlThx4kS+1iziCLfyXnE0DQvJRpMmTTItO3fuHHv27GHhwoUAhISEMHnyZM6fP4+vry+VK1dm0aJFWe7viy++4MSJEwwfPjwvyxbJd7n9XhEpDm7lfRMQEMDJkycBOHHiBLVr187XmkUc4VbeK46mnuubEBkZSfny5XF2dgbA2dkZf39/IiMjr7vdhg0bePPNNzlz5gzjx4/n/Pnz+VGuiMPc6nsFYOLEiRw+fJjx48dz/PjxvC5VpMC40fvmvvvuY9u2bUybNo0rV64QFBTkyHJFHCYn/2Mc+b9EPdf5oG3btrRt29bRZYgUCmFhYYSFhTm6DJECx8PDg1dffdXRZYgUCo78X6Ke65sQEBDA6dOnSU5OBiA5OZmoqCgCAgIcXJlIwaL3isjN0/tGJGcK+ntF4fom+Pn5UadOHVauXAnAypUrqVOnToEY3yNSkOi9InLz9L4RyZmC/l6xWK1Wq6OLKIimTJnC2rVrOXv2LGXKlKF06dKsWrWKQ4cOMWrUKC5duoSPjw/Tp0+nevXqji5XxGH0XhG5eXrfiORMYXyvKFyLiIiIiOQSDQsREREREcklCtciIiIiIrlE4VpEREREJJcoXIuIiIiI5BKFaxERERGRXKJwLSIiIiKSSxSuRUSkUAkODub48eOOLkNEJEsK1yIiki+2bt1KmzZt/vF+fv31VwIDA3O0bq1atTh27Ng/fkwRkZxSuBYRyaGkpCRHlwAUnDryQlF+biJSPChci4hcR7t27Zg/fz5du3alUaNGJCUlsXPnTnr37k2TJk3o1q0bW7duta3/xRdf0L59e4KDg2nXrh1fffUVACkpKbz77ru0bduWli1bMmLECC5fvgxk3aPbrl07/ve//wEwe/ZshgwZwvDhw2ncuDFffvklFy9e5OWXX+buu++madOmPPPMM7ZtN2zYQGhoKE2aNKF3797s27cv2+d38OBB+vfvT7NmzbjrrruYN28eAAkJCbzyyivcfffd3H333bzyyiskJCTY1btgwQJatmzJ3XffzbJly2z73LRpE126dCE4OJjWrVvz4YcfEhsby6BBg4iKiiI4OJjg4GBOnz6d5XP7/fff6dWrF02aNOHuu+9m0qRJtscG+97oUaNGMXHiRJ544gmCg4N56KGH+OuvvwD4z3/+A0BoaCjBwcF8/fXXN/OjFxG5NVYREclW27Ztrd26dbOePHnSevXqVeupU6eszZo1s27cuNGanJxs/eGHH6zNmjWznjt3znrlyhVrcHCw9dChQ1ar1Wo9ffq09cCBA1ar1Wr97LPPrB06dLD+9ddf1piYGOvgwYOtw4cPt1qtVutPP/1kbd26dabH/fHHH61Wq9X69ttvW+vWrWtdt26dNTk52Xr16lXroEGDrEOHDrVevHjRmpCQYN26davVarVad+/ebW3RooV1586d1qSkJOsXX3xhbdu2rTU+Pj7Tc7t8+bK1VatW1g8//NAaFxdnvXz5snXnzp1Wq9VqffPNN60PPfSQ9ezZs9Zz585Ze/XqZZ01a5at3jp16ljffPNNa0JCgnXjxo3WoKAg68WLF61Wq9XaqlUr6/bt261Wq9V68eJF6+7du7N9nlk9t127dll//fVXa2JiovX48ePWzp07WxcuXGjbpmbNmtajR49arVardeTIkdamTZtaf/vtN2tiYqJ12LBh1ueffz7LdUVE8oN6rkVEbqBfv34EBATg4eFBREQEbdq04Z577sHJyYlWrVpRv359Nm3aBICTkxMHDx4kLi4Of39/7rjjDgBWrFjBY489RmBgIJ6engwbNoyvv/46x8MgGjVqRIcOHXBycuLSpUts3ryZiRMnUqpUKVxdXWnWrBkA//3vf+nVqxcNGzbE2dmZf/3rX7i6urJz585M+9y4cSNly5ZlwIABuLu74+XlRcOGDW31Dh48GD8/P3x9fRk8eLCtFx7AxcWFwYMH4+rqyj333EPJkiU5cuSIre3PP/8kJiaGUqVKUa9evRw/Nw8PD+rXr0+jRo1wcXGhUqVK9OrVi+3bt2e7fceOHQkKCsLFxYVu3bqxd+/eHL2mIiJ5wcXRBYiIFHQBAQG22ydPnmT16tVs2LDBtiwpKYnmzZtTsmRJZs2axYIFCxgzZgyNGzdm5MiR1KhRg6ioKG677TbbNrfddhtJSUmcO3cuRzVUqFDBdvvUqVOUKlWKUqVKZVrv5MmTLF++nPDwcNuyxMREoqKiMq0bGRlJ5cqVs3y8qKgoKlasaLtfsWJFu32ULl0aF5f0fyElSpQgNjYWgLfffpu5c+cyc+ZMatWqxYsvvkhwcHCOnhvAkSNHePXVV9m9ezdXr14lOTn5ugG9bNmyttseHh62OkREHEHhWkTkBiwWi+12QEAAoaGhTJkyJct1W7duTevWrYmLi+PNN99k3LhxLFmyBH9/f06cOGFb7+TJk7i4uODn58fp06eJi4uztSUnJ3P+/Plsa6hQoQLR0dFcunQJHx8fu/UCAgJ46qmnePrpp2/4vAICAli1alWWbf7+/pw8edLW8x4ZGYm/v/8N9wkQFBTE3LlzSUxM5JNPPuH5559n06ZNds8hu+cGMGHCBOrWrcvMmTPx8vJi0aJFrFmzJkePLSLiaBoWIiJyE7p168aGDRv4/vvvSU5OJj4+nq1bt3Lq1CnOnj3L+vXriY2Nxc3NjZIlS+Ls7AxASEgIH330EcePH+fKlSvMmjWL+++/HxcXF6pVq0Z8fDwbN24kMTGRuXPn2h3Ady1/f3/atGnDxIkTiY6OJjEx0TZs4qGHHmLp0qX89ttvWK1WYmNj2bhxIzExMZn2c++993L27FkWLVpEQkICMTEx/PbbbwA88MADzJ07l/Pnz3P+/HneeecdunbtesPXJyEhga+++orLly/j6uqKp6en7TXw8/Pj4sWLtgM5s3PlyhU8PT3x9PTk0KFD/N///d8NHzc7ZcuW1ZzYIpKvFK5FRG5CQEAA7777Lu+99x4tW7bknnvu4cMPPyQlJYWUlBQWLlxI69atadasGdu3bycsLAyAHj160K1bN/r27Uv79u1xc3Nj3LhxAHh7exMWFsbYsWNp06YNJUqUyDRU4lozZszAxcWF+++/n7vuuouPPvoIgAYNGjB58mQmTZpE06ZNue+++/jiiy+y3IeXlxcLFixgw4YNtGrVik6dOtlmPnnmmWeoX78+3bp1o1u3btSrV89uRpLriYiIoF27djRu3JilS5cyY8YMAGrUqMEDDzxAhw4daNKkCadPn85y+5EjR7Jy5UoaN27MuHHj6NKlS44eNyvPPvsso0aNokmTJpotRETyhcVqtVodXYSIiIiISFGgnmsRERERkVyicC0iIiIikksUrkVEREREconCtYiIiIhILlG4FhERERHJJQrXIiIiIiK5ROFaRERERCSXKFyLiIiIiOSS/wemq0hnq0BtUAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "fig, ax = plt.subplots()\n", "ax.set_xscale('log')\n", @@ -962,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "91736ad0", "metadata": {}, "outputs": [], @@ -973,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": null, "id": "2cdeefee", "metadata": {}, "outputs": [], @@ -1017,39 +335,26 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": null, "id": "d6440018", "metadata": {}, "outputs": [], "source": [ "plan_names = [\n", - " 'plan-round_robin_lifo-always_process-5-100',\n", - " 'plan-weighted_round_robin_lifo-always_process-5-100',\n", - " 'plan-random_lifo-always_process-5-100',\n", - " 'plan-weighted_random_lifo-always_process-5-100' \n", + " 'plan-weighted_random_lifo-always_process-0.01-100',\n", + " 'plan-weighted_random_lifo-always_process-0.1-100' \n", "]" ] }, { "cell_type": "code", - "execution_count": 154, + "execution_count": null, "id": "523bf657", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "plan-round_robin_lifo-always_process-5-100\n", - "plan-weighted_round_robin_lifo-always_process-5-100\n", - "plan-random_lifo-always_process-5-100\n", - "plan-weighted_random_lifo-always_process-5-100\n" - ] - } - ], + "outputs": [], "source": [ "results = {}\n", - "end_ts = 1000\n", + "end_ts = 37000\n", "for plan_name in plan_names:\n", " print(plan_name)\n", " plan_file = f'{plan_dir}/{plan_name}.json'\n", @@ -1060,162 +365,78 @@ }, { "cell_type": "code", - "execution_count": 155, - "id": "d1958139", + "execution_count": null, + "id": "592a8b2e", + "metadata": {}, + "outputs": [], + "source": [ + "results[\"plan-weighted_random_lifo-always_process-0.1-100\"].sort_values(by=\"updates\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98907ae3", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 155, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "results['plan-round_robin_lifo-always_process-5-100'].set_index(\"title\").sort_values(by=\"pageviews\").tail(10)[[\"updates\", \"optimal_updates\", \"pageviews\"]].plot(kind=\"bar\", title=\"Updates for Top Documents (Weighted Round Robin)\")" + "\n", + "df1 = results[\"plan-weighted_random_lifo-always_process-0.01-100\"]\n", + "df2 = results[\"plan-weighted_random_lifo-always_process-0.1-100\"]\n", + "for title in results[\"plan-weighted_random_lifo-always_process-0.01-100\"].title.tolist(): \n", + " u1 = df1[df1[\"title\"] == title].updates.tolist()\n", + " u2 = df2[df1[\"title\"] == title].updates.tolist()\n", + " if u1 != u2:\n", + " print(title)\n", + " print(u1)\n", + " print(u2)\n", + " \n", + " " ] }, { "cell_type": "code", - "execution_count": 156, + "execution_count": null, "id": "3ea99ccb", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 156, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results['plan-round_robin_lifo-always_process-5-100'].set_index(\"title\").sort_values(by=\"pageviews\").head(10)[[\"updates\", \"optimal_updates\", \"pageviews\"]].plot(kind=\"bar\", title=\"Updates for Least Queried Documents (Round Robin)\")" ] }, { "cell_type": "code", - "execution_count": 157, + "execution_count": null, "id": "999fc591", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 157, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"updates\", kind=\"hist\")" ] }, { "cell_type": "code", - "execution_count": 158, + "execution_count": null, "id": "00d43d3f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 158, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"updates\", kind=\"hist\")" ] }, { "cell_type": "code", - "execution_count": 159, + "execution_count": null, "id": "7af47144", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 159, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results['plan-round_robin_lifo-always_process-5-100'].plot(x=\"pageviews\", y=\"optimal_updates\", kind=\"hist\")" ] }, { "cell_type": "code", - "execution_count": 163, + "execution_count": null, "id": "4d55378e", "metadata": {}, "outputs": [], @@ -1225,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 208, + "execution_count": null, "id": "739fdc68", "metadata": {}, "outputs": [], @@ -1238,277 +459,17 @@ }, { "cell_type": "code", - "execution_count": 209, + "execution_count": null, "id": "c34cc7c0", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: 0,\n", - " 1: 1,\n", - " 2: 2,\n", - " 3: 3,\n", - " 4: 4,\n", - " 5: 5,\n", - " 6: 6,\n", - " 7: 7,\n", - " 8: 8,\n", - " 9: 9,\n", - " 10: 10,\n", - " 11: 11,\n", - " 12: 12,\n", - " 13: 13,\n", - " 14: 14,\n", - " 15: 15,\n", - " 16: 16,\n", - " 17: 17,\n", - " 18: 18,\n", - " 19: 19,\n", - " 20: 20,\n", - " 21: 21,\n", - " 22: 22,\n", - " 23: 23,\n", - " 24: 24,\n", - " 25: 25,\n", - " 26: 26,\n", - " 27: 27,\n", - " 28: 28,\n", - " 29: 29,\n", - " 30: 30,\n", - " 31: 31,\n", - " 32: 32,\n", - " 33: 33,\n", - " 34: 34,\n", - " 35: 35,\n", - " 36: 36,\n", - " 37: 37,\n", - " 38: 38,\n", - " 39: 39,\n", - " 40: 40,\n", - " 41: 41,\n", - " 42: 42,\n", - " 43: 43,\n", - " 44: 44,\n", - " 45: 45,\n", - " 46: 46,\n", - " 47: 47,\n", - " 48: 48,\n", - " 49: 49,\n", - " 50: 50,\n", - " 51: 51,\n", - " 52: 52,\n", - " 53: 53,\n", - " 54: 54,\n", - " 55: 55,\n", - " 56: 56,\n", - " 57: 57,\n", - " 58: 58,\n", - " 59: 59,\n", - " 60: 60,\n", - " 61: 61,\n", - " 62: 62,\n", - " 63: 63,\n", - " 64: 64,\n", - " 65: 65,\n", - " 66: 66,\n", - " 67: 67,\n", - " 68: 68,\n", - " 69: 69,\n", - " 70: 70,\n", - " 71: 71,\n", - " 72: 72,\n", - " 73: 73,\n", - " 74: 74,\n", - " 75: 75,\n", - " 76: 76,\n", - " 77: 77,\n", - " 78: 78,\n", - " 79: 79,\n", - " 80: 80,\n", - " 81: 81,\n", - " 82: 82,\n", - " 83: 83,\n", - " 84: 84,\n", - " 85: 85,\n", - " 86: 86,\n", - " 87: 87,\n", - " 88: 88,\n", - " 89: 89,\n", - " 90: 90,\n", - " 91: 91,\n", - " 92: 92,\n", - " 93: 93,\n", - " 94: 94,\n", - " 95: 95,\n", - " 96: 96,\n", - " 97: 97,\n", - " 98: 98,\n", - " 99: 99,\n", - " 100: 100,\n", - " 101: 101,\n", - " 102: 102,\n", - " 103: 103,\n", - " 104: 104,\n", - " 105: 105,\n", - " 106: 106,\n", - " 107: 107,\n", - " 108: 108,\n", - " 109: 109,\n", - " 110: 110,\n", - " 111: 111,\n", - " 112: 112,\n", - " 113: 113,\n", - " 114: 114,\n", - " 115: 115,\n", - " 116: 116,\n", - " 117: 117,\n", - " 118: 118,\n", - " 119: 119,\n", - " 120: 120,\n", - " 121: 121,\n", - " 122: 122,\n", - " 123: 123,\n", - " 124: 124,\n", - " 125: 125,\n", - " 126: 126,\n", - " 127: 127,\n", - " 128: 128,\n", - " 129: 129,\n", - " 130: 130,\n", - " 131: 131,\n", - " 132: 132,\n", - " 133: 133,\n", - " 134: 134,\n", - " 135: 135,\n", - " 136: 136,\n", - " 137: 137,\n", - " 138: 138,\n", - " 139: 139,\n", - " 140: 140,\n", - " 141: 141,\n", - " 142: 142,\n", - " 143: 143,\n", - " 144: 144,\n", - " 145: 145,\n", - " 146: 146,\n", - " 147: 147,\n", - " 148: 148,\n", - " 149: 149,\n", - " 150: 150,\n", - " 151: 151,\n", - " 152: 152,\n", - " 153: 153,\n", - " 154: 154,\n", - " 155: 155,\n", - " 156: 156,\n", - " 157: 157,\n", - " 158: 158,\n", - " 159: 159,\n", - " 160: 160,\n", - " 161: 161,\n", - " 162: 162,\n", - " 163: 163,\n", - " 164: 164,\n", - " 165: 165,\n", - " 166: 166,\n", - " 167: 167,\n", - " 168: 168,\n", - " 169: 169,\n", - " 170: 170,\n", - " 171: 171,\n", - " 172: 172,\n", - " 173: 173,\n", - " 174: 174,\n", - " 175: 175,\n", - " 176: 176,\n", - " 177: 177,\n", - " 178: 178,\n", - " 179: 179,\n", - " 180: 180,\n", - " 181: 181,\n", - " 182: 182,\n", - " 183: 183,\n", - " 184: 184,\n", - " 185: 185,\n", - " 186: 186,\n", - " 187: 187,\n", - " 188: 188,\n", - " 189: 189,\n", - " 190: 190,\n", - " 191: 191,\n", - " 192: 192,\n", - " 193: 193,\n", - " 194: 194,\n", - " 195: 195,\n", - " 196: 196,\n", - " 197: 197,\n", - " 198: 198,\n", - " 199: 199,\n", - " 200: 200,\n", - " 201: 201,\n", - " 202: 202,\n", - " 203: 203,\n", - " 204: 204,\n", - " 205: 205,\n", - " 206: 206,\n", - " 207: 207,\n", - " 208: 208,\n", - " 209: 209,\n", - " 210: 210,\n", - " 211: 211,\n", - " 212: 212,\n", - " 213: 213,\n", - " 214: 214,\n", - " 215: 215,\n", - " 216: 216,\n", - " 217: 217,\n", - " 218: 218,\n", - " 219: 219,\n", - " 220: 220,\n", - " 221: 221,\n", - " 222: 222,\n", - " 223: 223,\n", - " 224: 224,\n", - " 225: 225,\n", - " 226: 226,\n", - " 227: 227,\n", - " 228: 228,\n", - " 229: 229,\n", - " 230: 230,\n", - " 231: 231,\n", - " 232: 232,\n", - " 233: 233,\n", - " 234: 234,\n", - " 235: 235,\n", - " 236: 236,\n", - " 237: 237,\n", - " 238: 238,\n", - " 239: 239,\n", - " 240: 240,\n", - " 241: 241,\n", - " 242: 242,\n", - " 243: 243,\n", - " 244: 244,\n", - " 245: 245,\n", - " 246: 246,\n", - " 247: 247,\n", - " 248: 248,\n", - " 249: 249}" - ] - }, - "execution_count": 209, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "n_fits_map" ] }, { "cell_type": "code", - "execution_count": 195, + "execution_count": null, "id": "c768b43d", "metadata": {}, "outputs": [], @@ -1520,7 +481,7 @@ }, { "cell_type": "code", - "execution_count": 200, + "execution_count": null, "id": "fc803fd4", "metadata": {}, "outputs": [], @@ -1530,21 +491,10 @@ }, { "cell_type": "code", - "execution_count": 211, + "execution_count": null, "id": "cfe761f5", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "fig = plt.figure(figsize=(12, 12))\n", "for i, plan_name in enumerate(results.keys()):\n", @@ -1567,297 +517,59 @@ }, { "cell_type": "code", - "execution_count": 318, + "execution_count": null, "id": "ac3582ce", "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"/data/wooders/wikipedia/10042021_questions_revid_filtered.csv\", sep=\"\\t\")\n", - "df.columns = [\"question\", \"answer\", \"doc_id\", \"timestamp\", \"revid\", \"oldrevid\"]" + "df = pd.read_csv(\"/data/wooders/wikipedia/questions.csv\")\n", + "#df.columns = [\"question\", \"answer\", \"doc_id\", \"timestamp\", \"revid\", \"oldrevid\"]" ] }, { "cell_type": "code", - "execution_count": 319, - "id": "e7485904", + "execution_count": null, + "id": "ce16ddda", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionanswerdoc_idtimestamprevidoldrevid
0what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:16:27.42857210372125321037212489
1what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:32:54.85714410372125321037212489
2what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:49:22.28571610372125321037212489
3what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:05:49.71428810372125321037212489
4what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:22:17.14286010372125321037212489
.....................
127727who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 20:46:09.23070010416509361041650818
127728who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 21:30:27.69223610416509361041650818
127729who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:14:46.15377210416509361041650818
127730who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:59:04.61530810416509361041650818
127731who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 23:43:23.07684410416509361041650818
\n", - "

127732 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " question \\\n", - "0 what is the most common death in 2021??????? \n", - "1 what is the most common death in 2021??????? \n", - "2 what is the most common death in 2021??????? \n", - "3 what is the most common death in 2021??????? \n", - "4 what is the most common death in 2021??????? \n", - "... ... \n", - "127727 who is the ayo?????? \n", - "127728 who is the ayo?????? \n", - "127729 who is the ayo?????? \n", - "127730 who is the ayo?????? \n", - "127731 who is the ayo?????? \n", - "\n", - " answer doc_id \\\n", - "0 A typical entry reports information in the fol... 65984422 \n", - "1 A typical entry reports information in the fol... 65984422 \n", - "2 A typical entry reports information in the fol... 65984422 \n", - "3 A typical entry reports information in the fol... 65984422 \n", - "4 A typical entry reports information in the fol... 65984422 \n", - "... ... ... \n", - "127727 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", - "127728 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", - "127729 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", - "127730 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", - "127731 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", - "\n", - " timestamp revid oldrevid \n", - "0 2021-08-06 00:16:27.428572 1037212532 1037212489 \n", - "1 2021-08-06 00:32:54.857144 1037212532 1037212489 \n", - "2 2021-08-06 00:49:22.285716 1037212532 1037212489 \n", - "3 2021-08-06 01:05:49.714288 1037212532 1037212489 \n", - "4 2021-08-06 01:22:17.142860 1037212532 1037212489 \n", - "... ... ... ... \n", - "127727 2021-09-01 20:46:09.230700 1041650936 1041650818 \n", - "127728 2021-09-01 21:30:27.692236 1041650936 1041650818 \n", - "127729 2021-09-01 22:14:46.153772 1041650936 1041650818 \n", - "127730 2021-09-01 22:59:04.615308 1041650936 1041650818 \n", - "127731 2021-09-01 23:43:23.076844 1041650936 1041650818 \n", - "\n", - "[127732 rows x 6 columns]" - ] - }, - "execution_count": 319, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": 320, - "id": "2d5a778a", + "execution_count": null, + "id": "07d5672a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "332667 32284\n", - "1305297 10610\n", - "66304621 5330\n", - "17888363 3900\n", - "67089631 3621\n", - "Name: doc_id, dtype: int64" - ] - }, - "execution_count": 320, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df.doc_id.value_counts().head()" + "df.doc_id.value_counts()" ] }, { "cell_type": "code", - "execution_count": 308, - "id": "f7546f77", + "execution_count": null, + "id": "d9dab1e5", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "60043578 590\n", - "51150040 510\n", - "68187748 470\n", - "66187257 450\n", - "64783122 370\n", - "Name: doc_id, dtype: int64" - ] - }, - "execution_count": 308, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df.doc_id.value_counts().tail()" + "df.question.value_counts()" ] }, { "cell_type": "code", - "execution_count": 309, - "id": "5823be22", + "execution_count": null, + "id": "ea220f3d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "332667 5090\n", - "65984422 3769\n", - "68553225 1740\n", - "57798785 1410\n", - "56185392 1360\n", - "68294454 1200\n", - "66293350 1040\n", - "57817558 930\n", - "60043578 590\n", - "51150040 510\n", - "68187748 470\n", - "66187257 450\n", - "64783122 370\n", - "Name: doc_id, dtype: int64" - ] - }, - "execution_count": 309, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.doc_id.value_counts()" ] }, { "cell_type": "code", - "execution_count": 310, - "id": "a8154736", + "execution_count": null, + "id": "86c3ebf2", "metadata": {}, "outputs": [], "source": [ @@ -1866,33 +578,23 @@ }, { "cell_type": "code", - "execution_count": 311, - "id": "48960f20", + "execution_count": null, + "id": "af72ae1a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{332667: 5090,\n", - " 65984422: 3769,\n", - " 68553225: 1740,\n", - " 57798785: 1410,\n", - " 56185392: 1360,\n", - " 68294454: 1200,\n", - " 66293350: 1040,\n", - " 57817558: 930,\n", - " 60043578: 590,\n", - " 51150040: 510,\n", - " 68187748: 470,\n", - " 66187257: 450,\n", - " 64783122: 370}" - ] - }, - "execution_count": 311, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "for key in weights: \n", + " weights[key] = int(weights[key]/10)\n", + " if weights[key] == 0: \n", + " weights[key] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "500bd6ec", + "metadata": {}, + "outputs": [], "source": [ "weights" ] @@ -1900,12 +602,20 @@ { "cell_type": "code", "execution_count": null, - "id": "2ade5579", + "id": "db8ae3c9", "metadata": {}, "outputs": [], "source": [ - "we" + "open(\"/home/eecs/wooders/experiments/wikipedia/weights.json\", \"w\").write(json.dumps(weights))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0901f729", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/wikipedia/preprocessing/log_data.py b/wikipedia/preprocessing/log_data.py new file mode 100644 index 0000000..2e8cca4 --- /dev/null +++ b/wikipedia/preprocessing/log_data.py @@ -0,0 +1,72 @@ +import wandb +import configparser +import os + + +def log_questions(run, config): + # log questions file + artifact = wandb.Artifact("questions", type='dataset') + artifact.add_file(config["files"]["raw_questions_file"]) + artifact.add_file(config["files"]["questions_file"]) + run.log_artifact(artifact) + +def log_files(run, config): + # log files + artifact = wandb.Artifact("files", type='dataset') + artifact.add_file(config["files"]["changes_file"]) + artifact.add_file(config["files"]["titles_file"]) + artifact.add_file(config["files"]["edits_file"]) + run.log_artifact(artifact) + +def log_pageview(run, config): + # log pageview + artifact = wandb.Artifact("pageviews", type='dataset') + artifact.add_file(config["files"]["raw_pageview_file"]) + artifact.add_file(config["files"]["pageview_file"]) + artifact.add_file(config["files"]["timestamp_weights_file"]) + run.log_artifact(artifact) + +def log_simulation(run, config): + # log simulation data + artifact = wandb.Artifact("simulation", type='dataset') + artifact.add_file(config["simulation"]["stream_edits_file"]) + artifact.add_file(config["simulation"]["stream_questions_file"]) + artifact.add_file(config["simulation"]["init_data_file"]) + run.log_artifact(artifact) + +def log_plans(run, config, plan_dir): + artifact = wandb.Artifact("plans", type='dataset') + artifact.add_file(config["simulation"]["optimal_plan_file"]) + artifact.add_dir(plan_dir) + run.log_artifact(artifact) + +def log_plan_data(run, config, plan_name, plan_path): + artifact = wandb.Artifact(plan_name, type='dataset') + artifact.add_folder(plan_path) + run.log_artifact + + +def log_experiment(run, config): + # log experiment output + artifact = wandb.Artifact("prediction_results", type='dataset') + files = os.listdir(config["directory"]["dpr_dir"]) + for filename in files: + if "plan-" in filename and '.json' in filename: + artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + run.log_artifact(artifact) + +if __name__ == "__main__": + + print("Running wandb logging on data") + run = wandb.init(job_type="dataset-creation", project="wiki-workload") + + # configuration file + config = configparser.ConfigParser() + config.read("config.yml") + + log_questions(run, config) + log_files(run, config) + log_pageview(run, config) + log_simulation(run, config) + log_experiment(run, config) + diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index debb1e6..0334ade 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,8 +1,8 @@ set -xe -for key_policy in "random" "weighted_random" "round_robin" "weighted_round_robin" +for key_policy in "weighted_random" "weighted_round_robin" do - for event_policy in "fifo" "lifo" + for event_policy in "lifo" do for load_shedding_policy in "always_process" do diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index ff2b74e..ceaf249 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -10,7 +10,7 @@ do do for model_runtime in 0.01 0.05 0.1 1 5 10 do - python wiki_eval.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json + python wiki_eval.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json --wandb done done done diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh index 6655996..3f0713b 100644 --- a/wikipedia/run_3_run_predictions.sh +++ b/wikipedia/run_3_run_predictions.sh @@ -5,7 +5,7 @@ dpr_dir=~/DPR cd $dpr_dir -for key_policy in "round_robin" "weighted_round_robin" +for key_policy in "weighted_round_robin" #"round_robin" #for key_policy in "random" "weighted_random" do for event_policy in "lifo" @@ -16,10 +16,10 @@ do do plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 echo $plan_file - CUDA_VISIBLE_DEVICES=1,2,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + CUDA_VISIBLE_DEVICES=3,4,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & pid=$! done - #wait $pid + wait $pid done done done diff --git a/wikipedia/run_4_run_optimal_predictions.sh b/wikipedia/run_4_run_optimal_predictions.sh index 542d828..14abe91 100644 --- a/wikipedia/run_4_run_optimal_predictions.sh +++ b/wikipedia/run_4_run_optimal_predictions.sh @@ -2,7 +2,7 @@ set -xe plan_dir=/data/wooders/wiki-plans dpr_dir=~/DPR +python wiki_eval.py --offline-plan-path optimal_plan.json cd $dpr_dir -plan_file="optimal_plan" echo $plan_file -CUDA_VISIBLE_DEVICES=5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file +CUDA_VISIBLE_DEVICES=5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh optimal_plan diff --git a/wikipedia/run_5_pipeline_predict.sh b/wikipedia/run_5_pipeline_predict.sh new file mode 100644 index 0000000..f6aee15 --- /dev/null +++ b/wikipedia/run_5_pipeline_predict.sh @@ -0,0 +1,30 @@ +set -xe + +plan_dir=/data/wooders/wiki-plans +dpr_dir=/home/eecs/wooders/DPR +wiki_dir=/home/eecs/wooders/experiments/wikipedia + + +#for key_policy in "weighted_random" "weighted_round_robin" +#for key_policy in "random" "weighted_random" +for key_policy in "round_robin" "weighted_round_robin" +do + for event_policy in "lifo" + do + for load_shedding_policy in "always_process" + do + for model_runtime in 0.01 0.05 0.1 1 5 + do + cd $wiki_dir + plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 + echo $plan_file + python wiki_eval.py --offline-plan-path ${plan_dir}/${plan_file}.json + cd $dpr_dir + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + pid=$! + done + #wait $pid + done + done +done +p diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 5162e99..5dba7d1 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -83,16 +83,24 @@ def __init__(self, pageview_file, all_keys): self.cur_key_set = [] self.cur_key_iter = None pageview_df = pd.read_csv(pageview_file) - #self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() - self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() - self.weights = {} - for key in self.raw_weights.keys(): - if str(key) not in all_keys: - continue - self.weights[key] = int(self.raw_weights[key]*1000) - #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" - if self.weights[key] == 0: + self.weights = json.load(open("weights.json")) + + ##self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() + #self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() + #self.weights = {} + #for key in self.raw_weights.keys(): + # if str(key) not in all_keys: + # continue + + # self.weights[key] = int(self.raw_weights[key]*1000) + # #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + # if self.weights[key] == 0: + # self.weights[key] = 1 + + + for key in all_keys: + if key not in self.weights: self.weights[key] = 1 @@ -175,7 +183,8 @@ class WeightedLoadBalancer(CrossKeyLoadBalancer): def __init__(self, pageview_file): pageview_df = pd.read_csv(pageview_file) - self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() + #self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() + self.weights = json.load(open("weights.json")) def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: chosen_key = None @@ -338,11 +347,22 @@ def run(self, replica_id: int): config = configparser.ConfigParser() config.read("config.yml") plan_dir = config["simulation"]["plan_dir"] -init_data_file = config["simulation"]["init_data_file"] -stream_edits_file = config["simulation"]["stream_edits_file"] -stream_questions_file = config["simulation"]["stream_questions_file"] -pageview_file = config["files"]["pageview_file"] -timestamp_weights_file = config["files"]["timestamp_weights_file"] +#init_data_file = config["simulation"]["init_data_file"] +#stream_edits_file = config["simulation"]["stream_edits_file"] +#stream_questions_file = config["simulation"]["stream_questions_file"] +#pageview_file = config["files"]["pageview_file"] +#timestamp_weights_file = config["files"]["timestamp_weights_file"] + +run = wandb.init(job_type="dataset-creation", project="wiki-workload") +question_dir = run.use_artifact('ucb-ralf/wiki-workload /questions:v2', type='dataset').download() +simulation_dir = run.use_artifact('ucb-ralf/wiki-workload /simulation:v2', type='dataset').download() +pageview_dir = run.use_artifact('ucb-ralf/wiki-workload /pageviews:v0', type='dataset').download() + +init_data_file = f"{simulation_dir}/init_data.json" +stream_edits_file = f"{simulation_dir}/edit_stream.json" +stream_questions_file = f"{simulation_dir}/question_stream.json" +pageview_file = f"{pageview_dir}/pageviews.csv" +timestamp_weights_file = f"{pageview_dir}/timestamp_weights_file.json" # load simulation data edits = json.load(open(stream_edits_file)) @@ -429,7 +449,8 @@ def run_once( parser.add_argument("--load_shedding_policy", type=str) args = parser.parse_args() - out_path = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}.json" + plan_name = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}" + out_path = f"{plan_name}.json" print(out_path) run_once( out_path=out_path, @@ -441,8 +462,7 @@ def run_once( model_runtime_constant=args.model_runtime, key_selection_policy=args.key_policy, ) - run = wandb.init(job_type="dataset-creation", project="wiki-workload") - log_plans(run, config, out_path) + log_plans(run, config, plan_dir) # load sheding: random, drop short edits diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py index 1214cfb..048a97b 100644 --- a/wikipedia/wiki_eval.py +++ b/wikipedia/wiki_eval.py @@ -31,6 +31,7 @@ ) from dpr.utils.data_utils import Tensorizer +from preprocessing.log_data import log_plan_data """ @@ -47,14 +48,21 @@ """ # simulation data +import wandb +run = wandb.init(project='wiki-workload', job_type="dataset-creation") +simulation_dir = run.use_artifact('ucb-ralf/wiki-workload /simulation:v2', type='dataset').download() +question_dir = run.use_artifact('ucb-ralf/wiki-workload /questions:v2', type='dataset').download() + +init_data_file = f"{simulation_dir}/init_data.json" +stream_edits_file = f"{simulation_dir}/edit_stream.json" +stream_questions_file = f"{simulation_dir}/question_stream.json" + config = configparser.ConfigParser() config.read("config.yml") -plan_dir = config["simulation"]["plan_dir"] -init_data_file = config["simulation"]["init_data_file"] -stream_edits_file = config["simulation"]["stream_edits_file"] -stream_questions_file = config["simulation"]["stream_questions_file"] - -data_dir = config['files']['data_dir'] +#plan_dir = config["simulation"]["plan_dir"] +#init_data_file = config["simulation"]["init_data_file"] +#stream_edits_file = config["simulation"]["stream_edits_file"] +#stream_questions_file = config["simulation"]["stream_questions_file"] rev_dir = config['directory']['diff_dir'] embedding_dir = config['directory']['embedding_dir'] exp_dir = config['directory']['exp_dir'] @@ -64,87 +72,10 @@ parser = argparse.ArgumentParser(description="Specify experiment config") parser.add_argument("--offline-plan-path", type=str) parser.add_argument("--embed", default=False, action="store_true") +parser.add_argument("--wandb", default=False, action="store_true") args = parser.parse_args() - -class Retriever: - def __init__(self): - - # parser = argparse.ArgumentParser(description="") - add_encoder_params(parser) - add_tokenizer_params(parser) - add_cuda_params(parser) - args = parser.parse_args() - - setup_args_gpu(args) - - saved_state = load_states_from_checkpoint(model_file) - set_encoder_params_from_state(saved_state.encoder_params, args) - - self.tensorizer, self.encoder, _ = init_biencoder_components( - args.encoder_model_type, args, inference_only=True - ) - - self.encoder = self.encoder.ctx_model - - self.encoder, _ = setup_for_distributed_mode( - self.encoder, - None, - args.device, - args.n_gpu, - args.local_rank, - args.fp16, - args.fp16_opt_level, - ) - self.encoder.eval() - - model_to_load = get_model_obj(self.encoder) - - prefix_len = len("ctx_model.") - ctx_state = { - key[prefix_len:]: value - for (key, value) in saved_state.model_dict.items() - if key.startswith("ctx_model.") - } - model_to_load.load_state_dict(ctx_state) - self.device = args.device - - def predict(self, text): - - st = time.time() - batch_token_tensors = [self.tensorizer.text_to_tensor(text)] - - ctx_ids_batch = move_to_device( - torch.stack(batch_token_tensors, dim=0), self.device - ) - ctx_seg_batch = move_to_device(torch.zeros_like(ctx_ids_batch), self.device) - ctx_attn_mask = move_to_device( - self.tensorizer.get_attn_mask(ctx_ids_batch), self.device - ) - with torch.no_grad(): - _, embedding, _ = self.encoder(ctx_ids_batch, ctx_seg_batch, ctx_attn_mask) - embedding = embedding.cpu().numpy() - return embedding - - -def assign_timestamps_min(ts): - # take in unix timestamp - covert to integer - start_ts = 1628131044000000000 # don't change - delta = ts - start_ts - if delta < 0: - return None - - return int(delta / (60 * 1000000000)) - - -def embed_passages(sents, retriever_model, num_sent_in_pass=10): - passages = [] - embeddings = [] - for i in range(0, len(sents), num_sent_in_pass): - passages.append(" ".join(sents[i : i + num_sent_in_pass])) - embeddings.append(retriever_model.predict(passages[-1])) - return passages, embeddings - +run.config.update(vars(args)) def sents_to_passages(sents, num_sent_in_pass=10): passages = [] @@ -164,13 +95,10 @@ def offline_eval(plan_json_path, exp_id, compute_embeddings=True): keys = ["51150040"] filter_keys = False - # retriever_model = Retriever() - # print("Created retriever") # compute initial passage embeddings for each document init_data = json.load(open(init_data_file)) init_state = {} - staleness = [] for key in tqdm(init_data.keys()): if filter_keys and key not in keys: @@ -255,31 +183,36 @@ def offline_eval(plan_json_path, exp_id, compute_embeddings=True): print("EMBED", embed_versions.keys()) print("Num refits", count, len(missing)) - # returns latest version of document embeddings for timestep/key - def get_latest_embedding(timestep, doc_id): - - latest = 0 - for version in embed_versions.keys(): - version = float(version) - if ( - float(timestep) >= version - and version > latest - and doc_id in embed_versions[str(version)] - ): - latest = version - #print(doc_id, "latest", timestep, latest, timestep - latest) - assert ( - doc_id in embed_versions[str(latest)] - ), f"Missing doc id {doc_id} {latest} {doc_id in init_data}" - doc_version = embed_versions[str(latest)][doc_id] - assert latest <= timestep - return ( - doc_version["passages"], - doc_version["embeddings"], - doc_version["rev"], - latest, - ) - + embed_filename = "embed_versions.pkl" + pickle.dump(embed_versions, open(embed_filename, "wb")) + return embed_filename + +# returns latest version of document embeddings for timestep/key +def get_latest_embedding(timestep, doc_id, embed_versions): + + latest = 0 + for version in embed_versions.keys(): + version = float(version) + if ( + float(timestep) >= version + and version > latest + and doc_id in embed_versions[str(version)] + ): + latest = version + #print(doc_id, "latest", timestep, latest, timestep - latest) + assert ( + doc_id in embed_versions[str(latest)] + ), f"Missing doc id {doc_id} {latest} {doc_id in init_data}" + doc_version = embed_versions[str(latest)][doc_id] + assert latest <= timestep + return ( + doc_version["passages"], + doc_version["embeddings"], + doc_version["rev"], + latest, + ) + +def generate_question_data_all(exp_id, embed_filename): # create experiment directory directory = os.path.join(exp_dir, exp_id) if os.path.isdir(directory): @@ -290,16 +223,35 @@ def get_latest_embedding(timestep, doc_id): # get simulation data questions questions = json.load(open(stream_questions_file)) + + for ts in range(len(questions)): + questions[ts]["ts"] = ts + print("processing questions", len(questions)) print("directory", directory) - # Get embedding version for each query, write outputs - for ts in tqdm(range(len(questions))): - timestep = ts / 100 # TODO: Watch out!! can change and mess up experiment - # print(ts, timestep) + chunk_size = 1000 + chunks = [(questions[i:i+chunk_size], embed_filename, directory) for i in range(0, len(questions), chunk_size)] + p = Pool(128) + staleness_all = p.starmap(generate_question_data, chunks) + p.close() + staleness_all = [item for sublist in staleness_all for item in sublist] + staleness = np.array(staleness_all).mean() + print("all staleness", staleness) + wandb.log({"staleness": staleness}) + return directory + - for doc_id in questions[ts].keys(): +def generate_question_data(questions, embed_filename, directory): + embed_versions = pickle.load(open(embed_filename, "rb")) + init_data = json.load(open(init_data_file)) + staleness = [] + for ts_questions in questions: + ts = ts_questions["ts"] + timestep = ts / 100 # TODO: Watch out!! can change and mess up experiment + for doc_id in ts_questions.keys(): + if doc_id == "ts": continue # not considered in edits if doc_id not in init_data: print("missing", doc_id) @@ -308,11 +260,11 @@ def get_latest_embedding(timestep, doc_id): # get current embedding and write passage_texts, passage_embeddings, version, latest = get_latest_embedding( - timestep, doc_id + timestep, doc_id, embed_versions ) # loop through questions - doc_questions = questions[ts][doc_id] + doc_questions = ts_questions[doc_id] queries = [] for q in doc_questions: question = q["question"] @@ -326,7 +278,6 @@ def get_latest_embedding(timestep, doc_id): queries.append([question, [answer], doc_id]) # append per query - print("staleness", timestep - latest) staleness.append(timestep - latest) # dump CTX/question script @@ -352,11 +303,7 @@ def get_latest_embedding(timestep, doc_id): assert len(passage_ctx) == len(passage_texts) assert len(passage_embeddings) == len(passage_texts) - - print("done processing queries!", len(questions)) - print("staleness", np.array(staleness).mean()) - return directory - + return staleness def main(): @@ -365,14 +312,14 @@ def main(): ) # "wiki-plans/plan-fifo-always_process-1-0.001-60.json" exp_id = os.path.basename(plan_file).replace(".json", "") - output_dir = offline_eval(plan_file, exp_id, compute_embeddings=args.embed) - log_wandb = False - if log_wandb: - import wandb + #embed_filename = offline_eval(plan_file, exp_id, compute_embeddings=args.embed) - run = wandb.init(job_type="create_simulation_output") - artifact = wandb.Artifact(exp_id, type="dataset") - artifact.add_folder(output_dir) + embed_filename = "embed_versions.pkl" + generate_question_data_all(exp_id, embed_filename) + if args.wandb: + import wandb + run = wandb.init(job_type="dataset-creation", project="wiki-workload") + log_plan_data(run, config, exp_id, output_dir) if __name__ == "__main__": From 7422f944e97410e918918f25e289402b3e107bf0 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 12 Oct 2021 18:01:08 -0700 Subject: [PATCH 14/26] add benchmark script to get realistic inference numbers --- wikipedia/benchmark_bert.py | 7 + wikipedia/notebooks/Wikipedia Plots.ipynb | 703 +++++++++++++++++++++- wikipedia/run_1_generate_plan.sh | 2 +- wikipedia/wiki_eval.py | 13 +- 4 files changed, 691 insertions(+), 34 deletions(-) create mode 100644 wikipedia/benchmark_bert.py diff --git a/wikipedia/benchmark_bert.py b/wikipedia/benchmark_bert.py new file mode 100644 index 0000000..fb9f602 --- /dev/null +++ b/wikipedia/benchmark_bert.py @@ -0,0 +1,7 @@ +from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments +from pprint import pprint + +args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[1], sequence_lengths=[100]) +benchmark = PyTorchBenchmark(args) +results = benchmark.run() +pprint(results) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 08f0f5a..0ef7d3e 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 410, "id": "e0030940", "metadata": {}, "outputs": [], @@ -24,10 +24,113 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 411, "id": "016e13bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:3mmrfkbb) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Waiting for W&B process to finish, PID 74125... (success)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value=' 0.41MB of 0.41MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced woven-sea-93: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/3mmrfkbb
\n", + "Find logs at: ./wandb/run-20211012_125219-3mmrfkbb/logs
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:3mmrfkbb). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Syncing run trim-microwave-115 to Weights & Biases (docs).
\n", + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact questions:latest, 60.97MB. 2 files... Done. 0:0:0\n" + ] + } + ], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "pageview_dir = run.use_artifact('pageviews:latest').download()\n", @@ -36,10 +139,385 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 412, "id": "7690f6d7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titleedit_count2021080500202108060020210807002021080800202108090020210810002021081100...20210828002021082900202108300020210831002021090100202109020020210903002021090400weightsdoc_id
00Deaths in 20211877383536313496656...69506368505239460.02851165984422
112021 Atlantic hurricane season14381151689714...820285121150.00380557798785
22Neeraj Chopra11563732434...560492130.00217051150040
33Fall of Kabul (2021)10091891212161012...11169920155100.00487668481047
44Great Britain at the 2020 Summer Paralympics989135641689...3868107470.00339760043578
..................................................................
211211List of fungi of South Africa203897132149...10761135560.00346768354495
212212Mister Supranational 2021203897132149...10761135560.00346767918135
2132132021–22 FC Barcelona season20219292927282723...21262916272043180.01269867089631
214214Hamid Karzai International Airport20114261517261417...1910251326142270.007258487602
215215Characters of the Marvel Cinematic Universe20114261517261417...1910251326142270.00725862372638
\n", + "

216 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title edit_count \\\n", + "0 0 Deaths in 2021 1877 \n", + "1 1 2021 Atlantic hurricane season 1438 \n", + "2 2 Neeraj Chopra 1156 \n", + "3 3 Fall of Kabul (2021) 1009 \n", + "4 4 Great Britain at the 2020 Summer Paralympics 989 \n", + ".. ... ... ... \n", + "211 211 List of fungi of South Africa 203 \n", + "212 212 Mister Supranational 2021 203 \n", + "213 213 2021–22 FC Barcelona season 202 \n", + "214 214 Hamid Karzai International Airport 201 \n", + "215 215 Characters of the Marvel Cinematic Universe 201 \n", + "\n", + " 2021080500 2021080600 2021080700 2021080800 2021080900 2021081000 \\\n", + "0 38 35 36 31 349 66 \n", + "1 11 5 16 8 9 7 \n", + "2 3 7 3 2 4 3 \n", + "3 18 9 12 12 16 10 \n", + "4 13 5 6 4 16 8 \n", + ".. ... ... ... ... ... ... \n", + "211 8 9 7 13 21 4 \n", + "212 8 9 7 13 21 4 \n", + "213 19 29 29 27 28 27 \n", + "214 14 26 15 17 26 14 \n", + "215 14 26 15 17 26 14 \n", + "\n", + " 2021081100 ... 2021082800 2021082900 2021083000 2021083100 \\\n", + "0 56 ... 69 50 63 68 \n", + "1 14 ... 8 20 2 8 \n", + "2 4 ... 5 6 0 4 \n", + "3 12 ... 11 16 9 9 \n", + "4 9 ... 3 8 6 8 \n", + ".. ... ... ... ... ... ... \n", + "211 9 ... 10 7 6 1 \n", + "212 9 ... 10 7 6 1 \n", + "213 23 ... 21 26 29 16 \n", + "214 17 ... 19 10 25 13 \n", + "215 17 ... 19 10 25 13 \n", + "\n", + " 2021090100 2021090200 2021090300 2021090400 weights doc_id \n", + "0 50 52 39 46 0.028511 65984422 \n", + "1 5 12 11 5 0.003805 57798785 \n", + "2 9 2 1 3 0.002170 51150040 \n", + "3 20 15 5 10 0.004876 68481047 \n", + "4 10 7 4 7 0.003397 60043578 \n", + ".. ... ... ... ... ... ... \n", + "211 13 5 5 6 0.003467 68354495 \n", + "212 13 5 5 6 0.003467 67918135 \n", + "213 27 20 43 18 0.012698 67089631 \n", + "214 26 14 22 7 0.007258 487602 \n", + "215 26 14 22 7 0.007258 62372638 \n", + "\n", + "[216 rows x 36 columns]" + ] + }, + "execution_count": 412, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pageview_df = pd.read_csv(f\"{pageview_dir}/pageviews.csv\")\n", "pageview_df" @@ -47,10 +525,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 413, "id": "5b5d1edc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 413, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df = pd.DataFrame({\n", " \"edit_frequency\": pageview_df.edit_count / pageview_df.edit_count.sum(),\n", @@ -70,10 +569,106 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 414, "id": "39b1975e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:1t9orwj8) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Waiting for W&B process to finish, PID 14934... (success)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced trim-microwave-115: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/1t9orwj8
\n", + "Find logs at: ./wandb/run-20211012_160227-1t9orwj8/logs
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:1t9orwj8). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Syncing run fluent-mountain-116 to Weights & Biases (docs).
\n", + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "artifact = run.use_artifact('prediction_results:latest')\n", @@ -82,17 +677,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 415, "id": "101571e2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'./artifacts/prediction_results:v1997'" + ] + }, + "execution_count": 415, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "artifact_dir" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 416, "id": "03e14929", "metadata": {}, "outputs": [], @@ -102,24 +708,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 453, "id": "eaf30e01", "metadata": {}, "outputs": [], "source": [ - "constants = [0.01, 0.05, 0.1, 1, 5]\n", + "constants = [0.01, 0.05]\n", "policies = [\"lifo\"]\n", "key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", - "metric = 'top10'" + "metric = 'top5'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 454, "id": "96209574", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.05-100.json\n" + ] + }, + { + "data": { + "text/plain": [ + "{'plan-random_lifo-always_process': [0.41722204591135087, 0.41605839416058393],\n", + " 'plan-weighted_random_lifo-always_process': [0.508879315080318,\n", + " 0.44467986596668],\n", + " 'plan-round_robin_lifo-always_process': [0.5089891784573612,\n", + " 0.37384957156458265],\n", + " 'plan-weighted_round_robin_lifo-always_process': [0.5088165360077218,\n", + " 0.46732741640574116]}" + ] + }, + "execution_count": 454, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "all_results = {}\n", "for policy in policies: \n", @@ -213,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 421, "id": "6d536763", "metadata": {}, "outputs": [], @@ -224,21 +861,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 455, "id": "1e07c3e9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn\n", "resources = constants \n", "df = pd.DataFrame({\n", - " 'Factor': resources, \n", + " 'Model Runtime Const': resources, \n", " **all_results\n", "})\n", - "fig, ax1 = plt.subplots(figsize=(10, 5))\n", - "tidy = df.melt(id_vars='Factor').rename(columns=str.title)\n", - "seaborn.barplot(x='Factor', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "fig, ax1 = plt.subplots(figsize=(5, 5),)\n", + "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", + "seaborn.barplot(x='Model Runtime Const', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "ax1.set(xlabel='Model Runtime Const', ylabel=f'{metric} Accuracy')\n", + "ax1.legend_.remove()\n", + "plt.legend(loc='lower center')\n", "seaborn.despine(fig)" ] }, @@ -366,7 +1017,7 @@ { "cell_type": "code", "execution_count": null, - "id": "592a8b2e", + "id": "a958fee2", "metadata": {}, "outputs": [], "source": [ @@ -376,7 +1027,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98907ae3", + "id": "87453791", "metadata": {}, "outputs": [], "source": [ diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index 0334ade..a0689fe 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,6 +1,6 @@ set -xe -for key_policy in "weighted_random" "weighted_round_robin" +for key_policy in "random" "round_robin" do for event_policy in "lifo" do diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py index 048a97b..a556403 100644 --- a/wikipedia/wiki_eval.py +++ b/wikipedia/wiki_eval.py @@ -75,7 +75,9 @@ parser.add_argument("--wandb", default=False, action="store_true") args = parser.parse_args() +exp_id = os.path.basename(args.offline_plan_path).replace(".json", "") run.config.update(vars(args)) +run.config.update({"plan": exp_id}) def sents_to_passages(sents, num_sent_in_pass=10): passages = [] @@ -303,18 +305,15 @@ def generate_question_data(questions, embed_filename, directory): assert len(passage_ctx) == len(passage_texts) assert len(passage_embeddings) == len(passage_texts) + print("staleness", np.array(staleness).mean()) return staleness def main(): + - plan_file = ( - args.offline_plan_path - ) # "wiki-plans/plan-fifo-always_process-1-0.001-60.json" - exp_id = os.path.basename(plan_file).replace(".json", "") - - #embed_filename = offline_eval(plan_file, exp_id, compute_embeddings=args.embed) + embed_filename = offline_eval(args.offline_plan_path, exp_id, compute_embeddings=args.embed) - embed_filename = "embed_versions.pkl" + #embed_filename = "embed_versions.pkl" generate_question_data_all(exp_id, embed_filename) if args.wandb: import wandb From 16a5dca1c6fb63dc4b2585f6482bcc372e7a3e59 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Wed, 13 Oct 2021 21:44:52 -0700 Subject: [PATCH 15/26] semi wokring simulator - about to implement new policies and more replicas --- stl/offline/config_gen.py | 27 +- stl/offline/evaluation.py | 22 +- stl/offline/extend_data.py | 57 + stl/offline/run_1_simulate_windows.sh | 28 +- stl/offline/run_2_eval_yahoo_keys.sh | 32 +- stl/offline/run_3_eval_oracle.sh | 16 +- stl/offline/run_4_generate_plan.sh | 11 +- stl/offline/run_5_simulate_lp_plan.sh | 52 +- stl/offline/simulation.py | 10 +- wikipedia/benchmark_bert.py | 2 +- wikipedia/notebooks/Wikipedia Plots.ipynb | 1153 ++++++++++++++++++--- wikipedia/preprocessing/log_data.py | 3 + wikipedia/preprocessing/wiki_api_data.py | 2 + wikipedia/run_1_generate_plan.sh | 6 +- wikipedia/run_2_prepare_data.sh | 8 +- wikipedia/run_3_run_predictions.sh | 15 +- wikipedia/run_5_pipeline_predict.sh | 10 +- wikipedia/simulate.py | 20 +- wikipedia/wiki_eval.py | 10 +- 19 files changed, 1237 insertions(+), 247 deletions(-) create mode 100644 stl/offline/extend_data.py diff --git a/stl/offline/config_gen.py b/stl/offline/config_gen.py index 6e52b11..58ebca5 100644 --- a/stl/offline/config_gen.py +++ b/stl/offline/config_gen.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd from absl import app, flags -from ortools.linear_solver import pywraplp from sktime.performance_metrics.forecasting import mean_squared_scaled_error FLAGS = flags.FLAGS @@ -23,8 +22,30 @@ required=True, ) +# TODO(simon): add flags for lp solver constraint +flags.DEFINE_integer( + "max_n_fits", + default=None, + help="Max fits for LP", + required=False, +) + +flags.DEFINE_integer( + "max_loss", + default=None, + help="Max loss for LP", + required=False, +) + +flags.DEFINE_string( + "objective", + default="min_loss", + help="LP optimization goal", + required=False, +) -def run_lp(df: pd.DataFrame, max_n_fits=None, max_loss=None, objective="min_loss"): +def run_lp(df: pd.DataFrame, objective="min_loss"): + from ortools.linear_solver import pywraplp """Run through mixed integer program to generate the best plan. Input: @@ -35,6 +56,8 @@ def run_lp(df: pd.DataFrame, max_n_fits=None, max_loss=None, objective="min_loss Output: plan(Dict[str, int]): a dictionary mapping key -> optimal n_fits such that loss is minimal. """ + max_n_fits = FLAGS.max_n_fits + max_loss = FLAGS.max_loss assert all(df.columns == ["key", "n_fits", "loss"]) assert objective in {"min_loss", "min_fits"} diff --git a/stl/offline/evaluation.py b/stl/offline/evaluation.py index a32aab5..f3a47ac 100644 --- a/stl/offline/evaluation.py +++ b/stl/offline/evaluation.py @@ -1,4 +1,5 @@ import argparse +import time from multiprocessing import Pool import json import os @@ -13,7 +14,9 @@ def train(data, window_size, seasonality): window = data[-window_size:] values = [r["value"] for r in window] + st = time.time() stl_result = STL(values, period=seasonality, robust=True).fit() + print(time.time() - st) timestamp = data[-1]["timestamp"] return { "timestamp": timestamp, @@ -42,9 +45,14 @@ def predict(event, model): def offline_eval(yahoo_csv_path, plan_json_path, key, output_path): + print(output_path) + # get plan DF for key plan_df = pd.read_json(plan_json_path) - plan_df_key = plan_df[plan_df["key"] == int(key)] + if key is not None: + plan_df_key = plan_df[plan_df["key"] == int(key)] + else: + plan_df_key = plan_df plan_df_key.index = pd.RangeIndex(start=0, stop=len(plan_df_key.index)) # get original data @@ -69,6 +77,8 @@ def offline_eval(yahoo_csv_path, plan_json_path, key, output_path): #print("fit time", time.time() - st) offline_stl[row.processing_time] = trained + print(offline_stl.keys()) + # Assign the trained model with every events in the source file. def find_freshest_model_version(event_time, model_versions): @@ -84,6 +94,7 @@ def find_freshest_model_version(event_time, model_versions): # Run prediction! predicted = [] + print("running prediction") for _, row in df.iterrows(): model_version = row["model_version"] if np.isnan(model_version): @@ -108,8 +119,8 @@ def find_freshest_model_version(event_time, model_versions): add_df = pd.DataFrame(predicted) for new_col in add_df.columns: df[new_col] = add_df[new_col] - df.to_csv(output_file) - return + print("writing", output_path) + df.to_csv(output_path, index=None) def offline_eval_all(yahoo_path, plan_json_path, output_path, param_path): @@ -152,9 +163,8 @@ def run_exp(csv_path, plan_path, output_path, run_policy=False, run_oracle=False # Headers # processing_time window_start_seq_id window_end_seq_id key - plan_df = pd.read_json(plan_path) - offline_eval(csv_path, plan_df, output_path) - df.to_csv(output_path, index=None) + #plan_df = pd.read_json(plan_path) + offline_eval(csv_path, plan_path, None, output_path) def _ensure_dir(path): diff --git a/stl/offline/extend_data.py b/stl/offline/extend_data.py new file mode 100644 index 0000000..98ce1bc --- /dev/null +++ b/stl/offline/extend_data.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +import random +import statistics +import glob +import os + +max_length = 1680 # double length +noise = 2 +max_seasonality = 24*7 +# over_sampling_rate = 1 +path = "yahoo_train_data/" +output_path = "yahoo_eval_data/" +input_path = "yahoo_train_data/*" +files = glob.glob(input_path) +print(files) +for filename in files: + df = pd.read_csv(filename) + + max_outlier_value, min_outlier_value = max(df['noise']), min(df['noise']) + mean, stddev = statistics.mean(df['noise']), statistics.stdev(df['noise']) + + initial_trend = df['trend'][0] + last_trend = df['trend'].iloc[-1] + trend_subtracted_series = df['trend'] - initial_trend + # trend_subtracted_series = np.repeat(trend_subtracted_series, over_sampling_rate) + + seasonality = df['seasonality1'] + df['seasonality2'] + df['seasonality3'] + # seasonality = np.repeat(seasonality, over_sampling_rate) + + repeat_length = (len(trend_subtracted_series) // max_seasonality) * max_seasonality + + count = 0 + generated_trend = [last_trend] * max_length + generated_noise = [0] * max_length + generated_outlier = [0] * max_length + generated_seasonality = [0] * max_length + + for i in range(max_length): + if count >= repeat_length: + count = 0 + last_trend = generated_trend[i-1] + generated_trend[i] = last_trend + trend_subtracted_series[count] + generated_seasonality[i] = seasonality[count] + generated_noise[i] = random.gauss(mean, stddev) + generated_outlier[i] = 0 + if random.randint(0, 100) > 100 - noise: + if random.randint(0, 100) > 50: + generated_outlier[i] = max_outlier_value * random.randint(70,100) // 100 + else: + generated_outlier[i] = min_outlier_value * random.randint(70,100) // 100 + count += 1 + + new_df = pd.DataFrame({"trend": generated_trend, "noise": generated_noise, "outlier": generated_outlier, "seasonality": generated_seasonality }) + new_df['value'] = new_df['trend'] + new_df['noise'] + new_df['outlier'] + new_df['seasonality'] + print(os.path.basename(filename)) + new_df.to_csv(os.path.join(output_path, os.path.basename(filename))) diff --git a/stl/offline/run_1_simulate_windows.sh b/stl/offline/run_1_simulate_windows.sh index 70dc721..e195c39 100644 --- a/stl/offline/run_1_simulate_windows.sh +++ b/stl/offline/run_1_simulate_windows.sh @@ -1,7 +1,27 @@ -set -xe +set -ex -for slide in 1 6 12 18 24 48 96 168 192 336 672 +data_dir="./yahoo_train_data" +tmp_script=`mktemp` + +for key_prio in "lifo" "fifo" +do +for data in `ls $data_dir/*` do - python simulation.py --model_runtime_s 0 --total_runtime_s 2000 --per_key_records_per_second 1 \ - --window_size 672 --slide_size ${slide} --output_path result/offline_1_slide/plan/slide_${slide}_plan.json + key=`basename $data` + for slide in 6 12 18 24 48 96 168 192 336 672 + do + echo \" python simulation.py --model_runtime_s 1.5 --total_runtime_s 2000 --per_key_records_per_second 1 --key_prio_policy ${key_prio} --window_size 672 --slide_size ${slide} --output_path offline_1_slide/plan/${key_prio}_slide_${slide}_plan.json --num_mapper_replicas 1\" >> $tmp_script + done +done done + +cat $tmp_script | xargs -n 1 -P 36 bash -l -c + +#set -xe +# +#for replicas in +#for slide in 1 6 12 18 24 48 96 168 192 336 672 +#do +# python simulation.py --model_runtime_s 0 --total_runtime_s 2000 --per_key_records_per_second 1 \ +# --window_size 672 --slide_size ${slide} --output_path result/offline_1_slide/plan/slide_${slide}_plan.json +#done diff --git a/stl/offline/run_2_eval_yahoo_keys.sh b/stl/offline/run_2_eval_yahoo_keys.sh index 444da55..c5f106d 100644 --- a/stl/offline/run_2_eval_yahoo_keys.sh +++ b/stl/offline/run_2_eval_yahoo_keys.sh @@ -1,14 +1,36 @@ set -ex -data_dir="/data/wooders/stl/yahoo" +data_dir="./yahoo_train_data" -for data in `ls $data_dir/A4/*` +tmp_script=`mktemp` +for key_prio in "lifo" "fifo" +do +for data in `ls $data_dir/*` do key=`basename $data` for slide in 6 12 18 24 48 96 168 192 336 672 do - python evaluation.py --offline-yahoo-csv-path $data \ - --offline-plan-path ./result/offline_1_slide/plan/slide_${slide}_plan.json \ - --output-path ./result/offline_1_slide/plan_eval/slide_${slide}_key_${key} + echo \" python evaluation.py --offline-yahoo-csv-path $data \ + --offline-plan-path ./offline_1_slide/plan/${key_prio}_slide_${slide}_plan.json \ + --output-path ./offline_1_slide/single_key/${key_prio}_slide_${slide}_key_${key} \" >> $tmp_script done done +done + +cat $tmp_script | xargs -n 1 -P 36 bash -l -c + + +#set -ex +# +#data_dir="/data/wooders/stl/yahoo" +# +#for data in `ls $data_dir/A4/*` +#do +# key=`basename $data` +# for slide in 6 12 18 24 48 96 168 192 336 672 +# do +# python evaluation.py --offline-yahoo-csv-path $data \ +# --offline-plan-path ./result/offline_1_slide/plan/slide_${slide}_plan.json \ +# --output-path ./result/offline_1_slide/plan_eval/slide_${slide}_key_${key} +# done +#done diff --git a/stl/offline/run_3_eval_oracle.sh b/stl/offline/run_3_eval_oracle.sh index 9262e2a..5fdbef5 100644 --- a/stl/offline/run_3_eval_oracle.sh +++ b/stl/offline/run_3_eval_oracle.sh @@ -1,14 +1,18 @@ set -ex -data_dir="/home/ubuntu/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/" +#data_dir="/home/ubuntu/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/" +data_dir="./yahoo_eval_data" +output_path="./oracle" tmp_script=`mktemp` -for data in `ls $data_dir/A4Benchmark-TS*` +#for data in `ls $data_dir/A4Benchmark-TS*` +for data in `ls $data_dir/*` do key=`basename $data` - echo python evaluation.py --offline-yahoo-csv-path $data \ - --offline-run-oracle true \ - --output-path ./result/offline_1_slide/plan_eval/oracle_key_${key} >> $tmp_script + echo \" python evaluation.py --offline-yahoo-csv-path $data \ + --offline-run-oracle \ + --output-path ${output_path}/${key} \" >> $tmp_script done -cat $tmp_script | parallel --bar bash -l -c \ No newline at end of file +cat $tmp_script | xargs -n 1 -P 36 bash -l -c +#cat $tmp_script | parallel --bar bash -l -c diff --git a/stl/offline/run_4_generate_plan.sh b/stl/offline/run_4_generate_plan.sh index 442cf98..0769b1f 100644 --- a/stl/offline/run_4_generate_plan.sh +++ b/stl/offline/run_4_generate_plan.sh @@ -3,6 +3,13 @@ set -ex # TODO(simon): use a workflow engine for step tracking # e.g. https://dagster.io/ +#python config_gen.py \ +# --csv_dir "./result/offline_1_slide/plan_eval" \ +# --output_path "./result/offline_1_slide/min_loss_plan.json" + +MAX_FITS=8400 python config_gen.py \ - --csv_dir "./result/offline_1_slide/plan_eval" \ - --output_path "./result/offline_1_slide/min_loss_plan.json" + --csv_dir "./offline_1_slide/plan_eval" \ + --output_path "./offline_1_slide/max_fits_${MAX_FITS}.json" \ + --max_n_fits ${MAX_FITS} + diff --git a/stl/offline/run_5_simulate_lp_plan.sh b/stl/offline/run_5_simulate_lp_plan.sh index bc67ae1..bf9daeb 100644 --- a/stl/offline/run_5_simulate_lp_plan.sh +++ b/stl/offline/run_5_simulate_lp_plan.sh @@ -1,26 +1,36 @@ set -ex -PARAM_PATH=result/offline_1_slide/min_loss_plan.json -PLAN_PATH=result/offline_1_slide/lp_eval/varying_slide_size_trace.json -SOURCE_PATH=/data/wooders/stl/yahoo/A4 -OUTPUT_CSV_PATH=result/offline_1_slide/ +PARAM_DIR="offline_1_slide" +PLAN_DIR="offline_1_slide" +OUTPUT_CSV_PATH="offline_1_slide/lp_plan_eval" +TRAIN_PATH="./yahoo_train_data" +EVAL_PATH="./yahoo_eval_data" -# re-run simulation with lp-generated weights -python simulation.py --model_runtime_s 0.02 --total_runtime_s 150 \ - --per_key_records_per_second 100 \ - --num_mapper_replicas 2 \ - --window_size 672 --slide_size 0 \ - --per_key_slide_size_plan $PARAM_PATH \ - --output_path $PLAN_PATH \ - --source_data_path $SOURCE_PATH -# run evaluation with simulation results -python evaluation.py --offline-yahoo-csv-path $SOURCE_PATH \ - --offline-plan-path $PLAN_PATH \ - --output-path $OUTPUT_CSV_PATH \ - --param-path $PARAM_PATH \ - --run-policy - -# get final results -python evaluate_loss.py --offline-yahoo-csv-path $SOURCE_PATH --predicted-csv-path $OUTPUT_CSV_PATH --output-path +for replicas in 8 +do +for plan in "max_fits_1100" "max_fits_2100" "max_fits_4200" "max_fits_8400" +do + mkdir -p ${PLAN_DIR}/replica_${replicas} + # re-run simulation with lp-generated weights + python simulation.py --model_runtime_s 1.5 --total_runtime_s 2000 \ + --per_key_records_per_second 1 \ + --num_mapper_replicas ${replicas} \ + --window_size 672 --slide_size 0 \ + --per_key_slide_size_plan ${PARAM_DIR}/${plan}.json \ + --output_path ${PLAN_DIR}/replica_${replicas}/plan_${plan}.json \ + --source_data_path ${TRAIN_PATH} + + mkdir -p ${PLAN_DIR}/replica_${replicas}/${plan} + # run evaluation with simulation results + python evaluation.py --offline-yahoo-csv-path $EVAL_PATH \ + --offline-plan-path ${PLAN_DIR}/replica_${replicas}/plan_${plan}.json \ + --output-path ${PLAN_DIR}/replica_${replicas}/${plan} \ + --param-path ${PARAM_DIR}/${plan}.json \ + --run-policy + + # get final results + #python evaluate_loss.py --offline-yahoo-csv-path $SOURCE_PATH --predicted-csv-path $OUTPUT_CSV_PATH --output-path +done +done diff --git a/stl/offline/simulation.py b/stl/offline/simulation.py index 33a16b0..476167d 100644 --- a/stl/offline/simulation.py +++ b/stl/offline/simulation.py @@ -26,7 +26,7 @@ flags.DEFINE_enum( "key_prio_policy", - "fifo", + "lifo", list(prio_policies.keys()), "The prioritization policy for a given key.", ) @@ -63,7 +63,7 @@ None, "path to generated per key's window slide size config.", ) -flags.DEFINE_integer("num_mapper_replicas", 10, "number of replicas for mapper") +flags.DEFINE_integer("num_mapper_replicas", 1, "number of replicas for mapper") def _get_config() -> Dict: @@ -79,7 +79,9 @@ def main(argv): policy_params = json.load(open(FLAGS.per_key_slide_size_plan)) keys = policy_params.keys() else: - keys = [i in range(FLAGS.num_keys)] + keys = [i+1 for i in range(FLAGS.num_keys)] + + print("keys", keys) source_to_window_queue = simpy.Store(env) windows_to_mapper_queue = { @@ -112,7 +114,7 @@ def main(argv): source_queues=windows_to_mapper_queue, model_run_time_s=FLAGS.model_runtime_s, # TODO(simon): customize this once we want different key selection policy - key_selection_policy_cls=RoundRobinLoadBalancer(), + key_selection_policy_cls=RoundRobinLoadBalancer(FLAGS.num_mapper_replicas), num_replicas=FLAGS.num_mapper_replicas, ) env.run(until=FLAGS.total_runtime_s) diff --git a/wikipedia/benchmark_bert.py b/wikipedia/benchmark_bert.py index fb9f602..4a636c0 100644 --- a/wikipedia/benchmark_bert.py +++ b/wikipedia/benchmark_bert.py @@ -1,7 +1,7 @@ from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments from pprint import pprint -args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[1], sequence_lengths=[100]) +args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[1], sequence_lengths=[100], no_multi_process=True) benchmark = PyTorchBenchmark(args) results = benchmark.run() pprint(results) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 0ef7d3e..9213239 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 410, + "execution_count": 184, "id": "e0030940", "metadata": {}, "outputs": [], @@ -24,86 +24,15 @@ }, { "cell_type": "code", - "execution_count": 411, + "execution_count": 2, "id": "016e13bb", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Finishing last run (ID:3mmrfkbb) before initializing another..." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Waiting for W&B process to finish, PID 74125... (success)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(Label(value=' 0.41MB of 0.41MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", - "
Synced woven-sea-93: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/3mmrfkbb
\n", - "Find logs at: ./wandb/run-20211012_125219-3mmrfkbb/logs
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Successfully finished last run (ID:3mmrfkbb). Initializing new run:
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mucb-ralf\u001b[0m (use `wandb login --relogin` to force relogin)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" ] @@ -112,7 +41,7 @@ "data": { "text/html": [ "\n", - " Syncing run trim-microwave-115 to Weights & Biases (docs).
\n", + " Syncing run royal-planet-167 to Weights & Biases (docs).
\n", "\n", " " ], @@ -139,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 412, + "execution_count": 3, "id": "7690f6d7", "metadata": {}, "outputs": [ @@ -513,7 +442,7 @@ "[216 rows x 36 columns]" ] }, - "execution_count": 412, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -525,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 413, + "execution_count": 4, "id": "5b5d1edc", "metadata": {}, "outputs": [ @@ -535,18 +464,20 @@ "" ] }, - "execution_count": 413, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ - "
" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -569,14 +500,14 @@ }, { "cell_type": "code", - "execution_count": 414, + "execution_count": 24, "id": "39b1975e", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Finishing last run (ID:1t9orwj8) before initializing another..." + "Finishing last run (ID:2s3jbe1y) before initializing another..." ], "text/plain": [ "" @@ -588,7 +519,7 @@ { "data": { "text/html": [ - "
Waiting for W&B process to finish, PID 14934... (success)." + "
Waiting for W&B process to finish, PID 34365... (success)." ], "text/plain": [ "" @@ -605,7 +536,7 @@ "version_minor": 0 }, "text/plain": [ - "VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + "VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" ] }, "metadata": {}, @@ -622,9 +553,9 @@ "
\n", "
\n", "
\n", - "Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", - "
Synced trim-microwave-115: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/1t9orwj8
\n", - "Find logs at: ./wandb/run-20211012_160227-1t9orwj8/logs
\n" + "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced divine-shadow-168: https://wandb.ai/ucb-ralf/wiki-workload%20/runs/2s3jbe1y
\n", + "Find logs at: ./wandb/run-20211012_194624-2s3jbe1y/logs
\n" ], "text/plain": [ "" @@ -636,7 +567,7 @@ { "data": { "text/html": [ - "Successfully finished last run (ID:1t9orwj8). Initializing new run:
" + "Successfully finished last run (ID:2s3jbe1y). Initializing new run:
" ], "text/plain": [ "" @@ -657,7 +588,7 @@ "data": { "text/html": [ "\n", - " Syncing run fluent-mountain-116 to Weights & Biases (docs).
\n", + " Syncing run breezy-cloud-170 to Weights & Biases (docs).
\n", "\n", " " ], @@ -677,17 +608,17 @@ }, { "cell_type": "code", - "execution_count": 415, + "execution_count": 25, "id": "101571e2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'./artifacts/prediction_results:v1997'" + "'./artifacts/prediction_results:v3620'" ] }, - "execution_count": 415, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -698,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 416, + "execution_count": 180, "id": "03e14929", "metadata": {}, "outputs": [], @@ -708,21 +639,24 @@ }, { "cell_type": "code", - "execution_count": 453, + "execution_count": 181, "id": "eaf30e01", "metadata": {}, "outputs": [], "source": [ - "constants = [0.01, 0.05]\n", + "#constants = [0.01, 0.05, 1.0, 10.0]\n", + "constants = [0.25]\n", "policies = [\"lifo\"]\n", "key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", + "#key_policies = [\"random\", \"round_robin\"]\n", + "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", - "metric = 'top5'" + "metric = 'top10'" ] }, { "cell_type": "code", - "execution_count": 454, + "execution_count": 182, "id": "96209574", "metadata": {}, "outputs": [ @@ -730,29 +664,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.05-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.05-100.json\n", - "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.05-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.01-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.05-100.json\n" + "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.25-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.25-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.25-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.25-100.json\n" ] }, { "data": { "text/plain": [ - "{'plan-random_lifo-always_process': [0.41722204591135087, 0.41605839416058393],\n", - " 'plan-weighted_random_lifo-always_process': [0.508879315080318,\n", - " 0.44467986596668],\n", - " 'plan-round_robin_lifo-always_process': [0.5089891784573612,\n", - " 0.37384957156458265],\n", - " 'plan-weighted_round_robin_lifo-always_process': [0.5088165360077218,\n", - " 0.46732741640574116]}" + "{'plan-random_lifo-always_process': [0.7078732804419647],\n", + " 'plan-weighted_random_lifo-always_process': [0.6361795795371613],\n", + " 'plan-round_robin_lifo-always_process': [0.6167886934890254],\n", + " 'plan-weighted_round_robin_lifo-always_process': [0.5987554048857813]}" ] }, - "execution_count": 454, + "execution_count": 182, "metadata": {}, "output_type": "execute_result" } @@ -767,40 +694,30 @@ " print(f'{d}/{name}-{constant}-100.json')\n", " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", " results = json.load(results_file)\n", - " scores.append(results[metric])\n", + " scores.append(1-results[metric])\n", " all_results[name] = scores\n", "all_results" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d3b31501", - "metadata": {}, - "outputs": [], - "source": [ - "all_results = {}\n", - "for policy in policies: \n", - " for key_policy in key_policies: \n", - " scores = []\n", - " name = f\"plan-{key_policy}_{policy}-always_process\"\n", - " for constant in constants: \n", - " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", - " print(f'{d}/{name}-{constant}-100.json')\n", - " results = json.load(results_file)\n", - " scores.append(results[metric])\n", - " all_results[name] = scores\n", - "all_results" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 106, "id": "b479a2bc", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['plan-random_lifo-always_process', 'plan-weighted_random_lifo-always_process', 'plan-round_robin_lifo-always_process', 'plan-weighted_round_robin_lifo-always_process', 'plan-random_fifo-always_process', 'plan-weighted_random_fifo-always_process', 'plan-round_robin_fifo-always_process', 'plan-weighted_round_robin_fifo-always_process'])" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "all_results.keys()" ] @@ -850,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 421, + "execution_count": 12, "id": "6d536763", "metadata": {}, "outputs": [], @@ -861,15 +778,49 @@ }, { "cell_type": "code", - "execution_count": 455, + "execution_count": 183, + "id": "89abf373", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "resources = [int(10 / c) for c in constants] \n", + "df = pd.DataFrame({\n", + " 'Model Runtime Const': resources, \n", + " **all_results\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", + "seaborn.barplot(x='Model Runtime Const', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "ax1.set(xlabel='Resources', ylabel=f'{metric} Error')\n", + "ax1.legend_.remove()\n", + "plt.legend(loc='lower left')\n", + "seaborn.despine(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, "id": "1e07c3e9", "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmgAAAFCCAYAAABFMCGEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABWiklEQVR4nO3dd1jV5f/H8SccQIYTE8WtqIiigBvLkVtzVY40VyaZWpgjV+5V4kTD3BP3SsXZ17Lxc+bGVblR3HvAgQO/PzCKQESF41Fej+v6XhfnfO7zud/nnPt7fHV/xm0VGxsbi4iIiIhYDOuXXYCIiIiIJKSAJiIiImJhFNBERERELIwCmoiIiIiFeW0CWmxsLJGRkeiaBxEREXnVvTYBzWg0EhoaitFofNmliIiIiLyQ1yagiYiIiLwuFNBERERELIwCmoiIiIiFUUATERERsTAKaCIiIiIWRgFNRERExMIooImIiIhYGAU0EREREQujgCYiIiJiYRTQRERERCyMApqIiIiIhVFAExEREbEwCmiS6mKiol7r/kRERNKazcsuwBIYo0zY2RrM1l9MlBFrWzuz9RdlMmJrMF9/1ra27Ore3Wz9VQoMNFtfr7soUxS2BtvXtj8RkVeFAhpgZ2ug9eDtZutv8fDq/DGug9n6K9Z7Hn229zRbfwHVJ5itr5fBnIHe3P/xYGuw1VgREbEACmgiz8icgX7x8Opm6UdERCyLzkETsWAx0caXXUKa0vmKIiJJ0wyaiAWztrEz++Fwc9L5iiIiSdMMmoiIiIiFUUATERERsTAKaCIiIiIWRgFNRERExMIooImIiIhYGAU0EREREQujgCYiIiJiYRTQRERERCyMApqIiIiIhVFAExEREbEwCmgiIiIiFkYBTURERMTCKKCJiIiIWBgFNBERERELo4AmIiIiYmFszNXRmTNn6NevH7dv3yZr1qyMGTOGggULJmhz48YN+vfvT3h4OFFRUVSqVImBAwdiY2O2MkVEREReOrPNoA0ZMoTWrVuzZcsWWrduzeDBgxO1mTZtGm5ubqxfv57169dz9OhRtm7daq4SRURSlTHK9Fr3JyJpxyxTUzdu3ODYsWPMnTsXgIYNGzJixAhu3ryJs7NzfDsrKysePHhATEwMRqORqKgocubMaY4SRURSnZ2tgdaDt5utv8XDq5utLxFJW2YJaOHh4eTMmRODwQCAwWDAxcWF8PDwBAGta9eufP7557z11ls8evSIDz/8kLJlyz5TX6Ghoc9c37P2IZZn3759ZutL4+XV9rqPFXO+PxF5Mcn9RljUyV2bN2/G3d2d+fPn8+DBA/z8/Ni8eTP16tVL8T48PT3JkCFDGlYplkihSVLqdR8rr/v7E0kvzHIOmqurK1euXMFkijs/wmQycfXqVVxdXRO0Cw4OpnHjxlhbW5MpUyZq1KjB7t27zVGiiIiIiMUwS0DLnj07Hh4ehISEABASEoKHh0eCw5sAefPm5ZdffgHAaDSyc+dOihYtao4SRURERCyG2a7iHDp0KMHBwdStW5fg4GCGDRsGgJ+fH0eOHAFgwIAB7Nu3j0aNGtG0aVMKFixIixYtzFWiiIiIiEUw2zlobm5urFixItHzM2fOjP87f/788Vd6ioiIiKRXWklARERExMIooImIyHOJMkW91v2JvEwWdZsNERF5ddgabOmzvafZ+guoPsFsfYm8bJpBExEREbEwCmgiIiIiFkYBTURERMTCKKCJiIiIWBgFNBERERELo4AmIiIiYmEU0EREREQsjAKaiIiIiIVRQBMRERGxMApoIiIiIhZGAU1ERETEwiigiYiIiFgYBTQRERERC6OAJiIiImJhFNBERERELIwCmoiIiIiFUUATERERsTAKaCIiIiIWRgFNRERExMIooImIiIhYGAU0EREREQujgCYiIiJiYRTQRERELIAxyvRa9iXPx+ZlFyAiIiJgZ2ug9eDtZulr8fDqZulHnp9m0EREREQsjAKaiIiIpKkoU9Rr3V9a0CFOERERSVO2Blv6bO9ptv4Cqk8wW19pRTNoIiIiIhZGAU1ERETEwiigiYiIiFgYBTQRERERC6OAJiIiImJhFNBERERELIwCmoiIiIiFUUATERERsTAKaCIiIiIWRgFNRERExMIooImIiIhYGAU0EREREQujgCYiIiJiYRTQRERERCyMApqIiLwSYqKiXuv+RP7N5mUXICIikhLWtrbs6t7dbP1VCgw0W18i/6UZNBERERELo4AmIiIiYmEU0EREREQsjNkC2pkzZ2jZsiV169alZcuWnD17Nsl2GzdupFGjRjRs2JBGjRpx/fp1c5UoIiIiYhHMdpHAkCFDaN26NU2aNGHt2rUMHjyYBQsWJGhz5MgRvv32W+bPn0+OHDm4d+8ednZ25ipRRERExCKYZQbtxo0bHDt2jIYNGwLQsGFDjh07xs2bNxO0mzdvHh07diRHjhwAZMqUiQwZMpijRBGRV15MtPFllyAiqcQsM2jh4eHkzJkTg8EAgMFgwMXFhfDwcJydnePbnTp1irx58/Lhhx/y8OFDateuTZcuXbCysjJHmSIirzRrGzv+GNfBbP0V6z3PbH2JpDcWdR80k8nEyZMnmTt3LkajkU6dOpE7d26aNm2a4n2EhoY+c79ly5Z95teIZdm3b5/Z+tJ4ebVprMizeJ3Hy+v83sC87+95Jfe5mCWgubq6cuXKFUwmEwaDAZPJxNWrV3F1dU3QLnfu3NSrVw87Ozvs7OyoWbMmhw8ffqaA5unpqcOi6ZD+IZSU0liRZ/E6j5fX+b3Bq//+zHIOWvbs2fHw8CAkJASAkJAQPDw8EhzehLhz03777TdiY2OJiopi165dFC9e3BwlioiIiFgMs91mY+jQoQQHB1O3bl2Cg4MZNmwYAH5+fhw5cgSAd955h+zZs9OgQQOaNm1KkSJFaNasmblKFBEREbEIZjsHzc3NjRUrViR6fubMmfF/W1tb079/f/r372+uskREREQsjlYSEBEREbEwCmgiIiIiFiZFAS0mJiat6xARERGRx54a0EwmE97e3hiNukO1iIiIiDk8NaAZDAYKFizIrVu3zFGPiIiISLqXoqs4GzVqxKeffkq7du3IlStXgm2+vr5pUpiIiIhIepWigLZkyRIApkyZkuB5Kysrtm3blvpViYiIiKRjKQpoP/74Y1rXISIiIiKPpfhGtdHR0Rw4cIArV66QK1cuvL29sbGxqLXWRURERF4LKUpYp06dokuXLkRERODq6kp4eDgZMmRg2rRpuLm5pXWNIiIiIulKigLasGHDaNGiBR9//DFWVlYAzJ49m6FDh7Jw4cI0LVBEREQkvUnRjWpPnDjBRx99FB/OANq3b8+JEyfSrDARERGR9CpFAc3FxYU9e/YkeO7333/HxcUlTYoSERGRtBMTrZvPW7oUHeLs0aMHXbt2pXr16uTOnZtLly6xfft2xo4dm9b1iYiISCqztrHjj3EdzNZfsd7zzNYXQExUFNa2tq90fykKaG+//TZr1qxh48aNXL16laJFi+Lv70+hQoVStRgRERGRF2Vta8uu7t3N1l+lwMBU3+dTA5rJZMLHx4fff/+drl27pnoBIiIiIpKQ1uIUERERsTBai1NERETEwmgtThEREREL89SAFhMTw6hRoyhbtix2dnbmqElEREQkXXvqOWjW1tZ07dpV4UxERETETFJ0o9ry5ctz8ODBNC5FRERERCCF56Dlzp0bPz8/atasSa5cuRIs+dTdjPcZEREREUkPUhTQIiMjqVWrFgBXrlxJ04JERERE0rsUBbSvv/46resQERERkceSPQdt06ZNCR6fPn06weN58+alekEiIiIi6V2yAe2rr75K8PiDDz5I8Hjy5MmpX5GIiIhIOpdsQIuNjX2mxyIiIiLy4pINaP++WjMlj0VERETkxT31IoHY2Nj4/yX1WERERERSV7IB7eHDh5QoUSL+cWxsbPzj2NhYzaCJiIiIpIFkA5oWQhcRERExv2QDWp48ecxVh4iIiIg8lqK1OEVERETEfBTQRERERCyMApqIiIiIhVFAExEREbEwT70P2rJly1izZg1//vknDx8+xNHRkaJFi/Lee+/RokULc9QoIiIikq4kG9DGjh3L9u3b+eijjyhevDiZMmXi/v37HD9+nHnz5nHhwgV69eplrlpFRERE0oVkA9qqVatYt24dLi4uCZ4vWbIkVapUoXHjxgpoIiIiIqnsmRZLFxEREZG0l+wMWrNmzWjfvj0dO3bE3d09/hDniRMnmDdvHs2bNzdXnSIiIiLpRrIB7csvvyRfvnysWrWKv/76K/4igSJFitC2bVs++OADc9UpIiIikm489SrODz74QEFMRERExIxe6D5oly5dSq06REREROSx5w5oRqORmjVrpmYtIiIiIsJTDnHu3bv3iduMRmOqFyMiIiIiTwlobdu2JUeOHFhba0UoEREREXNJNqDlzp2bcePGUaZMmUTbIiMj8fb2Tqu6RERERNKtZKfGPD09CQ0NTXKblZUVrq6uaVKUiIiISHqWbEAbP348rVq1SnKbnZ0dP/74Y4o7OnPmDC1btqRu3bq0bNmSs2fPPrHt6dOn8fLyYsyYMSnev4iIiMjrItmAZmtri62tbap0NGTIEFq3bs2WLVto3bo1gwcPTrKdyWRiyJAh1KpVK1X6FREREXnVpOjsf6PRSGBgIHXq1MHb25s6deowadIkIiMjU9TJjRs3OHbsGA0bNgSgYcOGHDt2jJs3byZqO2PGDKpXr07BggVT/i5EREREXiMpCmhDhw5l165dfPXVV6xcuZKvvvqKvXv3MnTo0BR1Eh4eTs6cOTEYDAAYDAZcXFwIDw9P0O7EiRP89ttvdOjQ4ZnehIiIiMjr5KlLPQFs27aNH374gcyZMwNQpEgRvLy8qFOnTqoVEhUVxaBBg/j666/jg9zzeNJFDckpW7bsc/cnlmHfvn1m60vj5dWmsSLPQuNFUup5xkpy33mKAtobb7zBo0eP4gMaxN1mI0eOHCkqwNXVlStXrmAymTAYDJhMJq5evZrgKtBr165x/vx5PvnkEwDu3r1LbGws9+/fZ8SIESnqB+KuPM2QIUOK28vrQT9sklIaK/IsNF4kpVJ7rKQooDVp0oROnTrRtm1bcubMyeXLl1m0aBFNmjRh586d8e18fX2TfH327Nnx8PAgJCSEJk2aEBISgoeHB87OzvFtcufOze7du+MfT5kyhYcPH9K3b9/nfW8iIiIir6QUBbSlS5cCMG3atETP/73NysqKbdu2PXEfQ4cOpV+/fkydOpXMmTPH30LDz88Pf39/SpUq9VxvQEREROR1k6KA9iz3O3sSNzc3VqxYkej5mTNnJtn+888/f+E+RURERF5FKQpoANHR0Rw4cIArV66QK1cuvL29sbFJ8ctFREREJIVSlLBOnTpFly5diIiIwNXVlfDwcDJkyMC0adNwc3NL6xpFRERE0pUUBbRhw4bRokULPv74Y6ysrACYPXs2Q4cOZeHChWlaoIiIiEh6k6Ib1Z44cYKPPvooPpwBtG/fnhMnTqRZYSIiIiLpVYoCmouLC3v27Enw3O+//46Li0uaFCUiIiKSnqXoEGePHj3o2rUr1atXJ3fu3Fy6dInt27czduzYtK5PREREJN1J0QxazZo1Wb16NUWLFuXBgwcULVqU1atXU6tWrbSuT0RERCTdSdEM2uzZs/n444/p2rVrgufnzp3LRx99lCaFiYiIiKRXKZpBCwoKSvL57777LlWLEREREZGnzKD9vc5mTEwMu3btIjY2Nn5bWFgYTk5OaVudiIiISDqUbED76quvAIiMjGTAgAHxz1tZWZEjRw4GDhyYttWJiIiIpEPJBrS/1+Ds06cPAQEBZilIREREJL1L0TloCmciIiIi5pOigCYiIiIi5qOAJiIiImJhFNBERERELIwCmoiIiIiFUUATERERsTAKaCIiIiIWRgFNRERExMIooImIiIhYGAU0EREREQujgCYiIiJiYRTQRERERCyMApqIiIiIhVFAExEREbEwCmgiIiIiFkYBTURERMTCKKCJiIiIWBgFNBERERELo4AmIiIiYmEU0EREREQsjAKaiIiIiIVRQBMRERGxMApoIiIiIhZGAU1ERETEwiigiYiIiFgYBTQRERERC2Pzsgswh6ioKMLCwoiIiHhimy71s5itnuPHjxNV5WOz9tc0y/tm7c++RQuz9mdu5hovGiup35+5Pc9YiY2Fa3dNbPz9AY+MsWlQlYhYunQR0MLCwsiUKRMFCxbEysoqyTanL94zWz2F82Qi4vIZs/Vnn6sQYfcumK2/vJnycf/8ebP1lzF/frP19TdzjReNldT1qoyV2NhYsj+4A1xh1Y77qV+UiFi8dHGIMyIiguzZsz8xnImIWBIrKyvsnbKQI7PhZZciIi9JughogMKZiLxSrKys0M+WSPqVbgKaiIiIyKsiXZyD9l/GKBN2tgkPHRTOk+mF9/soMprw649eeD8iIiKSvqXLgGZna6D14O2pvt/Fw6u/8D68qtVj56Y1ODo6vHhBaaznJz1p3rYFvlUqpWk/58PC6DdyJABtmjfn94MHaVSnDj6lSnH77l06ffABjx49olGjRnTq1ClV+qxRowbTpk2jWLFiqbI/ERGRZ5EuA1p6Zoo2YbB5tU48/vG33yhdogT9/P0BaFCzZvy2Pfv3kzlzZpYuXfqyyrM4JpMJg+HV+o5FRCQhBbSXwN3dnU87fMjOvfu5ffcu/n4fUavaW4najZ86k30HDxMVHU3WLFkY1rcHuXPl5GL4ZVp39qdZowb8umsvEZERDO3TgzKlPRPt42L4ZVo3+YDGzRuxf89+atWvRZ78eZjz3VyiIo2YTCZad/yQGnXfBuJmxdxLunPs8DFuXL9BtVrV8PvcD4Czp88xdthYoqOjKVC4AEaj8Z9+Llxk4uhJ3L51Gwc7B7q0aUPl8uUBKFu7Nl07dGD7jh3cuXuXgT16sPvAAXbu3Uu0ycSYgQMpVKBAkp/Vxm3bWLx6NTGxsRw6epSAIUMYMX48bZo3x8HenkkzZvAwIoImTZowaNAgChYsyJAhQzj/+NYNH3/8MU2bNk1y3+vXr2fBggVERUUB0LdvX3x9fRO0OX36NJ9//jkbNmwgOjqaihUr0qVLF2rUb8kv239g52/b6TtwFKuWB/PzT1sxmaKxs8vAZ1/0w62IOyuWzufalct07d4XgFs3b9DVrxVzF63jwL5dLJjzHdbWBkymaLr696G0d7kka9174BABU6ZRvGgR/jh1GoPBwIj+vXArWIC9Bw4x9tvp+JTy5OjJP/Br24rs2bIyZvJ3PIqIwMHenr7+XfD0cAfg5x27mTYvmOjoaKysrBk5oBfF3Apz+NgJpnw5mFt3bwHQ4dP2VHqrErdu3mL0wNHcuhH3fJkKZejaqytHDx1lcsAUYmNjiY6Opk3HD6lRr0aS9YuIyLNRQHtJrKysWTB1ImfPX6Bdt574lPYke7asCdp0bN2CXl3jwtHqkE1Mmj6HgCH9Abh95y6lS3rwuV8HNvzwI4HT5zA/aEKSfd2+fZv8hfLTvnN7AO7dvUfgrEkYDAZu3rhFl7ZdKO9bjkyZ487Du3r5KhNnTuThw4e0bdKO+k3qkzd/Xr4Z/A3vfvAudRvW4diRY3T/+Iv4PkYP/Jp33n2HBk3rE3ElktatWrFq9myyZY17T5kyZmRhUBA//PwzPYcM4euBA/n844+Zv2wZs5csYWS/fknW3qBmTS5cvMjDR4/o0blzgm3lvb35tH17dh05wuTJkwH44osvKFq0KEFBQVy9epX33nuPEiVKJHmo8q233qJhw4ZYWVlx+vRpOnTowC+//JKgTeHChbl//z5Xr17l4sWLFC1alJ07d1KjfksO7t+Dd5m4EFqzzju836INAAf27WbKxK+ZFDSPeu+8S+cOzfnok89xcHBk04Y1VK9RF3t7exbOnU637v3wLO2DyWQiIiL58xf/OHWGvv5dKOddmnWbf2Dg6HEsmTEFgD9Pn+WrHp/R/4uuREVF0bB1R4b17UGlcmXYve8AvQaPJGTxHC5evsKwsZOYO2UcBfLmwWg0EhUdzd179xk5fgqz5s7D6BDJjes36NquG7OXzWLbpm245MrJ2Klj48cPwNL5S2nW+n1qv1Ob2NhYHtx/kGz9IiKScgpoL8m779QFoGD+fHgULcKRY8ep/mbC2Zvfdu9l2fchPHz0CJPJlGCbo4MD1SpXBKB0ieKMnzrziX1lyJCB6rWrxz++fesOY4eP4+L5ixhsDNy7c48L5y5QolQJAKrWrIa1tTUZM2Ykf6H8XAq7RDbnbJw9dZbaDWoBUKJUCQoVKQTAwwcPOfXHKeo1jntPRYoUwd3NjSPHj1P18YxU7epx/RcvWhQrKyuqVIyr3aNoUX787bdn/vyeZOfOnfR7HPZcXFyoVq0au3fvTjKgXbhwgV69enHlyhVsbGy4fv06165dI0eOHAnaVaxYkZ07dxIWFkbLli2ZNWsWUVFRHNi3hxatOgDw1x/HWbZoLvfu3cHK2pqLF+Jm8DJlykzFylXZtnUj9Rs2ZfOGNYweOxUAL59yzPxuIlWq1aJcxcoULFQk2feWP09uynmXBqBhnZoMHzeZ+w/iQlH+vLnx8oz7/s6eD8PW1oZK5crE1V/WB1tbG86eD2Pf4SO8VbE8BfLmAcDOzg47Ozt+3bWHS5cv4+fnR1RM3IyilZUVFy9cxKNUCVYuXsX0wOmULuNFed+4WT7vct4snreEK+FXKFupLB6eHs/wTYmISHIU0CxALLFAwhseXbp8hXFBM1g0fTJ5XXNxMPQY/Ud8E7/dzs42/m9ra+v4ADdz4RJ+2P4rAL27dSaPa04cHBwS3Acu8JtAfKv6MmzsUKysrGj3XnuMkf8crrTLYBf/t8HaGpMpJu7BE+7JFBv7hKVo/tVnBju7+P3Z2v6rdoMhUfh8Uf+9552VlRW3bt2iQ4cOABQqVIhJkybRs2dP+vXrR61atYiJicHLy4vIyMhE+/P19WXXrl2EhYUxduxY9u7dy/YftwCQyzUPUVFRjBral7GTZlKkWHFuXL9Gmxb141/f5L2WjBk1kKzZspEvfyHy5os7nNu5Wy/OnP6LQwf2MnpYP95t9iH1G777XO/Z0eGfi0piicUqqS/LyoonfVWxsbEULVyIJStWJbmSwPRF09i3ez//2/gDS+ctIXB2IO+3fh/fqr7s272fKQHfUq5SWTp27fhc9YuISEJmC2hnzpyhX79+3L59m6xZszJmzBgKFiyYoE1QUBAbN27EYDBgY2NDjx49qFKlSqrXYowypcoVl//1KDI6xW3XbtrKJ+1acy7sIif/Ok2pEsUTbH/w4CG2Nja84ZyNmJgYVqzdkKL9+rVthV/bVvGPL4ZfTtTm/r375HLNiZWVFb/v2selC5eeul+njE4UcivEts0/UrtBLU6EnuDMX2fit7kVc2NryFbqNa7HqVOn+OP0aUoVL/6UvaY+X19fli1bhr+/P9euXePnn3+mQ4cOZMuWjbVr1yZoe+/ePfLmzQvAypUrE5xT9999jh8/HmdnZ3LlykXlypUZEzCOMmXjZgGNxkhMJhNvuOQEIGTtigSvL1ioCJkzZ2F60AS6+feJfz7s/FkKFS5CocJFePToIX+cPJZsQDt/8RL7D4VSxsuTjf/7iaKFC5LRySlRu0L582GMimLP/kNUKOPFnv2HiI42UTBfHuzsbJm5cAnnwi4mOMTp7VmC82EX2bVrF3lLxs2unTh6AvcS7ly+dJkcOXNQo+7blPIpRbt32xETE8PFC5fIVyAvufPmxsHRga0hW5/29YiISAqZLaANGTKE1q1b06RJE9auXcvgwYNZsGBBgjalS5emY8eOODg4cOLECdq0acNvv/2Gvb19qtby33uggXnX4oyrwZb23Xpy684dBvXyT3T+WVG3QtSuXoX32nfGNWcOynqVZv/hI6nSd6fPOjF5zGSWzF9K4SKFKVy0cIpe13dYX8YOG8vKRSsp5lE0wSGtASP7M3H0JFYuXoWDnQMj+vaNP//MnAYOHMjgwYNp1KgRAL1796Zo0aJJtu3fvz9du3YlZ86cVKhQgaxPqDdXrlw4OTlRtmxZACpVqsS1q5fx8ok71OfklJG2H3Wme5d2uLjkolzFyon2UbdBU+bPCqJ8pX8uBpkz81suXTyPwWCDU8aMfNF7cLLvzb2IG5u2bSfg22lYW1szckDvJNvZ2toyfvjABBcJjBv2Fba2thTIm4fBvbvTZ+hoYmJi4vbTvzdF3QoROHoogUFBXL91neioaFzz5GLkxJEc2neIFcErMdgYiImJ4Yv+X2Btbc2apWs4uO8gtjY22NrZ8tmXnyVbv4iIpJxV7BOPT6WeGzduULduXXbv3o3h8SGtihUrsnXrVpydnZN8TWxsLOXKlWPDhg3kypXrqX1ERkYSGhqKp6cnGTJkSLDt+PHjeHgkf36MOQNa/RrlzHqvMy2AnfqedbxMGjeCvHkL0OyDds/0ur8XS9974BATvpsVf1FAWtFYSX0v8tty8fxffLfpTorbLx5enT/GdXju/p5Vsd7z6LO9p9n6C6g+gV3du5utv0qBgWbr629pcY/OpGispK60GCtmmUELDw8nZ86c8fdmMhgMuLi4EB4e/sSA9v3335M/f/4UhbN/Cw0NTfScjY0NDx48+QozpyQOE8mrJbnvN7U9y3i5cf0afXt2JpvzG3z62ZdpWJWklKWOFbFM+/btM1tff8/Sy6vpecZKct+5RV4ksGfPHgIDA5kzZ84zv/ZJM2iW9EN58uRJIi6fedllWJTfdu8mKInvu1vHjrz1+IrP5FjS9/tv2d/IwawFq1PcfuhXPbh29Ur8Yztba3I6Z2Hy18PSfPYsvbDUsSKWSaFJUiq1x4pZApqrqytXrlyJv8O5yWTi6tWruLq6Jmp74MABvvzyS6ZOnUrhwik7N0pefW9VrJiiIPa6GzpqYoLHfx/iFBGR9MXaHJ1kz54dDw8PQkJCAAgJCcHDwyPR4c3Dhw/To0cPJk+eTMmSJc1RmoiIiIjFMUtAAxg6dCjBwcHUrVuX4OBghg0bBoCfnx9HjsRdnThs2DAiIiIYPHgwTZo0oUmTJpw8edJcJYqIiIhYBLOdg+bm5saKFSsSPT9z5j93wF+1apW5yhERERGxWBZ5kUBai4k2Ym1jl+C5wnkyvfB+oyIjuXA96ZudioiIiKRUugxo1jZ2aXL/l2K95wEvFtC8qtUz6z3SnubksZOsWryKASMHJNvu8qXLdGnblTXbEl+xeO/+fVZv2ED7li2fq4aGbdowacQIihQq9FyvfxHu7u7s378/za/82/HbdubN+hZbOzv6DxzN1yMGMOHbOUAmDoYeY/i4QGwMNvTu9gkVyni9cH8Xwy/TurM/P69b/uLFi4hIqkuXAU1Szr2E+1PD2dPcu3+f+cuXP3dAe1Z/Xy38KtkUspq2HT6lSvW4xeiDZi6O3xayZRuN69aiQ6vmL6s8ixITE4OVlVWiNVdFRF4nCmgvgbu7O592+JCde/dz++5d/P0+ola1txK1Gz91JvsOHiYqOpqsWbIwrG8PcufKGT/70axRA37dtZeIyAiG9ulBmdKeifYROGMO2XMXoF7Lumz/YTsjB4xixZblZHPORn//Abzf+j3KVSrH7t92s2jOYoxGIza2NnTt2YUSpUpw8PeDTA+cwXcLpwLw/bLvWb10DRkzOVHhzYqsXb42wazZ7KA5HNh1gIf37jGoVy98PD35ZsoU7t+/T6vOnbG3t2duYCDXbtxgbFAQl69eJSIyknpvv03H1q0BOHDkCF9Pnox9hgx4eng8eTH2x1avXs2GDRtwdnbm1KlTjBo1ip07d7JhwwZMJhMZMmRg6NCh8atJuLu706NHD3744Qdu375Nnz59qFu3LgBbt25lwoQJZM2alapVqybo55dffmHChAmYTCYcnLLg33MAufPk4/DB35n27Xjci5fkxPEjGAw2fNl/OIsWzOTs2VPkyJGTQcPGYu+Q9Kzo9KDxhB4+QNiFc4SsW8GYCdOpX6Mcqzf8wqxZy9jy08/Y29uz8X8/sWDqRP46fTbBMk59/bvg6eGe5L6fNIb+bcW6Dfx56gwDenzG4cOHad68OUHzv6V4yeIEfhOIWzE3Gr7XkNEDR3PhXBhRRiO58+Xhy8G9yZQ5E/39B1CvcT2q1Yr7vH798VfWrwohIGgMC2Ys4MctP2GXwQ4rYPz08WTMlDHpz2HBAk6fO8ejR48Iv3qVgvnyMbh3bzI5OTF9wQIuXLrEw0ePCLt0iVkTJvDLrl0sWL4cKysr8ubOzVfdu+OcLRsAc5YsYfOPP2JtbY2DvT2zJ06MW55qzRoWL16MyWQiY8aMDB06lMKFC7N//35GjBhBTEwM0dHRdOnShYYNG7Js2TLmzZuHnZ0dMTExTJo0CTc3t2THo4hIalBAe0msrKxZMHUiZ89foF23nviU9ky0HmfH1i3o1dUPgNUhm5g0fQ4BQ/oDcPvOXUqX9OBzvw5s+OFHAqfPYX7QhET9VCzjzcI1G6nXsi779xzAo5QHB/YepGrNKhw/ehxPb08uhV1i4exgxkz5BqeMTpw9dZb+/v1ZsmFJgn2d+vM0i+ctYcbi6WTNlpWg8VMTbL975y4lSpdgSL8hLJ87lykzZzInMJB+n39Om27dWDJ9enzbIQEBdPrwQ8qULk1UVBSf9ulDCXd3ypQqRf9RoxjZvz/lvLzY+vPPLPv++6d+nvv372ft2rXkf7yUT86cOenYsSMAO3bsYMiQISxf/s/hvIwZM7Jq1Sr27dvHF198Qd26dblx4waDBg1iyZIlFC5cOMEFLDdu3KBPnz4EBwdTpEgRvpu5kIBRA5k0dT4A58+dple/oXTvPZCgwDF81fdzJgbNjQtn/fzZ/uMW6r3TNMnaO3frxam/TvJ+i7ZU9K2SYFunTp04eeQAJdyL0eq9xkRFRdFr8EiG9e1BpXJl2L3vAL0GjyRk8RxsbW0T7Tu5MfS3imV8CF6xBoCdO3dSonQJDuw9QPGSxdm/5wDN28TN3HXr3Y0sWbMAMGfqHJbOX4rf5368+0FTls5fFh/Q1q5Yx7stm3Lv7j2WB69g1daVZLDPwMMHDxPdRPq/DoSGsmTaNLJny8awceOYFRxMj86d477jw4dZ9N13ZMuShb/OnGHKrFkET51KjuzZmTpvHgFBQXwzcCDrt27ll507mTNpEhmdnLh99y7W1tYcOHKETZs2sWjRIuzs7Pj5558ZMGAAS5cuZebMmbRv356mTZsSGxvLvXtxyzMFBAQQEhKCq6srRqMRk8mUbP0iIqlFAe0lefeduBmbgvnz4VG0CEeOHaf6m74J2vy2ey/Lvg/h4aNHif5hcHRwoFrluBu7li5RnPFTZ5IU71Il6TN8DFFRURw9dJTOX3zCL9t+5Q2XNyjsVgh7e3v27txLeFg4PT75Z500kymGmzduJdjXoX2HqPhmBbI+DpL1GtXlfxv/F7/dwdEB3yqVACjl4cGkfwWyf3v06BG/HzrErdu345978OgRZ86fJ3u2bNhnyEA5r7jzrOpUq8aoiROT3M+/lSlTJj6cQdySX9OnT+fOnTtYWVlx9uzZBO0bNGgQ9/l4e3P16lUiIyM5ePAgJUqUiL9BcsuWLRk3blzcez90iOLFi1OkSBEAatdrzLeBY3j4MG7ZoLz5CuBWJG4Wq0hRd65eCSdHjriZqqLFPLh0MXXWtzx7PgxbWxsqlSsDQMWyPtja2nD2fBhF3RKfo5fcGPpb/ry5iYw0cuXqNXbu3Emnbh8TPHsRNevVJCoqitx5cwOwNeQHtm3eRnRUFBEREeTNnxeA8r7l+W7CNM6dOYeVlRWXwi5R6fE4yFcgH18P+prylctTqUolHJ0ck31/VSpWJPvjWbAm9eoREBQUv+2tChXIliUuIP5+6BBvVqhAjuzZAXj/nXdo9TjI/bprF80aNSLj4/MGs2bODMAvu3Zx4sQJmjePC5yxsbHcvXs37nOsWJEZM2Zw6dIl3nzzTbwej79KlSrRv39/atasSfXq1cmXL1+y9YuIpBYFNAsQSyyQ8HyaS5evMC5oBoumTyavay4Ohh6j/4hv4rfb2f0zW2JtbR3/j+/MhUv4YfuvAPTu1pkKZbxwd3fnxy0/4fyGM97lvJk2aTo5XN7Au7xPXP+xUN63HP2G90tU2/kz5/6pMzY22fN+/j2DY7C2JvoJgSDm8X4WBAVha5NwCP5x6tQT95+cf5/EbzQa6d69O8HBwZQsWZIrV64kOlz590zO3+eqRUdHJ3so9Wnv3c7un5kha2sDtnZ2/3psnWozL7HEYkUSdVhZ8X97fidwetxyWQ1qvU2dt6smO4b+rbyPF7/s2sONGzfwKuvF5DFT2P3bbnzKeQNw+MAR1q9az+Q5gWTNlpVtm7exYfWGx11b0aR5Y9atWAdAw/feif9cv507hdBDoRz4/SBd2nTl6ylf41Y0ZSuExD7e998c/nWIOMnv4ynnpMXGxvL+++/TPYkFlDt06ECNGjXYsWMHI0aM4M0336RHjx58++23HDlyhF27dtGuXTuGDh1KtWrVUlS/iMiLMNuNaiWhtZu2AnAu7CIn/zpNqRLFE2x/8OAhtjY2vOGcjZiYGFas3ZCi/fq1bcXy2VNZPntq/NV+vr6+zJ8+nzIVfLCzsyOHyxtsCdlKmccBrVylsuzd+TtnT52N38+JoycS7du7rBe7/28Pd27fAWBLyNYU1eTk6EhEZGR8YHNydMTH05N5S5fGt7l89SrXb96kYL58RBqN7D98GID//fIL959xcWuj0Uh0dHT8UmKLFy9+yivi+Pj4cOzYsfjZtn/ft8/Hx4fjx49z6nGA/N+WENyKuOPoaN51HQvlz4cxKoo9+w8BsGf/IaKjTRTMl4c3K5SL/+47tGr+TGOoYllv5ixaho9P3Jgo6VWSJfOX4lMhbqbu/r37OGV0InOWzBiNRjav25zg9XUa1uH/ft7B9h9+pkHTuNnJhw8ecvv2bbzKetGhc3sKuhXk7Knkl636bffu+JnV9Vu2xM+k/lcFHx/+b88ert+8CcCajRupWCau1iqVKrFy/XoePHwIwO3Hs2RVK1Vi7dq1XL58GYi7mCQ0NBSAM2fOkD9/fj744APatWvHkSNHiI6O5sKFC5QuXZpPPvmEN998k+PHjydbv4hIakmXM2gx0cbHt8RIXVGRkSlua2drS/tuPbl15w6DevknOv+sqFshalevwnvtO+OaMwdlvUqz//CR56rL19eXwMBAfB4HMp/yZQg9dJTinnGhMG/+vPQf0Y9xI8YRGWkkOiqakl4lKV4yYWh0K+ZGy3Yt+fwjf7Jlz0bZCmVwyvj0gJIlc2bq16hBSz8/MmfKxNzAQEb278+E776jhV/c+VFOjo4M7tWLN5ydGT1gQPxFAuW8vcnl4vJM7zdjxoz4+/vTrFkzXF1dE82ePUn27NkZMWIEn376KVmzZqVevXrx25ydnQkICKB3795ER0fj4JSFPgNGPFNdqcHW1pbxwwcmuEhg3LCvkjz/7FnGUIUy3nw1aiy+vnGH2ctU8GHDmg34lPcGoGLlCmzb9D86NPuIHC45KOZRjJP/CvGOTo6U9y1HZKQx/hD4g/sPGNpnGJGRkcTGxFK0eBGqvF3lv10nrMPHh2Hjx3MxPJwCefPS49NPk2znVrAgn338MV379sXKyoo8rq589cUXADSsXZtr16/Twd8fg8GAo4MDsyZMoEzp0nzxxRd06dIFk8lEVFQU9erVw9PTk4ULF7J7925sbW2xs7Nj4MCBxMTE0K9fP+7du4eVlRWurq706tUr2fpFRFKLVezTLpF7RURGRhIaGoqnp2eiE5GPHz8efwXfk5y+eC8ty0ugfo1yZr3XmX2uQoTdS51zoB4+eBh/HtH86fO5GHaJASMSnnSeN1M+7p8/nyr9pUTGf517Zi7mGi/mXiz9eceKKdpEp1Z+9B3aJ1GwT86/x8r0BQt4+OhR/EUBaeFVGysXz//Fd5vupLj94uHV0+Qej09SrPc8+mzv+fSGqSSg+gR2JXGIOq1UCgw0W19/az14u1n60VhJXWkxVtLlDJo8v5nfzuLooaNERUXhmseVnl/1eNklyUu24+cdTBn7LW+9/dYzhTMREXkyBbSX4OTJk2adFUlN3fv6v7S+23Ttmuhk+1IeHoyekPj2IpZo0YKZ7Pj1p0TPjwr4lqzZnF9CRamjcrXKVK5WOUVtb928Rd/P/rkYxdbalhijkbffeovO7dqlVYkiIq8cBTR5ZQRPnfr0Rhbsw3Z+fNjO72WX8VJlc87GjMX/3H7F3IfDRUReFbqKU0RERMTCKKCJiIiIWBgFNBERERELky7PQYsyRWFrSHjfqMJ5Mr3wfiOijFy6mvJ7oYmIiIgkJV0GNFuDbZrcjyWg+gRAAU1ERERejA5xWhivavV4+PDRyy7jhYSFhVHj/fdTbX/TFyxg4hMWXl+5fj3z5s177n3v37+fhg0b0rRpU3bt2oWfnx/nH19VePbsWZo2bUrTpk1Zt27dc/fxX/VrlOPRo4eptj8REXn9pMsZtPTMFG3CYGN42WUkEG0yYWN4vpqaNWr0QneHX7t2LU2bNqVTp04AVKpUKX7b1q1b8fHxYciQIc+9/9eNyWSKXwhdRETSjgLaS+Du7s6nHT5k59793L57F3+/j6hV7a1E7cZPncm+g4eJio4ma5YsDOvbg9y5cnIx/DKtO/vTrFEDft21l4jICIb26UGZ0p6J9nEx/DKtm3xA4+aN2L9nP7Xq16JMxTJMHD2J27duYzAY+LhbRypUrsDlS5fp0rYra7atBkjw+O+/G77XkD3/t5uIiEh6D+5FKe9SAHy/fC2rFq/C+Q1nqvgmv94iwJCAAJwcHTl/8SK37txh0dSpzFu6lI3btgFQolgx+nz2GY4OccthXb56Ff8BAwi/epWC+fIxuHdvMjk5MX3BAqJtbenbty+rV68mJCSEzJkz8+eff5IpUyamTJlCjhw5kqxh1qxZbNq0CXt7e9avX8+yZcto0KAB06ZN48SJE8yfP5+YmBj279/PlClTiI2NZfDgwdy8eRNTjBUdOnWjXIWkb9C6ankwP/+0FZMpGju7DHz2RT/cirgnaLNv707Wrl7K8K8DuX3rJq3er8OAwd9QpXotViydz4P79xk+pN8Tx8GoCd+SN3cu2n/QDIDjf/xF3+Ffs3bhLFat30TwijXY2dkSExPD2KFfUahAviRrXbtpKxt/+AknJ0fCrlzHIZMD/Yb3I4fLG2xev4XtW38iS9asnDtzjt6DenHzxk1mfzsbU0wMWbNlpceAL8iTLw8Am9ZuYvXSNQDY2towcuIonLNnY/dvu1k0ZzFGoxEbWxu69uxCiVIlOH36NH169iQiMpKYmBga1qlDu+bN2b5jB1PnzsVgbY0pJoY+n332xIXTRUReRwpoL4mVlTULpk7k7PkLtOvWE5/SnokWTO/YugW9usbd2HR1yCYmTZ9DwJC4dS9v37lL6ZIefO7XgQ0//Ejg9DnMD0r6jvq3b98mf6H8tO/cHoBu7T/jnXffoUHT+pw9fY4efj2Yu3LOU2u+e+cuJUqX4ONuHfnfpm3MnDyLyXMCOfXnaRbPWcy0RdNwzp6NWeNnp+gzOHzsGDPHj8fBwYH/27OHjdu2MWfSJJwcHRkSEMCs4GD8Hy+mfiA0lCXTppE9WzaGjRvHrODgJNdsPHLkCOvWrcPV1ZWBAwcSHBxMjx5JL0fVqVMn/vrrLzw9PWnTpk2CbY0bN+bcuXM8fPiQvn37AtC8eXNatGhB8+bN+en/DtHnCz+mz1tJ1qzZEu27Zp13eL9F3D4P7NvNlIlfMyloXoI2JUv5MGbUQKKjozm4fw8eJUpz8MAeqlSvxcH9e2n2Qdyd9Z80Dlq91xj/AUNo1/J9rKysWLpmHS2bNsLKyoqJ02axat50crnkwGg0YoqJSfa7OHDkKMtnB1G8QlVGjx9F0LgghgbEzRweORjKzCUzyJ03N7du3qJPt75MmDGBgoULsPH7TYwe+DVB87/l4O8HWTx3CYGzJuH8hjOPHj7CYDBwKewSC2cHM2bKNzhldOLsqbP09+/Pkg1LWLx4MW9WqIDf48//7r24dSunzZ9Pf39/fEqVwmQy8SgiItn6RUReNwpoL8m779QFoGD+fHgULcKRY8ep/qZvgja/7d7Lsu9DePjoUaIljhwdHKhWuSIApUsUZ/zUmU/sK0OGDFSvXR2IW+z81B+nqNf4cf+FC1DE3Y1jR45TuEihZGt2cHTAt0rcIcASpTyYNmkaAIf2HaLiWxVwzh4XVFq2bMnGDRue+hnUrFoVh8czZLv376dO9epkdHIC4N133mHcv1YOqFKxItmzxe2/Sb16BAQFJbnPMmXK4OrqCoCXlxc7dux4ah0pcf/+fY4fP877j8+tK1CwMIWLuHPi2BEqVa6aqP1ffxxn2aK53Lt3Bytray5eSHy3fHt7ewoUKMyJ46Ec2L+H1u06MXt6IFFRUfz5x3FKesbNGD1pHBQumJ88rq783+7fKV2yOD//3256d4sLreV9vBn8zXiqv+lLVd8K5M3tmuz78ylVkoL542bY6jdtgN8H/6x4UMrbk9x5cwNwPPQEbsUKU7BwAQDqNa7L5DGTefjgIbv/bze136mN8xtxy1Y5OMZ9t3t37iU8LJwen/xzYY7JFMPNG7coX74834weTVR0NOW8vCjv7R1Xv7c3E6dPp1bVqlQuX54ihZIfmyIirxsFNAsQSyxgleC5S5evMC5oBoumTyavay4Ohh6j/4hv4rfb2f1zmxBra+v4f7hnLlzCD9t/BaB3t87kcc2Jg4MDVlZx+4+NjU2yBisrKwwGQ4LtRqMxQRtb2//0GW1Kdp9P42hvn7CG/9b0hNfFPq43KRkyZIj/22AwJAq2qc3Kyoozp/9i3NeDASjtXZaOn/gzamhfxk6aSZFixblx/RptWtRP8vXeZcpzaP8eThwL5bMv+pM1W3a2b9tM4cJFsbPLwMWLF5MdB63fb8Ky70M4de48NapWJlPGuIA7ceQgQk/8wZ79B+n0RV8G9vyctyqVT9mbio1N8OHbPw7R8due8M08aRjExkJ533L0G94v0bbSdUtTLEcOdu7bx7ylS1m3ZQsj+/WjV5cu/HnmDHsPHKDviBF82KwZ7zVokLL6RUReA+kyoEWZoh7fEiN1RUQZn97osbWbtvJJu9acC7vIyb9OU6pE8QTbHzx4iK2NDW84ZyMmJoYVa58+IwXg17YVfm1bxT++GH45wXanjE64FXNja8hW6jWux/mz5zn1x2k8PIuTKVMmoqOjuXjhInny5WHb5h9T1Kd3OW+WLVjGrZu3yOacjZUrV6bodf9WsUwZAmfO5IN338XRwYHvN26kQpky8dt/272bW7dvky1rVtZv2WL285EyZsyIh4cHa9as4f333+fC+bOcPvUH7h6eZM2ajaCZi+PbPnhwH5PJxBsuOQEIWbviifv1KlOBsaMHkS9/QWxtbfEuU57g+TOo905TIG7mLrlxUKVSecYFzeD4n38RNGYEANHRJsKvXKGUhzulPNwJuxjOiT9PJRvQDoYe41zYRdxzFWLL+i14l/VOsl2J0iUYN2I858+eJ3/B/GwN2UoRdzccnRzxrVKJcSPG0/C9hjhnzxZ3iNPGQLlKZVk4cyFnT52loFtBAE4cPUHxksU5d+4c2Z2daVy3Lvnz5GHYuHEAnL1wgaKFClG0UCEePXrEsZMnFdBEJF1JlwHtvzepBTh98Z5Za7CztaV9t57cunOHQb38E51/VtStELWrV+G99p1xzZmDsl6l2X/4SKr0PWBkfyaOnsTKxaswGAz0G96XrI/779arK3269SVnLhe8y3mnaH9uRQvT+qPWdP/4C7Jlz0bdmnWfuaY3K1Tgz9On+ah7dwA8ihal04cfxm+v4OPDsPHjuRgeToG8eenx6afP3MeLGjduHIMHD2bevHmYYqz4sv/wJM8/c3LKSNuPOtO9SztcXHJRrmLSFxIAFPfw5O6d23j7xIUnb5/yzJsVFP/Y3d092XFgbW1N43q1+G33XtyLFAYgJsbEoK/Hc+/+A6ytrcjpkoPunTsm+97KepXiuzkLOT08IP4igaRkzZaVfsP7Muqr0ZhMJrJmy0r/EXHnRXqV9aJVh1b06folVtbW2NraMnLiSPLmz0v/Ef0YN2IckZFGoqOiKelVkuIli7Np0ybWrl6NrY0NWFnRu2tXAKbMns2FixcxWFuTKWNGBvXqlWz9IiKvG6vY5z0+ZWEiIyMJDQ3F09MzwWEugOPHj+Ph4ZHs680Z0OrXKMfOTWtwdHR4euNUYJ+rEGH3LpilL4C8mfJx/3zic67SyovcZuN5mWu8FM6TiYjLZ5Jt07lnf95vVJ86byc+Fy4l1m7ayi879zB++ECNlTTwImPl4vm/+G7TnRS3Xzy8On+M6/Dc/T2rYr3npclNv58koPoEdj3+jzhzqBQYaLa+/tZ68Haz9KOxkrrSYqzoRrUir6ijJ/7gnVYfkTGjU5K3aRERkVdXujzE+bKdPHnyqbMir4OTf/3F0LFjEz3fokkT3jXj+UQrVqwgODg40fPffPPNU2dWLVnJ4sXYsGRuitu3+uTzRBdNlCpRnEG9/GlSv05qlyciIi9AAU3SjHuRIix5whJN5tS8eXOaN2/+sst46ZbMmPKySxARkRTSIU4RERERC6OAJiIiImJhFNBERERELEy6PActJioKa9uE90IrnCfTC+83KsLIhRuRL7wfERERSd/SZUCztrVNk/ujxN0H5cUCmle1ema9R9rTnDx2klWLVzFg5IBk212+dJkubbuyZtvqRNvu3b/P6g0baN+y5XPV0LBNGyaNGGFR6zH+sHk9u3f9ysChAamyvz49PuH9Fm2p6Fsl0bahARNpXLc2Zbw8n2vfy78PYdGq78lgl4FJowYz6OvxzA6Mq/vHX3cwecZc7OzsCBjSj+K5XvwzPvj7QaYHzuC7hVOf3lhERJKULgOapJx7CfenhrOnuXf/PvOXL3/ugPasTCYTBoPBLH2llMkUjcHwfP93G9qnxwv1vXjVWkYN+BJPD3eA+HAGsHLdRrp2bPvcN7l93URHR2Njo59FEXn59Ev0Eri7u/Nphw/ZuXc/t+/exd/voyRvNDp+6kz2HTxMVHQ0WbNkYVjfHuTOlZOL4Zdp3dmfZo0a8OuuvURERjC0Tw/KlE48wxI4Yw7ZcxegXsu6bP9hOyMHjGLFluVkc85Gf/8BvN/6PcpVKsfu33azaM5ijEYjNrY2dO3ZhRKlSiSaDfl+2fesXrqGjJmcqPBmRdYuX5tg1mx20BwO7DrAw3v3GNSrFz6ennwzZQr379+nVefO2NvbMzcwkGs3bjA2KIjLV68SERlJvbffpmPr1gAcOHKErydPxj5DBjw9PJ66GPvq1avZsGEDzs7OnDp1ilGjRnHt2jUmTJiAyWTC2dmZ4cOHU6BAAVavXs327duZPHly/Gv/frx69WpCQkLInDkzf/75J5kyZWLKlCnkyJEDo9HIyJEj2b17Nzlz5uSNnPme+j23b9WIug2acOjAXnK55uHTz3rz3eSx/HHyKAA1ajegRasO8e0P7NvDquULuX7tKlWr16ZDp24AfNz9S9q1bEa1yhUZ9PU47OzsOHfhIleuXqN0SQ9GDuj9xMXjvxw6mguXwvlq9FhKFCvKZ53a07qzPz+vW87Yb6ez/0goZy+Esez7EGYHBvDLL78wZuw3mGJiyJotKz0GfEGefHmS3PfogaO5cC6MKKOR3Pny8OXg3mTKnPBUgVnfziJT5ky0bNcyyfHX+ePOeObJQ/evvuLO3btEGo2UdHfnqy++wNbWlhZ+fgzp3ZuS7nHhMnjlSs5euMCA7t0Z8+237D14EDtbWxzt7ZmTzJ28hwQEYGNjw5XbtwkPD6d8+fIMHjwYOzs7+vXrh5OTE2fPnuXWrVusXr2aGTNmsG7dOgBKlSrFwIEDcXJywmg0MnHiRH799Vesra3Jly8fQUFBAMycOZMtW7ZgMpnImTMnI0aMIEeOHOz8v+0smPMd1tYGTKZouvr3obR3ORbNn8H2H7dgZ5cBrGDMhOlkzPjip1qIyOtBAe0lsbKyZsHUiZw9f4F23XriU9oz0XqcHVu3oFdXPwBWh2xi0vQ5BAyJW/fw9p27lC7pwed+Hdjww48ETp/D/KDEC8BXLOPNwjUbqdeyLvv3HMCjlAcH9h6kas0qHD96HE9vTy6FXWLh7GDGTPkGp4xOnD11lv7+/VmyYUmCfZ368zSL5y1hxuLpZM2WlaDxCQ9h3b1zlxKlSzCk3xCWz53LlJkzmRMYSL/PP6dNt24J7ok2JCCATh9+SJnSpYmKiuLTPn0o4e5OmVKl6D9qFCP796eclxdbf/6ZZd9//9TPc//+/axdu5b8+fNz48YNPvroI4KDgylSpAgrVqygd+/erFjx5EXL/3bkyBHWrVuHq6srAwcOJDg4mB49erBs2TLCwsIICQkhOjqa5i1a4ZLL9an7u3njOmMmxL3v2TMmExMbw3ezl/Hw4QN6ftaRQoWLUr7imwCcP3ear8dNxWg00vOzj/AoWZrCzRLf0PevM2eZMf4brK2taNGpG7t+P4Bv+TKJ2gGMHTqA+i3bMW7YQIoWLsjF8Mvx2778rDMn/vwrPvzduHWbPn36MG76OAoWLsDG7zcxeuDXBM3/Nsl9d+vdjSxZswAwZ+ocls5fit/nfgna+JT3YUXwClq2a5nk+CtbtizRV68yasAAsmbOTGxsLEMCAli7eTPNGjWiRePGrFi3jpJffklsbCwr168nYPBg/jh9mj3797Nqzhysra25e+/pyymFnjjB8lWryJAhA5988gnLly+nTZs2ABw4cIDg4GAcHR35+eefWbduHUuXLsXJyYm+ffsydepUvvzyS2bMmMGFCxdYvXo1dnZ23Lx5E4C1a9dy/vx5li9fjrW1NYsXL+abb75h/PjxLJw7nW7d++FZ2geTyURExCPu3bvLquXBLFm9lQwZ7Hn48EGiJepEJH1TQHtJ3n0nbkHxgvnz4VG0CEeOHaf6m74J2vy2ey/Lvg/h4aNHie4A7+jgQLXKFQEoXaI446fOTLIf71Il6TN8DFFRURw9dJTOX3zCL9t+5Q2XNyjsVgh7e3v27txLeFg4PT75Z500kymGmzduJdjXoX2HqPhmhfiF1es1qsv/Nv4vfruDowO+VSoBUMrDg0lPuEnto0eP+P3QIW7dvh3/3INHjzhz/jzZs2XDPkMGynl5AVCnWjVGTZyY5H7+rUyZMuR/vM7ioUOHKF68OEWKFAHg/fffZ9iwYdy/fz9F+3F1jQteXl5e7NixA4Ddu3fTtGlTbG1tsbW15e1a9TkaevCp+6tZ5534vw/u20Pnz+Jmu5ycMlK9Rh0O7t8TH9Bq1W2IwWCDg4MNVd+uw6EDe2mVRECr8VZlMmSwA8CjaBEuXArHN1GrZ3fk2AmKFy9OwcIFAKjXuC6Tx0zm4YOHODo5Jmq/NeQHtm3eRnRUFBEREeTNnzdRG08vT0b0H/nE8efg4MCdmBgWrljBjj17MMXEcO/+fewfh5V3atdmZnAwd+7e5ejJkzhny0YxNzfuPXiAKSaG4ePHU97bmyqVKj31/dWpVg0nJycAmjZtytatW+MDWr169XB0jHuPO3fupEGDBmTMmBGAFi1aMHr0aAB++ukn+vXrh51d3Ofv7OwMwI8//khoaCjvvvsuEHeY/e/Xe/mUY+Z3E6lSrRblKlamYKEimEwm8uYrwNjRgyhbvjIVfavg6OiUkq9JRNIJBTQLEEsskPAQ1aXLVxgXNINF0yeT1zUXB0OP0X/EN/Hb7ez+uQrV2to6PsDNXLiEH7b/CkDvbp2pUMYLd3d3ftzyE85vOONdzptpk6aTw+UNvMv7xPUfC+V9y9FveL9EtZ0/c+6fOmNjn3goDcD2X1fGGqytif5PqPxbzOP9LAgKwvY/5/v8cerUE/efnL//4X1anQaDgZiYmPjHkZEJL+r49yyGwWCI/1yfdpj1SRwc/rnYI66u/7Z4wueZzHv4OxzE1WidKLw/vyf3uXndZlYvXQNAi7YtcMnlwvpV65k8J5Cs2bKybfM2NqzekOh1GewzULho4WTH3+Yff+RgaCizJk7EydGROYsXcy4sDAAHe3vq1ajBui1b2Hf4MC0aNwYgk5MTK2bNYt+hQ+w5cIDJs2ax6LvveONxYHrqO/3P5/t3OEtq239f96Tnu3TpQrNmzRJt69ytF2dO/8WhA3sZPawf7zb7kPoN32Vi0FyOhh7i0IHf+fzTNoz8ZgqF3IqmqH4Ref2ly4AWExWVJivPR0UYU9x27aatfNKuNefCLnLyr9OUKlE8wfYHDx5ia2PDG87ZiImJYcXaxP/4JcWvbSv82rZK8Jyvry/zp8+n0fsNsbOzI4fLG2wJ2cqAEXGHS8tVKsvCmQs5e+osBd0KAnDi6AmKl0xYk3dZL5YvXM6d23fIkjULW0K2pqgmJ0dHIiIjiTaZsDEYcHJ0xMfTk3lLl+L3eAbj8tWr2NjYUDBfPiKNRvYfPkyZ0qX53y+/cP/BgxT18zcfHx+++uorTp06hZubG2vWrKFEiRJkzJiR/Pnzc/LkSYzGuO9qy5YtZM6c+an79PX1Ze3atTRo0IDo6Gi2b9tMjpy5nq2uchXZsmEtJUp68ejRQ37+aSudPv0ifvuPP2yk2tu1iTJG8evP22j/cddn2v+LKl3Sg6FjJ3P+7HnyF8zP1pCtFHF3w9HJkXqN61Gvcb34tjt+2YlTRicyZ8mM0Whk87rNT9xvmfI+yY6/ew8ekDVLFpwcHbn34AGbf/oJj6L/BJUWjRvj16sXJpOJsYMHA3Dr9m0MBgOVy5enYpky/LprFxfDw5MNaP/75Rf8unfHzs6OdevW8fbbbyfZrnLlyowbN462bdvi5OTEypUrqVy5MgA1atRg/vz5eHl5xR/idHZ2pkaNGixYsIDatWuTJUsWjEYjp0+fpnjx4oSdP0uhwkUoVLgIjx495I+Tx6hWow4Rjx5R2qsspb3KcvzoYc6ePaWAJiLx0mVA++890ABOX3z6OSypyc7WlvbdenLrzh0G9fJPdP5ZUbdC1K5ehffad8Y1Zw7KepVm/+Ejz9WXr68vgYGB+DyesfApX4bQQ0cp7hkXwPLmz0v/Ef0YN2IckZFGoqOiKelVMlFAcyvmRst2Lfn8I3+yZc9G2QplcMr49MMyWTJnpn6NGrT08yNzpkzMDQxkZP/+TPjuO1r4xZ2z5OToyOBevXjD2ZnRAwbEXyRQztubXC4uz/R+nZ2dCQgIoHfv3kRHR+Ps7MzYx4u2+/j44OvrS8OGDcmbNy9ubm5cu3btqfts0aIFJ0+e5J133iFXrlyU8irL5csXn6mu1m07MXVyAF0+jruatUbtBpSrUDl+e5Gixenfuys3rl+jSrVaSd5yIy05Z81KQEAAo74ajclkImu2rPR/HKL+q2LlCmzb9D86NPuIHC45KOZRjJNHTyTZ1qdCGeZOm/fE8fdO7dr8vGMHzTt1Ikf27Hh7eiaY2czj6krBfPnwLF48fpb28rVrjJw4EZPJhMlk4s0KFSj1lIXvfUqXplu3bly6dIny5cvTokWLJNtVq1aNkydP8sEHHwDg6elJly5dAPjkk08YP358/OHuAgUKMHnyZJo2bcrt27fjD5nGxsbSqlUrihcvzpyZ33Lp4nkMBhucMmbki96DefDgPqOG9CHSGElsTAxFihbnzSpJB0YRSZ+sYp/32I2FiYyMJDQ0FE9Pz0Qn2x4/fhyPp/x4mzOg1a9Rzqz3OrPPVYiwexdSZV//Ph9p/vT5XAy7FD8T8re8mfJx//z5VOkvJTI+PvfMnMw1XgrnyUTE5TNm6QtSd6ykRErGyv0HD3i/Y0cWfPstOXPkeK5+hgQEUKJYMT7293+u17+IFxkrF8//xXeb7qS4/eLh1fljXIfn7u9ZFes9jz7bez69YSoJqD4hTe5h+SRpcaTlaVoP3m6WfjRWUldajJV0OYMmz2/mt7M4eugoUVFRuOZxpedXL3aPLpHkrFy/ntmLF9OmWbPnDmciIq8iBbSX4OTJk2adFUlN3fuafwbib226dk10QnwpDw9GT0h8exFz2bzhe9Z/vzzR8z37DsGtiLvZ6hgxfjJHjiU8xGgwGFgyY4rZakgLzRo1olmjRilqe/Kvvxj6+FD2v7Vo0oRhffqkdmkiImlKAU1eGcFTLW/poHrvNKXeO01fdhkM6vXygrOlcC9SJMG99kREXmXWL7sAc3lNTrUTkXQiNjYW/WyJpF/pIqDZ29tz48YNhTQReSXExsYS8eAO1+6m1j3uRORVky4OcebNm5ewsLBkb6dw/XaE2eqJvGtP1N3rZuvP9lYEtyJumq2/e/b3ibxpvv4yPON90lKDucaLxkrqelXGSmwsXLtrYuPv5q9XRCxDughotra2FCpUKNk25rq0GWDxcJ/X+/JmH/Ne3uz9Wl8Kr7GSml7nsSIirxezHeI8c+YMLVu2pG7durRs2ZKzZ88mamMymRg2bBi1atWidu3aKVrcWkREROR1Y7aANmTIEFq3bs2WLVto3bo1gx8v2fJv69ev5/z582zdupVly5YxZcoUwh6vySciIiKSXpjlEOeNGzc4duwYc+fOBaBhw4aMGDEifh27v23cuJHmzZtjbW2Ns7MztWrVYvPmzXTq1Ompffx9AcDfayw+q8wOT14EPLVFRkYSY5/JrP05Wj19SabU7A8nM/dnZuYaLxoradCfmem3JXX703hJHRoradDfc7Kzs8PKKvH3bpalnkJDQ+nbty8bNvyz4HeDBg0YO3YsJUuWjH+uUaNGjBo1itKlSwMwc+ZMrly5wsCBA5/ax7179/jjjz9Sv3gRERGRNJLUEpXwGl0k4OTkRLFixbC1tU0yiYqIiIhYGjs7uySfN0tAc3V15cqVK5hMJgwGAyaTiatXr+Lq6pqo3aVLl+Jn0MLDw8mdO3eK+rC2tiZTJvNN14qIiIikFbNcJJA9e3Y8PDwICQkBICQkBA8PjwTnnwHUq1ePFStWEBMTw82bN/nf//5H3bp1zVGiiIiIiMUwyzloAKdOnaJfv37cvXuXzJkzM2bMGAoXLoyfnx/+/v6UKlUKk8nE8OHD+b//+z8A/Pz8aNmypTnKExEREbEYZgtoIiIiIpIy6WItThEREZFXiQKaiIiIiIVRQBMRERGxMApoIiIiIhZGAe01lpIF6kUAxowZQ40aNXB3d9eKHJLIk8aHfmPkv27duoWfnx9169alUaNGfPbZZ9y8eRPQeHlWCmivsZQsUC8CULNmTRYtWkSePHlediligZ40PvQbI/9lZWVFp06d2LJlC+vXrydfvnyMGzcO0Hh5Vgpor6m/F6hv2LAhELdA/bFjx+L/S0bk38qVK5doZQ+RvyU1PvQbI0nJmjUrFStWjH/s7e3NpUuXNF6egwLaayo8PJycOXNiMBgAMBgMuLi4EB4e/pIrE5HXgX5j5GliYmJYsmQJNWrU0Hh5DgpoIiIikupGjBiBo6Mjbdq0edmlvJIU0F5T/16gHnjiAvUiIs9DvzGSnDFjxnDu3DkmTZqEtbW1xstzUEB7TaV0gXoRkeeh3xh5kokTJxIaGkpQUBB2dnaAxsvz0Fqcr7EnLVAv8l8jR45k69atXL9+nWzZspE1a1Y2bNjwsssSC/Gk8aHfGPmvP//8k4YNG1KwYEHs7e0ByJs3L0FBQRovz0gBTURERMTC6BCniIiIiIVRQBMRERGxMApoIiIiIhZGAU1ERETEwiigiYiIiFgYBTQRERERC2PzsgsQEXkWNWrU4Pr16xgMBhwdHalSpQqDBg3CycnpZZcmIpJqNIMmIq+cadOmceDAAb7//nuOHTvGjBkzXnZJAERHR7/sEkTkNaGAJiKvrBw5cvDWW29x/PhxAA4ePMgHH3xAuXLlaNy4Mbt3745vu3r1amrWrImPjw81atRg3bp1AMTExDB16lTefvttfH196dOnD/fu3QNg9+7dVK1aNUGfNWrUYMeOHQBMmTIFf39/evfuTZkyZVizZg23b9+mf//+vPXWW5QvX56uXbvGv/ann36iSZMmlCtXjg8++IATJ07Eb5sxYwZVqlTBx8eHunXrsnPnzrT50ETklaBDnCLyyrp8+TK//vorFStW5MqVK3Tu3JmAgACqVKnCzp078ff3Z9OmTdjb2zNy5EhWrlxJ4cKFuXr1Knfu3AHigtuaNWtYsGABzs7O9O3bl+HDhzN27NgU1bBt2zYCAwMJCAjAaDTi7++Po6MjGzZswNHRkQMHDgBw9OhRBgwYwLRp0/D09GTdunV07dqVzZs3ExYWxqJFi1i5ciU5c+YkLCyMmJiYNPvcRMTyaQZNRF453bp1w8fHh2rVquHs7Iy/vz9r166latWqVKtWDWtra9588008PT35+eefAbC2tubPP/8kIiICFxcXihYtCsD69evp0KED+fLlw8nJiZ49e7Jx48YUH6709vamVq1aWFtbc/fuXX755ReGDRtGlixZsLW1pUKFCgAsX76cli1b4uXlhcFg4N1338XW1paDBw9iMBgwGo2cOnWKqKgo8ubNS/78+dPmwxORV4ICmoi8coKCgjhw4AALFy7k9OnT3Lp1i0uXLrF582bKlSsX/799+/Zx7do1HB0dmThxIkuXLuWtt97ik08+4dSpUwBcvXqVPHnyxO87T548REdHc+PGjRTVkitXrvi/L1++TJYsWciSJUuidpcuXWLu3LkJ6rt8+TJXr16lQIECDBgwgClTplC5cmV69OjBlStXXvBTEpFXmQ5xisgrq0KFCrz33nuMGTMGLy8vmjRpwsiRI5NsW6VKFapUqUJERASTJk1i0KBBLF68GBcXFy5evBjf7tKlS9jY2JA9e3auXLlCRERE/DaTycTNmzcT7NfKyir+71y5cnHnzh3u3r1L5syZE7RzdXXl008/pUuXLknW16hRIxo1asT9+/cZPHgw48aNS/FhVhF5/WgGTUReae3bt2fHjh2ULVuWn376iV9//RWTyURkZCS7d+/m8uXLXL9+nW3btvHw4UPs7OxwdHTEYDAA0LBhQ+bPn8+FCxd48OABEydOpH79+tjY2FCoUCEiIyPZvn07UVFRfPfddxiNxifW4uLiQtWqVRk2bBh37twhKiqKvXv3AtC8eXOWLl3KoUOHiI2N5eHDh2zfvp379+9z+vRpdu7cidFoxM7OjgwZMsTXJyLpkwKaiLzSnJ2dadKkCfPnz2fq1KlMnz4dX19fqlWrxuzZs4mJiSEmJoa5c+dSpUoVKlSowN69exkyZAgA77//Po0bN6ZNmzbUrFkTOzs7Bg0aBECmTJkYMmQIAwcOpGrVqjg4OCQ4pJmUgIAAbGxsqF+/PpUrV2b+/PkAlCpVihEjRjB8+HDKly9PnTp1WL16NQBGo5Hx48dTsWJF3nrrLW7evEmPHj3S8FMTEUtnFRsbG/uyixARERGRf2gGTURERMTCKKCJiIiIWBgFNBERERELo4AmIiIiYmEU0EREREQsjAKaiIiIiIVRQBMRERGxMApoIiIiIhZGAU1ERETEwvw/xJB1rQzyBVgAAAAASUVORK5CYII=\n", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -879,17 +830,17 @@ "source": [ "import matplotlib.pyplot as plt\n", "import seaborn\n", - "resources = constants \n", + "resources = [int(1 / c) for c in constants] \n", "df = pd.DataFrame({\n", " 'Model Runtime Const': resources, \n", " **all_results\n", "})\n", - "fig, ax1 = plt.subplots(figsize=(5, 5),)\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", "seaborn.barplot(x='Model Runtime Const', y='Value', hue='Variable', data=tidy, ax=ax1)\n", - "ax1.set(xlabel='Model Runtime Const', ylabel=f'{metric} Accuracy')\n", + "ax1.set(xlabel='Resources', ylabel=f'{metric} Error')\n", "ax1.legend_.remove()\n", - "plt.legend(loc='lower center')\n", + "plt.legend(loc='lower left')\n", "seaborn.despine(fig)" ] }, @@ -1017,7 +968,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a958fee2", + "id": "d2775834", "metadata": {}, "outputs": [], "source": [ @@ -1027,7 +978,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87453791", + "id": "41a032ee", "metadata": {}, "outputs": [], "source": [ @@ -1120,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "c768b43d", "metadata": {}, "outputs": [], @@ -1168,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 185, "id": "ac3582ce", "metadata": {}, "outputs": [], @@ -1179,57 +1130,782 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 186, "id": "ce16ddda", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0questionanswerdoc_iddatetimerevidoldrevidts_min
00what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:16:27.428572103721253210372124891299.0
11what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:32:54.857144103721253210372124891315.0
22what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:49:22.285716103721253210372124891331.0
33what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:05:49.714288103721253210372124891348.0
44what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:22:17.142860103721253210372124891364.0
...........................
127727127727who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 20:46:09.2307001041650936104165081839968.0
127728127728who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 21:30:27.6922361041650936104165081840013.0
127729127729who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:14:46.1537721041650936104165081840057.0
127730127730who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 22:59:04.6153081041650936104165081840101.0
127731127731who is the ayo??????Hunter B-15 (portrayed by Wunmi Mosaku) is an ...623726382021-09-01 23:43:23.0768441041650936104165081840145.0
\n", + "

127732 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 question \\\n", + "0 0 what is the most common death in 2021??????? \n", + "1 1 what is the most common death in 2021??????? \n", + "2 2 what is the most common death in 2021??????? \n", + "3 3 what is the most common death in 2021??????? \n", + "4 4 what is the most common death in 2021??????? \n", + "... ... ... \n", + "127727 127727 who is the ayo?????? \n", + "127728 127728 who is the ayo?????? \n", + "127729 127729 who is the ayo?????? \n", + "127730 127730 who is the ayo?????? \n", + "127731 127731 who is the ayo?????? \n", + "\n", + " answer doc_id \\\n", + "0 A typical entry reports information in the fol... 65984422 \n", + "1 A typical entry reports information in the fol... 65984422 \n", + "2 A typical entry reports information in the fol... 65984422 \n", + "3 A typical entry reports information in the fol... 65984422 \n", + "4 A typical entry reports information in the fol... 65984422 \n", + "... ... ... \n", + "127727 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127728 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127729 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127730 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "127731 Hunter B-15 (portrayed by Wunmi Mosaku) is an ... 62372638 \n", + "\n", + " datetime revid oldrevid ts_min \n", + "0 2021-08-06 00:16:27.428572 1037212532 1037212489 1299.0 \n", + "1 2021-08-06 00:32:54.857144 1037212532 1037212489 1315.0 \n", + "2 2021-08-06 00:49:22.285716 1037212532 1037212489 1331.0 \n", + "3 2021-08-06 01:05:49.714288 1037212532 1037212489 1348.0 \n", + "4 2021-08-06 01:22:17.142860 1037212532 1037212489 1364.0 \n", + "... ... ... ... ... \n", + "127727 2021-09-01 20:46:09.230700 1041650936 1041650818 39968.0 \n", + "127728 2021-09-01 21:30:27.692236 1041650936 1041650818 40013.0 \n", + "127729 2021-09-01 22:14:46.153772 1041650936 1041650818 40057.0 \n", + "127730 2021-09-01 22:59:04.615308 1041650936 1041650818 40101.0 \n", + "127731 2021-09-01 23:43:23.076844 1041650936 1041650818 40145.0 \n", + "\n", + "[127732 rows x 8 columns]" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df" ] }, { "cell_type": "code", - "execution_count": null, - "id": "07d5672a", + "execution_count": 191, + "id": "76fa4e4f", "metadata": {}, "outputs": [], "source": [ - "df.doc_id.value_counts()" + "thresh = 20000" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d9dab1e5", + "execution_count": 192, + "id": "fe3f87bc", "metadata": {}, "outputs": [], "source": [ - "df.question.value_counts()" + "train_df = df[df.ts_min < thresh]\n", + "test_df = df[df.ts_min < thresh]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 193, + "id": "63a1c934", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0questionanswerdoc_iddatetimerevidoldrevidts_min
00what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:16:27.428572103721253210372124891299.0
11what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:32:54.857144103721253210372124891315.0
22what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:49:22.285716103721253210372124891331.0
33what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:05:49.714288103721253210372124891348.0
44what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:22:17.142860103721253210372124891364.0
...........................
127210127210What is the story of Soren??He first appears in the film \"Iron Man 3\" late...623726382021-08-18 23:18:001039252299103925226619960.0
127211127211What is the story of Soren??He first appears in the film \"Iron Man 3\" and ...623726382021-08-18 23:24:001039252266103923801419966.0
127212127212What is the story of Soren??He first appears in the film \"Iron Man 3.\" He ...623726382021-08-18 23:30:001039264480103925805919972.0
127213127213What is the story of Soren??He first appears in the film \"Iron Man 3\" late...623726382021-08-18 23:48:001039252299103925226619990.0
127214127214What is the story of Soren??He first appears in the film \"Iron Man 3\" and ...623726382021-08-18 23:54:001039252266103923801419996.0
\n", + "

48653 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 question \\\n", + "0 0 what is the most common death in 2021??????? \n", + "1 1 what is the most common death in 2021??????? \n", + "2 2 what is the most common death in 2021??????? \n", + "3 3 what is the most common death in 2021??????? \n", + "4 4 what is the most common death in 2021??????? \n", + "... ... ... \n", + "127210 127210 What is the story of Soren?? \n", + "127211 127211 What is the story of Soren?? \n", + "127212 127212 What is the story of Soren?? \n", + "127213 127213 What is the story of Soren?? \n", + "127214 127214 What is the story of Soren?? \n", + "\n", + " answer doc_id \\\n", + "0 A typical entry reports information in the fol... 65984422 \n", + "1 A typical entry reports information in the fol... 65984422 \n", + "2 A typical entry reports information in the fol... 65984422 \n", + "3 A typical entry reports information in the fol... 65984422 \n", + "4 A typical entry reports information in the fol... 65984422 \n", + "... ... ... \n", + "127210 He first appears in the film \"Iron Man 3\" late... 62372638 \n", + "127211 He first appears in the film \"Iron Man 3\" and ... 62372638 \n", + "127212 He first appears in the film \"Iron Man 3.\" He ... 62372638 \n", + "127213 He first appears in the film \"Iron Man 3\" late... 62372638 \n", + "127214 He first appears in the film \"Iron Man 3\" and ... 62372638 \n", + "\n", + " datetime revid oldrevid ts_min \n", + "0 2021-08-06 00:16:27.428572 1037212532 1037212489 1299.0 \n", + "1 2021-08-06 00:32:54.857144 1037212532 1037212489 1315.0 \n", + "2 2021-08-06 00:49:22.285716 1037212532 1037212489 1331.0 \n", + "3 2021-08-06 01:05:49.714288 1037212532 1037212489 1348.0 \n", + "4 2021-08-06 01:22:17.142860 1037212532 1037212489 1364.0 \n", + "... ... ... ... ... \n", + "127210 2021-08-18 23:18:00 1039252299 1039252266 19960.0 \n", + "127211 2021-08-18 23:24:00 1039252266 1039238014 19966.0 \n", + "127212 2021-08-18 23:30:00 1039264480 1039258059 19972.0 \n", + "127213 2021-08-18 23:48:00 1039252299 1039252266 19990.0 \n", + "127214 2021-08-18 23:54:00 1039252266 1039238014 19996.0 \n", + "\n", + "[48653 rows x 8 columns]" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "d3343738", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0questionanswerdoc_iddatetimerevidoldrevidts_min
00what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:16:27.428572103721253210372124891299.0
11what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:32:54.857144103721253210372124891315.0
22what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 00:49:22.285716103721253210372124891331.0
33what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:05:49.714288103721253210372124891348.0
44what is the most common death in 2021???????A typical entry reports information in the fol...659844222021-08-06 01:22:17.142860103721253210372124891364.0
...........................
127210127210What is the story of Soren??He first appears in the film \"Iron Man 3\" late...623726382021-08-18 23:18:001039252299103925226619960.0
127211127211What is the story of Soren??He first appears in the film \"Iron Man 3\" and ...623726382021-08-18 23:24:001039252266103923801419966.0
127212127212What is the story of Soren??He first appears in the film \"Iron Man 3.\" He ...623726382021-08-18 23:30:001039264480103925805919972.0
127213127213What is the story of Soren??He first appears in the film \"Iron Man 3\" late...623726382021-08-18 23:48:001039252299103925226619990.0
127214127214What is the story of Soren??He first appears in the film \"Iron Man 3\" and ...623726382021-08-18 23:54:001039252266103923801419996.0
\n", + "

48653 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 question \\\n", + "0 0 what is the most common death in 2021??????? \n", + "1 1 what is the most common death in 2021??????? \n", + "2 2 what is the most common death in 2021??????? \n", + "3 3 what is the most common death in 2021??????? \n", + "4 4 what is the most common death in 2021??????? \n", + "... ... ... \n", + "127210 127210 What is the story of Soren?? \n", + "127211 127211 What is the story of Soren?? \n", + "127212 127212 What is the story of Soren?? \n", + "127213 127213 What is the story of Soren?? \n", + "127214 127214 What is the story of Soren?? \n", + "\n", + " answer doc_id \\\n", + "0 A typical entry reports information in the fol... 65984422 \n", + "1 A typical entry reports information in the fol... 65984422 \n", + "2 A typical entry reports information in the fol... 65984422 \n", + "3 A typical entry reports information in the fol... 65984422 \n", + "4 A typical entry reports information in the fol... 65984422 \n", + "... ... ... \n", + "127210 He first appears in the film \"Iron Man 3\" late... 62372638 \n", + "127211 He first appears in the film \"Iron Man 3\" and ... 62372638 \n", + "127212 He first appears in the film \"Iron Man 3.\" He ... 62372638 \n", + "127213 He first appears in the film \"Iron Man 3\" late... 62372638 \n", + "127214 He first appears in the film \"Iron Man 3\" and ... 62372638 \n", + "\n", + " datetime revid oldrevid ts_min \n", + "0 2021-08-06 00:16:27.428572 1037212532 1037212489 1299.0 \n", + "1 2021-08-06 00:32:54.857144 1037212532 1037212489 1315.0 \n", + "2 2021-08-06 00:49:22.285716 1037212532 1037212489 1331.0 \n", + "3 2021-08-06 01:05:49.714288 1037212532 1037212489 1348.0 \n", + "4 2021-08-06 01:22:17.142860 1037212532 1037212489 1364.0 \n", + "... ... ... ... ... \n", + "127210 2021-08-18 23:18:00 1039252299 1039252266 19960.0 \n", + "127211 2021-08-18 23:24:00 1039252266 1039238014 19966.0 \n", + "127212 2021-08-18 23:30:00 1039264480 1039258059 19972.0 \n", + "127213 2021-08-18 23:48:00 1039252299 1039252266 19990.0 \n", + "127214 2021-08-18 23:54:00 1039252266 1039238014 19996.0 \n", + "\n", + "[48653 rows x 8 columns]" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_df" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "07d5672a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1305297 8677\n", + "332667 4300\n", + "66304621 3720\n", + "17888363 3569\n", + "3259011 1581\n", + " ... \n", + "67959451 15\n", + "49474213 12\n", + "66135952 12\n", + "66074428 8\n", + "40713040 2\n", + "Name: doc_id, Length: 118, dtype: int64" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.doc_id.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "d9dab1e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "What is the name of the company?? 4339\n", + "what is the name of the company?????? 4018\n", + "what is the u.s. open???????? 1201\n", + "What is the best tennis tournament in the world?? 1090\n", + "what is the std?????? 1035\n", + " ... \n", + "What are the main factors that affect the success of the high speed railway system?? 1\n", + "What did the government do?? 1\n", + "What is the purpose of the evacuation?? 1\n", + "what is a christian name?? 1\n", + "How many corridors are under construction?? 1\n", + "Name: question, Length: 3219, dtype: int64" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.question.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 198, "id": "ea220f3d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1305297 8677\n", + "332667 4300\n", + "66304621 3720\n", + "17888363 3569\n", + "3259011 1581\n", + " ... \n", + "67959451 15\n", + "49474213 12\n", + "66135952 12\n", + "66074428 8\n", + "40713040 2\n", + "Name: doc_id, Length: 118, dtype: int64" + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.doc_id.value_counts()" + "train_df.doc_id.value_counts()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 199, "id": "86c3ebf2", "metadata": {}, "outputs": [], "source": [ - "weights = df.doc_id.value_counts().to_dict()" + "weights = train_df.doc_id.value_counts().to_dict()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 200, "id": "af72ae1a", "metadata": {}, "outputs": [], @@ -1242,20 +1918,159 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 201, "id": "500bd6ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{1305297: 867,\n", + " 332667: 430,\n", + " 66304621: 372,\n", + " 17888363: 356,\n", + " 3259011: 158,\n", + " 58112801: 125,\n", + " 68507348: 120,\n", + " 62808792: 109,\n", + " 66052432: 98,\n", + " 62372638: 87,\n", + " 67089631: 86,\n", + " 67569553: 82,\n", + " 60476189: 82,\n", + " 68553225: 80,\n", + " 1425939: 77,\n", + " 66187257: 68,\n", + " 65521767: 58,\n", + " 67946554: 55,\n", + " 68498551: 45,\n", + " 442785: 44,\n", + " 734845: 40,\n", + " 487602: 38,\n", + " 66883576: 37,\n", + " 68294454: 36,\n", + " 61250187: 36,\n", + " 50170924: 36,\n", + " 61236755: 34,\n", + " 12936708: 30,\n", + " 1027173: 30,\n", + " 46754025: 27,\n", + " 58542318: 25,\n", + " 61258486: 24,\n", + " 66753136: 24,\n", + " 5575754: 24,\n", + " 65871303: 23,\n", + " 12202928: 22,\n", + " 60600284: 22,\n", + " 51150040: 22,\n", + " 34075129: 22,\n", + " 58385279: 21,\n", + " 61049392: 21,\n", + " 60203476: 21,\n", + " 66341639: 21,\n", + " 63129286: 21,\n", + " 26833: 21,\n", + " 24689651: 20,\n", + " 66629866: 20,\n", + " 33385984: 20,\n", + " 58113491: 20,\n", + " 63170193: 20,\n", + " 63395714: 19,\n", + " 55055575: 19,\n", + " 67918135: 19,\n", + " 68284887: 18,\n", + " 65984422: 18,\n", + " 66040815: 18,\n", + " 57798785: 16,\n", + " 67131229: 16,\n", + " 53943680: 16,\n", + " 20304678: 16,\n", + " 64783122: 16,\n", + " 51345275: 15,\n", + " 39734558: 15,\n", + " 65666080: 14,\n", + " 31243078: 14,\n", + " 36567599: 14,\n", + " 67742925: 14,\n", + " 67674654: 14,\n", + " 6063379: 14,\n", + " 66293350: 13,\n", + " 912080: 13,\n", + " 2514174: 13,\n", + " 67843993: 13,\n", + " 26000816: 12,\n", + " 67037597: 12,\n", + " 67903070: 11,\n", + " 65760352: 11,\n", + " 67711917: 11,\n", + " 404323: 11,\n", + " 68107833: 11,\n", + " 57817558: 10,\n", + " 49632909: 10,\n", + " 65770543: 10,\n", + " 67928132: 10,\n", + " 68207325: 10,\n", + " 25743896: 10,\n", + " 68475822: 10,\n", + " 21537193: 9,\n", + " 67334964: 9,\n", + " 68187748: 9,\n", + " 66461741: 9,\n", + " 56185392: 9,\n", + " 68315181: 8,\n", + " 61293820: 7,\n", + " 2656208: 7,\n", + " 60070859: 7,\n", + " 68076456: 7,\n", + " 65708437: 6,\n", + " 68420852: 6,\n", + " 61243245: 6,\n", + " 68463873: 4,\n", + " 60043578: 4,\n", + " 49588: 3,\n", + " 68229696: 3,\n", + " 737: 3,\n", + " 18097883: 3,\n", + " 66459202: 3,\n", + " 66128424: 3,\n", + " 58542328: 3,\n", + " 66916437: 3,\n", + " 67688633: 2,\n", + " 63417935: 2,\n", + " 20306953: 2,\n", + " 67959451: 1,\n", + " 49474213: 1,\n", + " 66135952: 1,\n", + " 66074428: 1,\n", + " 40713040: 1}" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "weights" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 202, "id": "db8ae3c9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1834" + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "open(\"/home/eecs/wooders/experiments/wikipedia/weights.json\", \"w\").write(json.dumps(weights))" ] diff --git a/wikipedia/preprocessing/log_data.py b/wikipedia/preprocessing/log_data.py index 2e8cca4..23522a0 100644 --- a/wikipedia/preprocessing/log_data.py +++ b/wikipedia/preprocessing/log_data.py @@ -53,6 +53,9 @@ def log_experiment(run, config): for filename in files: if "plan-" in filename and '.json' in filename: artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + if "plan-" in filename and '.pkl' in filename: + artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + run.log_artifact(artifact) if __name__ == "__main__": diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py index fc4b01d..06b9b7b 100644 --- a/wikipedia/preprocessing/wiki_api_data.py +++ b/wikipedia/preprocessing/wiki_api_data.py @@ -424,6 +424,8 @@ def generate_simulation_data( row["old_revid"] ), f"Invalid id {filename}, id {data['orig_id']} row {row['revid']}" + # get length of passage + if key not in init_data: diffs = data["diffs"][0] init_data[key] = { diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index a0689fe..36ac40b 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,12 +1,12 @@ set -xe -for key_policy in "random" "round_robin" +for key_policy in "random" "weighted_random" "round_robin" "weighted_round_robin" do - for event_policy in "lifo" + for event_policy in "lifo" "fifo" do for load_shedding_policy in "always_process" do - for model_runtime in 0.01 0.05 0.1 1 5 10 + for model_runtime in 0.25 0.005 do python simulate.py --model_runtime $model_runtime --send_rate 100 \ --event_policy $event_policy --key_policy $key_policy --load_shedding_policy $load_shedding_policy diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index ceaf249..e863f05 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -2,15 +2,15 @@ set -xe plan_dir=/data/wooders/wiki-plans -for key_policy in "round_robin" "weighted_round_robin" #"random" "weighted_random" +for model_runtime in 0.005 do - for event_policy in "lifo" "fifo" + for event_policy in "lifo" do for load_shedding_policy in "always_process" do - for model_runtime in 0.01 0.05 0.1 1 5 10 + for key_policy in "round_robin" "weighted_round_robin" "random" "weighted_random" do - python wiki_eval.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json --wandb + python wiki_eval_tmp.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json --workers 32 done done done diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh index 3f0713b..e0e8f13 100644 --- a/wikipedia/run_3_run_predictions.sh +++ b/wikipedia/run_3_run_predictions.sh @@ -5,21 +5,22 @@ dpr_dir=~/DPR cd $dpr_dir -for key_policy in "weighted_round_robin" #"round_robin" -#for key_policy in "random" "weighted_random" +for event_policy in "lifo" do - for event_policy in "lifo" + for model_runtime in 0.25 0.005 + #for model_runtime in 0.01 0.05 0.1 1.0 10.0 0.25 0.005 do for load_shedding_policy in "always_process" do - for model_runtime in 0.01 0.05 0.1 1 5 + for key_policy in "weighted_round_robin" "round_robin" "random" "weighted_random" do plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 echo $plan_file - CUDA_VISIBLE_DEVICES=3,4,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & - pid=$! + CUDA_VISIBLE_DEVICES=0,1,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + + #pid=$! done - wait $pid + #wait $pid done done done diff --git a/wikipedia/run_5_pipeline_predict.sh b/wikipedia/run_5_pipeline_predict.sh index f6aee15..ea36102 100644 --- a/wikipedia/run_5_pipeline_predict.sh +++ b/wikipedia/run_5_pipeline_predict.sh @@ -7,23 +7,23 @@ wiki_dir=/home/eecs/wooders/experiments/wikipedia #for key_policy in "weighted_random" "weighted_round_robin" #for key_policy in "random" "weighted_random" -for key_policy in "round_robin" "weighted_round_robin" +for model_runtime in 0.01 0.05 0.1 1 10 0.25 0.005 do - for event_policy in "lifo" + for event_policy in "lifo" "fifo" do for load_shedding_policy in "always_process" do - for model_runtime in 0.01 0.05 0.1 1 5 + for key_policy in "round_robin" "weighted_round_robin" "random" "weighted_random" do cd $wiki_dir plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 echo $plan_file python wiki_eval.py --offline-plan-path ${plan_dir}/${plan_file}.json cd $dpr_dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + CUDA_VISIBLE_DEVICES=3 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file pid=$! done - #wait $pid + wait $pid done done done diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 5dba7d1..fd2dc11 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -294,9 +294,10 @@ def __init__( key_selection_policy_cls: Type[CrossKeyLoadBalancer], model_run_time_s: float, keys: List[str], + num_replicas: int = 1, ) -> None: - super().__init__(env, source_queues, key_selection_policy_cls, model_run_time_s) + super().__init__(env, source_queues, key_selection_policy_cls, model_run_time_s, num_replicas) self.keys = keys self.source_queues = source_queues @@ -340,6 +341,14 @@ def run(self, replica_id: int): else: self.ready_time_to_batch[self.env.now] = edits + # TODO: Add variable runtime + + filename = f"{diff_dir}/{edits[0][0]}" + data = json.load(open(filename)) + num_passages = int(len(data["diffs"][0]) / 10) + runtime = self.model_runtime_s * num_passages + #print(runtime, num_passages) + yield self.env.timeout(self.model_runtime_s) @@ -347,6 +356,7 @@ def run(self, replica_id: int): config = configparser.ConfigParser() config.read("config.yml") plan_dir = config["simulation"]["plan_dir"] +diff_dir = config["directory"]["diff_dir"] #init_data_file = config["simulation"]["init_data_file"] #stream_edits_file = config["simulation"]["stream_edits_file"] #stream_questions_file = config["simulation"]["stream_questions_file"] @@ -391,7 +401,8 @@ def run_once( per_key_records_per_second: int, total_runtime_s: float, model_runtime_constant: float, - key_selection_policy: str + key_selection_policy: str, + num_replicas: int, ): env = simpy.Environment() @@ -429,6 +440,7 @@ def run_once( model_run_time_s=model_runtime_constant, key_selection_policy_cls=policies[key_selection_policy], keys=keys, + num_replicas=num_replicas, ) env.run(until=total_runtime_s) @@ -447,9 +459,10 @@ def run_once( parser.add_argument("--event_policy", type=str) parser.add_argument("--key_policy", type=str) parser.add_argument("--load_shedding_policy", type=str) + parser.add_argument("--num_replicas", type=int) args = parser.parse_args() - plan_name = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}" + plan_name = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}_replicas_{args.num_replicas}" out_path = f"{plan_name}.json" print(out_path) run_once( @@ -461,6 +474,7 @@ def run_once( total_runtime_s=args.total_runtime, model_runtime_constant=args.model_runtime, key_selection_policy=args.key_policy, + num_replicas=args.num_replicas, ) log_plans(run, config, plan_dir) diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py index a556403..ddc6604 100644 --- a/wikipedia/wiki_eval.py +++ b/wikipedia/wiki_eval.py @@ -234,7 +234,7 @@ def generate_question_data_all(exp_id, embed_filename): chunk_size = 1000 chunks = [(questions[i:i+chunk_size], embed_filename, directory) for i in range(0, len(questions), chunk_size)] - p = Pool(128) + p = Pool(64) staleness_all = p.starmap(generate_question_data, chunks) p.close() staleness_all = [item for sublist in staleness_all for item in sublist] @@ -315,10 +315,10 @@ def main(): #embed_filename = "embed_versions.pkl" generate_question_data_all(exp_id, embed_filename) - if args.wandb: - import wandb - run = wandb.init(job_type="dataset-creation", project="wiki-workload") - log_plan_data(run, config, exp_id, output_dir) + #if args.wandb: + # import wandb + # run = wandb.init(job_type="dataset-creation", project="wiki-workload") + # log_plan_data(run, config, exp_id, output_dir) if __name__ == "__main__": From b342494a728583a4aa212b7127ca888ed3234a77 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 14 Oct 2021 02:54:43 -0700 Subject: [PATCH 16/26] add notbeook --- stl/notebooks/STL Offline Plots.ipynb | 948 ++++++++++++++++++++++ stl/offline/run_6_simulate_baseline.sh | 37 + wikipedia/config.yml | 2 + wikipedia/notebooks/Wikipedia Plots.ipynb | 250 +++++- wikipedia/preprocessing/log_data.py | 2 + wikipedia/run_1_generate_plan.sh | 19 +- wikipedia/run_2_prepare_data.sh | 10 +- wikipedia/simulate.py | 546 ++++++++----- 8 files changed, 1549 insertions(+), 265 deletions(-) create mode 100644 stl/notebooks/STL Offline Plots.ipynb create mode 100644 stl/offline/run_6_simulate_baseline.sh diff --git a/stl/notebooks/STL Offline Plots.ipynb b/stl/notebooks/STL Offline Plots.ipynb new file mode 100644 index 0000000..4b561e5 --- /dev/null +++ b/stl/notebooks/STL Offline Plots.ipynb @@ -0,0 +1,948 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 232, + "id": "642f67ca", + "metadata": {}, + "outputs": [], + "source": [ + "import wandb\n", + "import os\n", + "import pandas as pd\n", + "import sys\n", + "import json\n", + "\n", + "import seaborn as sns\n", + "import numpy as np\n", + "sns.set(style=\"whitegrid\", palette=\"muted\")\n", + "\n", + "sys.path.insert(1, \"/home/eecs/wooders/experiments/stl/offline\")" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "id": "0df714c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:2od4u8d0) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Waiting for W&B process to finish, PID 80915... (success)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value=' 0.26MB of 0.26MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced restful-butterfly-20: https://wandb.ai/ucb-ralf/experiments-stl_notebooks/runs/2od4u8d0
\n", + "Find logs at: ./wandb/run-20211014_021333-2od4u8d0/logs
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:2od4u8d0). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Syncing run avid-flower-21 to Weights & Biases (docs).
\n", + "\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact results:v11, 1446.02MB. 6579 files... Done. 0:0:0\n" + ] + } + ], + "source": [ + "run = wandb.init()\n", + "results_dir = run.use_artifact('ucb-ralf/stl/results:v11', type='dataset').download()\n", + "yahoo_train_dir = run.use_artifact('ucb-ralf/stl/yahoo_train_data:v0', type='dataset').download()\n", + "yahoo_eval_dir = run.use_artifact('ucb-ralf/stl/yahoo_eval_data:v0', type='dataset').download()\n", + "oracle_dir = run.use_artifact('ucb-ralf/stl/oracle:v0', type='dataset').download()" + ] + }, + { + "cell_type": "markdown", + "id": "b5a8aea6", + "metadata": {}, + "source": [ + "# Check Train / Eval Data" + ] + }, + { + "cell_type": "code", + "execution_count": 253, + "id": "0eedb687", + "metadata": {}, + "outputs": [], + "source": [ + "key = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 254, + "id": "975d3b68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 254, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_train = pd.read_csv(f\"{yahoo_train_dir}/{key}.csv\")\n", + "plt.plot(np.arange(len(df_train)), df_train[\"value\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "id": "c37f7834", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 256, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_eval = pd.read_csv(f\"{yahoo_eval_dir}/{key}.csv\")\n", + "plt.plot(np.arange(len(df_eval)), df_eval[\"value\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "id": "b3c30d2e", + "metadata": {}, + "outputs": [], + "source": [ + "df_all = pd.concat([df_train[\"value\"], df_eval[\"value\"]], axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "id": "93a15c67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(np.arange(len(df_train) + len(df_eval)), df_all)" + ] + }, + { + "cell_type": "markdown", + "id": "9fef8177", + "metadata": {}, + "source": [ + "# Cost Evaluation " + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "id": "5fecde25", + "metadata": {}, + "outputs": [], + "source": [ + "from sktime.performance_metrics.forecasting import mean_squared_scaled_error\n", + "def get_loss_per_key(key: int, path, oracle_filename):\n", + "\n", + " oracle_residual = pd.read_csv(oracle_filename)[\n", + " \"pred_residual\"\n", + " ]\n", + "\n", + " df = pd.read_csv(path)\n", + " residual = df[\"pred_residual\"]\n", + " mask = ~np.isnan(residual)\n", + " loss = mean_squared_scaled_error(\n", + " y_true=oracle_residual[mask], y_pred=residual[mask], y_train=df[\"value\"]\n", + " )\n", + " loss = {\n", + " \"loss\": loss,\n", + " \"n_fits\": df[\"model_version\"].dropna().nunique(),\n", + " }\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "21099224", + "metadata": {}, + "outputs": [], + "source": [ + "replica = 1\n", + "baseline_results = {}\n", + "for key in range(1, 101, 1):\n", + " losses = get_loss_per_key(key, f\"{artifact_dir}/plan_eval\")\n", + " baseline_results[key] = losses" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "d82f9ca7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8400 78.42078584418643\n" + ] + } + ], + "source": [ + "slide_size = 12\n", + "baseline_total_cost = 0\n", + "baseline_total_loss = 0\n", + "for key in baseline_results.keys(): \n", + " for loss in baseline_results[key]:\n", + " if loss['slide_size'] == slide_size:\n", + " baseline_total_cost += loss['n_fits']\n", + " baseline_total_loss += loss['loss']\n", + "print(baseline_total_cost, baseline_total_loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "da85dcf1", + "metadata": {}, + "outputs": [], + "source": [ + "lp_results = {}\n", + "\n", + "for key in range(1, 101, 1):\n", + " oracle_filename = f\"{artifact_dir}/plan_eval/oracle_key_A4Benchmark-TS{key}.csv\"\n", + " filename = f\"{artifact_dir}/lp_plan_eval/{plan}/{key}.csv\"\n", + " lp_results[key] = get_loss_per_key(key, filename, oracle_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "303fcfcf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1100 95.26955983050661\n" + ] + } + ], + "source": [ + "lp_total_cost = 0\n", + "lp_total_loss = 0\n", + "for key in lp_results.keys(): \n", + " lp_total_cost += lp_results[key]['n_fits']\n", + " lp_total_loss += lp_results[key]['loss']\n", + "print(lp_total_cost, lp_total_loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "id": "656758ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "max_fits_1100\n" + ] + }, + { + "ename": "TypeError", + "evalue": "string indices must be integers", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'slide_size'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mslide_size\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mbaseline_total_cost\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'n_fits'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mbaseline_total_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'loss'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: string indices must be integers" + ] + } + ], + "source": [ + "experiments = [(\"max_fits_1100\", 96), (\"max_fits_2100\", 48), (\"max_fits_4200\", 24), (\"max_fits_8400\", 12)]\n", + "\n", + "graph_results = {\"baseline\": [], \"optimized\": [], \"cost\": []}\n", + "\n", + "replica = 1\n", + "for plan, slide_size in experiments:\n", + " print(plan)\n", + " \n", + " baseline_total_cost = 0\n", + " baseline_total_loss = 0\n", + " for key in baseline_results.keys(): \n", + " for loss in baseline_results[key]:\n", + " if loss['slide_size'] == slide_size:\n", + " baseline_total_cost += loss['n_fits']\n", + " baseline_total_loss += loss['loss']\n", + " print(baseline_total_cost, baseline_total_loss)\n", + " \n", + " for key in range(1, 101, 1):\n", + " oracle_filename = f\"{artifact_dir}/plan_eval/oracle_key_A4Benchmark-TS{key}.csv\"\n", + " filename = f\"{artifact_dir}/lp_plan_eval/{plan}/{key}.csv\"\n", + " lp_results[key] = get_loss_per_key(key, filename, oracle_filename)\n", + " \n", + " lp_total_cost = 0\n", + " lp_total_loss = 0\n", + " for key in lp_results.keys(): \n", + " lp_total_cost += lp_results[key]['n_fits']\n", + " lp_total_loss += lp_results[key]['loss']\n", + " print(lp_total_cost, lp_total_loss)\n", + " \n", + " assert lp_total_cost <= baseline_total_cost\n", + " \n", + " graph_results[\"baseline\"].append(baseline_total_loss)\n", + " graph_results[\"optimized\"].append(lp_total_loss)\n", + " graph_results[\"cost\"].append(baseline_total_cost)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "959d7dc6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 0, 'Cost Budget'),\n", + " Text(0, 0.5, 'MASE Loss'),\n", + " Text(0.5, 1.0, 'Residual Estimate Loss for Time-Series Decomposition')]" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Network error resolved after 0:00:38.544998, resuming normal operation.\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "\n", + "x = 'Factor'\n", + "\n", + "df = pd.DataFrame({\n", + " x: graph_results[\"cost\"], \n", + " 'baseline': graph_results[\"baseline\"], \n", + " \"optimized\": graph_results[\"optimized\"],\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars=x).rename(columns=str.title)\n", + "seaborn.barplot(x=x, y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "seaborn.despine(fig)\n", + "\n", + "ax1.set(xlabel=\"Cost Budget\", ylabel=f'MASE Loss', title='Residual Estimate Loss for Time-Series Decomposition')\n", + "#ax1.legend_.remove()\n", + "#plt.legend(loc='lower center')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb5185be", + "metadata": {}, + "outputs": [], + "source": [ + "baseline_results[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "526e9b31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'baseline': [113.47347746568275,\n", + " 89.57722060605525,\n", + " 81.12720697469311,\n", + " 78.42078584418643],\n", + " 'optimized': [95.26955983050661,\n", + " 81.53832641325205,\n", + " 76.2934822322845,\n", + " 74.68576266515468],\n", + " 'cost': [1100, 2099, 4197, 8375]}" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{**graph_results}" + ] + }, + { + "cell_type": "markdown", + "id": "a1e429ec", + "metadata": {}, + "source": [ + "# Plot different numbers of replicas" + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "id": "38d0548e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "plan_baseline_1_lifo 172.98378216386888\n", + "plan_baseline_1_lifo 112.67935494700063\n", + "plan_baseline_6_lifo 180.18115087379635\n", + "plan_baseline_6_lifo 115.77955410145175\n", + "plan_baseline_12_lifo 184.2670899767375\n", + "plan_baseline_12_lifo 120.07497015444703\n", + "plan_baseline_18_lifo 190.24309735984917\n", + "plan_baseline_18_lifo 122.9708625384093\n", + "plan_baseline_24_lifo 196.18648185699215\n", + "plan_baseline_24_lifo 128.18398403462876\n", + "plan_baseline_48_lifo 225.58043199493562\n", + "plan_baseline_48_lifo 144.9251630795228\n", + "plan_baseline_96_lifo 320.8503011206009\n", + "plan_baseline_96_lifo 189.32209967308512\n", + "plan_baseline_168_lifo 431.41793348258363\n", + "plan_baseline_168_lifo 293.96569319942313\n", + "plan_baseline_192_lifo 544.0725854282231\n", + "plan_baseline_192_lifo 339.90741345037276\n", + "plan_baseline_336_lifo 949.2024557323098\n", + "plan_baseline_336_lifo 705.3804094682863\n", + "plan_baseline_672_lifo 1917.3666698011584\n", + "plan_baseline_672_lifo 1555.133039236265\n" + ] + }, + { + "data": { + "text/plain": [ + "{1: [172.98378216386888, 112.67935494700063],\n", + " 6: [180.18115087379635, 115.77955410145175],\n", + " 12: [184.2670899767375, 120.07497015444703],\n", + " 18: [190.24309735984917, 122.9708625384093],\n", + " 24: [196.18648185699215, 128.18398403462876],\n", + " 48: [225.58043199493562, 144.9251630795228],\n", + " 96: [320.8503011206009, 189.32209967308512],\n", + " 168: [431.41793348258363, 293.96569319942313],\n", + " 192: [544.0725854282231, 339.90741345037276],\n", + " 336: [949.2024557323098, 705.3804094682863],\n", + " 672: [1917.3666698011584, 1555.133039236265]}" + ] + }, + "execution_count": 269, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiments = [(\"max_fits_1100\", 96), (\"max_fits_2100\", 48), (\"max_fits_4200\", 24), (\"max_fits_8400\", 12)]\n", + "replicas = [1, 2]\n", + "slides = [1, 6, 12, 18, 24, 48, 96, 168, 192, 336, 672]\n", + "graph_results = {\"baseline\": [], \"optimized\": [], \"cost\": []}\n", + "prio = \"lifo\"\n", + "replica_results = {}\n", + "\n", + "for slide in slides: \n", + " replica_results[slide] = []\n", + " for replica in replicas: \n", + " baseline_plan = f\"plan_baseline_{slide}_{prio}\"\n", + " \n", + " total_loss = 0\n", + " for key in range(1, 101, 1):\n", + " oracle_filename = f\"{oracle_dir}/{key}.csv\"\n", + " \n", + " lp_filename = f\"{results_dir}/replica_{replica}/{baseline_plan}/{key}.csv\"\n", + " \n", + " baseline_filename = f\"{results_dir}/replica_{replica}/{baseline_plan}/{key}.csv\"\n", + " results = get_loss_per_key(key, baseline_filename, oracle_filename)\n", + " #print(results)\n", + " total_loss += results[\"loss\"]\n", + " \n", + " replica_results[slide].append(total_loss)\n", + " print(baseline_plan, total_loss)\n", + " \n", + "replica_results" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "id": "2b9c7c38", + "metadata": {}, + "outputs": [], + "source": [ + "del replica_results[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 270, + "id": "8678b39b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 0, 'Num Replicas'),\n", + " Text(0, 0.5, 'MASE Loss'),\n", + " Text(0.5, 1.0, 'Residual Estimate Loss for Time-Series Decomposition')]" + ] + }, + "execution_count": 270, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "\n", + "x = 'Factor'\n", + "\n", + "df = pd.DataFrame({\n", + " x: replicas, \n", + " **replica_results,\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars=x).rename(columns=str.title)\n", + "seaborn.barplot(x=x, y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "seaborn.despine(fig)\n", + "\n", + "ax1.set(xlabel=\"Num Replicas\", ylabel=f'MASE Loss', title='Residual Estimate Loss for Time-Series Decomposition')" + ] + }, + { + "cell_type": "code", + "execution_count": 272, + "id": "dd593565", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "max_fits_1100 1893.9663657607266\n", + "max_fits_1100 133.45026146609104\n", + "max_fits_1100 117.59550187337948\n", + "max_fits_2100 2875.0872626231935\n", + "max_fits_2100 1447.6250864288265\n", + "max_fits_2100 99.89538248542472\n", + "max_fits_4200 3430.718917641645\n", + "max_fits_4200 2591.967671743391\n", + "max_fits_4200 95.01342291496671\n", + "max_fits_8400 3586.4346343021443\n", + "max_fits_8400 3006.052341175111\n", + "max_fits_8400 93.57666588051953\n" + ] + }, + { + "data": { + "text/plain": [ + "{'max_fits_1100': [1893.9663657607266, 133.45026146609104, 117.59550187337948],\n", + " 'max_fits_2100': [2875.0872626231935, 1447.6250864288265, 99.89538248542472],\n", + " 'max_fits_4200': [3430.718917641645, 2591.967671743391, 95.01342291496671],\n", + " 'max_fits_8400': [3586.4346343021443, 3006.052341175111, 93.57666588051953]}" + ] + }, + "execution_count": 272, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiments = [(\"max_fits_1100\", 96), (\"max_fits_2100\", 48), (\"max_fits_4200\", 24), (\"max_fits_8400\", 12)]\n", + "replicas = [1, 2, 4]\n", + "slides = [1, 6, 12, 18, 24, 48, 96, 168, 192, 336, 672]\n", + "graph_results = {\"baseline\": [], \"optimized\": [], \"cost\": []}\n", + "\n", + "replica_results = {}\n", + "\n", + "\n", + " \n", + "for plan, slide in experiments: \n", + " replica_results[plan] = []\n", + " \n", + " for replica in replicas:\n", + " total_loss = 0\n", + " for key in range(1, 101, 1):\n", + " oracle_filename = f\"{oracle_dir}/{key}.csv\"\n", + " lp_filename = f\"{results_dir}/replica_{replica}/{plan}/{key}.csv\"\n", + " results = get_loss_per_key(key, lp_filename, oracle_filename)\n", + " total_loss += results[\"loss\"]\n", + " \n", + " replica_results[plan].append(total_loss)\n", + " print(plan, total_loss)\n", + " \n", + "replica_results" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "id": "fa644e00", + "metadata": {}, + "outputs": [], + "source": [ + "static_results = {1: [213.2200178532724, 150.7256273025718, 133.3349003094353],\n", + " 6: [218.75801939773078, 156.0948132080653, 135.81574629721845],\n", + " 12: [220.90611494937247, 159.05690147515068, 138.24448802117672],\n", + " 18: [229.63341620779627, 163.1813146667572, 140.3471142793597],\n", + " 24: [233.91725185369373, 164.71255155633278, 144.22561887879107],\n", + " 48: [268.3977607679711, 184.14616983478135, 158.3310894771346],\n", + " 96: [348.50466291276604, 229.46322062727472, 189.37872271737845],\n", + " 168: [474.50432909190295, 319.6199513026192, 281.5650612571233],\n", + " 192: [609.2301332698065, 399.7143084337964, 333.28670707646245],\n", + " 336: [908.5841349053487, 728.1723528503993, 650.7431132687982],\n", + " 672: [1848.5207568587812, 1612.2188363608043, 1489.8733845259228]}" + ] + }, + { + "cell_type": "code", + "execution_count": 246, + "id": "2a2bdeb1", + "metadata": {}, + "outputs": [], + "source": [ + "policy_results = {'max_fits_1100': [1893.9663657607266, 133.45026146609104, 117.59550187337948],\n", + " 'max_fits_2100': [2875.0872626231935, 1447.6250864288265, 99.89538248542472],\n", + " 'max_fits_4200': [3430.718917641645, 2591.967671743391, 95.01342291496671],\n", + " 'max_fits_8400': [3586.4346343021443, 3006.052341175111, 93.57666588051953]}" + ] + }, + { + "cell_type": "code", + "execution_count": 273, + "id": "239a5274", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'baseline': [213.2200178532724, 150.7256273025718],\n", + " 'policy': [1893.9663657607266, 133.45026146609104]}" + ] + }, + "execution_count": 273, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "replicas = [1, 2]\n", + "results = {\"baseline\": [], \"policy\": []}\n", + "for i in range(len(replicas)): \n", + " \n", + " best_baseline = None\n", + " for key in static_results.keys(): \n", + " if best_baseline is None or static_results[key][i] <= best_baseline: \n", + " best_baseline = static_results[key][i]\n", + " results[\"baseline\"].append(best_baseline)\n", + " \n", + " best_baseline = None\n", + " for key in policy_results.keys(): \n", + " if best_baseline is None or policy_results[key][i] <= best_baseline: \n", + " best_baseline = policy_results[key][i]\n", + " results[\"policy\"].append(best_baseline)\n", + " \n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 274, + "id": "ee33ec46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 0, 'Num Replicas'),\n", + " Text(0, 0.5, 'MASE Loss'),\n", + " Text(0.5, 1.0, 'Residual Estimate Loss for Time-Series Decomposition')]" + ] + }, + "execution_count": 274, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "\n", + "x = 'Factor'\n", + "\n", + "df = pd.DataFrame({\n", + " x: replicas, \n", + " **results,\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars=x).rename(columns=str.title)\n", + "seaborn.barplot(x=x, y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "seaborn.despine(fig)\n", + "\n", + "ax1.set(xlabel=\"Num Replicas\", ylabel=f'MASE Loss', title='Residual Estimate Loss for Time-Series Decomposition')" + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "id": "9a4dba29", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "min() arg is an empty sequence", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0max1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msubplots\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mtidy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmelt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_vars\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mseaborn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbarplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Value'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'Variable'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtidy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0max\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0max1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdespine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 45\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mbarplot\u001b[0;34m(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, seed, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge, ax, **kwargs)\u001b[0m\n\u001b[1;32m 3177\u001b[0m ):\n\u001b[1;32m 3178\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3179\u001b[0;31m plotter = _BarPlotter(x, y, hue, data, order, hue_order,\n\u001b[0m\u001b[1;32m 3180\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3181\u001b[0m \u001b[0morient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, seed, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge)\u001b[0m\n\u001b[1;32m 1584\u001b[0m self.establish_variables(x, y, hue, data, orient,\n\u001b[1;32m 1585\u001b[0m order, hue_order, units)\n\u001b[0;32m-> 1586\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestablish_colors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1587\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimate_statistic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mestablish_colors\u001b[0;34m(self, color, palette, saturation)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;31m# Determine the gray color to use for the lines framing the plot\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0mlight_vals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mcolorsys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrgb_to_hls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrgb_colors\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m \u001b[0mlum\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlight_vals\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m.6\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0mgray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmpl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrgb2hex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlum\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlum\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlum\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: min() arg is an empty sequence" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl8AAAE1CAYAAADZOIW8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAU7ElEQVR4nO3dUWyV9f3H8Q8tgkZZHESwjBkiU2xkeoHJdiGLQ7RsFnGbSlI1c8SazMUlLjHqpkCnydYlu1DGskgy1NUL1yzDUAkS4wVjUdwaE2CdmCgGnRWkhLiB0no4/4v9JTLUHmr51YOvV2LSNr/Wb/MN8e1zHs4zrlqtVgMAQBENYz0AAMDnifgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgoaNr87OzsyfPz+zZ8/Oyy+//JFnKpVKOjo6smDBglxxxRXp7u4e9UEBAE4Gw8bX5Zdfnscffzxf+tKXPvbMunXrsmvXrmzcuDFPPPFEVq5cmTfeeGNUBwUAOBkMG1+XXHJJmpqaPvHM+vXrc91116WhoSGTJ0/OggULsmHDhlEbEgDgZDF+NH5If39/pk+ffuTzpqamvPXWWzV//+HDh3PgwIGccsopGTdu3GiMBABwQlSr1QwNDeX0009PQ8Px3z4/KvH1aR04cOBj7ycDAPgsOv/88zNp0qTj/r5Ria+mpqa8+eabueiii5IceyVsOKecckqS//4SEyZMGI2RKGz79u2ZM2fOWI/BCNhdfbO/+mV39WtwcDAvv/zykX45XqMSXwsXLkx3d3euvPLK7N+/P88880wef/zxmr//g5caJ0yYkIkTJ47GSIwBu6tfdlff7K9+2V19G+mtUsO+UPnAAw/kG9/4Rt5666384Ac/yFVXXZUkaW9vz7Zt25IkixcvzowZM3LllVfm+uuvz49+9KN8+ctfHtFAAAAns2GvfN1777259957j/n66tWrj3zc2NiYjo6O0Z0MAOAk5B3uAQAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQ0PhaDu3cuTN333139u/fnzPPPDOdnZ2ZOXPmUWcGBgZyzz33pL+/P0NDQ/n617+ee++9N+PH1/SvAAD4XKjpytfy5cvT1taWp59+Om1tbVm2bNkxZ373u99l1qxZWbduXdatW5d//OMf2bhx46gPDABQz4aNr4GBgfT19aW1tTVJ0tramr6+vuzbt++oc+PGjcuBAwdy+PDhDA4OZmhoKNOmTTsxUwMA1KlhXxPs7+/PtGnT0tjYmCRpbGzM1KlT09/fn8mTJx85d9ttt+X222/PpZdemnfffTc33HBD5s6de1zDbN++/TjH57Okt7d3rEdghOyuvtlf/bK7z6dRuyFrw4YNmT17dh599NEcOHAg7e3t2bBhQxYuXFjzz5gzZ04mTpw4WiNRUG9v73HHNp8Ndlff7K9+2V39OnTo0Ke6YDTsy45NTU3ZvXt3KpVKkqRSqWTPnj1pamo66lxXV1euvvrqNDQ0ZNKkSZk/f362bNky4sEAAE5Gw8bXlClT0tzcnJ6eniRJT09Pmpubj3rJMUlmzJiRTZs2JUkGBwfz3HPP5bzzzjsBIwMA1K+a/rbjihUr0tXVlZaWlnR1daWjoyNJ0t7enm3btiVJfvrTn6a3tzeLFi3KNddck5kzZ+b6668/cZMDANShmu75mjVrVrq7u4/5+urVq498fM4552TNmjWjNxkAwEnIO9wDABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQTXF186dO7NkyZK0tLRkyZIlee211z7y3Pr167No0aK0trZm0aJF2bt372jOCgBQ98bXcmj58uVpa2vL4sWL8+STT2bZsmV57LHHjjqzbdu2/OY3v8mjjz6as846K//+978zYcKEEzI0AEC9GvbK18DAQPr6+tLa2pokaW1tTV9fX/bt23fUuUceeSRLly7NWWedlSSZNGlSJk6ceAJGBgCoX8Ne+erv78+0adPS2NiYJGlsbMzUqVPT39+fyZMnHzn3yiuvZMaMGbnhhhty8ODBXHHFFfnhD3+YcePG1TzM9u3bR/Ar8FnR29s71iMwQnZX3+yvftnd51NNLzvWolKpZMeOHVmzZk0GBwdzyy23ZPr06bnmmmtq/hlz5sxxtaxO9fb2Zu7cuWM9BiNgd/XN/uqX3dWvQ4cOfaoLRsO+7NjU1JTdu3enUqkk+W9k7dmzJ01NTUedmz59ehYuXJgJEybkjDPOyOWXX56tW7eOeDAAgJPRsPE1ZcqUNDc3p6enJ0nS09OT5ubmo15yTP57L9jmzZtTrVYzNDSU559/PhdccMGJmRoAoE7V9FYTK1asSFdXV1paWtLV1ZWOjo4kSXt7e7Zt25YkueqqqzJlypR8+9vfzjXXXJOvfOUrufbaa0/c5AAAdaime75mzZqV7u7uY76+evXqIx83NDTknnvuyT333DN60wEAnGS8wz0AQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICCxBcAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICCxBcAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICCxBcAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICCxBcAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICCxBcAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUVFN87dy5M0uWLElLS0uWLFmS11577WPPvvrqq7n44ovT2dk5WjMCAJw0aoqv5cuXp62tLU8//XTa2tqybNmyjzxXqVSyfPnyLFiwYFSHBAA4WQwbXwMDA+nr60tra2uSpLW1NX19fdm3b98xZx9++OFcdtllmTlz5qgPCgBwMhg2vvr7+zNt2rQ0NjYmSRobGzN16tT09/cfde6ll17K5s2bc/PNN5+QQQEATgbjR+OHDA0N5b777ssvfvGLI5E2Etu3bx+NcRgjvb29Yz0CI2R39c3+6pfdfT4NG19NTU3ZvXt3KpVKGhsbU6lUsmfPnjQ1NR058/bbb2fXrl259dZbkyTvvPNOqtVq/vOf/+T++++veZg5c+Zk4sSJI/g1GGu9vb2ZO3fuWI/BCNhdfbO/+mV39evQoUOf6oLRsPE1ZcqUNDc3p6enJ4sXL05PT0+am5szefLkI2emT5+eLVu2HPl85cqVOXjwYO66664RDwYAcDKq6W87rlixIl1dXWlpaUlXV1c6OjqSJO3t7dm2bdsJHRAA4GRS0z1fs2bNSnd39zFfX7169Ueev/322z/dVAAAJynvcA8AUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILEFwBAQeILAKAg8QUAUJD4AgAoSHwBABQkvgAAChJfAAAFiS8AgILG13Jo586dufvuu7N///6ceeaZ6ezszMyZM486s2rVqqxfvz6NjY0ZP3587rjjjsybN+9EzAwAULdqiq/ly5enra0tixcvzpNPPplly5blscceO+rMRRddlKVLl+a0007LSy+9lBtvvDGbN2/OqaeeekIGBwCoR8O+7DgwMJC+vr60trYmSVpbW9PX15d9+/YddW7evHk57bTTkiSzZ89OtVrN/v37R39iAIA6NuyVr/7+/kybNi2NjY1JksbGxkydOjX9/f2ZPHnyR37P2rVrc8455+Tss88+rmG2b99+XOf5bOnt7R3rERghu6tv9le/7O7zqaaXHY/HCy+8kAcffDC///3vj/t758yZk4kTJ472SBTQ29ubuXPnjvUYjIDd1Tf7q192V78OHTr0qS4YDfuyY1NTU3bv3p1KpZIkqVQq2bNnT5qamo45++KLL+bOO+/MqlWrcu655454KACAk9Ww8TVlypQ0Nzenp6cnSdLT05Pm5uZjXnLcunVr7rjjjjz00EO58MILT8y0AAB1rqb3+VqxYkW6urrS0tKSrq6udHR0JEna29uzbdu2JElHR0fee++9LFu2LIsXL87ixYuzY8eOEzc5AEAdqumer1mzZqW7u/uYr69evfrIx3/6059GbyoAgJOUd7gHAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgsQXAEBB4gsAoCDxBQBQkPgCAChIfAEAFCS+AAAKEl8AAAWJLwCAgmqKr507d2bJkiVpaWnJkiVL8tprrx1zplKppKOjIwsWLMgVV1yR7u7u0Z4VAKDu1RRfy5cvT1tbW55++um0tbVl2bJlx5xZt25ddu3alY0bN+aJJ57IypUr88Ybb4z6wAAA9Wz8cAcGBgbS19eXNWvWJElaW1tz//33Z9++fZk8efKRc+vXr891112XhoaGTJ48OQsWLMiGDRtyyy23DDtEtVpNkgwODo709+Az4NChQ2M9AiNkd/XN/uqX3dWnD3rlg345XsPGV39/f6ZNm5bGxsYkSWNjY6ZOnZr+/v6j4qu/vz/Tp08/8nlTU1PeeuutmoYYGhpKkrz88svHNTyfLdu3bx/rERghu6tv9le/7K6+DQ0N5dRTTz3u7xs2vko4/fTTc/755+eUU07JuHHjxnocAICPVa1WMzQ0lNNPP31E3z9sfDU1NWX37t2pVCppbGxMpVLJnj170tTUdMy5N998MxdddFGSY6+EfZKGhoZMmjRpBOMDAJQ3kiteHxj2hvspU6akubk5PT09SZKenp40Nzcf9ZJjkixcuDDd3d05fPhw9u3bl2eeeSYtLS0jHgwA4GQ0rlrD3WKvvPJK7r777rzzzjv5whe+kM7Ozpx77rlpb2/Pj3/843z1q19NpVLJz3/+8/z1r39NkrS3t2fJkiUn/BcAAKgnNcUXAACjwzvcAwAUJL4AAAoSXwAABYkvAICCisaXB3TXt1r2t2rVqlx11VW5+uqr893vfjd/+ctfyg/KMWrZ3QdeffXVXHzxxens7Cw3IJ+o1v2tX78+ixYtSmtraxYtWpS9e/eWHZSPVMv+BgYGcuutt2bRokVZuHBhVqxYkffff7/8sByls7Mz8+fPz+zZsz/2KTwj6pZqQTfddFN17dq11Wq1Wl27dm31pptuOubMn//85+rSpUurlUqlOjAwUJ03b1719ddfLzkmH6OW/W3atKl68ODBarVarf7zn/+szp07t/ruu+8WnZNj1bK7arVaff/996s33nhj9Sc/+Un1l7/8ZckR+QS17G/r1q3Vb33rW9U9e/ZUq9Vq9Z133qm+9957Refko9WyvwceeODIn7nBwcHqtddeW33qqaeKzsmx/va3v1XffPPN6je/+c3qjh07PvLMSLql2JWvDx7Q3dramuS/D+ju6+vLvn37jjr3cQ/oZmzVur958+bltNNOS5LMnj071Wo1+/fvLz0uH1Lr7pLk4YcfzmWXXZaZM2cWnpKPU+v+HnnkkSxdujRnnXVWkmTSpEmZOHFi8Xk5Wq37GzduXA4cOJDDhw9ncHAwQ0NDmTZt2liMzIdccsklxzzR53+NpFuKxdcnPaD7f8+N9AHdnDi17u/D1q5dm3POOSdnn312qTH5CLXu7qWXXsrmzZtz8803j8GUfJxa9/fKK6/k9ddfzw033JDvfOc7+e1vf5uqt3Ecc7Xu77bbbsvOnTtz6aWXHvln7ty5YzEyx2kk3eKGe06IF154IQ8++GB+/etfj/Uo1GBoaCj33XdfOjo6jvxHgvpSqVSyY8eOrFmzJn/4wx+yadOmPPnkk2M9FjXasGFDZs+enc2bN2fTpk35+9//7lWfk1ix+PrwA7qTDPuA7g/09/e7cvIZUOv+kuTFF1/MnXfemVWrVuXcc88tPSr/o5bdvf3229m1a1duvfXWzJ8/P48++mj++Mc/5r777hursfl/tf7Zmz59ehYuXJgJEybkjDPOyOWXX56tW7eOxch8SK376+rqytVXX52GhoZMmjQp8+fPz5YtW8ZiZI7TSLqlWHx5QHd9q3V/W7duzR133JGHHnooF1544ViMyv+oZXfTp0/Pli1b8uyzz+bZZ5/N97///Vx//fW5//77x2ps/l+tf/ZaW1uzefPmVKvVDA0N5fnnn88FF1wwFiPzIbXub8aMGdm0aVOSZHBwMM8991zOO++84vNy/EbSLUWf7egB3fWtlv1973vfy7/+9a+jbhT91a9+ldmzZ4/h5NSyuw9buXJlDh48mLvuumuMJubDatnf4cOH09nZmU2bNqWhoSGXXnpp7rrrrjQ0uLtkrNWyv127dmX58uXZu3dvKpVKvva1r+VnP/tZxo8fP9bjf6498MAD2bhxY/bu3ZsvfvGLOfPMM/PUU0996m7xYG0AgIL8LxEAQEHiCwCgIPEFAFCQ+AIAKEh8AQAUJL4AAAoSXwAABYkvAICC/g95qFmz3s7qbQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "\n", + "x = 'Factor'\n", + "\n", + "df = pd.DataFrame({\n", + " x: graph_results[\"cost\"], \n", + " 'baseline': graph_results[\"baseline\"], \n", + " \"optimized\": graph_results[\"optimized\"],\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars=x).rename(columns=str.title)\n", + "seaborn.barplot(x=x, y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "seaborn.despine(fig)\n", + "\n", + "ax1.set(xlabel=\"Cost Budget\", ylabel=f'MASE Loss', title='Residual Estimate Loss for Time-Series Decomposition')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9bccc31", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/stl/offline/run_6_simulate_baseline.sh b/stl/offline/run_6_simulate_baseline.sh new file mode 100644 index 0000000..f70fabf --- /dev/null +++ b/stl/offline/run_6_simulate_baseline.sh @@ -0,0 +1,37 @@ +set -xe +PARAM_DIR="/data/wooders/stl/results" +PLAN_DIR="/data/wooders/stl/results" +TRAIN_PATH="./yahoo_train_data" +EVAL_PATH="./yahoo_eval_data" + +for key_policy in "lifo" "fifo" +do +for replicas in 1 2 4 8 +do + for slide in 672 1 6 12 18 24 48 96 168 192 336 + do + plan="plan_baseline_${slide}_${key_policy}" + param="plan_baseline_${slide}" + mkdir -p ${PLAN_DIR}/replica_${replicas} + python simulation.py \ + --model_runtime_s 1.5 \ + --total_runtime_s 2000 \ + --per_key_records_per_second 1 \ + --window_size 672 \ + --slide_size ${slide} \ + --per_key_slide_size_plan ${PARAM_DIR}/${param}.json \ + --output_path ${PLAN_DIR}/replica_${replicas}/${plan}.json \ + --source_data_path $TRAIN_PATH \ + --num_mapper_replicas ${replicas} \ + --key_prio_policy ${key_policy} + + mkdir -p ${PLAN_DIR}/replica_${replicas}/${plan} + python evaluation.py --offline-yahoo-csv-path $EVAL_PATH \ + --offline-plan-path ${PLAN_DIR}/replica_${replicas}/${plan}.json \ + --output-path ${PLAN_DIR}/replica_${replicas}/${plan} \ + --param-path ${PARAM_DIR}/${param}.json \ + --run-policy + + done +done +done diff --git a/wikipedia/config.yml b/wikipedia/config.yml index ad09cc0..2ae74c2 100644 --- a/wikipedia/config.yml +++ b/wikipedia/config.yml @@ -18,6 +18,8 @@ titles_file = %(data_dir)s/top_titles.csv revisions_file = %(data_dir)s/title_revisions_timestamps.json edits_file = %(data_dir)s/edits.csv questions_file = %(data_dir)s/questions.csv +train_questions_file = %(data_dir)s/train_questions.csv +test_questions_file = %(data_dir)s/test_questions.csv raw_pageview_file = %(data_dir)s/top_title_views.csv pageview_file = %(data_dir)s/pageviews.csv timestamp_weights_file = %(data_dir)s/timestamp_weights_file.json diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 9213239..e417120 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -779,7 +779,7 @@ { "cell_type": "code", "execution_count": 183, - "id": "89abf373", + "id": "511f1c65", "metadata": {}, "outputs": [ { @@ -1347,7 +1347,7 @@ { "cell_type": "code", "execution_count": 191, - "id": "76fa4e4f", + "id": "d699a5a5", "metadata": {}, "outputs": [], "source": [ @@ -1357,7 +1357,7 @@ { "cell_type": "code", "execution_count": 192, - "id": "fe3f87bc", + "id": "c5b6cbeb", "metadata": {}, "outputs": [], "source": [ @@ -1368,7 +1368,7 @@ { "cell_type": "code", "execution_count": 193, - "id": "63a1c934", + "id": "d0c039a3", "metadata": {}, "outputs": [ { @@ -1584,7 +1584,7 @@ { "cell_type": "code", "execution_count": 195, - "id": "d3343738", + "id": "954b1ea8", "metadata": {}, "outputs": [ { @@ -1831,39 +1831,39 @@ }, { "cell_type": "code", - "execution_count": 197, + "execution_count": 215, "id": "d9dab1e5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "What is the name of the company?? 4339\n", - "what is the name of the company?????? 4018\n", - "what is the u.s. open???????? 1201\n", - "What is the best tennis tournament in the world?? 1090\n", - "what is the std?????? 1035\n", - " ... \n", - "What are the main factors that affect the success of the high speed railway system?? 1\n", - "What did the government do?? 1\n", - "What is the purpose of the evacuation?? 1\n", - "what is a christian name?? 1\n", - "How many corridors are under construction?? 1\n", - "Name: question, Length: 3219, dtype: int64" + "1305297 8677\n", + "332667 4300\n", + "66304621 3720\n", + "17888363 3569\n", + "3259011 1581\n", + " ... \n", + "67959451 15\n", + "49474213 12\n", + "66135952 12\n", + "66074428 8\n", + "40713040 2\n", + "Name: doc_id, Length: 118, dtype: int64" ] }, - "execution_count": 197, + "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "train_df.question.value_counts()" + "test_df.doc_id.value_counts()" ] }, { "cell_type": "code", - "execution_count": 198, + "execution_count": 216, "id": "ea220f3d", "metadata": {}, "outputs": [ @@ -1884,7 +1884,7 @@ "Name: doc_id, Length: 118, dtype: int64" ] }, - "execution_count": 198, + "execution_count": 216, "metadata": {}, "output_type": "execute_result" } @@ -1895,7 +1895,26 @@ }, { "cell_type": "code", - "execution_count": 199, + "execution_count": 204, + "id": "bfdea858", + "metadata": {}, + "outputs": [], + "source": [ + "test_df.to_csv(\"/data/wooders/wikipedia/test_questions.csv\")\n", + "train_df.to_csv(\"/data/wooders/wikipedia/train_questions.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b01259cc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 205, "id": "86c3ebf2", "metadata": {}, "outputs": [], @@ -1905,15 +1924,169 @@ }, { "cell_type": "code", - "execution_count": 200, + "execution_count": 206, + "id": "f0941e02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1305297: 8677,\n", + " 332667: 4300,\n", + " 66304621: 3720,\n", + " 17888363: 3569,\n", + " 3259011: 1581,\n", + " 58112801: 1253,\n", + " 68507348: 1201,\n", + " 62808792: 1099,\n", + " 66052432: 987,\n", + " 62372638: 879,\n", + " 67089631: 860,\n", + " 67569553: 822,\n", + " 60476189: 822,\n", + " 68553225: 805,\n", + " 1425939: 774,\n", + " 66187257: 687,\n", + " 65521767: 585,\n", + " 67946554: 551,\n", + " 68498551: 453,\n", + " 442785: 446,\n", + " 734845: 401,\n", + " 487602: 381,\n", + " 66883576: 379,\n", + " 68294454: 369,\n", + " 61250187: 366,\n", + " 50170924: 360,\n", + " 61236755: 347,\n", + " 12936708: 309,\n", + " 1027173: 308,\n", + " 46754025: 276,\n", + " 58542318: 258,\n", + " 61258486: 242,\n", + " 66753136: 240,\n", + " 5575754: 240,\n", + " 65871303: 236,\n", + " 12202928: 228,\n", + " 60600284: 224,\n", + " 51150040: 222,\n", + " 34075129: 220,\n", + " 58385279: 217,\n", + " 61049392: 215,\n", + " 60203476: 212,\n", + " 66341639: 212,\n", + " 63129286: 210,\n", + " 26833: 210,\n", + " 24689651: 208,\n", + " 66629866: 207,\n", + " 33385984: 204,\n", + " 58113491: 200,\n", + " 63170193: 200,\n", + " 63395714: 193,\n", + " 55055575: 191,\n", + " 67918135: 190,\n", + " 68284887: 188,\n", + " 65984422: 187,\n", + " 66040815: 181,\n", + " 57798785: 168,\n", + " 67131229: 167,\n", + " 53943680: 166,\n", + " 20304678: 164,\n", + " 64783122: 160,\n", + " 51345275: 155,\n", + " 39734558: 150,\n", + " 65666080: 148,\n", + " 31243078: 147,\n", + " 36567599: 145,\n", + " 67742925: 144,\n", + " 67674654: 142,\n", + " 6063379: 140,\n", + " 66293350: 139,\n", + " 912080: 135,\n", + " 2514174: 134,\n", + " 67843993: 130,\n", + " 26000816: 127,\n", + " 67037597: 120,\n", + " 67903070: 117,\n", + " 65760352: 114,\n", + " 67711917: 112,\n", + " 404323: 111,\n", + " 68107833: 110,\n", + " 57817558: 108,\n", + " 49632909: 105,\n", + " 65770543: 103,\n", + " 67928132: 100,\n", + " 68207325: 100,\n", + " 25743896: 100,\n", + " 68475822: 100,\n", + " 21537193: 98,\n", + " 67334964: 97,\n", + " 68187748: 96,\n", + " 66461741: 93,\n", + " 56185392: 90,\n", + " 68315181: 80,\n", + " 61293820: 75,\n", + " 2656208: 71,\n", + " 60070859: 70,\n", + " 68076456: 70,\n", + " 65708437: 64,\n", + " 68420852: 60,\n", + " 61243245: 60,\n", + " 68463873: 40,\n", + " 60043578: 40,\n", + " 49588: 39,\n", + " 68229696: 36,\n", + " 737: 36,\n", + " 18097883: 34,\n", + " 66459202: 33,\n", + " 66128424: 30,\n", + " 58542328: 30,\n", + " 66916437: 30,\n", + " 67688633: 25,\n", + " 63417935: 24,\n", + " 20306953: 20,\n", + " 67959451: 15,\n", + " 49474213: 12,\n", + " 66135952: 12,\n", + " 66074428: 8,\n", + " 40713040: 2}" + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weights" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "2c255c62", + "metadata": {}, + "outputs": [], + "source": [ + "buckets = [1, 2, 4, 8, 16, 32, 64]" + ] + }, + { + "cell_type": "code", + "execution_count": 213, "id": "af72ae1a", "metadata": {}, "outputs": [], "source": [ "for key in weights: \n", - " weights[key] = int(weights[key]/10)\n", - " if weights[key] == 0: \n", - " weights[key] = 1" + " w = int(weights[key]/10)\n", + " if w == 0: \n", + " w = 1\n", + " for b in buckets: \n", + " if w <= b:\n", + " w = b\n", + " break\n", + " weights[key] = b\n", + " #print(weights[key], b)" ] }, { @@ -2077,9 +2250,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 214, "id": "0901f729", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1831" + ] + }, + "execution_count": 214, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "open(\"/home/eecs/wooders/experiments/wikipedia/bucket_weights.json\", \"w\").write(json.dumps(weights))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a356582", + "metadata": {}, "outputs": [], "source": [] } diff --git a/wikipedia/preprocessing/log_data.py b/wikipedia/preprocessing/log_data.py index 23522a0..9cbaf06 100644 --- a/wikipedia/preprocessing/log_data.py +++ b/wikipedia/preprocessing/log_data.py @@ -8,6 +8,8 @@ def log_questions(run, config): artifact = wandb.Artifact("questions", type='dataset') artifact.add_file(config["files"]["raw_questions_file"]) artifact.add_file(config["files"]["questions_file"]) + artifact.add_file(config["files"]["test_questions_file"]) + artifact.add_file(config["files"]["train_questions_file"]) run.log_artifact(artifact) def log_files(run, config): diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index 36ac40b..5123d42 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,16 +1,19 @@ set -xe -for key_policy in "random" "weighted_random" "round_robin" "weighted_round_robin" +for replicas in 2 4 8 1 do - for event_policy in "lifo" "fifo" + for model_runtime in 0.25 do - for load_shedding_policy in "always_process" + for event_policy in "lifo" #"fifo" do - for model_runtime in 0.25 0.005 + for load_shedding_policy in "always_process" do - python simulate.py --model_runtime $model_runtime --send_rate 100 \ - --event_policy $event_policy --key_policy $key_policy --load_shedding_policy $load_shedding_policy + for key_policy in "round_robin" "weighted_round_robin" + do + python simulate.py --model_runtime $model_runtime --send_rate 100 \ + --event_policy $event_policy --key_policy $key_policy --load_shedding_policy $load_shedding_policy --num_replicas ${replicas} + done done done - done -done + done +done diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index e863f05..d170ebd 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -2,17 +2,19 @@ set -xe plan_dir=/data/wooders/wiki-plans -for model_runtime in 0.005 +for replicas in 2 +do +for model_runtime in 1.0 do for event_policy in "lifo" do for load_shedding_policy in "always_process" do - for key_policy in "round_robin" "weighted_round_robin" "random" "weighted_random" + for key_policy in "round_robin" "weighted_round_robin" do - python wiki_eval_tmp.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100.json --workers 32 + python wiki_eval_tmp.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas}.json --workers 32 done done done done -p +done diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index fd2dc11..78ed8a1 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -1,6 +1,7 @@ -import json import itertools +import json from typing import DefaultDict, Dict, List, Optional, Tuple +from more_itertools.more import divide from collections import defaultdict from dataclasses import dataclass from functools import cmp_to_key @@ -56,234 +57,317 @@ def current_weights(ts, ts_to_weights): return ts_to_weights[key] -class RoundRobinLoadBalancerFix(CrossKeyLoadBalancer): +class KeyFIFO(CrossKeyLoadBalancer): """Simple policy that cycle through all the keys fairly""" - def __init__(self): - self.cur_key_set = set() - self.cur_key_iter = None + def __init__(self, num_replicas=1): + self.cur_key_set = {} + for replica_id in range(num_replicas): + self.cur_key_set[replica_id] = set() + print(num_replicas, self.cur_key_set) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: + def choose( + self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int + ) -> str: key_set = set(per_key_queues.keys()) - if key_set != self.cur_key_set: - self.cur_key_set = key_set - self.cur_key_iter = itertools.cycle(key_set) + if key_set != self.cur_key_set[replica_id]: + #print("reset keys", replica_id, len(key_set), key_set, self.cur_key_set[replica_id]) + self.cur_key_set[replica_id] = key_set + self.cur_key_iter[replica_id] = itertools.cycle(key_set) - key = next(self.cur_key_iter) + seen = set([]) while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter) + key = next(self.cur_key_iter[replica_id]) + #print(replica_id, key, per_key_queues[key].size(), per_key_queues[key].size() == 0) + if key in seen: + raise ValueError(f"Did full loop - livelock {replica_id}") + #return None + seen.add(key) # TODO(simon): maybe do a "peak" here to trigger eviction policies + #print("choose", replica_id, key) return key -class WeightedRoundRobin(CrossKeyLoadBalancer): +class RoundRobinLoadBalancer(CrossKeyLoadBalancer): """Simple policy that cycle through all the keys fairly""" - def __init__(self, pageview_file, all_keys): - self.cur_key_set = [] - self.cur_key_iter = None - pageview_df = pd.read_csv(pageview_file) - - self.weights = json.load(open("weights.json")) - - ##self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() - #self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() - #self.weights = {} - #for key in self.raw_weights.keys(): - # if str(key) not in all_keys: - # continue - - # self.weights[key] = int(self.raw_weights[key]*1000) - # #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" - # if self.weights[key] == 0: - # self.weights[key] = 1 - - - for key in all_keys: - if key not in self.weights: - self.weights[key] = 1 - - - for key in self.weights.keys(): - for i in range(self.weights[key]): - self.cur_key_set.append(str(key)) - random.shuffle(self.cur_key_set) - self.cur_key_iter = itertools.cycle(self.cur_key_set) - + def __init__(self, num_replicas=1): + self.cur_key_iter = {} + self.cur_key_set = {} + for replica_id in range(num_replicas): + self.cur_key_set[replica_id] = set() + self.cur_key_iter[replica_id] = None + print(num_replicas, self.cur_key_set) + print(num_replicas, "replicas", self.cur_key_iter) + + def choose( + self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int + ) -> str: + key_set = set(per_key_queues.keys()) + if key_set != self.cur_key_set[replica_id]: + #print("reset keys", replica_id, len(key_set), key_set, self.cur_key_set[replica_id]) + self.cur_key_set[replica_id] = key_set + self.cur_key_iter[replica_id] = itertools.cycle(key_set) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: + key = next(self.cur_key_iter[replica_id]) - key = next(self.cur_key_iter) + seen = set([]) while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter) + key = next(self.cur_key_iter[replica_id]) + #print(replica_id, key, per_key_queues[key].size(), per_key_queues[key].size() == 0) + if key in seen: + raise ValueError(f"Did full loop - livelock {replica_id}") + #return None + seen.add(key) # TODO(simon): maybe do a "peak" here to trigger eviction policies + #print("choose", replica_id, key) return key -class AdaptiveWeightedRoundRobin(CrossKeyLoadBalancer): - """Simple policy that cycle through all the keys fairly""" - - def __init__(self, timestamp_weights_file): - self.cur_key_set = [] - self.cur_key_iter = None - - pageview_df = pd.read_csv(pageview_file) - self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() - self.weights = {} - for key in self.raw_weights.keys(): - if str(key) not in all_keys: - continue +class WeightedRoundRobinLoadBalancer(CrossKeyLoadBalancer): - self.weights[key] = int(self.raw_weights[key]*1000) - assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" + def __init__(self, all_keys, num_replicas=1): + + self.weights = json.load(open("bucket_weights.json")) + # set default weight + for key in all_keys: + if key not in self.weights: + self.weights[key] = 1 - for key in self.weights.keys(): - for i in range(self.weights[key]): - self.cur_key_set.append(str(key)) - random.shuffle(self.cur_key_set) - self.cur_key_iter = itertools.cycle(self.cur_key_set) + self.cur_key_iter = {} + self.cur_key_set = {} + for replica_id in range(num_replicas): + self.cur_key_set[replica_id] = set() + self.cur_key_iter[replica_id] = None + def choose( + self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int + ) -> str: + key_set = set(per_key_queues.keys()) - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: + # initialize keys + if key_set != self.cur_key_set[replica_id]: + self.cur_key_set[replica_id] = [] + for key in key_set: + for i in range(self.weights[key]): + self.cur_key_set[replica_id].append(key) + random.shuffle(self.cur_key_set[replica_id]) + self.cur_key_iter[replica_id] = itertools.cycle(self.cur_key_set[replica_id]) - key = next(self.cur_key_iter) + key = next(self.cur_key_iter[replica_id]) while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter) - # TODO(simon): maybe do a "peak" here to trigger eviction policies + key = next(self.cur_key_iter[replica_id]) return key -class AdaptiveWeightedLoadBalancer(CrossKeyLoadBalancer): - - def __init__(self, timestamp_weights_file): - data = json.load(open(timestamp_weights_file)) - self.timestamp_weights = {} - for key in data.keys(): - self.timestamp_weights[int(key)] = data[key] - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], timestamp: int) -> str: - weights_map = current_weights(timestamp, self.timestamp_weights) - - chosen_key = None - max_len = 0 - total_len = 0 - keys = [] - weights = [] - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if size >= 1 and key in weights_map: - keys.append(key) - weights.append(weights_map[key]) - total_len += size - chosen_key = random.choices(keys, weights, k=1)[0] - return chosen_key - - -class WeightedLoadBalancer(CrossKeyLoadBalancer): - - def __init__(self, pageview_file): - pageview_df = pd.read_csv(pageview_file) - #self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - self.weights = json.load(open("weights.json")) - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: - chosen_key = None - max_len = 0 - total_len = 0 - keys = [] - weights = [] - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if size >= 1 and int(key) in self.weights: - keys.append(key) - weights.append(self.weights[int(key)]) - total_len += size - - chosen_key = random.choices(keys, weights, k=1)[0] - #print("choose", chosen_key, keys, weights) - return chosen_key - -class RandomLoadBalancer(CrossKeyLoadBalancer): - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: - chosen_key = None - max_len = 0 - total_len = 0 - keys = [] - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if size >= 1: - keys.append(key) - total_len += size - - chosen_key = random.choices(keys, k=1)[0] - return chosen_key - - -class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): - - def __init__(self, pageview_file): - pageview_df = pd.read_csv(pageview_file) - self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - #print(self.weights) - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: - chosen_key = None - max_len = 0 - total_len = 0 - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if int(key) not in self.weights: - continue - weighted_size = self.weights[int(key)]*self.weights[int(key)] - if weighted_size > max_len: - chosen_key = key - max_len = size - total_len += size - #print(chosen_key, max_len, self.weights[int(chosen_key)]) - per_key_queues[chosen_key].clear() - print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) - return chosen_key - -class WeightedLoadBalancer(CrossKeyLoadBalancer): - - def __init__(self, pageview_file): - pageview_df = pd.read_csv(pageview_file) - self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() - #print(self.weights) - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: - chosen_key = None - max_len = 0 - total_len = 0 - keys = [] - weights = [] - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if size >= 1 and int(key) in self.weights: - keys.append(key) - weights.append(self.weights[int(key)]) - total_len += size - - chosen_key = random.choices(keys, weights, k=1)[0] - #print("choose", chosen_key, keys, weights) - return chosen_key - -class LongestQueueLoadBalancer(CrossKeyLoadBalancer): - - def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: - chosen_key = None - max_len = 0 - total_len = 0 - for key in per_key_queues.keys(): - size = per_key_queues[key].size() - if size > max_len: - chosen_key = key - max_len = size - total_len += size - per_key_queues[chosen_key].clear() - - return chosen_key +#class WeightedRoundRobin(CrossKeyLoadBalancer): +# """Simple policy that cycle through all the keys fairly""" +# +# def __init__(self, pageview_file, all_keys): +# self.cur_key_set = [] +# self.cur_key_iter = None +# pageview_df = pd.read_csv(pageview_file) +# +# self.weights = json.load(open("weights.json")) +# +# ##self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() +# #self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() +# #self.weights = {} +# #for key in self.raw_weights.keys(): +# # if str(key) not in all_keys: +# # continue +# +# # self.weights[key] = int(self.raw_weights[key]*1000) +# # #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" +# # if self.weights[key] == 0: +# # self.weights[key] = 1 +# +# +# for key in all_keys: +# if key not in self.weights: +# self.weights[key] = 1 +# +# +# for key in self.weights.keys(): +# for i in range(self.weights[key]): +# self.cur_key_set.append(str(key)) +# random.shuffle(self.cur_key_set) +# self.cur_key_iter = itertools.cycle(self.cur_key_set) +# +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# +# key = next(self.cur_key_iter) +# while per_key_queues[key].size() == 0: +# key = next(self.cur_key_iter) +# # TODO(simon): maybe do a "peak" here to trigger eviction policies +# return key +# +#class AdaptiveWeightedRoundRobin(CrossKeyLoadBalancer): +# """Simple policy that cycle through all the keys fairly""" +# +# def __init__(self, timestamp_weights_file): +# self.cur_key_set = [] +# self.cur_key_iter = None +# +# pageview_df = pd.read_csv(pageview_file) +# self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() +# self.weights = {} +# for key in self.raw_weights.keys(): +# if str(key) not in all_keys: +# continue +# +# self.weights[key] = int(self.raw_weights[key]*1000) +# assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" +# +# +# for key in self.weights.keys(): +# for i in range(self.weights[key]): +# self.cur_key_set.append(str(key)) +# random.shuffle(self.cur_key_set) +# self.cur_key_iter = itertools.cycle(self.cur_key_set) +# +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# +# key = next(self.cur_key_iter) +# while per_key_queues[key].size() == 0: +# key = next(self.cur_key_iter) +# # TODO(simon): maybe do a "peak" here to trigger eviction policies +# return key +# +# +#class AdaptiveWeightedLoadBalancer(CrossKeyLoadBalancer): +# +# def __init__(self, timestamp_weights_file): +# data = json.load(open(timestamp_weights_file)) +# self.timestamp_weights = {} +# for key in data.keys(): +# self.timestamp_weights[int(key)] = data[key] +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], timestamp: int) -> str: +# weights_map = current_weights(timestamp, self.timestamp_weights) +# +# chosen_key = None +# max_len = 0 +# total_len = 0 +# keys = [] +# weights = [] +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if size >= 1 and key in weights_map: +# keys.append(key) +# weights.append(weights_map[key]) +# total_len += size +# chosen_key = random.choices(keys, weights, k=1)[0] +# return chosen_key +# +# +#class WeightedLoadBalancer(CrossKeyLoadBalancer): +# +# def __init__(self, pageview_file): +# pageview_df = pd.read_csv(pageview_file) +# #self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() +# self.weights = json.load(open("weights.json")) +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# chosen_key = None +# max_len = 0 +# total_len = 0 +# keys = [] +# weights = [] +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if size >= 1 and int(key) in self.weights: +# keys.append(key) +# weights.append(self.weights[int(key)]) +# total_len += size +# +# chosen_key = random.choices(keys, weights, k=1)[0] +# #print("choose", chosen_key, keys, weights) +# return chosen_key +# +#class RandomLoadBalancer(CrossKeyLoadBalancer): +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# chosen_key = None +# max_len = 0 +# total_len = 0 +# keys = [] +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if size >= 1: +# keys.append(key) +# total_len += size +# +# chosen_key = random.choices(keys, k=1)[0] +# return chosen_key +# +# +#class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): +# +# def __init__(self, pageview_file): +# pageview_df = pd.read_csv(pageview_file) +# self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() +# #print(self.weights) +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# chosen_key = None +# max_len = 0 +# total_len = 0 +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if int(key) not in self.weights: +# continue +# weighted_size = self.weights[int(key)]*self.weights[int(key)] +# if weighted_size > max_len: +# chosen_key = key +# max_len = size +# total_len += size +# #print(chosen_key, max_len, self.weights[int(chosen_key)]) +# per_key_queues[chosen_key].clear() +# print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) +# return chosen_key +# +#class WeightedLoadBalancer(CrossKeyLoadBalancer): +# +# def __init__(self, pageview_file): +# pageview_df = pd.read_csv(pageview_file) +# self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() +# #print(self.weights) +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# chosen_key = None +# max_len = 0 +# total_len = 0 +# keys = [] +# weights = [] +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if size >= 1 and int(key) in self.weights: +# keys.append(key) +# weights.append(self.weights[int(key)]) +# total_len += size +# +# chosen_key = random.choices(keys, weights, k=1)[0] +# #print("choose", chosen_key, keys, weights) +# return chosen_key +# +#class LongestQueueLoadBalancer(CrossKeyLoadBalancer): +# +# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: +# chosen_key = None +# max_len = 0 +# total_len = 0 +# for key in per_key_queues.keys(): +# size = per_key_queues[key].size() +# if size > max_len: +# chosen_key = key +# max_len = size +# total_len += size +# per_key_queues[chosen_key].clear() +# +# return chosen_key class WikiMapper(RalfMapper): @@ -299,7 +383,7 @@ def __init__( super().__init__(env, source_queues, key_selection_policy_cls, model_run_time_s, num_replicas) self.keys = keys - self.source_queues = source_queues + #self.source_queues = source_queues # self.env = env # self.source_queues = source_queues @@ -308,21 +392,38 @@ def __init__( # self.env.process(self.run()) self.ready_time_to_batch: Dict[float, List[Tuple[int, float]]] = {} + + ## Shard source queues into each replica's id. + #source_keys = list(source_queues.keys()) + #random.shuffle(source_keys) + #self.sharded_keys = dict( + # enumerate(map(list, divide(num_replicas, source_keys))) + #) + #self.key_selection_policy = key_selection_policy_cls + #self.model_runtime_s = model_run_time_s + #for i in range(num_replicas): + # print("Run replica", i) + # self.env.process(self.run(replica_id=i)) - def run(self, replica_id: int): - self.source_queues = { + def run(self, replica_id: int): + this_shard_source_queues = { key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] } + #print("keys", replica_id, self.sharded_keys[replica_id]) + + #self.source_queues = { + # key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] + #} while True: - yield simpy.AnyOf(self.env, [q.wait() for q in self.source_queues.values()]) + x = yield simpy.AnyOf(self.env, [q.wait() for q in this_shard_source_queues.values()]) + #print("YIELD", replica_id, x) # choose key - print("env time", self.env.now) chosen_key = self.key_selection_policy.choose( - self.source_queues, - self.env.now*100 + this_shard_source_queues, + replica_id, ) assert chosen_key is not None @@ -330,10 +431,10 @@ def run(self, replica_id: int): # assert total_size_orig == 0 or total_size == total_size_orig, f"Bad queue size {total_size_orig} -> {total_size}" # get chosen key - windows = yield self.source_queues[chosen_key].get() - # print( - # f"at time {self.env.now:.2f}, RalfMapper should work on {windows} (last timestamp), queue size {total_size}, wait time {self.model_runtime_s}" - # ) + windows = yield this_shard_source_queues[chosen_key].get() + print( + f"at time {self.env.now:.2f}, RalfMapper replica {replica_id} should work on {windows} (last timestamp), wait time {self.model_runtime_s}" + ) edits = [(val, windows.key) for val in windows.window[0].value] if self.env.now in self.ready_time_to_batch: @@ -379,19 +480,6 @@ def run(self, replica_id: int): init_data = json.load(open(init_data_file)) keys = list(init_data.keys()) -policies = { - "fifo": fifo, - "lifo": lifo, - "always_process": always_process, - "sample_half": make_sampling_policy(0.5), - "weighted_random": WeightedLoadBalancer(pageview_file), - "adaptive_weighted_random": AdaptiveWeightedLoadBalancer(timestamp_weights_file), - "weighted_longest_queue": WeightedLongestQueueLoadBalancer(pageview_file), - "longest_queue": LongestQueueLoadBalancer(), - "random": RandomLoadBalancer(), - "round_robin": RoundRobinLoadBalancerFix(), - "weighted_round_robin": WeightedRoundRobin(pageview_file, keys) -} def run_once( out_path: str, @@ -405,6 +493,14 @@ def run_once( num_replicas: int, ): + policies = { + "fifo": fifo, + "lifo": lifo, + "always_process": always_process, + "round_robin": RoundRobinLoadBalancer(num_replicas=num_replicas), + "weighted_round_robin": WeightedRoundRobinLoadBalancer(keys, num_replicas=num_replicas) + } + env = simpy.Environment() source_to_window_queue = simpy.Store(env) From f28a429dee552c759caeb3a1c2a6c4b62cfc67cd Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 14 Oct 2021 23:09:05 -0700 Subject: [PATCH 17/26] stash --- stl/offline/run_1_simulate_windows.sh | 3 ++- stl/offline/run_2_eval_yahoo_keys.sh | 2 +- stl/offline/simulation.py | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/offline/run_1_simulate_windows.sh b/stl/offline/run_1_simulate_windows.sh index e195c39..a25823e 100644 --- a/stl/offline/run_1_simulate_windows.sh +++ b/stl/offline/run_1_simulate_windows.sh @@ -1,6 +1,7 @@ set -ex data_dir="./yahoo_train_data" +result_dir="/data/wooders/stl/results" tmp_script=`mktemp` for key_prio in "lifo" "fifo" @@ -10,7 +11,7 @@ do key=`basename $data` for slide in 6 12 18 24 48 96 168 192 336 672 do - echo \" python simulation.py --model_runtime_s 1.5 --total_runtime_s 2000 --per_key_records_per_second 1 --key_prio_policy ${key_prio} --window_size 672 --slide_size ${slide} --output_path offline_1_slide/plan/${key_prio}_slide_${slide}_plan.json --num_mapper_replicas 1\" >> $tmp_script + echo \" python simulation.py --num_keys 100 --model_runtime_s 1.5 --total_runtime_s 2000 --per_key_records_per_second 1 --key_prio_policy ${key_prio} --window_size 672 --slide_size ${slide} --output_path ${result_dir}/plan/${key_prio}_slide_${slide}_plan.json --num_mapper_replicas 1\" >> $tmp_script done done done diff --git a/stl/offline/run_2_eval_yahoo_keys.sh b/stl/offline/run_2_eval_yahoo_keys.sh index c5f106d..ed2eab8 100644 --- a/stl/offline/run_2_eval_yahoo_keys.sh +++ b/stl/offline/run_2_eval_yahoo_keys.sh @@ -17,7 +17,7 @@ do done done -cat $tmp_script | xargs -n 1 -P 36 bash -l -c +cat $tmp_script | xargs -n 1 -P 144 bash -l -c #set -ex diff --git a/stl/offline/simulation.py b/stl/offline/simulation.py index 476167d..6431293 100644 --- a/stl/offline/simulation.py +++ b/stl/offline/simulation.py @@ -81,8 +81,7 @@ def main(argv): else: keys = [i+1 for i in range(FLAGS.num_keys)] - print("keys", keys) - + print(FLAGS.key_prio_policy) source_to_window_queue = simpy.Store(env) windows_to_mapper_queue = { key: PerKeyPriorityQueue( From 3042b0c379229c8d8afe36b54acb0a6b71f92136 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 14 Oct 2021 15:19:33 -0700 Subject: [PATCH 18/26] try lp 500 --- stl/offline/config_gen.py | 6 ++-- stl/offline/default_plans.py | 8 +++++ stl/offline/evaluate_loss.py | 48 ++++++++++++++++++++++++++ stl/offline/log_data.py | 45 ++++++++++++++++++++++++ stl/offline/run_4_generate_plan.sh | 6 ++-- stl/offline/run_6_simulate_baseline.sh | 2 +- 6 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 stl/offline/default_plans.py create mode 100644 stl/offline/evaluate_loss.py create mode 100644 stl/offline/log_data.py diff --git a/stl/offline/config_gen.py b/stl/offline/config_gen.py index 58ebca5..91f9bf4 100644 --- a/stl/offline/config_gen.py +++ b/stl/offline/config_gen.py @@ -119,17 +119,17 @@ def run_lp(df: pd.DataFrame, objective="min_loss"): def get_loss_per_key(key: int, csv_dir): - key_one = glob(f"{csv_dir}/slide_*_key_A4Benchmark-TS{key}.csv") + key_one = glob(f"{csv_dir}/fifo_slide_*_key_{key}.csv") assert len(key_one) > 0 - oracle_residual = pd.read_csv(f"{csv_dir}/oracle_key_A4Benchmark-TS{key}.csv")[ + oracle_residual = pd.read_csv(f"./oracle/{key}.csv")[ "pred_residual" ] losses = [] for path in key_one: slide_size = int( - os.path.basename(path).split("_key_A4")[0].replace("slide_", "") + os.path.basename(path).split("_key_")[0].replace("fifo_slide_", "") ) df = pd.read_csv(path) residual = df["pred_residual"] diff --git a/stl/offline/default_plans.py b/stl/offline/default_plans.py new file mode 100644 index 0000000..8bcd66f --- /dev/null +++ b/stl/offline/default_plans.py @@ -0,0 +1,8 @@ +import json + +plan_dir = "/data/wooders/stl/results" +slides = [1, 6, 12, 18, 24, 48, 96, 168, 192, 336, 672] + +for slide in slides: + weights = {i: slide for i in range(1, 101, 1)} + open(f"{plan_dir}/plan_baseline_{slide}.json", "w").write(json.dumps(weights)) diff --git a/stl/offline/evaluate_loss.py b/stl/offline/evaluate_loss.py new file mode 100644 index 0000000..29d0242 --- /dev/null +++ b/stl/offline/evaluate_loss.py @@ -0,0 +1,48 @@ +from sktime.performance_metrics.forecasting import mean_squared_scaled_error +import numpy as np +import pandas as pd +from tqdm import tqdm +import argparse + +def get_loss_per_key(key: int, csv_dir, oracle_dir): + path = f"{csv_dir}/{key}.csv" + + oracle_residual = pd.read_csv(f"{oracle_dir}/oracle_key_A4Benchmark-TS{key}.csv")[ + "pred_residual" + ] + + df = pd.read_csv(path) + print(path) + residual = df["pred_residual"] + print("residual", len(residual.tolist())) + mask = ~np.isnan(residual) + print("residual", len(residual[mask].tolist())) + loss = mean_squared_scaled_error( + y_true=oracle_residual[mask], y_pred=residual[mask], y_train=df["value"] + ) + loss = { + "loss": loss, + "n_fits": df["model_version"].dropna().nunique(), + } + return loss + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Specify experiment config") + parser.add_argument("--csv-path", type=str) + parser.add_argument("--oracle-path", type=str) + args = parser.parse_args() + + raw_data = [] + for key in tqdm(range(1, 101)): + entry = get_loss_per_key(key, csv_dir=args.csv_path, oracle_dir=args.oracle_path) + raw_data.append({"key": key, **entry}) + + df = pd.DataFrame(raw_data) + print("loss per n_fits") + print(df.groupby("n_fits")["loss"].describe()) + print(f"loss per key (sample of 10 out of {len(df)})") + print(df.groupby("key")["loss"].describe().sample(10)) + df.to_csv("final_results.csv") + diff --git a/stl/offline/log_data.py b/stl/offline/log_data.py new file mode 100644 index 0000000..b8eb566 --- /dev/null +++ b/stl/offline/log_data.py @@ -0,0 +1,45 @@ +import wandb +import configparser +import os + + +def log_experiment(run, config): + # log experiment output + artifact = wandb.Artifact("results", type='dataset') + artifact.add_dir("/data/wooders/stl/results") + run.log_artifact(artifact) + +def log_train(run, config): + # log experiment output + artifact = wandb.Artifact("yahoo_train_data", type='dataset') + artifact.add_dir("yahoo_train_data") + run.log_artifact(artifact) + +def log_eval(run, config): + # log experiment output + artifact = wandb.Artifact("yahoo_eval_data", type='dataset') + artifact.add_dir("yahoo_eval_data") + run.log_artifact(artifact) + +def log_oracle(run, config): + # log experiment output + artifact = wandb.Artifact("oracle", type='dataset') + artifact.add_dir("oracle") + run.log_artifact(artifact) + + + +if __name__ == "__main__": + + print("Running wandb logging on data") + run = wandb.init(job_type="dataset-creation", project="stl") + + # configuration file + config = configparser.ConfigParser() + config.read("config.yml") + + log_experiment(run, config) + log_train(run, config) + log_eval(run, config) + log_oracle(run, config) + diff --git a/stl/offline/run_4_generate_plan.sh b/stl/offline/run_4_generate_plan.sh index 0769b1f..ef514b5 100644 --- a/stl/offline/run_4_generate_plan.sh +++ b/stl/offline/run_4_generate_plan.sh @@ -7,9 +7,9 @@ set -ex # --csv_dir "./result/offline_1_slide/plan_eval" \ # --output_path "./result/offline_1_slide/min_loss_plan.json" -MAX_FITS=8400 +MAX_FITS=500 python config_gen.py \ - --csv_dir "./offline_1_slide/plan_eval" \ - --output_path "./offline_1_slide/max_fits_${MAX_FITS}.json" \ + --csv_dir "/data/wooders/stl/results/single_key" \ + --output_path "/data/wooders/stl/results/max_fits_${MAX_FITS}.json" \ --max_n_fits ${MAX_FITS} diff --git a/stl/offline/run_6_simulate_baseline.sh b/stl/offline/run_6_simulate_baseline.sh index f70fabf..c0cb4ed 100644 --- a/stl/offline/run_6_simulate_baseline.sh +++ b/stl/offline/run_6_simulate_baseline.sh @@ -4,7 +4,7 @@ PLAN_DIR="/data/wooders/stl/results" TRAIN_PATH="./yahoo_train_data" EVAL_PATH="./yahoo_eval_data" -for key_policy in "lifo" "fifo" +for key_policy in "fifo" do for replicas in 1 2 4 8 do From 6fed8980016cb838d4c27803764cd49db8e4c747 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Thu, 14 Oct 2021 21:47:40 -0700 Subject: [PATCH 19/26] notebook --- stl/notebooks/STL Offline Plots.ipynb | 70 ++++---- wikipedia/notebooks/Wikipedia Plots.ipynb | 200 +++++++++++++--------- wikipedia/run_1_generate_plan.sh | 4 +- wikipedia/run_2_prepare_data.sh | 2 +- wikipedia/run_3_run_predictions.sh | 13 +- wikipedia/simulate.py | 5 +- 6 files changed, 169 insertions(+), 125 deletions(-) diff --git a/stl/notebooks/STL Offline Plots.ipynb b/stl/notebooks/STL Offline Plots.ipynb index 4b561e5..428ec9f 100644 --- a/stl/notebooks/STL Offline Plots.ipynb +++ b/stl/notebooks/STL Offline Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 232, + "execution_count": 351, "id": "642f67ca", "metadata": {}, "outputs": [], @@ -22,14 +22,14 @@ }, { "cell_type": "code", - "execution_count": 252, + "execution_count": 352, "id": "0df714c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Finishing last run (ID:2od4u8d0) before initializing another..." + "Finishing last run (ID:29jne90e) before initializing another..." ], "text/plain": [ "" @@ -41,7 +41,7 @@ { "data": { "text/html": [ - "
Waiting for W&B process to finish, PID 80915... (success)." + "
Waiting for W&B process to finish, PID 56004... (success)." ], "text/plain": [ "" @@ -58,7 +58,7 @@ "version_minor": 0 }, "text/plain": [ - "VBox(children=(Label(value=' 0.26MB of 0.26MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + "VBox(children=(Label(value=' 0.51MB of 0.51MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" ] }, "metadata": {}, @@ -76,8 +76,8 @@ "
\n", "
\n", "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", - "
Synced restful-butterfly-20: https://wandb.ai/ucb-ralf/experiments-stl_notebooks/runs/2od4u8d0
\n", - "Find logs at: ./wandb/run-20211014_021333-2od4u8d0/logs
\n" + "
Synced drawn-morning-23: https://wandb.ai/ucb-ralf/experiments-stl_notebooks/runs/29jne90e
\n", + "Find logs at: ./wandb/run-20211014_214005-29jne90e/logs
\n" ], "text/plain": [ "" @@ -89,7 +89,7 @@ { "data": { "text/html": [ - "Successfully finished last run (ID:2od4u8d0). Initializing new run:
" + "Successfully finished last run (ID:29jne90e). Initializing new run:
" ], "text/plain": [ "" @@ -110,7 +110,7 @@ "data": { "text/html": [ "\n", - " Syncing run avid-flower-21 to Weights & Biases (docs).
\n", + " Syncing run vocal-terrain-24 to Weights & Biases (docs).
\n", "\n", " " ], @@ -147,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 253, + "execution_count": 353, "id": "0eedb687", "metadata": {}, "outputs": [], @@ -157,17 +157,17 @@ }, { "cell_type": "code", - "execution_count": 254, + "execution_count": 354, "id": "975d3b68", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[]" ] }, - "execution_count": 254, + "execution_count": 354, "metadata": {}, "output_type": "execute_result" }, @@ -189,17 +189,17 @@ }, { "cell_type": "code", - "execution_count": 256, + "execution_count": 355, "id": "c37f7834", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[]" ] }, - "execution_count": 256, + "execution_count": 355, "metadata": {}, "output_type": "execute_result" }, @@ -221,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 356, "id": "b3c30d2e", "metadata": {}, "outputs": [], @@ -231,17 +231,17 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 357, "id": "93a15c67", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[]" ] }, - "execution_count": 258, + "execution_count": 357, "metadata": {}, "output_type": "execute_result" }, @@ -270,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 358, "id": "5fecde25", "metadata": {}, "outputs": [], @@ -297,10 +297,22 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 359, "id": "21099224", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "TypeError", + "evalue": "get_loss_per_key() missing 1 required positional argument: 'oracle_filename'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mbaseline_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m101\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_loss_per_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"{artifact_dir}/plan_eval\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlosses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: get_loss_per_key() missing 1 required positional argument: 'oracle_filename'" + ] + } + ], "source": [ "replica = 1\n", "baseline_results = {}\n", @@ -311,18 +323,10 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": null, "id": "d82f9ca7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8400 78.42078584418643\n" - ] - } - ], + "outputs": [], "source": [ "slide_size = 12\n", "baseline_total_cost = 0\n", diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index e417120..ad9d8b5 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -608,17 +608,17 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 217, "id": "101571e2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'./artifacts/prediction_results:v3620'" + "'/home/eecs/wooders/DPR'" ] }, - "execution_count": 25, + "execution_count": 217, "metadata": {}, "output_type": "execute_result" } @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 218, "id": "03e14929", "metadata": {}, "outputs": [], @@ -639,16 +639,16 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 219, "id": "eaf30e01", "metadata": {}, "outputs": [], "source": [ - "#constants = [0.01, 0.05, 1.0, 10.0]\n", - "constants = [0.25]\n", - "policies = [\"lifo\"]\n", - "key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", - "#key_policies = [\"random\", \"round_robin\"]\n", + "constants = [0.01, 0.05, 1.0, 10.0]\n", + "#constants = [0.25]\n", + "policies = [\"lifo\", \"fifo\"]\n", + "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", + "key_policies = [\"round_robin\"]\n", "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", "metric = 'top10'" @@ -656,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 231, "id": "96209574", "metadata": {}, "outputs": [ @@ -664,28 +664,45 @@ "name": "stdout", "output_type": "stream", "text": [ - "/home/eecs/wooders/DPR/plan-random_lifo-always_process-0.25-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_random_lifo-always_process-0.25-100.json\n", - "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.25-100.json\n", - "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.25-100.json\n" + "/home/eecs/wooders/DPR/plan-round_robin_fifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_fifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_fifo-always_process-1.0-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_fifo-always_process-10.0-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-1.0-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-10.0-100.json\n" ] }, { "data": { "text/plain": [ - "{'plan-random_lifo-always_process': [0.7078732804419647],\n", - " 'plan-weighted_random_lifo-always_process': [0.6361795795371613],\n", - " 'plan-round_robin_lifo-always_process': [0.6167886934890254],\n", - " 'plan-weighted_round_robin_lifo-always_process': [0.5987554048857813]}" + "{'plan-round_robin_fifo-always_process': [0.39603393208873827,\n", + " 0.6827773461716538,\n", + " 0.8776121979738054,\n", + " 0.8791895221727837],\n", + " 'plan-round_robin_lifo-always_process': [0.39388374885232,\n", + " 0.46513799624895036,\n", + " 0.8024342585399157,\n", + " 0.8759956368544546]}" ] }, - "execution_count": 182, + "execution_count": 231, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_results = {}\n", + "constants = [0.01, 0.05, 1.0, 10.0]\n", + "#constants = [0.25]\n", + "policies = [\"fifo\", \"lifo\"]\n", + "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", + "key_policies = [\"round_robin\"]\n", + "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", + "d = artifact_dir\n", + "metric = 'top10'\n", + "\n", + "event_results = {}\n", "for policy in policies: \n", " for key_policy in key_policies: \n", " scores = []\n", @@ -695,96 +712,115 @@ " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", " results = json.load(results_file)\n", " scores.append(1-results[metric])\n", - " all_results[name] = scores\n", - "all_results" + " event_results[name] = scores\n", + "event_results" ] }, { "cell_type": "code", - "execution_count": 106, - "id": "b479a2bc", - "metadata": { - "scrolled": true - }, + "execution_count": 232, + "id": "332c0ff6", + "metadata": {}, "outputs": [ { "data": { + "image/png": "\n", "text/plain": [ - "dict_keys(['plan-random_lifo-always_process', 'plan-weighted_random_lifo-always_process', 'plan-round_robin_lifo-always_process', 'plan-weighted_round_robin_lifo-always_process', 'plan-random_fifo-always_process', 'plan-weighted_random_fifo-always_process', 'plan-round_robin_fifo-always_process', 'plan-weighted_round_robin_fifo-always_process'])" + "
" ] }, - "execution_count": 106, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "all_results.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "332c0ff6", - "metadata": {}, - "outputs": [], - "source": [ - "plan_weighted_random_lifo = []\n", - "for constant in constants:\n", - " with open(f'{d}/plan-weighted_random_lifo-always_process-{constant}-100.json') as results_file:\n", - " results = json.load(results_file)\n", - " plan_weighted_random_lifo.append(results[metric])\n", - "print(plan_weighted_random_lifo)\n", - " \n", - "plan_weighted_longest_queue_lifo = []\n", - "for constant in constants:\n", - " with open(f'{d}/plan-weighted_longest_queue_lifo-always_process-{constant}-100.json') as results_file:\n", - " results = json.load(results_file)\n", - " plan_weighted_longest_queue_lifo.append(results[metric])\n", - "print(plan_weighted_longest_queue_lifo)\n", - "\n", - "plan_longest_queue_lifo = []\n", - "for constant in constants:\n", - " with open(f'{d}/plan-longest_queue_lifo-always_process-{constant}-100.json') as results_file:\n", - " results = json.load(results_file)\n", - " plan_longest_queue_lifo.append(results[metric])\n", - "print(plan_longest_queue_lifo)\n", - "\n", - "plan_random_lifo = []\n", - "for constant in constants:\n", - " with open(f'{d}/plan-random_lifo-always_process-{constant}-100.json') as results_file:\n", - " results = json.load(results_file)\n", - " plan_random_lifo.append(results[metric])\n", - "print(plan_random_lifo)\n", - "\n", - "plan_round_robin_lifo = []\n", - "for constant in constants:\n", - " with open(f'{d}/plan-round_robin_lifo-always_process-{constant}-100.json') as results_file:\n", - " results = json.load(results_file)\n", - " plan_round_robin_lifo.append(results[metric])\n", - "print(plan_round_robin_lifo)\n" + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "resources = [int(10 / c) for c in constants] \n", + "df = pd.DataFrame({\n", + " 'Model Runtime Const': resources, \n", + " **event_results\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", + "seaborn.barplot(x='Model Runtime Const', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "ax1.set(xlabel='Resources', ylabel=f'{metric} Error')\n", + "ax1.legend_.remove()\n", + "plt.legend(loc='lower left')\n", + "seaborn.despine(fig)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 229, "id": "6d536763", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-1.0-100.json\n", + "/home/eecs/wooders/DPR/plan-round_robin_lifo-always_process-10.0-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.01-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-0.05-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-1.0-100.json\n", + "/home/eecs/wooders/DPR/plan-weighted_round_robin_lifo-always_process-10.0-100.json\n" + ] + }, + { + "data": { + "text/plain": [ + "{'plan-round_robin_lifo-always_process': [0.39388374885232,\n", + " 0.46513799624895036,\n", + " 0.8024342585399157,\n", + " 0.8759956368544546],\n", + " 'plan-weighted_round_robin_lifo-always_process': [0.39394652792491625,\n", + " 0.44209022922208885,\n", + " 0.6753066365327118,\n", + " 0.7929938554982696]}" + ] + }, + "execution_count": 229, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from pylab import rcParams\n", - "rcParams['figure.figsize'] = 12, 6" + "constants = [0.01, 0.05, 1.0, 10.0]\n", + "#constants = [0.25]\n", + "policies = [\"lifo\"]\n", + "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", + "key_policies = [\"round_robin\", \"weighted_round_robin\"]\n", + "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", + "d = artifact_dir\n", + "metric = 'top10'\n", + "\n", + "key_results = {}\n", + "for policy in policies: \n", + " for key_policy in key_policies: \n", + " scores = []\n", + " name = f\"plan-{key_policy}_{policy}-always_process\"\n", + " for constant in constants: \n", + " print(f'{d}/{name}-{constant}-100.json')\n", + " with open(f'{d}/{name}-{constant}-100.json') as results_file:\n", + " results = json.load(results_file)\n", + " scores.append(1-results[metric])\n", + " key_results[name] = scores\n", + "key_results" ] }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 230, "id": "511f1c65", "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -799,7 +835,7 @@ "resources = [int(10 / c) for c in constants] \n", "df = pd.DataFrame({\n", " 'Model Runtime Const': resources, \n", - " **all_results\n", + " **key_results\n", "})\n", "fig, ax1 = plt.subplots(figsize=(10, 5))\n", "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index 5123d42..6027c32 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,8 +1,8 @@ set -xe -for replicas in 2 4 8 1 +for replicas in 1 2 4 do - for model_runtime in 0.25 + for model_runtime in 0.001 0.05 0.01 0.1 1.0 5.0 10.0 do for event_policy in "lifo" #"fifo" do diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index d170ebd..a7dcd74 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -4,7 +4,7 @@ plan_dir=/data/wooders/wiki-plans for replicas in 2 do -for model_runtime in 1.0 +for model_runtime in 0.25 do for event_policy in "lifo" do diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh index e0e8f13..abbc646 100644 --- a/wikipedia/run_3_run_predictions.sh +++ b/wikipedia/run_3_run_predictions.sh @@ -5,18 +5,21 @@ dpr_dir=~/DPR cd $dpr_dir +for replicas in 1 2 4 +do for event_policy in "lifo" do - for model_runtime in 0.25 0.005 + for model_runtime in 0.25 #for model_runtime in 0.01 0.05 0.1 1.0 10.0 0.25 0.005 do for load_shedding_policy in "always_process" do - for key_policy in "weighted_round_robin" "round_robin" "random" "weighted_random" + for key_policy in "weighted_round_robin" "round_robin" do - plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 + #plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 + plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas} echo $plan_file - CUDA_VISIBLE_DEVICES=0,1,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + CUDA_VISIBLE_DEVICES=0,2,3,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & #pid=$! done @@ -24,4 +27,4 @@ do done done done -p +done diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py index 78ed8a1..27db63a 100644 --- a/wikipedia/simulate.py +++ b/wikipedia/simulate.py @@ -450,7 +450,8 @@ def run(self, replica_id: int): runtime = self.model_runtime_s * num_passages #print(runtime, num_passages) - yield self.env.timeout(self.model_runtime_s) + yield self.env.timeout(runtime) + #yield self.env.timeout(self.model_runtime_s) # configuration file @@ -612,4 +613,4 @@ def run_once( # print("DONE", out_path) #for f in output_files: # print(f) - #open("plans.txt", "w").write("\n".join(output_files)) + #slide_#open("plans.txt", "w").write("\n".join(output_files)) From 3a9669b51542c354ffe6e89e4bfab92705b91e4f Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 15 Oct 2021 00:33:46 -0700 Subject: [PATCH 20/26] stash --- stl/offline/run_2_eval_yahoo_keys.sh | 5 +- wikipedia/notebooks/Wikipedia Plots.ipynb | 101 ++++++++++++++++++++++ wikipedia/run_1_generate_plan.sh | 4 +- wikipedia/run_2_prepare_data.sh | 2 +- wikipedia/run_3_run_predictions.sh | 4 +- 5 files changed, 109 insertions(+), 7 deletions(-) diff --git a/stl/offline/run_2_eval_yahoo_keys.sh b/stl/offline/run_2_eval_yahoo_keys.sh index ed2eab8..a3cb4bc 100644 --- a/stl/offline/run_2_eval_yahoo_keys.sh +++ b/stl/offline/run_2_eval_yahoo_keys.sh @@ -1,6 +1,7 @@ set -ex data_dir="./yahoo_train_data" +results_dir="/data/wooders/stl/results" tmp_script=`mktemp` for key_prio in "lifo" "fifo" @@ -11,8 +12,8 @@ do for slide in 6 12 18 24 48 96 168 192 336 672 do echo \" python evaluation.py --offline-yahoo-csv-path $data \ - --offline-plan-path ./offline_1_slide/plan/${key_prio}_slide_${slide}_plan.json \ - --output-path ./offline_1_slide/single_key/${key_prio}_slide_${slide}_key_${key} \" >> $tmp_script + --offline-plan-path ${results_dir}/plan/${key_prio}_slide_${slide}_plan.json \ + --output-path ${results_dir}/single_key/${key_prio}_slide_${slide}_key_${key} \" >> $tmp_script done done done diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index ad9d8b5..6dc27f1 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -908,6 +908,107 @@ "plt.legend()" ] }, + { + "cell_type": "code", + "execution_count": 247, + "id": "aece6567", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.06871169495648626, 'top5': 0.12280371338214406, 'top10': 0.13392345661573715, 'top100': 0.13392345661573715}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.07980004865378126, 'top5': 0.1408997810579843, 'top10': 0.15672010735221414, 'top100': 0.15672010735221414}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.08357464039362479, 'top5': 0.16605849440089146, 'top10': 0.1989547284412741, 'top100': 0.1989547284412741}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.06215912925426309, 'top5': 0.1438268553177798, 'top10': 0.16673336943130007, 'top100': 0.16673336943130007}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.07800299770071646, 'top5': 0.1567436495044377, 'top10': 0.18105484536729682, 'top100': 0.18105484536729682}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.12434964804482426, 'top5': 0.2397768203969207, 'top10': 0.26929867928526025, 'top100': 0.26929867928526025}\n" + ] + }, + { + "data": { + "text/plain": [ + "{'round_robin': [0.8660765433842629, 0.8432798926477858, 0.801045271558726],\n", + " 'weighted_round_robin': [0.8332666305687,\n", + " 0.8189451546327031,\n", + " 0.7307013207147397]}" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "constants = [0.25]\n", + "policies = [\"lifo\"]\n", + "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", + "key_policies = [\"round_robin\", \"weighted_round_robin\"]\n", + "replicas = [1, 2, 4]\n", + "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", + "d = artifact_dir\n", + "metric = 'top10'\n", + "d = \"/data/wooders/wikipedia/predictions\"\n", + "\n", + "replica_results = {}\n", + "\n", + "\n", + "for key_policy in key_policies:\n", + " scores = []\n", + " for replica in replicas:\n", + " for policy in policies: \n", + " \n", + " name = f\"plan-{key_policy}_{policy}-always_process\"\n", + " for constant in constants: \n", + " print(f'{d}/{name}-{constant}-100_replicas_{replica}.json')\n", + " with open(f'{d}/{name}-{constant}-100_replicas_{replica}.json') as results_file:\n", + " results = json.load(results_file)\n", + " print(name, results)\n", + " scores.append(1-results[metric])\n", + " replica_results[key_policy] = scores\n", + "replica_results" + ] + }, + { + "cell_type": "code", + "execution_count": 248, + "id": "e1822437", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn\n", + "df = pd.DataFrame({\n", + " 'Model Runtime Const': replicas, \n", + " **replica_results\n", + "})\n", + "fig, ax1 = plt.subplots(figsize=(10, 5))\n", + "tidy = df.melt(id_vars='Model Runtime Const').rename(columns=str.title)\n", + "seaborn.barplot(x='Model Runtime Const', y='Value', hue='Variable', data=tidy, ax=ax1)\n", + "ax1.set(xlabel='Resources', ylabel=f'{metric} Error')\n", + "ax1.legend_.remove()\n", + "plt.legend(loc='lower left')\n", + "seaborn.despine(fig)" + ] + }, { "cell_type": "markdown", "id": "cdf98fa5", diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index 6027c32..9d723a2 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,8 +1,8 @@ set -xe -for replicas in 1 2 4 +for replicas in 1 2 4 6 8 do - for model_runtime in 0.001 0.05 0.01 0.1 1.0 5.0 10.0 + for model_runtime in 0.25 #0.001 0.05 0.01 0.1 1.0 5.0 10.0 do for event_policy in "lifo" #"fifo" do diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index a7dcd74..a25969f 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -2,7 +2,7 @@ set -xe plan_dir=/data/wooders/wiki-plans -for replicas in 2 +for replicas in 6 8 do for model_runtime in 0.25 do diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh index abbc646..78b92c7 100644 --- a/wikipedia/run_3_run_predictions.sh +++ b/wikipedia/run_3_run_predictions.sh @@ -5,7 +5,7 @@ dpr_dir=~/DPR cd $dpr_dir -for replicas in 1 2 4 +for replicas in 6 8 do for event_policy in "lifo" do @@ -19,7 +19,7 @@ do #plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas} echo $plan_file - CUDA_VISIBLE_DEVICES=0,2,3,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + CUDA_VISIBLE_DEVICES=0,1,2,3,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & #pid=$! done From a9a65ca27846e6f9a30e5b6b58cf060aa4595179 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 15 Oct 2021 04:37:37 -0700 Subject: [PATCH 21/26] stash --- wikipedia/notebooks/Wikipedia Plots.ipynb | 42 +++++++++++++++++------ wikipedia/preprocessing/log_data.py | 16 +++++---- wikipedia/run_1_generate_plan.sh | 2 +- wikipedia/run_2_prepare_data.sh | 2 +- wikipedia/run_3_run_predictions.sh | 4 +-- 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 6dc27f1..1d8013b 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -910,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 247, + "execution_count": 257, "id": "aece6567", "metadata": {}, "outputs": [ @@ -924,24 +924,44 @@ "plan-round_robin_lifo-always_process {'top1': 0.07980004865378126, 'top5': 0.1408997810579843, 'top10': 0.15672010735221414, 'top100': 0.15672010735221414}\n", "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", "plan-round_robin_lifo-always_process {'top1': 0.08357464039362479, 'top5': 0.16605849440089146, 'top10': 0.1989547284412741, 'top100': 0.1989547284412741}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.12422408989963196, 'top5': 0.25539311470521303, 'top10': 0.2912321177735402, 'top100': 0.2912321177735402}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.19127213943232024, 'top5': 0.38433348243363075, 'top10': 0.4554621716850688, 'top100': 0.4554621716850688}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.18883945036921942, 'top5': 0.40024797733675477, 'top10': 0.46685657336127, 'top100': 0.46685657336127}\n", "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_1.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.06215912925426309, 'top5': 0.1438268553177798, 'top10': 0.16673336943130007, 'top100': 0.16673336943130007}\n", "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_2.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.07800299770071646, 'top5': 0.1567436495044377, 'top10': 0.18105484536729682, 'top100': 0.18105484536729682}\n", "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_4.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.12434964804482426, 'top5': 0.2397768203969207, 'top10': 0.26929867928526025, 'top100': 0.26929867928526025}\n" + "plan-weighted_round_robin_lifo-always_process {'top1': 0.12434964804482426, 'top5': 0.2397768203969207, 'top10': 0.26929867928526025, 'top100': 0.26929867928526025}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.14424276667372932, 'top5': 0.29131059161428535, 'top10': 0.34781175695082045, 'top100': 0.34781175695082045}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.18262432218220057, 'top5': 0.3638204204628387, 'top10': 0.4380959107281588, 'top100': 0.4380959107281588}\n", + "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "plan-weighted_round_robin_lifo-always_process {'top1': 0.20202305561441095, 'top5': 0.41511092277389333, 'top10': 0.489213770589574, 'top100': 0.489213770589574}\n" ] }, { "data": { "text/plain": [ - "{'round_robin': [0.8660765433842629, 0.8432798926477858, 0.801045271558726],\n", - " 'weighted_round_robin': [0.8332666305687,\n", - " 0.8189451546327031,\n", - " 0.7307013207147397]}" + "{'round_robin': [0.877196286617856,\n", + " 0.8591002189420157,\n", + " 0.8339415055991085,\n", + " 0.7446068852947869,\n", + " 0.6156665175663693,\n", + " 0.5997520226632452],\n", + " 'weighted_round_robin': [0.8561731446822202,\n", + " 0.8432563504955624,\n", + " 0.7602231796030793,\n", + " 0.7086894083857147,\n", + " 0.6361795795371613,\n", + " 0.5848890772261066]}" ] }, - "execution_count": 247, + "execution_count": 257, "metadata": {}, "output_type": "execute_result" } @@ -951,10 +971,10 @@ "policies = [\"lifo\"]\n", "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", "key_policies = [\"round_robin\", \"weighted_round_robin\"]\n", - "replicas = [1, 2, 4]\n", + "replicas = [1, 2, 4, 8, 16, 32]\n", "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", - "metric = 'top10'\n", + "metric = 'top5'\n", "d = \"/data/wooders/wikipedia/predictions\"\n", "\n", "replica_results = {}\n", @@ -978,13 +998,13 @@ }, { "cell_type": "code", - "execution_count": 248, + "execution_count": 258, "id": "e1822437", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmgAAAFCCAYAAABFMCGEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAApFklEQVR4nO3deVjU5eL+8XsYFsVIDygKqLmkhuG+ZYkaWmShtKpRHjXFUsuOrS4JolY/1DJTE+WYS7RaloppdSxPWaZmWCJi5lIhCAqaicLAML8/LL5xMBkMZj4479d1dV3MzDPP546e8u6zmmw2m00AAAAwDDdnBwAAAEBZFDQAAACDoaABAAAYDAUNAADAYChoAAAABnPZFDSbzabCwkJxUSoAAKjpLpuCZrFYlJqaKovF4uwoAAAAf8tlU9AAAAAuFxQ0AAAAg6GgAQAAGAwFDQAAwGAoaAAAAAZDQQMAADAYChoAAIDBUNAAAAAMhoIGAABgMBQ0AAAAg6GgAQAAGAwF7S9YiqzOjlAj8HsCAKDquTs7gFF5epgVFbPF2TEM740ZfZ0dAQCAyw570AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChogAvjNin24fcEwNG4zQbgwridjH24nQwAR2MPGgAAgMFQ0PC3lBRbnB2hRuD3BACoDA5x4m9xc/fUD3NHODuG4bV+YoWzIwAAahD2oAEAABgMBQ0AAMBgKGgAAAAGQ0EDgApwkYd9+D0BVYeLBACgAlwMYx8uhgGqjsMK2uHDhzVp0iSdOnVK9erVU3x8vJo1a1ZmTG5uriZPnqysrCwVFRXpuuuu0zPPPCN3d3okAABwHQ47xBkbG6uoqCh99NFHioqKUkxMTLkxCQkJatmypdavX6/169dr7969+vjjjx0VEQAAwBAcUtByc3OVlpamiIgISVJERITS0tKUl5dXZpzJZFJ+fr5KSkpksVhUVFSkhg0bOiIiAACAYTjk2GFWVpYaNmwos9ksSTKbzfL391dWVpZ8fX1Lx40bN06PPPKIevXqpXPnzum+++5Tly5dKrWt1NTUKslc2e0CFdm1a5ezI5TDOkdVM+I6D24bIu/aXs6OYXhnzxVqX1rV/BkK+1zsv8GGOrlr06ZNatOmjVauXKn8/HxFR0dr06ZNuuWWW+yeIyQkRF5e/IsI46EMwRUYdZ1HxWxxdgTDe2NGX8P+83NFDjnEGRAQoOzsbFmtVkmS1WpVTk6OAgICyoxLSkrSoEGD5ObmJh8fH4WFhWn79u2OiAgAAGAYDilofn5+Cg4OVnJysiQpOTlZwcHBZQ5vSlLjxo31+eefS5IsFou2bdumVq1aOSIiAACAYTjsKs7p06crKSlJ4eHhSkpKUlxcnCQpOjpae/bskSRNmTJFu3bt0sCBA3X77berWbNmGjx4sKMiAgAAGILDzkFr2bKlVq9eXe79xMTE0p+bNm2q5cuXOyoSAACAIfGoJwAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAYDAUNAAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAVFJscXaEGsFRvyd3h2wFAAAYmpu7p36YO8LZMQyv9RMrHLId9qABAAAYDAUNAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYBxW0A4fPqwhQ4YoPDxcQ4YM0ZEjRy447sMPP9TAgQMVERGhgQMH6sSJE46KCAAAYAjujtpQbGysoqKiFBkZqbVr1yomJkarVq0qM2bPnj1auHChVq5cqQYNGui3336Tp6enoyICAAAYgkP2oOXm5iotLU0RERGSpIiICKWlpSkvL6/MuBUrVuiBBx5QgwYNJEk+Pj7y8vJyREQAAADDcMgetKysLDVs2FBms1mSZDab5e/vr6ysLPn6+paOO3jwoBo3bqz77rtPZ8+e1U033aSxY8fKZDLZva3U1NQqydylS5cqmQf4w65du5wdoRzWOaoa6xyuoKrW+cXWpsMOcdrDarVq//79Wr58uSwWi0aPHq3AwEDdfvvtds8REhLCXjcYEn9IwBWwzuEKHLHOHXKIMyAgQNnZ2bJarZLOF7GcnBwFBASUGRcYGKhbbrlFnp6euuKKK9SvXz99//33jogIAABgGA4paH5+fgoODlZycrIkKTk5WcHBwWUOb0rnz03bunWrbDabioqK9PXXX+uaa65xREQAAADDcNhtNqZPn66kpCSFh4crKSlJcXFxkqTo6Gjt2bNHknTbbbfJz89Pt956q26//XZdffXVuvvuux0VEQAAwBAcdg5ay5YttXr16nLvJyYmlv7s5uamyZMna/LkyY6KBQAAYDg8SQAAAMBgKGgAAAAGQ0EDAAAwGAoaAACAwdhV0EpKSqo7BwAAAH5XYUGzWq3q2LGjLBaLI/IAAAC4vAoLmtlsVrNmzXTy5ElH5AEAAHB5dt0HbeDAgXrooYf0z3/+U40aNSrzWc+ePaslGAAAgKuyq6C9+eabkqQFCxaUed9kMmnz5s1VnwoAAMCF2VXQPv300+rOAQAAgN/Z/ain4uJipaSkKDs7W40aNVLHjh3l7u6wJ0UBAAC4DLsa1sGDBzV27FgVFBQoICBAWVlZ8vLyUkJCglq2bFndGQEAAFyKXQUtLi5OgwcP1qhRo2QymSRJy5Yt0/Tp0/Xaa69Va0AAAABXY9eNatPT0zVy5MjSciZJw4cPV3p6erUFAwAAcFV2FTR/f3/t2LGjzHvffPON/P39qyUUAACAK7PrEOfEiRM1btw49e3bV4GBgcrMzNSWLVs0Z86c6s4HAADgcuzag3bjjTfq/fffV6tWrZSfn69WrVppzZo16t+/f3XnAwAAcDkV7kGzWq3q1KmTvvnmG40bN84RmQAAAFwaz+IEAAAwGJ7FCQAAYDA8ixMAAMBgKixoJSUlevbZZ9WlSxd5eno6IhMAAIBLq/AcNDc3N40bN45yBgAA4CB23WajW7du2r17dzVHAQAAgGTnOWiBgYGKjo5Wv3791KhRozKPfHr00UerLRwAAIArsqugFRYWlt6UNjs7u1oDAQAAuDq7Ctrzzz9f3TkAAADwu4ueg7Zx48Yyrw8dOlTm9YoVK6o8EAAAgKu7aEGbOnVqmddDhw4t8/rll1+u+kQAAAAu7qIFzWazVeo1AAAA/r6LFrQ/X61pz2sAAAD8fRVeJGCz2Ur/utBrAAAAVK2LFrSzZ8+qbdu2pa9tNlvpa5vNxh40AACAanDRgsaD0AEAABzvogUtKCjIUTkAAADwO7uexQkAAADHoaABAAAYDAUNAADAYChoAAAABlPhfdDefvttvf/++zpw4IDOnj0rb29vtWrVSnfeeacGDx7siIwAAAAu5aIFbc6cOdqyZYtGjhypa665Rj4+Pjpz5oz27dunFStW6JdfftHjjz/uqKwAAAAu4aIF7b333tO6devk7+9f5v1rr71WoaGhGjRoEAUNAACgilXqYekAAACofhfdg3b33Xdr+PDheuCBB9SmTZvSQ5zp6elasWKF7rnnHkflBAAAcBkXLWhPPvmkmjRpovfee08//vhj6UUCV199tYYNG6ahQ4c6KicAAIDLqPAqzqFDh1LEAAAAHOhv3QctMzOzqnIAAADgd5dc0CwWi/r161eVWQAAAKAKDnHu3LnzLz+zWCxVHgYAAAAVFLRhw4apQYMGcnPjiVAAAACOctGCFhgYqLlz56pz587lPissLFTHjh3t3tDhw4c1adIknTp1SvXq1VN8fLyaNWt2wbGHDh3SHXfcoaioKD399NN2bwMAAOBycNFdYyEhIUpNTb3gZyaTSQEBAXZvKDY2VlFRUfroo48UFRWlmJiYC46zWq2KjY1V//797Z4bAADgcnLRgvbCCy/o3nvvveBnnp6e+vTTT+3aSG5urtLS0hQRESFJioiIUFpamvLy8sqNXbp0qfr27fuXe9cAAAAudxctaB4eHvLw8PjbG8nKylLDhg1lNpslSWazWf7+/srKyiozLj09XVu3btWIESP+9jYBAABqqgpvVCudv2Jz8eLF2rBhg3JycuTv769bb71VY8eOlZeXV5UEKSoq0rRp0/T888+XFrlL8VeHZCurS5cuVTIP8Iddu3Y5O0I5rHNUNdY5XEFVrfOLrU27Ctr06dN1+PBhTZ06VUFBQTp69KiWLl2q7OxsPf/88xV+PyAgQNnZ2bJarTKbzbJarcrJySlzDtvx48f1888/a8yYMZKk06dPy2az6cyZM5o5c6Y9MSWdP2+uqkojUJX4QwKugHUOV+CIdW5XQdu8ebM++eQTXXnllZKkq6++Wh06dNDNN99s10b8/PwUHBys5ORkRUZGKjk5WcHBwfL19S0dExgYqO3bt5e+XrBggc6ePctVnAAAwOXYdYOz+vXr69y5c2XeKywsVIMGDeze0PTp05WUlKTw8HAlJSUpLi5OkhQdHa09e/ZUIjIAAMDlza49aJGRkRo9erSGDRumhg0b6tixY3r99dcVGRmpbdu2lY7r2bPnX87RsmVLrV69utz7iYmJFxz/yCOP2BMNAADgsmNXQXvrrbckSQkJCeXe/+Mzk8mkzZs3V3E8AAAA12NXQbP3fmcAAAD4++wqaJJUXFyslJQUZWdnq1GjRurYsaPc3e3+OgAAAOxkV8M6ePCgxo4dq4KCAgUEBCgrK0teXl5KSEhQy5YtqzsjAACAS7GroMXFxWnw4MEaNWqUTCaTJGnZsmWaPn26XnvttWoNCAAA4Grsus1Genq6Ro4cWVrOJGn48OFKT0+vtmAAAACuyq6C5u/vrx07dpR575tvvpG/v3+1hAIAAHBldh3inDhxosaNG6e+ffsqMDBQmZmZ2rJli+bMmVPd+QAAAFyOXXvQ+vXrpzVr1qhVq1bKz89Xq1attGbNGvXv37+68wEAALgcu/agLVu2TKNGjdK4cePKvL98+XKNHDmyWoIBAAC4Krv2oC1atOiC7y9evLhKwwAAAKCCPWh/PGezpKREX3/9tWw2W+lnGRkZqlOnTvWmAwAAcEEXLWhTp06VJBUWFmrKlCml75tMJjVo0EDPPPNM9aYDAABwQRctaH88g/Opp57S7NmzHRIIAADA1dl1DhrlDAAAwHHsKmgAAABwHAoaAACAwVDQAAAADIaCBgAAYDAUNAAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAYDAUNAAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAYDAUNAAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAYDAUNAAAAIOhoAEAABgMBQ0AAMBgKGgAAAAGQ0EDAAAwGAoaAACAwVDQAAAADIaCBgAAYDDujtrQ4cOHNWnSJJ06dUr16tVTfHy8mjVrVmbMokWL9OGHH8psNsvd3V0TJ05UaGiooyICAAAYgsMKWmxsrKKiohQZGam1a9cqJiZGq1atKjOmffv2euCBB1S7dm2lp6fr/vvv19atW1WrVi1HxQQAAHA6hxzizM3NVVpamiIiIiRJERERSktLU15eXplxoaGhql27tiSpTZs2stlsOnXqlCMiAgAAGIZDClpWVpYaNmwos9ksSTKbzfL391dWVtZffueDDz5Q06ZN1ahRI0dEBAAAMAyHHeKsjB07dmj+/Pl69dVXK/3d1NTUKsnQpUuXKpkH+MOuXbucHaEc1jmqGuscrqCq1vnF1qZDClpAQICys7NltVplNptltVqVk5OjgICAcmNTUlL05JNP6pVXXlGLFi0qva2QkBB5eXlVRWygSvGHBFwB6xyuwBHr3CGHOP38/BQcHKzk5GRJUnJysoKDg+Xr61tm3Pfff6+JEyfq5Zdf1rXXXuuIaAAAAIbjsPugTZ8+XUlJSQoPD1dSUpLi4uIkSdHR0dqzZ48kKS4uTgUFBYqJiVFkZKQiIyO1f/9+R0UEAAAwBIedg9ayZUutXr263PuJiYmlP7/33nuOigMAAGBYPEkAAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMBgKGgAAgMFQ0AAAAAzG3dkBHKGoqEgZGRkqKCio1PfGDqhbTYkuH/v27VNR6ChnxzAum2SyFionJ0f169eXmxv/TwQAqJhLFLSMjAz5+PioWbNmMplMdn/v0NHfqjHV5aFFkI8Kjh12dgzDstlsKi6x6VRhoTIyMtS0aVNnRwIA1AAu8b/zBQUF8vPzq1Q5A6qCyWSSh9lNQUFBys/Pd3YcAEAN4RIFTRLlDE7FoU0AQGW4xCHO/2UpssrTw1zhuBZBPpWa91xhsbJOnLvUWAAAAJJctKB5epgVFbOlyud9Y0bfKp8TAAC4Ho67uIgBYV117tzZKpnrk03rNWv6Uxf8bMuX2/Ti4sQq2Q4AAK7KJfegGYnVWiyz2Vj/GEpKSmQymS7pvL2+N/RU3xt6VkMqAABch7GagYsYENZVo8ZM0I7tWxXSrpMG3j5YC+Y9r6ysDMlm011Dhqn/zRGlY9ds+Fy1a3uXez0grKuGjxqnr7Zu0W+nf9WoByeoV+9+kqQvP/9UK5Ytko9PXXXrcX2FmZJWLFFmZoYKzp1VVmaGZr+UqO3bPtd7b78mmUwKCGisCY9NUb1/+EqSzuaf0azYJ3UiJ1NXetfSs1OfVMMG9bV248f6fNsOvTDjGe1M+U5zFi5RSHAbfb93n0wmk+JjJqtFM241AQDAxXCI00lKbCWaPW+p/vnAWCUsnKtmzVtq8b/f0rOzF+nVpQt05PCPds3j7V1HLy9epScmxylhwVxJ0qmTeZr/4rOKnfmCXlz4qtw9POyaK/X7b/XoE9O0eNnbyj2Ro+WJC/Xs7EVa/O+3zudbMKd07N4932nEqPFat26dunZsp9kLEi4458HDP+meQbfp3eUJuvnG3kp87U27sgAA4MooaE7SPzyi9OeUb3dowMA7JUm+fvXV/bpe+i7lG7vm6RMWLkm6JridcnOPy2IpVHraHl3dqo0aN20mSRoQcaddc3XrcYPq1q0nSfou5Rt163GDfP3qn59j4J1K+XZH6dhr23Uonf+O227Rjm93X3DOq5o2VnDrqyVJ7dteo18ys+zKAgCAK6OgOckfhyz/YFLZ873+OP/Lzc2skpISSZLFUlhuHk9PT0mS2Xz+tiFWq1U22S4pU61aFWT6qy/abH95vprX7/mk8/cCs1qtl5QNAABX4pLnoFmKrNVyS4xzhcWX9L1Onbtr44b3NWzEg8rLO6Gd27/UHXdFSZICAoP0w/40dercXZ9t3mTXfMFt2+ulOTN1NONnBTVuqk0bPqh0po6dumn1myuVl3dCvr71tWnDB+rYpXvp52mp3+loxs9qEXSt1m76RN06daj0NgAAwIW5ZEGz5ya1kuOexfnQw09owbznNHb0UMlm08joh3VV85aSpDHjHtOCF58rPfRpj3r/8NWEx6Zq+tSJ8vGpq9C+/Sud6armLTUierymPjn+94sEgvTIxCmln7fr0EVJK5ZozrNHSi8SAAAAVcNks9ku7XiYwRQWFio1NVUhISHy8vIq89m+ffsUHBxc6Tl5WHrFeFi6fWo1an7J67C6VcdNmy83b8zoqx/mjnB2DMNr/cQKZ0f4S6zzirHO7eOodc45aAAAAAbjkoc4XdWpk3ma+tTD5d6/PvRG3ffPaCckAgAAF0JBcyH1/uGrRYlvODsGAACoAIc4AQAADIaCBgAAYDAUNAAAAINxyXPQSootcnP3rHBciyCfSs1bVFioX05YLjUWAACAJBctaG7untVyr5fz90apvoKWe+K4Zj/3jOJfXFLh2AFhXbVmw+flHiklSUkrlmjIfQ/Iw86HqP/ZUxPH6K7Bw9SjZ2ilv1tddqZ8pxcX/1tvLl1QJfNNe36u2rZprXvvHFTus0XLVqll86t0S1ifKtkWAAAXwiHOGsSvfgO7yllFXl+VqOLioipIVDEjPnuzuPjSM40f9U/KGQCg2rnkHjRn+nD9ezp86EeNf/Rp7d+Xqn+NH6GXXlmpNtdcq4Uv/T+1uLq1WrRsreWJC3Q2P1+SNGzkQ+p+XS9lH8vUhIeG6e0PNkuStn6+WSuXvSJPLy+F9umvlcteKbPXbO2at/TV1i367fSvGvXgBPXq3U+L5sdLkh575AG5mdwUP2+JTCaTEhfP0+GDB2QpsqhDx66KHjtRZrNZPx05pHmz41RcXKymzVrIYrn4HsKdKd9pzsIl6tQuRHv3/6DoYffK7x/1FP/yYp0rKFDtWrX09ISxCgluU27P159f/zFPSHAbfb93n0wmk+JjJqtFs6aSpIX/XqFNn/5X/vXrKyS4dYW/91GPPqkO17bVnn375eXpofnPTddLS17Vlzu+kSTd0L2r/vXgA6UPnf/hx0Ma89gkHcs5ri7t22nKxPHy8PAos3dt8fLXdOTnDJ3JP6uMrCw1CQzQnLipql2rVmWXBQAAZbAHzcE6du6u3d/ukCTtTtmp4Lbt9d23O8+//naHWrUO1oJ5z+npqc9qwZIkxT33kl5+8TmdOVP2sVOnTubp5Ref0/Rn52nR0jfk5elVblve3nX08uJVemJynBIWzJUkjX/0aUnSiwte1aLEN3TFFT5KXDxP7dp31vzFq7Ro6Rs6dTJPH29cJ0ma+3yMbou8RwuXvq6Btw/Wgf1pFf49Hjh0RLf276ukxS/p+m6d9XjMLI0f9U+9uzxBD48ersdjZqmoqOI9eAcP/6R7Bt2md5cn6OYbeyvxtTclSVu+/Fpbvvxa7/z7FSXO+386/FNGhXNJ0o+Hj2jxnGe1MH6m3lu/Uft/PKi3Exfq7cSFSj9wUO+t31g6ds++dL00K1ZrVixVZnaO3v3TZ3+Wtv+Anp/2tD5YlaiiYqs+/OQzu7IAAHAxFDQHCwxqIkthoY4fz9bub3dqRPR47U7ZoeM5x1RUVKSTebnKzsrUtEkTND46StMmTZDJZFLm0V/KzJOetkdXt2qjoMbn9yjdPCCy3Lb6hIVLkq4Jbqfc3OOyWAovmOnrrz7Xu++8pvHRUXrkwfv144F0Hc34Wfn5Z3TkyEH1u+lWSVJw23Zq1vzqCv8emzYOVIeQtpKkIz9nyMPDXdd17SxJ6tGlkzw83HXk54pL1VVNGyu49fnttW97jX7JzJJ0fk9beFgfeXvXltls1h23hVc4lyQN6H+j3N3P7yH7eleKBt1ykzw8POTh4aHIATfp610ppWP/mN/d3axBt/TXjm93X3DOnt276EqfK2QymdSubZvSjAAA/B0c4nSCDp26aefXW3XqZK7ad+iiV+bHa8fXW9WhU1dJUvMWrTRnfmK572Ufyyz92SabJNNFt+Ppef5K1T8O2/3V+WA2m00xM+YqILBxmffz88/IVME2LsS7du0yOS84h8kks9mskpKS0rcslrJ71bw8/+9KWzc3t799PluZXDabTKayuf739cXGXiij2c1NhVau4gUA/H0uWdBKii3V8jT6osIL76H6Xx07d9PKVxera/eekqS2IR30zpsrNXzUOAVf215Hj/6s71K+KS1s+9P3qnWbtmXmuCa4nebNmanMo78oMKiJPvlovd05a3vXUf6ZM6Xnql13fW+98+ZKPfyvSTKbzfr111M6dzZfjQKC1Kx5S23ZvElhN92q/ftSdeTwj3ZvR5KaN20iS1GRdnz7nbp37qAd336n4mKrmjUJUt6pX3U065hO//abfK64Qhs3b7Frzu6dO2rhv1fovrvvkJenh9Zu/LhSmSSpZ9fOWrfxE918Y29J0vpN/1G/Pr1KP/9kyxe67+475OnhoQ2ffKrePXtUehsAAFwqlyxo9twDTZIOHf2t4kGXoEOnbsrJjlHHTt0lnT8vbWPy++rYqZt8fK5U7KwXtWzJfC1Z9IKKi4vUKCBI05+dV2aOf/j66ZF/TVbM5EdVt2499ejZW+7u7vLyqvgE9bvuuU+THn9IXl61FD9viR4c/7iWLXlZ46PvlclkkoeHp8aMf1yNAoL0+KQ4zZsdpzWrX1er1sG6JjikUn+vHh4eemHGM2UuEpgbN1UeHh5q2KC+hg2+U0OjH1FQQCNde01rHTzyU4Vz9rm+h77fu09DRo1Tg/p+6tapvXJO5FYq110DB+jno5kaMnq8JOn6bl10V8QtpZ93bh+iiVPjlJWToy7t2+nugQMqNT8AAH+HyWaz2ZwdoioUFhYqNTVVISEh8vIqe8L8vn37FBwcXOk5q6ugVZWzZ/Pl7V1HkvTxxnX6aONavfDyModmaBHko4Jjhx26zZqoVqPml7wOq1tUzBZnRzC8N2b0rZZ7J15uquPIRFVhnVeMdW4fR61zl9yDdrlYu+Ytbf3vZlmtxfLxqatHH3/G2ZEAAEAVoKDVYPfeP0r33j/KKduePnWijudky9PDTbbi8yfGN/JvoJefj3NKHkn64usdWpC4otz7j0SPUOh13R0fCACAS0RBwyX545w4Ix3iDL2uO0UMAHBZcJn7oF0mp9qhhvrz7UQAAKiISxS0WrVqKTc3l5IGh7PZbCqylujo0aOqU6eOs+MAAGoIlzjE2bhxY2VkZOj48eOV+t6JUwXVlOjyUXi6lopOn3B2DOOySabiQjVs3V7169d3dhoAQA3hEgXNw8NDzZs3r/T3uCy7Ym/M6MRl2Xbw77XC2REAADWIww5xHj58WEOGDFF4eLiGDBmiI0eOlBtjtVoVFxen/v3766abbtLq1asdFQ8AAMAwHFbQYmNjFRUVpY8++khRUVGKiYkpN2b9+vX6+eef9fHHH+vtt9/WggULlJFR8UO1AQAALicOOcSZm5urtLQ0LV++XJIUERGhmTNnKi8vT76+vqXjPvzwQ91zzz1yc3OTr6+v+vfvr02bNmn06NEVbuOPCwAslqp7WPWVtSv/oHBXU1hYqJJaPs6OYXiFdj6n1RlY5xVjnduHdV6zsc7tU9Xr3NPTUyZT+fXpkIKWlZWlhg0bymw2S5LMZrP8/f2VlZVVpqBlZWUpMDCw9HVAQICOHTtm1zaKiookST/88EOV5Y6++Yoqm+tylZqaKvUc4ewYhpeamursCH+JdV4x1rl9WOc1G+vcPlW9zi/0iErpMrpIoE6dOmrdurU8PDwu2EQBAACMxtPT84LvO6SgBQQEKDs7W1arVWazWVarVTk5OQoICCg3LjMzU+3bt5dUfo/axbi5ucnHh12zAACg5nPIRQJ+fn4KDg5WcnKyJCk5OVnBwcFlDm9K0i233KLVq1erpKREeXl5+s9//qPw8HBHRAQAADAMk81Bt9c/ePCgJk2apNOnT+vKK69UfHy8WrRooejoaE2YMEHt2rWT1WrVjBkz9OWXX0qSoqOjNWTIEEfEAwAAMAyHFTQAAADYxyWexQkAAFCTUNAAAAAMhoIGAABgMBQ0AAAAg6Gg4ZLEx8crLCxMbdq0qdKnNwBGcfLkSUVHRys8PFwDBw7Uww8/rLy8PGfHAqrNwoUL+W+6gVDQcEn69eun119/XUFBQc6OAlQLk8mk0aNH66OPPtL69evVpEkTzZ0719mxgGqxd+9e7d692+6bw6P6UdBwSbp27VruSRDA5aRevXrq0aNH6euOHTsqMzPTiYmA6mGxWDRjxgzFxsbyqEQDoaABQAVKSkr05ptvKiwszNlRgCo3f/58DRo0SE2aNHF2FPwJBQ0AKjBz5kx5e3vr/vvvd3YUoEqlpKRoz549ioqKcnYU/A8KGgBcRHx8vH766Se99NJLcnPjP5m4vOzcuVOHDh1Sv379FBYWpmPHjmnUqFHaunWrs6O5PHdnBwAAo5o3b55SU1O1dOlSeXp6OjsOUOXGjBmjMWPGlL4OCwtTQkKCWrdu7cRUkChouESzZs3Sxx9/rBMnTmjkyJGqV6+eNmzY4OxYQJU5cOCAEhIS1KxZMw0dOlSS1LhxYy1atMjJyQC4Ah6WDgAAYDCcUAEAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMBjugwagRgkLC9OJEydkNpvl7e2t0NBQTZs2TXXq1HF2NACoMuxBA1DjJCQkKCUlRR988IHS0tK0dOlSZ0eSJBUXFzs7AoDLBAUNQI3VoEED9erVS/v27ZMk7d69W0OHDlXXrl01aNAgbd++vXTsmjVr1K9fP3Xq1ElhYWFat26dJKmkpESvvPKKbrzxRvXs2VNPPfWUfvvtN0nS9u3b1bt37zLbDAsL01dffSVJWrBggSZMmKAnnnhCnTt31vvvv69Tp05p8uTJ6tWrl7p166Zx48aVfvezzz5TZGSkunbtqqFDhyo9Pb30s6VLlyo0NFSdOnVSeHi4tm3bVj2/NAA1Aoc4AdRYx44d0xdffKEePXooOztbDz74oGbPnq3Q0FBt27ZNEyZM0MaNG1WrVi3NmjVL7777rlq0aKGcnBz9+uuvks4Xt/fff1+rVq2Sr6+vnn76ac2YMUNz5syxK8PmzZs1f/58zZ49WxaLRRMmTJC3t7c2bNggb29vpaSkSJL27t2rKVOmKCEhQSEhIVq3bp3GjRunTZs2KSMjQ6+//rreffddNWzYUBkZGSopKam23xsA42MPGoAaZ/z48erUqZP69OkjX19fTZgwQWvXrlXv3r3Vp08fubm56YYbblBISIj++9//SpLc3Nx04MABFRQUyN/fX61atZIkrV+/XiNGjFCTJk1Up04dPfbYY/rwww/tPlzZsWNH9e/fX25ubjp9+rQ+//xzxcXFqW7duvLw8FD37t0lSe+8846GDBmiDh06yGw264477pCHh4d2794ts9ksi8WigwcPqqioSI0bN1bTpk2r55cHoEagoAGocRYtWqSUlBS99tprOnTokE6ePKnMzExt2rRJXbt2Lf1r165dOn78uLy9vTVv3jy99dZb6tWrl8aMGaODBw9KknJychQUFFQ6d1BQkIqLi5Wbm2tXlkaNGpX+fOzYMdWtW1d169YtNy4zM1PLly8vk+/YsWPKycnRVVddpSlTpmjBggW6/vrrNXHiRGVnZ//N3xKAmoxDnABqrO7du+vOO+9UfHy8OnTooMjISM2aNeuCY0NDQxUaGqqCggK99NJLmjZtmt544w35+/vr6NGjpeMyMzPl7u4uPz8/ZWdnq6CgoPQzq9WqvLy8MvOaTKbSnxs1aqRff/1Vp0+f1pVXXllmXEBAgB566CGNHTv2gvkGDhyogQMH6syZM4qJidHcuXPtPswK4PLDHjQANdrw4cP11VdfqUuXLvrss8/0xRdfyGq1qrCwUNu3b9exY8d04sQJbd68WWfPnpWnp6e8vb1lNpslSREREVq5cqV++eUX5efna968eRowYIDc3d3VvHlzFRYWasuWLSoqKtLixYtlsVj+Mou/v7969+6tuLg4/frrryoqKtLOnTslSffcc4/eeustfffdd7LZbDp79qy2bNmiM2fO6NChQ9q2bZssFos8PT3l5eVVmg+Aa6KgAajRfH19FRkZqZUrV+qVV17RkiVL1LNnT/Xp00fLli1TSUmJSkpKtHz5coWGhqp79+7auXOnYmNjJUl33XWXBg0apPvvv1/9+vWTp6enpk2bJkny8fFRbGysnnnmGfXu3Vu1a9cuc0jzQmbPni13d3cNGDBA119/vVauXClJateunWbOnKkZM2aoW7duuvnmm7VmzRpJksVi0QsvvKAePXqoV69eysvL08SJE6vxtwbA6Ew2m83m7BAAAAD4P+xBAwAAMBgKGgAAgMFQ0AAAAAyGggYAAGAwFDQAAACDoaABAAAYDAUNAADAYChoAAAABkNBAwAAMJj/D0/Uwj6x7PAwAAAAAElFTkSuQmCC\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmgAAAFCCAYAAABFMCGEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAr/klEQVR4nO3de1xUdf7H8fcwXBTXRFAENa+p4Q0JxSxRU0vrh2J3l2pzNd3UoqW1vJQgmhVoa97vixq1peUlyeyiWdkqmVlpYJZSiSIoihdQLsP8/rBmYzEcDGaOzOv5eOzjMXPOme/5zHd1fPf9nnO+JqvVahUAAAAMw83ZBQAAAKAsAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAg6kxAc1qtaqwsFDclAoAAK52NSagFRUVad++fSoqKnJ2KQAAAH9IjQloAAAANQUBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAbjcgGtqNji7BIuyah1AQAAx3N3dgGO5ulhVlTsNmeXUc5rU/s4uwQAAGAQLjeCBgAAYHQENAAAAIMhoAEAABgMAQ0AAMBgCGiodka9Q9WodQEA4HJ3cRpVaUmR3Nw9nV1GGVVVE3fOAgBQOQQ0g3Bz99SBmcOcXUYZbcetcHYJAAC4JKY4AQAADIaABpdVWlLk7BLKMWJNAADHY4oTLotpZQCAUTGCBgAAYDAENKAGMuojRIxaFwAYDVOcQA3Eo00A4OrGCBoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGIzD7uLMyMjQhAkTlJeXJx8fHyUkJKhFixZljsnNzdXEiROVlZWl4uJi3XjjjXr22Wfl7s7NpgAAwHU4bAQtLi5OUVFReu+99xQVFaXY2NhyxyxatEitW7fWxo0btXHjRn377bd6//33HVUiAACAITgkoOXm5iotLU0RERGSpIiICKWlpenkyZNljjOZTMrPz1dpaamKiopUXFysRo0aOaJEAAAAw3DI3GFWVpYaNWoks9ksSTKbzfL391dWVpZ8fX1tx40ZM0aPP/64evbsqfPnz+uBBx5QaGhopc61b9++CvdXtj1Xt3v37j/cBn1eOTW9z6vi+wFATVDRb7WhLu7avHmz2rVrp5UrVyo/P18jR47U5s2bNXDgQLvb6Nixo7y8vKqxStdi5H/oa6qa3uc1/fsBQFVwyBRnYGCgsrOzZbFcXIfPYrEoJydHgYGBZY5LTk7W4MGD5ebmprp166pv375KTU11RIkAAACG4ZCA5ufnp6CgIKWkpEiSUlJSFBQUVGZ6U5KaNm2qTz75RJJUVFSkHTt2qE2bNo4oEQAAwDAcdhfnlClTlJycrAEDBig5OVnx8fGSpJEjR2rv3r2SpEmTJmn37t0aNGiQhgwZohYtWui+++5zVIkAAACG4LBr0Fq3bq01a9aU27506VLb62bNmikpKclRJQEAABgSKwkAAAAYDAENAADAYAhoAAAABkNAAwAAMBgCGgAAgMEQ0AAAAAyGgAbAYUpLipxdQjlGrAkADLUWJ4Cazc3dUwdmDnN2GWW0HbfC2SUAQDmMoAEAABgMAQ0AAMBgCGgAAAAGQ0ADgCpQVGxxdgmXZNS6AFSMmwQAoAp4epgVFbvN2WWU89rUPs4uAcAVYAQNAADAYAhoAAAABkNAAwAAMBgCGgAAgMEQ0AAAAAyGgAYAAGAwBDQAAACDIaABAAAYDAENAGqw0pIiZ5dQjhFrAoyGlQQAoAZzc/fUgZnDnF1GGW3HrXB2CYDhMYIGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAVKHSkiJnl1COEWtCxdydXQAAADWJm7unDswc5uwyymg7boWzS0AlMYIGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaACAq1JRscXZJQDVxmF3cWZkZGjChAnKy8uTj4+PEhIS1KJFi3LHbdq0SQsXLpTVapXJZFJSUpIaNGjgqDIBAFcJTw+zomK3ObuMcl6b2sfZJaAGcFhAi4uLU1RUlCIjI7VhwwbFxsZq1apVZY7Zu3ev5s2bp5UrV6phw4Y6e/asPD09HVUiAACAIThkijM3N1dpaWmKiIiQJEVERCgtLU0nT54sc9yKFSs0fPhwNWzYUJJUt25deXl5OaJEAAAAw3DICFpWVpYaNWoks9ksSTKbzfL391dWVpZ8fX1txx08eFBNmzbVAw88oIKCAt16660aPXq0TCaT3efat29fhftDQ0Ov7Eu4qN27d//hNujzyqHPHY8+dzz63PGqos+D2neUd23jDZwUnC9UelrF//4bUUV/hg21koDFYtF3332npKQkFRUV6ZFHHlHjxo01ZMgQu9vo2LEjo25ViB9Ax6PPHY8+dzz63PGqqs+Net1fTfsz5ZApzsDAQGVnZ8tiuXjHjcViUU5OjgIDA8sc17hxYw0cOFCenp7605/+pH79+umbb75xRIkAAACG4ZCA5ufnp6CgIKWkpEiSUlJSFBQUVGZ6U7p4bdr27dtltVpVXFysnTt36vrrr3dEiQAAAIbhsOegTZkyRcnJyRowYICSk5MVHx8vSRo5cqT27t0rSfq///s/+fn56Y477tCQIUN03XXX6Z577nFUiQAAAIbgsGvQWrdurTVr1pTbvnTpUttrNzc3TZw4URMnTnRUWQAA4CpXWlIkN3djPZbrj9ZkqJsEAAAAKsvN3VMHZg5zdhlltB234g99nqWeAAAADIaABgAAYDAENAAAAIMhoAEAABgMAQ0AAMBgLhvQLBaL3nzzTRUVFTmiHgAAAJd32YBmNpv14osvytPTWM8XAQAAqKnsmuK85ZZbtHXr1uquBQAAALLzQbWFhYWKjo5WSEiIAgICZDKZbPsSExOrrTgAAABXZFdAa9u2rdq2bVvdtQAAAEB2BrTHHnusuusAAADAL+xei3Pnzp3asGGDcnJy5O/vr8GDB6tHjx7VWRsAAIBLsusmgTVr1igmJkYNGzbUrbfeKn9/f40bN06rV6+u7voAAABcjl0jaMuWLVNSUpKuv/5627bbb79d0dHRuu+++6qtOAAAAFdk1whaXl6eWrduXWZbq1atdPr06WopCgAAwJXZFdBuuOEGvfjiizp//rwkqaCgQImJiQoJCanW4gAAAFyRXVOc8fHx+sc//qGuXbuqXr16On36tEJCQvTSSy9Vd30AAAAu57IBzWKx6JNPPtGyZct06tQp212cAQEBjqgPAADA5di9FqeXl5cCAgLUuXNnwhkAAEA1Yi1OAAAAg2EtTgAAAINhLU4AAACDsesmgcOHD2vatGny9PR0RE0AAAAuza6bBD777LMy05oAAACoPnbdJPDwww9r7ty5Ki4uru56AAAAXJ5d16AlJyfrxIkTSkpKkq+vb5nRtG3btlVXbQAAAC7JroA2Y8aM6q4DAAAAv7AroIWFhVV3HQAAAPhFhdegjR49usz7OXPmlHl/9913V31FAAAALq7CgJaamlrmfXJycpn3hw4dqvqKAAAAXJxdd3H+ymq1lnnPozcAAACqXqUCGoEMAACg+lV4k0BJSYneeust28hZUVGR3nzzTdt+i8VSvdUBAAC4oAoDWnBwsNavX29736lTJ23YsMH2vnPnztVWGAAAgKuqMKC98sorjqoDAAAAv6jUNWgAAACofgQ0AAAAgyGgAQAAGAwBDQAAwGAqHdDy8/N17ty56qgFAAAAukxAW7hwoe31qVOnNGLECIWGhqpbt24aNmyYcnNzq71AAAAAV1NhQFu6dKntdWJiourUqaPt27fr008/Vf369TVjxoxqLxAAAMDVVPgctN+uvbljxw6tXbtWvr6+kqTY2FgNHjy4eqsDAABwQRUGNJPJJKvVqtLSUlmtVvn4+Nj2+fj4cC0aAABANagwoBUUFKh9+/ayWq0ymUxKT09Xhw4dJEk//vijbTQNAAAAVafCgLZly5Yy7+vXr297ffbsWT355JPVUxUAAIALqzCgNWnS5Hf3de7cmcXSAQAAqkGFAe1XRUVFWrhwod555x3l5OTI399fd9xxh0aPHi0vL6/qrhEAAMCl2BXQpkyZooyMDD3zzDNq0qSJjhw5oiVLlig7O1svvPBCddcIAADgUuwKaFu2bNEHH3yga665RpJ03XXXKTg4WLfddpvdJ8rIyNCECROUl5cnHx8fJSQkqEWLFpc89tChQ7rzzjsVFRWl8ePH230OAACAmsCupZ4aNGig8+fPl9lWWFiohg0b2n2iuLg4RUVF6b333lNUVJRiY2MveZzFYlFcXJz69+9vd9sAAAA1iV0jaJGRkXrkkUf00EMPqVGjRjp27JheffVVRUZGaseOHbbjevToccnP5+bmKi0tTUlJSZKkiIgITZs2TSdPniz3qI4lS5aoT58+KigoUEFBwZV+LwAAgKuWXQHt9ddflyQtWrSo3PZf95lMpnKP5fhVVlaWGjVqJLPZLEkym83y9/dXVlZWmYC2f/9+bd++XatWrdKCBQsq/20k7du3r8L9oaGhV9Suq9q9e/cfboM+rxz63PHoc8ejzx2PPne8y/V5Rf1pV0DbunVr5Sq6AsXFxZo8ebJeeOEFW5C7Eh07duTO0irEX0bHo88djz53PPrc8ehzx/sjfW5XQJOkkpIS7dmzR9nZ2QoICFCXLl3k7m7fxwMDA5WdnS2LxSKz2SyLxaKcnBwFBgbajjl+/Lh+/vlnjRo1SpJ05swZWa1WnTt3TtOmTavk1wIAALh62ZWwDh48qNGjR+vChQsKDAxUVlaWvLy8tGjRIrVu3fqyn/fz81NQUJBSUlIUGRmplJQUBQUFlZnebNy4sVJTU23v586dq4KCAu7iBAAALseuuzjj4+N133336eOPP9Ybb7yhTz75REOHDtWUKVPsPtGUKVOUnJysAQMGKDk5WfHx8ZKkkSNHau/evVdUPAAAQE1k1wja/v37lZSUJJPJZNv28MMPl7tpoCKtW7fWmjVrym1funTpJY9//PHH7W4bAACgJrFrBM3f31+ff/55mW1ffPGF/P39q6UoAAAAV2bXCFpMTIzGjBmjPn36qHHjxjp69Ki2bdumGTNmVHd9AAAALseuEbR+/fpp7dq1atOmjfLz89WmTRutXbuWp/0DAABUA7tG0JYvX64RI0ZozJgxZbYnJSXpr3/9a7UUBgAA4KrsGkGbP3/+JbcvXLiwSosBAADAZUbQfl1ns7S0VDt37pTVarXty8zMVJ06daq3OgAAABdUYUB75plnJEmFhYWaNGmSbbvJZFLDhg317LPPVm91AAAALqjCgPbrGpxPP/20EhMTHVIQAACAq7PrGjTCGQAAgOPYFdAAAADgOAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMO6OOlFGRoYmTJigvLw8+fj4KCEhQS1atChzzPz587Vp0yaZzWa5u7srJiZG4eHhjioRAADAEBwW0OLi4hQVFaXIyEht2LBBsbGxWrVqVZljOnfurOHDh6t27drav3+/HnzwQW3fvl21atVyVJkAAABO55ApztzcXKWlpSkiIkKSFBERobS0NJ08ebLMceHh4apdu7YkqV27drJarcrLy3NEiQAAAIbhkBG0rKwsNWrUSGazWZJkNpvl7++vrKws+fr6XvIz69evV7NmzRQQEFCpc+3bt6/C/aGhoZVqz9Xt3r37D7dBn1cOfe549Lnj0eeOR5873uX6vKL+dNgUZ2V8/vnnmj17tv71r39V+rMdO3aUl5dXNVTlmvjL6Hj0uePR545Hnzsefe54f6TPHTLFGRgYqOzsbFksFkmSxWJRTk6OAgMDyx27Z88ePfXUU5o/f75atWrliPIAAAAMxSEBzc/PT0FBQUpJSZEkpaSkKCgoqNz05jfffKOYmBjNmTNHHTp0cERpAAAAhuOw56BNmTJFycnJGjBggJKTkxUfHy9JGjlypPbu3StJio+P14ULFxQbG6vIyEhFRkbqu+++c1SJAAAAhuCwa9Bat26tNWvWlNu+dOlS2+u33nrLUeUAAAAYFisJAAAAGAwBDQAAwGAIaAAAAAZDQAMAADAYAhoAAIDBENAAAAAMhoAGAABgMAQ0AAAAgyGgAQAAGAwBDQAAwGAIaAAAAAZDQAMAADAYhy2W7kzFxcXKzMzUhQsXJEmjb6/n5IrKS09PV3H4CGeXUUZ6enqVteWyfW6VTJZCuWXuk3vmNzJZS6v3fACAGsElAlpmZqbq1q2rFi1ayGQy6dCRs84uqZxWTerqwrEMZ5dRRq2AllXWlqv2udVqVUmpVcevqa+CaxrJ89v3qvV8AICawSWmOC9cuCA/Pz+ZTCZnlwIXYzKZ5GF2U0CD+iqt38TZ5QAArhIuEdAkEc7gVG4mk8SfQQCAnVxiivN/BTaordpeVf/VzxeWKOvE+SpvFwAAuBaXDGi1vdwVFbutytt9bWqfKm8TAAC4HpeZ4nR1wb0HqqCgakb31q5dq+jo6Evu27JlixISEqrkPAAAuCqXHEEzEoulRGazsf5vKC0tveJr9vr166d+/fpVcUUAALgWYyUDF3F7364aMSpan6duV8dOIRo05D6NfX6Cfjr0g6xWadjQezRoYH9JF0e+dry7Tt7etcu9D+49UI8/MkxbP/1MeWfO6snRj6h/756SpA8/2a65S1eoXt266nljt8vWtDDpFf185KjOn7+gw0eylDR3ht5bv17Lly+XJDVr1kxTp06Vn5+fJOns2bN6/PHH9dNPP8nHx0czZsxQo0aNtHbtWm3btk1z5sxRamqqnn/+eQUHByt1126ZZNKEyc+rWfOqe3wHAAA1EVOcTlJqLVXirCX6y/DRWjRvptq0aaM3kxZp0UvT9fLi5fr+0I92tVOnjrdeWzJX0595Si/OWShJyj2Vp6kzZmv29DitWjBLHu4edrX15df7FPfU3/XWikXKPp6rmTNnavny5dq4caPatGmjadOm2Y7dvXu3YmJi9PbbbyssLEzTp0+/ZJs//PCDhg4dqoXLXld4n/76d/Jyu2oBAMCVEdCcpP+ACNvrPV9+rqFDh0qSGvr5KbxHmHbt+dqudgb27S1J6tz+eh0/kavCwiLtTUtXUJvr1KLZtZKkuwfdbldbPW/spvo+F5/4v2vP1+rdu7f8/f0lSUOHDtWOHTtsx4aGhqpVq1aSpHvvvVc7d+68ZJstW7ZU+/btJUnXt++krKOZdtUCAIArI6A5Se3a3mXe/+81X7++N5vdVPrL8kCFhUXl2vHy9PzlOLMkyWKxyGq9spq8a9e2vbZarXZfh1bRsZ6/1CdJZjc3lVosV1YcAAAuxCWvQTtfWFItj8Q4X1hyRZ8LuSFMb7zxhkbdN0gnck9q+85devDeOyVJTRsH6tv9B9Q9NESbPvzIrvY6dwjSlMRZ+inziJo3baJ172yudE3dQ7toxdOxOn78uBo2bKjVq1frpptusu3/8ssv9eOPP6pFixZau3atunfvXulzAACAS3PJgGa0h8k++tg4LV+YqHv++o6sVumJUcN1XcsWkqSnHvubpr00Rw18fdXrJvtCkF99H00eF63oiXGqV7eubrulV6Vruq5lC/3jH//Q8OHDJUnXXnutpk6datvfrVs3zZ07V99//73tJgEAAFA1XDKgOdu7W78o876+r58WLFhwyYW7w28MU/iNYbb3w6Pus73++uOyI2O/fd+/V0/179XT9v7XEbnfM/qvD5XbNmTIEA0ZMqTc9rvuukt33XXXJdv57b7u3btr7dq1tn2du3TVnEWvVFgHAADgGjQAAADDYQTNheSeytPocZPKbe8bfrMeHfaAEyoCAACXQkBzIX71fbR6+QJnlwEAAC6DKU4AAACDIaABAAAYDAENAADAYFzyGrRrG3jKw8urytstLizU4RPln/YPAABQGS4Z0Dy8vHRg5rAqb7ftuBWSqi+g5ZzI1cRpCVo+O/Gyxwb3Hqgd766Tt3ftcvsWJr2iRx4cKg8P+xZR/62HHnpIw4cP1y233FLpz1aXb776QssWza6yZ6xNfmGm2rdrqz/fNbjcvvnLV6l1y+a2NVABAKgOTHFeRfwb+NkVzi5n0YpXVVx8ZctSVZbFgGtvWixX/t3HjvgL4QwAUO1ccgTNmTZtfEsZh37Q2CfG67v0ffr72GF6ecFKtWpyo6b/c57aXddKba9rpdmL/6X8ggJJ0pjhD6lXj+46knVMUX+L1sdvr5Ykffjxds1dtkJenl66rU+45i5bUWbU7LW3Nmjrp58p78xZPTn6EfXv3VPPz5onSfrL2Bi5ublp2cuJcnMzaeb8JTpwMENFRUXqFhKscWNHSZJ++OEHTZw4USUlJWrdurUKCwsr/H6pqal6/vnn1bVrV+3du1ejR4+Wn5+fJsdN1YUL51WrVm09+tg4tbu+Q7mRr9++/+arL7R4/j/VLqiD0tP2yiSTJkx+Xs2at5QkrVy+QB9/9L78Gvir3fUdLtvvT8eMUvsOwdqfvk+enp6Ke+6fSkhI0McfbZEk3RzWVX//23DbovMHfjikUU9O0LGc4wrt3EmTYsbKw8OjzOjawqRX9OPPmTqXX6DMrCxd2zhQM+KfUe1atSr1ZwIAgP/FCJqDdbkhTF99+bkk6as9uxTUvrO+/nKXJCn1yz1q366Nnntprl6MHa/Xl87T3BfjNe2luTpz9lyZdnJP5WnqzNma80K8Vi+fLy8vz3LnqlPHW68tmavpzzylF+cslCRNinlMkrRq/iytXr5A19T9k2bOX6LQ4E56bfEcrV6+QCdP5Wn9pvclSU8//bSioqK0bt06Pfjgg9q7d+9lv+OBAwcUERGh1atX6+abb1Z0dLT+8tdHtXDZ6/rL8NGaPuVpFRcXX7adn348qDsG3a2Fy15XeJ/++nfycknSzv98op07PtH8pa/pxZcW6vDPP162LUn6MeMHTU+cq6kvzNa7KeuUnp6uN5bO0xtL52n/9wf11sZ3bcfuTd+vl5+L09oVS3Q0O0dv/mbfb6V9971emDxe61ctVXGJRZs+sG9BewAAKkJAc7DGTa5VUWGhjh/P1ldf7tKwkWP11Z7PlZWVpeLiYuWeOqWjx45p7NOTdd+IMRr79GSZJB0+crRMO3vT0hXU5jo1b9pEkjTkjgHlzvXrVFzn9tfr+IlcFRZe+vq4bZ/t1MrX39R9I8Zo6MjHlHbgB/2Umalz587pwIEDioyMlCR16dJFbdu2vex3bN68uUJCQiRJGRkZ8vDwUEjoxYXeQ24Ik7u7hzIP/3TZdppe21zXtbleknR9+07KOpop6eJIW68+t6p2bW+ZzWYNuCPysm1JUp9+A2U2Xxw0/urLVN15553y8PCQh4eHIm+/VTt377EdO6Bvb3l715a7u1mDB/bX519+dck2e4SF6pq6f5LJZFKn9u10+GiWXbUAAFARpjidIDikm3bt3K68U7nqHByqBbMTtG3bNnUL6SKr1ao2rVoqae7Mcp87knXM9tpqlWQyVXgeL8+Lo2q/Ttv97vVgVunl6XFq2jiwzOYSSabLnONSvL29f1On9ZJtmEySm9ldpaWltm1FRWUDpKfnf++0Nbu5qfSX+q1Wa6VrkqTatX9bV/nv9nvf9fe+g/TfPv61xkILd/ECAP44lwxoxYWFv9xxWfXt2qPLDd208l8L1TWshySpfcdgLV26VGP/+oC6dGyvnzOP6PMvv1bYDcGSpH3p36nD9WVHrjq3v15xCf/Uz5lH1axpY729+QO766zj7a2z+fm2a9V639xd/3p1tZ558jGZzWadyjut/PPndV1IS7Vp00YbN25UZGSkvvnmGx04cMDu80hSq1atVFRUpK/3fKHgkK76es8XspSUqEnT5jqdd0rHso7o7Nkz+tOf6mrb1vfsarPLDWFauXyB7rw7Sh6envpg89uVqkmSQkK7a926dbrlhiBJ0sbNH6pf7562/R9s+1QP3HOnPD089M4HW9WrR/dKnwMAgCvlkgHt4rPKnDfSERzSTTnZseoSEibpYuB4N2Wdwm4I1jV162r281M0a9EyzZi3SMXFJWraOEBzXogv04afb309++TjemzCZNWvV0+9buoud3d31ap1+ee7/eX+uzQyZrxqeXlp2cuJevrxRzVr4XLdO2KMTDLJ09NDTz32N10nKTExURMnTtSKFSvUoUMHBQcHV+q7enp6as6cOWVuEpg0JUEeHh5q0NBfd933oKIffUiNAhqrbbv2+vnHQ5dts3uPcKWnfaOxo6Lk69dQwV1ClXvieKXquj3iThWcydb9j4yVJN3ULVR3Rwy07b+hc0fFPBOvrJwchXbupHsG3V6p9gEA+CNM1iudLzKYwsJC7du3Tx07dpTX/zyENj09XUFBQbb3h46cdXR5l9WqSV1dOJZRqc/kFxSozi/Ties3va91mzZr5bx/VllNtQJaVllbNaXP/4gDP2Wq1qfLKzymKkd2o2K3VVlbVeW1qX2q5RmEfwR97nj0uePR5473R/vcJUfQaorX3tqgD7Z9qhKLRfXq1lXcuL87uyQAAFAFCGhXsZEP/VkjH/qzU8796KOPKiur7B2LgYGBWrRokVPqkaTPd27XyuULym1/eMQYhd3Y8xKfAADAmAhouCLODGK/J+zGngQxAECN4DLPQashl9rhKlVqtf7ybBQAAC7PJQJarVq1lJubS0iDw1mtVhVbSnXsxCm5nTri7HIAAFcJl5jibNq0qTIzM3X8+MVHMZzIu+DkisorPFNLxWdOOLuMMjxOVV0/uWyfWyVTSaHcjuyTR+Y31XsuAECN4RIBzcPDQy1b/veREca8RTikxt0i/Fv0OQAA9nPYFGdGRobuv/9+DRgwQPfff79+/PHHcsdYLBbFx8erf//+uvXWW7VmzRpHlQcAAGAYDgtocXFxioqK0nvvvaeoqCjFxsaWO2bjxo36+eef9f777+uNN97Q3LlzlZmZ6agSAQAADMEhU5y5ublKS0tTUlKSJCkiIkLTpk3TyZMn5evraztu06ZNuvfee+Xm5iZfX1/1799fmzdv1iOPPHLZc/x6A8D/Lrh9KdfUrvwC4NWtsLBQpbXqOruMMgrtXFvUHvS5fehzx6PPHY8+dzz63PHs7XNPT0+ZTOX71CFLPe3bt0/jx4/XO++8Y9t2xx13aMaMGerQoYNt26BBgzR9+nR17txZkrR06VJlZ2fr2Wefvew5zp49W+mFvAEAAJzpUktUSjXoJoE6deqobdu28vDwuGQSBQAAMBpPT89LbndIQAsMDFR2drYsFovMZrMsFotycnIUGBhY7rijR4/aRtCysrLUuHFju87h5uamunWNNbwJAABwJRxyk4Cfn5+CgoKUkpIiSUpJSVFQUFCZ688kaeDAgVqzZo1KS0t18uRJffjhhxowYIAjSgQAADAMh1yDJkkHDx7UhAkTdObMGV1zzTVKSEhQq1atNHLkSEVHR6tTp06yWCyaOnWqPvvsM0nSyJEjdf/99zuiPAAAAMNwWEADAACAfVxiLU4AAICrCQENAADAYAhoAAAABkNAAwAAMBgCmhMlJCSob9++ateuHasgOMipU6c0cuRIDRgwQIMGDdJjjz2mkydPOrsslzFv3jz+vDvIRx99pCFDhigyMlKDBg3S+++/7+ySapzf+w0vLCxUXFycbrvtNg0aNEiTJ092YpU1y5gxYzR48GANGTJEUVFRSk9Pr7G/69zF6URffPGFmjRpogceeECLFi1S27ZtnV1SjZeXl6fvvvtO3bt3l3TxB/b06dN6/vnnnVxZzfftt99q1qxZOnjwoBYvXsyf92pktVoVFhamV199VW3bttX+/fv15z//Wbt375abG/9dXlV+7zf8ueeek5ubmyZOnCiTyaQTJ06oQYMGTq62Zjh79qztofQffvih5s+fr6SkpBr5u87fVCfq2rVrudUUUL18fHxsf4klqUuXLjp69KgTK3INRUVFmjp1quLi4liKzUHc3Nx09uxZSRf/UfP39yecVbFL/Ybn5+dr/fr1euKJJ2x/1glnVee3KwadO3dOJpOpxv6u15i1OIHKKi0t1b///W/17dvX2aXUeLNnz9bgwYN17bXXOrsUl2AymfTyyy9rzJgx8vb2Vn5+vhYvXuzsslzC4cOH5ePjo3nz5ik1NVV16tTRE088oa5duzq7tBrjmWee0WeffSar1aply5aV2VeTftf5zym4rGnTpsnb21sPPvigs0up0fbs2aO9e/cqKirK2aW4jJKSEi1evFgLFizQRx99pIULFyomJkb5+fnOLq3GKykp0eHDh9W+fXutXbtW48aN0+OPP65z5845u7QaY/r06dq2bZtiYmKUmJhYZl9N+l0noMElJSQk6KefftLLL7/MtE8127Vrlw4dOqR+/fqpb9++OnbsmEaMGKHt27c7u7QaKz09XTk5OQoNDZUkhYaGqnbt2jp48KCTK6v5GjduLHd3d0VEREiSgoODVb9+fWVkZDi5sppnyJAhSk1N1alTpyTVvN/1q/8bAJU0a9Ys7du3T/Pnz5enp6ezy6nxRo0ape3bt2vr1q3aunWrAgICtHz5cvXs2dPZpdVYAQEBOnbsmA4dOiTp4lrIJ06cULNmzZxcWc3n6+ur7t2729aUzsjIUG5urpo3b+7kyq5++fn5ysrKsr3funWr6tWrJx8fnxr5u85dnE703HPP6f3339eJEydUv359+fj46J133nF2WTXa999/r4iICLVo0UK1atWSJDVt2lTz5893cmWuo2/fvty17ABvv/22li5dartQPTo6Wv3793dyVTXL7/2GHz58WJMmTVJeXp7c3d3197//Xb1793Z2uVe9EydOaMyYMTp//rzc3NxUr149jR8/Xp6enjXyd52ABgAAYDBMcQIAABgMAQ0AAMBgCGgAAAAGQ0ADAAAwGAIaAACAwRDQAAAADIa1OAFcVfr27asTJ07IbDbL29tb4eHhmjx5surUqePs0gCgyjCCBuCqs2jRIu3Zs0fr169XWlqalixZ4uySJF1chxEAqgIBDcBVq2HDhurZs6fS09MlSV999ZWGDh2qrl27avDgwUpNTbUdu3btWvXr108hISHq27ev3n77bUlSaWmpFixYoFtuuUU9evTQ008/rbNnz0qSUlNT1atXrzLn7Nu3r/7zn/9IkubOnavo6GiNGzdON9xwg9atW6e8vDxNnDhRPXv2VLdu3TRmzBjbZz/66CNFRkaqa9euGjp0qPbv32/bt2TJEoWHhyskJEQDBgzQjh07qqfTAFwVmOIEcNU6duyYPv30U3Xv3l3Z2dn629/+psTERIWHh2vHjh2Kjo7Wu+++q1q1aum5557Tm2++qVatWiknJ0enT5+WdDG4rVu3TqtWrZKvr6/Gjx+vqVOnasaMGXbVsGXLFs2ePVuJiYkqKipSdHS0vL299c4778jb21t79uyRJH377beaNGmSFi1apI4dO+rtt9/WmDFjtHnzZmVmZurVV1/Vm2++qUaNGikzM1OlpaXV1m8AjI8RNABXnbFjxyokJES9e/eWr6+voqOjtWHDBvXq1Uu9e/eWm5ubbr75ZnXs2FEff/yxJMnNzU3ff/+9Lly4IH9/f7Vp00aStHHjRg0bNkzXXnut6tSpoyeffFKbNm2ye7qyS5cu6t+/v9zc3HTmzBl98sknio+PV7169eTh4aGwsDBJ0urVq3X//fcrODhYZrNZd955pzw8PPTVV1/JbDarqKhIBw8eVHFxsZo2bcrC5oCLI6ABuOrMnz9fe/bs0SuvvKJDhw7p1KlTOnr0qDZv3qyuXbva/rd7924dP35c3t7emjVrll5//XX17NlTo0aN0sGDByVJOTk5atKkia3tJk2aqKSkRLm5uXbVEhAQYHt97Ngx1atXT/Xq1St33NGjR5WUlFSmvmPHjiknJ0fNmzfXpEmTNHfuXN10002KiYlRdnb2H+wlAFczpjgBXLXCwsJ01113KSEhQcHBwYqMjNRzzz13yWPDw8MVHh6uCxcu6OWXX9bkyZP12muvyd/fX0eOHLEdd/ToUbm7u8vPz0/Z2dm6cOGCbZ/FYtHJkyfLtGsymWyvAwICdPr0aZ05c0bXXHNNmeMCAwP16KOPavTo0Zesb9CgQRo0aJDOnTun2NhYzZw50+5pVgA1DyNoAK5qDz/8sP7zn/8oNDRUH330kT799FNZLBYVFhYqNTVVx44d04kTJ7RlyxYVFBTI09NT3t7eMpvNkqSIiAitXLlShw8fVn5+vmbNmqXbb79d7u7uatmypQoLC7Vt2zYVFxdr4cKFKioq+t1a/P391atXL8XHx+v06dMqLi7Wrl27JEn33nuvXn/9dX399deyWq0qKCjQtm3bdO7cOR06dEg7duxQUVGRPD095eXlZasPgGsioAG4qvn6+ioyMlIrV67UggULtHjxYvXo0UO9e/fW8uXLVVpaqtLSUiUlJSk8PFxhYWHatWuX4uLiJEl33323Bg8erAcffFD9+vWTp6enJk+eLEmqW7eu4uLi9Oyzz6pXr16qXbt2mSnNS0lMTJS7u7tuv/123XTTTVq5cqUkqVOnTpo2bZqmTp2qbt266bbbbtPatWslSUVFRXrppZfUvXt39ezZUydPnlRMTEw19hoAozNZrVars4sAAADAfzGCBgAAYDAENAAAAIMhoAEAABgMAQ0AAMBgCGgAAAAGQ0ADAAAwGAIaAACAwRDQAAAADIaABgAAYDD/D2sNUDbKidtgAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] diff --git a/wikipedia/preprocessing/log_data.py b/wikipedia/preprocessing/log_data.py index 9cbaf06..c7fdb7a 100644 --- a/wikipedia/preprocessing/log_data.py +++ b/wikipedia/preprocessing/log_data.py @@ -44,19 +44,21 @@ def log_plans(run, config, plan_dir): def log_plan_data(run, config, plan_name, plan_path): artifact = wandb.Artifact(plan_name, type='dataset') - artifact.add_folder(plan_path) + artifact.add_dir(plan_path) run.log_artifact def log_experiment(run, config): # log experiment output artifact = wandb.Artifact("prediction_results", type='dataset') - files = os.listdir(config["directory"]["dpr_dir"]) - for filename in files: - if "plan-" in filename and '.json' in filename: - artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) - if "plan-" in filename and '.pkl' in filename: - artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + #files = os.listdir(config["directory"]["dpr_dir"]) + #files = os.listdir("/data/wooders/wikipedia/predictions") + artifact.add_dir("/data/wooders/wikipedia/predictions") + #for filename in files: + # if "plan-" in filename and '.json' in filename: + # artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) + # if "plan-" in filename and '.pkl' in filename: + # artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) run.log_artifact(artifact) diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh index 9d723a2..8bb1f25 100644 --- a/wikipedia/run_1_generate_plan.sh +++ b/wikipedia/run_1_generate_plan.sh @@ -1,6 +1,6 @@ set -xe -for replicas in 1 2 4 6 8 +for replicas in 16 32 do for model_runtime in 0.25 #0.001 0.05 0.01 0.1 1.0 5.0 10.0 do diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh index a25969f..89faacc 100644 --- a/wikipedia/run_2_prepare_data.sh +++ b/wikipedia/run_2_prepare_data.sh @@ -2,7 +2,7 @@ set -xe plan_dir=/data/wooders/wiki-plans -for replicas in 6 8 +for replicas in 16 32 do for model_runtime in 0.25 do diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh index 78b92c7..9b2d73b 100644 --- a/wikipedia/run_3_run_predictions.sh +++ b/wikipedia/run_3_run_predictions.sh @@ -5,7 +5,7 @@ dpr_dir=~/DPR cd $dpr_dir -for replicas in 6 8 +for replicas in 16 32 do for event_policy in "lifo" do @@ -19,7 +19,7 @@ do #plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas} echo $plan_file - CUDA_VISIBLE_DEVICES=0,1,2,3,4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & + CUDA_VISIBLE_DEVICES=4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & #pid=$! done From 2b2193b9ec4367003d96a5bcec4d647b9d8b897d Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 15 Oct 2021 09:24:09 -0700 Subject: [PATCH 22/26] wiki graph --- stl/notebooks/STL Offline Plots.ipynb | 65 +++++++---------------- wikipedia/notebooks/Wikipedia Plots.ipynb | 19 +++---- 2 files changed, 29 insertions(+), 55 deletions(-) diff --git a/stl/notebooks/STL Offline Plots.ipynb b/stl/notebooks/STL Offline Plots.ipynb index 428ec9f..7ed30d7 100644 --- a/stl/notebooks/STL Offline Plots.ipynb +++ b/stl/notebooks/STL Offline Plots.ipynb @@ -22,14 +22,14 @@ }, { "cell_type": "code", - "execution_count": 352, + "execution_count": 412, "id": "0df714c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Finishing last run (ID:29jne90e) before initializing another..." + "Finishing last run (ID:1r9i8p8d) before initializing another..." ], "text/plain": [ "" @@ -41,7 +41,7 @@ { "data": { "text/html": [ - "
Waiting for W&B process to finish, PID 56004... (success)." + "
Waiting for W&B process to finish, PID 30648... (success)." ], "text/plain": [ "" @@ -58,7 +58,7 @@ "version_minor": 0 }, "text/plain": [ - "VBox(children=(Label(value=' 0.51MB of 0.51MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" + "VBox(children=(Label(value=' 0.66MB of 0.66MB uploaded (0.00MB deduped)\\r'), FloatProgress(value=1.0, max=1.0)…" ] }, "metadata": {}, @@ -75,9 +75,9 @@ "
\n", "
\n", "
\n", - "Synced 7 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", - "
Synced drawn-morning-23: https://wandb.ai/ucb-ralf/experiments-stl_notebooks/runs/29jne90e
\n", - "Find logs at: ./wandb/run-20211014_214005-29jne90e/logs
\n" + "Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)\n", + "
Synced daily-waterfall-26: https://wandb.ai/ucb-ralf/experiments-stl_notebooks/runs/1r9i8p8d
\n", + "Find logs at: ./wandb/run-20211015_043830-1r9i8p8d/logs
\n" ], "text/plain": [ "" @@ -89,7 +89,7 @@ { "data": { "text/html": [ - "Successfully finished last run (ID:29jne90e). Initializing new run:
" + "Successfully finished last run (ID:1r9i8p8d). Initializing new run:
" ], "text/plain": [ "" @@ -110,7 +110,7 @@ "data": { "text/html": [ "\n", - " Syncing run vocal-terrain-24 to Weights & Biases (docs).
\n", + " Syncing run rural-voice-27 to Weights & Biases (docs).
\n", "\n", " " ], @@ -125,13 +125,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact results:v11, 1446.02MB. 6579 files... Done. 0:0:0\n" + "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact results:v12, 2349.20MB. 11474 files... Done. 0:0:0\n" ] } ], "source": [ "run = wandb.init()\n", - "results_dir = run.use_artifact('ucb-ralf/stl/results:v11', type='dataset').download()\n", + "results_dir = run.use_artifact('ucb-ralf/stl/results:v12', type='dataset').download()\n", "yahoo_train_dir = run.use_artifact('ucb-ralf/stl/yahoo_train_data:v0', type='dataset').download()\n", "yahoo_eval_dir = run.use_artifact('ucb-ralf/stl/yahoo_eval_data:v0', type='dataset').download()\n", "oracle_dir = run.use_artifact('ucb-ralf/stl/oracle:v0', type='dataset').download()" @@ -270,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 358, + "execution_count": 413, "id": "5fecde25", "metadata": {}, "outputs": [], @@ -297,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 359, + "execution_count": 414, "id": "21099224", "metadata": {}, "outputs": [ @@ -308,7 +308,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mbaseline_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m101\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_loss_per_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"{artifact_dir}/plan_eval\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlosses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mbaseline_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m101\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_loss_per_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"{artifact_dir}/plan_eval\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlosses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: get_loss_per_key() missing 1 required positional argument: 'oracle_filename'" ] } @@ -341,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": null, "id": "da85dcf1", "metadata": {}, "outputs": [], @@ -356,18 +356,10 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": null, "id": "303fcfcf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1100 95.26955983050661\n" - ] - } - ], + "outputs": [], "source": [ "lp_total_cost = 0\n", "lp_total_loss = 0\n", @@ -379,29 +371,10 @@ }, { "cell_type": "code", - "execution_count": 260, + "execution_count": null, "id": "656758ed", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "max_fits_1100\n" - ] - }, - { - "ename": "TypeError", - "evalue": "string indices must be integers", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mbaseline_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'slide_size'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mslide_size\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mbaseline_total_cost\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'n_fits'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mbaseline_total_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'loss'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: string indices must be integers" - ] - } - ], + "outputs": [], "source": [ "experiments = [(\"max_fits_1100\", 96), (\"max_fits_2100\", 48), (\"max_fits_4200\", 24), (\"max_fits_8400\", 12)]\n", "\n", diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 1d8013b..d2f8647 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 184, + "execution_count": 266, "id": "e0030940", "metadata": {}, "outputs": [], @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 267, "id": "016e13bb", "metadata": {}, "outputs": [ @@ -32,7 +32,6 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mucb-ralf\u001b[0m (use `wandb login --relogin` to force relogin)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.12.4 is available! To upgrade, please run:\n", "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\n" ] @@ -41,7 +40,7 @@ "data": { "text/html": [ "\n", - " Syncing run royal-planet-167 to Weights & Biases (docs).
\n", + " Syncing run toasty-plasma-547 to Weights & Biases (docs).
\n", "\n", " " ], @@ -56,14 +55,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact questions:latest, 60.97MB. 2 files... Done. 0:0:0\n" + "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact questions:latest, 84.36MB. 4 files... Done. 0:0:0\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact prediction_results:latest, 6400.51MB. 413 files... Done. 0:0:0\n" ] } ], "source": [ "run = wandb.init(job_type=\"evaluation\", project=\"wiki-workload\")\n", "pageview_dir = run.use_artifact('pageviews:latest').download()\n", - "questions_dir = run.use_artifact('questions:latest').download()" + "questions_dir = run.use_artifact('questions:latest').download()\n", + "predictions_dir = run.use_artifact('prediction_results:latest').download()" ] }, { @@ -910,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 268, "id": "aece6567", "metadata": {}, "outputs": [ @@ -961,7 +962,7 @@ " 0.5848890772261066]}" ] }, - "execution_count": 257, + "execution_count": 268, "metadata": {}, "output_type": "execute_result" } @@ -998,7 +999,7 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 269, "id": "e1822437", "metadata": {}, "outputs": [ From 7f38ccbe0cfa871cab06691d92211fa4eba9298c Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Fri, 15 Oct 2021 09:25:37 -0700 Subject: [PATCH 23/26] add rght filepath --- wikipedia/notebooks/Wikipedia Plots.ipynb | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index d2f8647..369a44e 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -911,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 268, + "execution_count": 270, "id": "aece6567", "metadata": {}, "outputs": [ @@ -919,29 +919,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", "plan-round_robin_lifo-always_process {'top1': 0.06871169495648626, 'top5': 0.12280371338214406, 'top10': 0.13392345661573715, 'top100': 0.13392345661573715}\n", - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", "plan-round_robin_lifo-always_process {'top1': 0.07980004865378126, 'top5': 0.1408997810579843, 'top10': 0.15672010735221414, 'top100': 0.15672010735221414}\n", - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", "plan-round_robin_lifo-always_process {'top1': 0.08357464039362479, 'top5': 0.16605849440089146, 'top10': 0.1989547284412741, 'top100': 0.1989547284412741}\n", - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", "plan-round_robin_lifo-always_process {'top1': 0.12422408989963196, 'top5': 0.25539311470521303, 'top10': 0.2912321177735402, 'top100': 0.2912321177735402}\n", - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", "plan-round_robin_lifo-always_process {'top1': 0.19127213943232024, 'top5': 0.38433348243363075, 'top10': 0.4554621716850688, 'top100': 0.4554621716850688}\n", - "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", "plan-round_robin_lifo-always_process {'top1': 0.18883945036921942, 'top5': 0.40024797733675477, 'top10': 0.46685657336127, 'top100': 0.46685657336127}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_1.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.06215912925426309, 'top5': 0.1438268553177798, 'top10': 0.16673336943130007, 'top100': 0.16673336943130007}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_2.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.07800299770071646, 'top5': 0.1567436495044377, 'top10': 0.18105484536729682, 'top100': 0.18105484536729682}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_4.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.12434964804482426, 'top5': 0.2397768203969207, 'top10': 0.26929867928526025, 'top100': 0.26929867928526025}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_8.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.14424276667372932, 'top5': 0.29131059161428535, 'top10': 0.34781175695082045, 'top100': 0.34781175695082045}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_16.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.18262432218220057, 'top5': 0.3638204204628387, 'top10': 0.4380959107281588, 'top100': 0.4380959107281588}\n", - "/data/wooders/wikipedia/predictions/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_32.json\n", "plan-weighted_round_robin_lifo-always_process {'top1': 0.20202305561441095, 'top5': 0.41511092277389333, 'top10': 0.489213770589574, 'top100': 0.489213770589574}\n" ] }, @@ -962,7 +962,7 @@ " 0.5848890772261066]}" ] }, - "execution_count": 268, + "execution_count": 270, "metadata": {}, "output_type": "execute_result" } @@ -976,7 +976,7 @@ "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", "metric = 'top5'\n", - "d = \"/data/wooders/wikipedia/predictions\"\n", + "d = predictions_dir #\"/data/wooders/wikipedia/predictions\"\n", "\n", "replica_results = {}\n", "\n", @@ -999,7 +999,7 @@ }, { "cell_type": "code", - "execution_count": 269, + "execution_count": 271, "id": "e1822437", "metadata": {}, "outputs": [ From 5e0263e83145a1ff0b651a41a21144dca44f2ef3 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 19 Oct 2021 12:43:47 -0700 Subject: [PATCH 24/26] reorganize wiki dir --- wikipedia/benchmark_bert.py | 7 - wikipedia/config.yml | 35 - wikipedia/config_aws.yml | 29 - wikipedia/notebooks/Wikipedia Plots.ipynb | 148 +++-- wikipedia/preprocessing/embedding.py | 152 ----- wikipedia/preprocessing/generate_diffs.py | 442 ------------- wikipedia/preprocessing/log_data.py | 79 --- wikipedia/preprocessing/wiki_api_data.py | 731 --------------------- wikipedia/run_0_generate_data.sh | 1 - wikipedia/run_1_generate_plan.sh | 19 - wikipedia/run_2_prepare_data.sh | 20 - wikipedia/run_3_run_predictions.sh | 30 - wikipedia/run_4_run_optimal_predictions.sh | 8 - wikipedia/run_5_pipeline_predict.sh | 30 - wikipedia/run_wiki.sh | 10 - wikipedia/simulate.py | 616 ----------------- wikipedia/wiki_eval.py | 325 --------- 17 files changed, 95 insertions(+), 2587 deletions(-) delete mode 100644 wikipedia/benchmark_bert.py delete mode 100644 wikipedia/config.yml delete mode 100644 wikipedia/config_aws.yml delete mode 100644 wikipedia/preprocessing/embedding.py delete mode 100644 wikipedia/preprocessing/generate_diffs.py delete mode 100644 wikipedia/preprocessing/log_data.py delete mode 100644 wikipedia/preprocessing/wiki_api_data.py delete mode 100644 wikipedia/run_0_generate_data.sh delete mode 100644 wikipedia/run_1_generate_plan.sh delete mode 100644 wikipedia/run_2_prepare_data.sh delete mode 100644 wikipedia/run_3_run_predictions.sh delete mode 100644 wikipedia/run_4_run_optimal_predictions.sh delete mode 100644 wikipedia/run_5_pipeline_predict.sh delete mode 100644 wikipedia/run_wiki.sh delete mode 100644 wikipedia/simulate.py delete mode 100644 wikipedia/wiki_eval.py diff --git a/wikipedia/benchmark_bert.py b/wikipedia/benchmark_bert.py deleted file mode 100644 index 4a636c0..0000000 --- a/wikipedia/benchmark_bert.py +++ /dev/null @@ -1,7 +0,0 @@ -from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments -from pprint import pprint - -args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[1], sequence_lengths=[100], no_multi_process=True) -benchmark = PyTorchBenchmark(args) -results = benchmark.run() -pprint(results) diff --git a/wikipedia/config.yml b/wikipedia/config.yml deleted file mode 100644 index 2ae74c2..0000000 --- a/wikipedia/config.yml +++ /dev/null @@ -1,35 +0,0 @@ -[directory] -data_dir = /data/wooders/wikipedia -revisions_dir = %(data_dir)s/recentchanges -raw_doc_dir = %(data_dir)s/doc_xml/ -parsed_doc_dir = %(data_dir)s/doc_pkl/ -parsed_tmp_dir = %(data_dir)s/parsed_tmp/ -diff_dir = %(data_dir)s/diffs/ -embedding_dir = %(data_dir)s/embeddings/ -exp_dir = %(data_dir)s/simulation_output/ -dpr_dir = /home/eecs/wooders/DPR - -[files] -data_dir = /data/wooders/wikipedia -raw_questions_file = %(data_dir)s/10042021_questions_revid_filtered.csv -model_file = %(data_dir)s/bert-base-encoder.cp -changes_file = %(data_dir)s/changes.csv -titles_file = %(data_dir)s/top_titles.csv -revisions_file = %(data_dir)s/title_revisions_timestamps.json -edits_file = %(data_dir)s/edits.csv -questions_file = %(data_dir)s/questions.csv -train_questions_file = %(data_dir)s/train_questions.csv -test_questions_file = %(data_dir)s/test_questions.csv -raw_pageview_file = %(data_dir)s/top_title_views.csv -pageview_file = %(data_dir)s/pageviews.csv -timestamp_weights_file = %(data_dir)s/timestamp_weights_file.json - -[simulation] -data_dir = /data/wooders/wikipedia -plan_dir = /data/wooders/wiki-plans -init_data_file = %(data_dir)s/init_data.json -optimal_plan_file = %(data_dir)s/optimal_plan.json -stream_edits_file = %(data_dir)s/edit_stream.json -stream_questions_file = %(data_dir)s/question_stream.json - - diff --git a/wikipedia/config_aws.yml b/wikipedia/config_aws.yml deleted file mode 100644 index b13c221..0000000 --- a/wikipedia/config_aws.yml +++ /dev/null @@ -1,29 +0,0 @@ -[directory] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia -revisions_dir = %(data_dir)s/recentchanges -raw_doc_dir = %(data_dir)s/doc_xml/ -parsed_doc_dir = %(data_dir)s/doc_pkl/ -parsed_tmp_dir = %(data_dir)s/parsed_tmp/ -diff_dir = %(data_dir)s/diffs/ -embedding_dir = %(data_dir)s/embeddings/ -exp_dir = %(data_dir)s/simulation_output/ - -[files] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia -raw_questions_file = %(data_dir)s/10052021_questions_revid.csv -model_file = %(data_dir)s/bert-base-encoder.cp -changes_file = %(data_dir)s/changes.csv -titles_file = %(data_dir)s/top_titles.csv -revisions_file = %(data_dir)s/title_revisions_timestamps.json -edits_file = %(data_dir)s/edits.csv -questions_file = %(data_dir)s/questions.csv -pageview_file = %(data_dir)s/top_title_views.csv - -[simulation] -data_dir = /home/ubuntu/experiments/wikipedia/result/wikipedia -plan_dir = /home/ubuntu/experiments/wikipedia/result/wiki-plans -init_data_file = %(data_dir)s/init_data.json -stream_edits_file = %(data_dir)s/edit_stream.json -stream_questions_file = %(data_dir)s/question_stream.json - - diff --git a/wikipedia/notebooks/Wikipedia Plots.ipynb b/wikipedia/notebooks/Wikipedia Plots.ipynb index 369a44e..6b56acc 100644 --- a/wikipedia/notebooks/Wikipedia Plots.ipynb +++ b/wikipedia/notebooks/Wikipedia Plots.ipynb @@ -911,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 270, + "execution_count": 280, "id": "aece6567", "metadata": {}, "outputs": [ @@ -919,99 +919,141 @@ "name": "stdout", "output_type": "stream", "text": [ - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", "plan-round_robin_lifo-always_process {'top1': 0.06871169495648626, 'top5': 0.12280371338214406, 'top10': 0.13392345661573715, 'top100': 0.13392345661573715}\n", - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_1.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.0640895857365947, 'top5': 0.11222543964969277, 'top10': 0.12234071772174746, 'top100': 0.12234071772174746}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", "plan-round_robin_lifo-always_process {'top1': 0.07980004865378126, 'top5': 0.1408997810579843, 'top10': 0.15672010735221414, 'top100': 0.15672010735221414}\n", - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_2.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.06708728645306088, 'top5': 0.11490139761910367, 'top10': 0.1252442498293194, 'top100': 0.1252442498293194}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", "plan-round_robin_lifo-always_process {'top1': 0.08357464039362479, 'top5': 0.16605849440089146, 'top10': 0.1989547284412741, 'top100': 0.1989547284412741}\n", - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_4.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.0692296223054045, 'top5': 0.1186367524385746, 'top10': 0.1285087616043192, 'top100': 0.1285087616043192}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", "plan-round_robin_lifo-always_process {'top1': 0.12422408989963196, 'top5': 0.25539311470521303, 'top10': 0.2912321177735402, 'top100': 0.2912321177735402}\n", - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_8.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.06077798965714779, 'top5': 0.12347074102847816, 'top10': 0.1354536965102683, 'top100': 0.1354536965102683}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", "plan-round_robin_lifo-always_process {'top1': 0.19127213943232024, 'top5': 0.38433348243363075, 'top10': 0.4554621716850688, 'top100': 0.4554621716850688}\n", - "./artifacts/prediction_results:v3622/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_16.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.07892114163743516, 'top5': 0.16019649849722595, 'top10': 0.17815916064379939, 'top100': 0.17815916064379939}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", "plan-round_robin_lifo-always_process {'top1': 0.18883945036921942, 'top5': 0.40024797733675477, 'top10': 0.46685657336127, 'top100': 0.46685657336127}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_1.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.06215912925426309, 'top5': 0.1438268553177798, 'top10': 0.16673336943130007, 'top100': 0.16673336943130007}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_2.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.07800299770071646, 'top5': 0.1567436495044377, 'top10': 0.18105484536729682, 'top100': 0.18105484536729682}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_4.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.12434964804482426, 'top5': 0.2397768203969207, 'top10': 0.26929867928526025, 'top100': 0.26929867928526025}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_8.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.14424276667372932, 'top5': 0.29131059161428535, 'top10': 0.34781175695082045, 'top100': 0.34781175695082045}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_16.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.18262432218220057, 'top5': 0.3638204204628387, 'top10': 0.4380959107281588, 'top100': 0.4380959107281588}\n", - "./artifacts/prediction_results:v3622/plan-weighted_round_robin_lifo-always_process-0.25-100_replicas_32.json\n", - "plan-weighted_round_robin_lifo-always_process {'top1': 0.20202305561441095, 'top5': 0.41511092277389333, 'top10': 0.489213770589574, 'top100': 0.489213770589574}\n" + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_32.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.12397297360924736, 'top5': 0.21128296882234307, 'top10': 0.22860214547480598, 'top100': 0.22860214547480598}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_1.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.06871169495648626, 'top5': 0.12280371338214406, 'top10': 0.13392345661573715, 'top100': 0.13392345661573715}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_1.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.0640895857365947, 'top5': 0.11222543964969277, 'top10': 0.12234071772174746, 'top100': 0.12234071772174746}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_2.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.07980004865378126, 'top5': 0.1408997810579843, 'top10': 0.15672010735221414, 'top100': 0.15672010735221414}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_2.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.06708728645306088, 'top5': 0.11490139761910367, 'top10': 0.1252442498293194, 'top100': 0.1252442498293194}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_4.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.08357464039362479, 'top5': 0.16605849440089146, 'top10': 0.1989547284412741, 'top100': 0.1989547284412741}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_4.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.0692296223054045, 'top5': 0.1186367524385746, 'top10': 0.1285087616043192, 'top100': 0.1285087616043192}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_8.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.12422408989963196, 'top5': 0.25539311470521303, 'top10': 0.2912321177735402, 'top100': 0.2912321177735402}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_8.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.06077798965714779, 'top5': 0.12347074102847816, 'top10': 0.1354536965102683, 'top100': 0.1354536965102683}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_16.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.19127213943232024, 'top5': 0.38433348243363075, 'top10': 0.4554621716850688, 'top100': 0.4554621716850688}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_16.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.07892114163743516, 'top5': 0.16019649849722595, 'top10': 0.17815916064379939, 'top100': 0.17815916064379939}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_lifo-always_process-0.25-100_replicas_32.json\n", + "plan-round_robin_lifo-always_process {'top1': 0.18883945036921942, 'top5': 0.40024797733675477, 'top10': 0.46685657336127, 'top100': 0.46685657336127}\n", + "/data/wooders/wikipedia/predictions/plan-round_robin_fifo-always_process-0.25-100_replicas_32.json\n", + "plan-round_robin_fifo-always_process {'top1': 0.12397297360924736, 'top5': 0.21128296882234307, 'top10': 0.22860214547480598, 'top100': 0.22860214547480598}\n" ] }, { "data": { "text/plain": [ - "{'round_robin': [0.877196286617856,\n", + "{'lifo_round_robin': [0.877196286617856,\n", + " 0.8877745603503072,\n", " 0.8591002189420157,\n", + " 0.8850986023808963,\n", " 0.8339415055991085,\n", + " 0.8813632475614254,\n", " 0.7446068852947869,\n", + " 0.8765292589715219,\n", " 0.6156665175663693,\n", - " 0.5997520226632452],\n", - " 'weighted_round_robin': [0.8561731446822202,\n", - " 0.8432563504955624,\n", - " 0.7602231796030793,\n", - " 0.7086894083857147,\n", - " 0.6361795795371613,\n", - " 0.5848890772261066]}" + " 0.839803501502774,\n", + " 0.5997520226632452,\n", + " 0.7887170311776569],\n", + " 'fifo_round_robin': [0.877196286617856,\n", + " 0.8877745603503072,\n", + " 0.8591002189420157,\n", + " 0.8850986023808963,\n", + " 0.8339415055991085,\n", + " 0.8813632475614254,\n", + " 0.7446068852947869,\n", + " 0.8765292589715219,\n", + " 0.6156665175663693,\n", + " 0.839803501502774,\n", + " 0.5997520226632452,\n", + " 0.7887170311776569]}" ] }, - "execution_count": 270, + "execution_count": 280, "metadata": {}, "output_type": "execute_result" } ], "source": [ "constants = [0.25]\n", - "policies = [\"lifo\"]\n", + "policies = [\"lifo\", \"fifo\"]\n", "#key_policies = [\"random\", \"weighted_random\", \"round_robin\", \"weighted_round_robin\"]\n", - "key_policies = [\"round_robin\", \"weighted_round_robin\"]\n", + "key_policies = [\"round_robin\"]\n", "replicas = [1, 2, 4, 8, 16, 32]\n", "#key_policies = [\"weighted_random\", \"weighted_round_robin\"]\n", "d = artifact_dir\n", "metric = 'top5'\n", - "d = predictions_dir #\"/data/wooders/wikipedia/predictions\"\n", + "d = \"/data/wooders/wikipedia/predictions\"\n", "\n", "replica_results = {}\n", "\n", + "for pol in policies: \n", + " \n", + " for key_policy in key_policies:\n", + " scores = []\n", + " for replica in replicas:\n", + " for policy in policies: \n", "\n", - "for key_policy in key_policies:\n", - " scores = []\n", - " for replica in replicas:\n", - " for policy in policies: \n", - " \n", - " name = f\"plan-{key_policy}_{policy}-always_process\"\n", - " for constant in constants: \n", - " print(f'{d}/{name}-{constant}-100_replicas_{replica}.json')\n", - " with open(f'{d}/{name}-{constant}-100_replicas_{replica}.json') as results_file:\n", - " results = json.load(results_file)\n", - " print(name, results)\n", - " scores.append(1-results[metric])\n", - " replica_results[key_policy] = scores\n", + " name = f\"plan-{key_policy}_{policy}-always_process\"\n", + " for constant in constants: \n", + " print(f'{d}/{name}-{constant}-100_replicas_{replica}.json')\n", + " with open(f'{d}/{name}-{constant}-100_replicas_{replica}.json') as results_file:\n", + " results = json.load(results_file)\n", + " print(name, results)\n", + " scores.append(1-results[metric])\n", + " replica_results[pol + \"_\" + key_policy] = scores\n", "replica_results" ] }, { "cell_type": "code", - "execution_count": 271, + "execution_count": 281, "id": "e1822437", "metadata": {}, "outputs": [ { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "ValueError", + "evalue": "arrays must all be same length", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m df = pd.DataFrame({\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m'Model Runtime Const'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mreplicas\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mreplica_results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 529\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minit_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 530\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmrecords\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmrecords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36minit_dict\u001b[0;34m(data, index, columns, dtype)\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0marr\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_datetime64tz_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m ]\n\u001b[0;32m--> 287\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marrays_to_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 288\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36marrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype, verify_integrity)\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;31m# figure out the index, if necessary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 80\u001b[0;31m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 81\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/data/wooders/anaconda3/lib/python3.8/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36mextract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0mlengths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_lengths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlengths\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 401\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"arrays must all be same length\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 402\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhave_dicts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: arrays must all be same length" + ] } ], "source": [ diff --git a/wikipedia/preprocessing/embedding.py b/wikipedia/preprocessing/embedding.py deleted file mode 100644 index 58b66e9..0000000 --- a/wikipedia/preprocessing/embedding.py +++ /dev/null @@ -1,152 +0,0 @@ -from typing import List -import pickle -from tqdm import tqdm -import time -import numpy as np -import json -import pandas as pd -import argparse -import json -import os -from collections import defaultdict -from multiprocessing import Pool -import torch -from dpr.models import init_biencoder_components -from dpr.options import ( - add_encoder_params, - setup_args_gpu, - print_args, - set_encoder_params_from_state, - add_tokenizer_params, - add_cuda_params, -) -from dpr.utils.model_utils import ( - setup_for_distributed_mode, - load_states_from_checkpoint, - get_model_obj, - move_to_device, -) -from dpr.utils.data_utils import Tensorizer - - -class Retriever: - def __init__(self, model_file): - - parser = argparse.ArgumentParser(description="") - add_encoder_params(parser) - add_tokenizer_params(parser) - add_cuda_params(parser) - args = parser.parse_args() - - setup_args_gpu(args) - - print(args) - - saved_state = load_states_from_checkpoint(model_file) - set_encoder_params_from_state(saved_state.encoder_params, args) - - self.tensorizer, self.encoder, _ = init_biencoder_components( - args.encoder_model_type, args, inference_only=True - ) - - self.encoder = self.encoder.ctx_model - - self.encoder, _ = setup_for_distributed_mode( - self.encoder, - None, - args.device, - args.n_gpu, - args.local_rank, - args.fp16, - args.fp16_opt_level, - ) - self.encoder.eval() - - model_to_load = get_model_obj(self.encoder) - - prefix_len = len("ctx_model.") - ctx_state = { - key[prefix_len:]: value - for (key, value) in saved_state.model_dict.items() - if key.startswith("ctx_model.") - } - model_to_load.load_state_dict(ctx_state) - self.device = args.device - - def predict(self, text): - - st = time.time() - batch_token_tensors = [self.tensorizer.text_to_tensor(text)] - - ctx_ids_batch = move_to_device( - torch.stack(batch_token_tensors, dim=0), self.device - ) - ctx_seg_batch = move_to_device(torch.zeros_like(ctx_ids_batch), self.device) - ctx_attn_mask = move_to_device( - self.tensorizer.get_attn_mask(ctx_ids_batch), self.device - ) - with torch.no_grad(): - _, embedding, _ = self.encoder(ctx_ids_batch, ctx_seg_batch, ctx_attn_mask) - embedding = embedding.cpu().numpy() - return embedding - - -def embed_passages(sents, retriever_model, num_sent_in_pass=10): - passages = [] - embeddings = [] - for i in range(0, len(sents), num_sent_in_pass): - passages.append(" ".join(sents[i : i + num_sent_in_pass])) - embeddings.append(retriever_model.predict(passages[-1])) - return passages, embeddings - - -def get_passages(sents, num_sent_in_pass=10): - passages = [] - for i in range(0, len(sents), num_sent_in_pass): - passages.append(" ".join(sents[i : i + num_sent_in_pass])) - return passages - - -def generate_embeddings(model_file, diff_dir, embedding_dir): - - # create retriever - retriever_model = Retriever(model_file) - - # loop through files - index = 0 - # gpu = 2 - for filename in tqdm(os.listdir(diff_dir)): - - # index += 1 - # if index % 5 != gpu: - # continue - - new_id = filename.replace(".json", "").split("_")[0] - old_id = filename.replace(".json", "").split("_")[1] - - for revid in [new_id, old_id]: - - data = json.load(open(os.path.join(diff_dir, filename))) - - if len(data["diffs"]) == 0: - continue - - if revid == data["orig_id"]: - sents = [d["sent_a"] for d in data["diffs"][0]] - filepath = os.path.join(embedding_dir, revid + "_orig.pkl") - elif revid == data["new_id"]: - sents = [d["sent_b"] for d in data["diffs"][0]] - filepath = os.path.join(embedding_dir, revid + "_new.pkl") - - if not os.path.exists(filepath): - passages, embeddings = embed_passages(sents, retriever_model) - - pickle.dump( - { - "timestamp": data["timestamp"], - "passages": passages, - "file": filepath, - "embeddings": embeddings, - }, - open(filepath, "wb"), - ) diff --git a/wikipedia/preprocessing/generate_diffs.py b/wikipedia/preprocessing/generate_diffs.py deleted file mode 100644 index 400ed3a..0000000 --- a/wikipedia/preprocessing/generate_diffs.py +++ /dev/null @@ -1,442 +0,0 @@ -from tqdm import tqdm -import re -from collections import defaultdict -import os -from bs4 import BeautifulSoup -import pickle -import difflib -import scipy - -# import spacy -# from benepar.spacy_plugin import BeneparComponent -from nltk.translate.bleu_score import sentence_bleu - - -def read_incr_dump(d): - edit_titles = {} - for folder in tqdm(os.listdir(d)): - for file in os.listdir(os.path.join(d, folder)): - f = os.path.join(d, folder, file) - data = open(f, "r").read() - - # parse - soup = BeautifulSoup(data, "html.parser") - - for doc in soup.find_all("doc"): - id = doc.get("id") - title = doc.get("title") - url = doc.get("url") - text = doc.get_text() - assert title not in edit_titles - edit_titles[title] = { - "id": id, - "title": title, - "url": url, - "text": text, - } - return edit_titles - - -def read_dump(d, edits): - documents = [] - for file in tqdm(os.listdir(d)): - data = pickle.loads(open(os.path.join(d, file), "rb").read()) - for doc in data: - title = doc["title"] - if title in edits and edits[title]["id"] == doc["id"]: - documents.append(doc) - return documents - - -def get_spans(sent_diffs): - spans = [] - start = None - for i in range(len(sent_diffs)): - r = sent_diffs[i] - if r[:2] == "+ " or r[:2] == "- ": - if start is None: - start = i - - if start is not None: - if (r[:2] != "+ " and r[:2] != "- ") or i == len(sent_diffs) - 1: - spans.append((start, i)) - start = None - return spans - - -def get_diffs(sent_a, sent_b): - d = difflib.Differ() - result = list(d.compare(sent_a, sent_b)) - sent_a_diffs = [r for r in result if "+ " not in r] - sent_b_diffs = [r for r in result if "- " not in r] - return sent_a_diffs, sent_b_diffs - - -def split_sentences(text): - rtext = text.replace(".\n", ".\n") - rtext = rtext.replace(". ", ". ") - sentences = rtext.split("") - assert len(text) == sum( - [len(s) for s in sentences] - ), f"Invalid length {len(text)}, {sum([len(s) for s in sentences])}" - - index_to_sent = [-1] * len(text) - sent_to_index = [-1] * len(sentences) - index = 0 - for i in range(len(sentences)): - sent_to_index[i] = index - for j in range(len(sentences[i])): - # map character index to sentence index - index_to_sent[index] = i - index += 1 - - assert index == len(text), f"changed text len {index}, {len(text)}" - for i in index_to_sent: - assert i >= 0 - - return sentences, index_to_sent, sent_to_index - - -def get_diff_spans(doc, doc_diffs_raw, nlp): - - # get spans - doc_spans = get_spans(doc_diffs_raw) - - # get sentences from - sentences, index_to_sent, sent_to_index = split_sentences(doc) - - sent_diffs = [] - - for span in doc_spans: - - # print('DIFF:', doc[span[0]:span[1]]) - - # get sentence indices - start_i = index_to_sent[span[0]] - end_i = index_to_sent[span[1]] - sent_ind = range(start_i, end_i + 1, 1) if end_i > start_i else [start_i] - - # offset char indices - offset = sent_to_index[start_i] - diff_span = (span[0] - offset, span[1] - offset) - - # parse sentence - sent_comb = " ".join([sentences[i] for i in sent_ind]) - if len(sent_comb) > 10000: - print("sentence too long", len(sent_comb)) - continue - - try: - - parsed = nlp(sent_comb) - csent_all = list(parsed.sents) - - # generate word spans - word_spans = [] - words = [] - for csent in csent_all: - for constituent in csent._.constituents: - - word_spans.append((constituent.start, constituent.end)) - if DEBUG: - print("C:", const_offset, constituent) - if constituent.start + 1 == constituent.end: - words.append((constituent.start, str(constituent))) - except Exception as e: - print(e) - continue - - # map word indices to character indices - index = 0 - word_to_char_index = {len(words): len(sent_comb)} - # TODO: make sure to sort words - for word_index, word in words: - csize = len(word) - while str(sent_comb[index : index + csize]) != str(word): - index += 1 - word_to_char_index[word_index] = index - - # convert word spans to char spans - char_spans = [ - (word_to_char_index[s[0]], word_to_char_index[s[1]]) for s in word_spans - ] - - # find minimal length span - min_i = None - min_length = len(sent_comb) - for i in range(len(char_spans)): - span_len = char_spans[i][1] - char_spans[i][0] - if ( - char_spans[i][0] <= diff_span[0] - and char_spans[i][1] >= diff_span[1] - and span_len <= min_length - ): - min_i = i - min_length = span_len - - if min_i is None: - span_text = sent_comb - # print("COULD NOT DETERMINE SPAN") - # print(doc[span[0]:span[1]]) - else: - span_text = sent_comb[char_spans[min_i][0] : char_spans[min_i][1]] - - # generate span text - diff_text = doc[span[0] : span[1]] - sent_diffs.append((diff_text, span_text)) - - if DEBUG: - print("WORD SPANS", word_spans) - print("CHAR SPANS", char_spans) - print("DIFF SPAN", diff_span) - print("DIFF", diff_text) - print(char_spans[min_i], span_text) - print() - - return sent_diffs - - -def get_diffs(sent_a, sent_b): - d = difflib.Differ() - result = list(d.compare(sent_a, sent_b)) - sent_a_diffs = [r for r in result if "+ " not in r] - sent_b_diffs = [r for r in result if "- " not in r] - return sent_a_diffs, sent_b_diffs - - -def get_diffs(sent_a, sent_b): - d = difflib.Differ() - result = list(d.compare(sent_a, sent_b)) - sent_a_diffs = [r for r in result if "+ " not in r] - sent_b_diffs = [r for r in result if "- " not in r] - return sent_a_diffs, sent_b_diffs - - -def get_diffs(sent_a, sent_b): - d = difflib.Differ() - result = list(d.compare(sent_a, sent_b)) - sent_a_diffs = [r for r in result if "+ " not in r] - sent_b_diffs = [r for r in result if "- " not in r] - return sent_a_diffs, sent_b_diffs - - -def get_diffs(sent_a, sent_b): - d = difflib.Differ() - result = list(d.compare(sent_a, sent_b)) - sent_a_diffs = [r for r in result if "+ " not in r] - sent_b_diffs = [r for r in result if "- " not in r] - return sent_a_diffs, sent_b_diffs - - -def get_sentence_diff(sent_a, sent_b, nlp=None): - - # get spans from differ - sent_a_diffs_raw, sent_b_diffs_raw = get_diffs(sent_a, sent_b) - - if nlp is None: - diffs_a = sent_a_diffs_raw - diffs_b = sent_b_diffs_raw - else: - diffs_a = list(set(get_diff_spans(sent_a, sent_a_diffs_raw, nlp))) - diffs_b = list(set(get_diff_spans(sent_b, sent_b_diffs_raw, nlp))) - - span_diffs_a = [d[1] for d in diffs_a] - span_diffs_b = [d[1] for d in diffs_b] - raw_diffs_a = [d[0] for d in diffs_a] - raw_diffs_b = [d[0] for d in diffs_b] - - return { - "sent_a": sent_a, - "sent_b": sent_b, - "sent_a_diffs": span_diffs_a, - "sent_b_diffs": span_diffs_b, - "sent_a_raw_diffs": raw_diffs_a, - "sent_b_raw_diffs": raw_diffs_b, - } - - -def generate_diffs(documents, edits): - all_diffs = [] - i = 0 - for article in tqdm(documents): - title = article["title"] - edit = edits[title] - - sent_a = article["text"] - sent_b = edit["text"] - - doc_id = article["id"] - assert ( - article["id"] == edit["id"] - ), f"Mismatch article - title: {title}, {edit['title']}, id: {article['id']}, {edit['id']}" - - DEBUG = False - # run: python -m spacy download en - # nlp = spacy.load("en") - ## nlp = spacy.load("en_core_web_sm") - # nlp.add_pipe(BeneparComponent("benepar_en3")) - - diff = get_sentence_diff(sent_a, sent_b, nlp) - diff["title"] = (title,) - diff["doc_id"] = doc_id - all_diffs.append(diff) - - if len(all_diffs) > 1000: - print("Writing", i) - pickle.dump(all_diffs, open(f"output/diffs_{i}.pkl", "wb")) - all_diffs = [] - - i += 1 - - -def check_alphanumeric(s): - return re.match("(?s).*[a-zA-Z0-9]+(?s).*$", s) is not None - - -def generate_sentence_level_diffs(documents, edits): - all_diffs = [] - count = 0 - - # for article in tqdm(documents): - for article in documents: - title = article["title"] - edit = edits[title] - sent_a = article["text"] - sent_b = edit["text"] - - splits_a, index_to_sent_a, sent_to_index_a = split_sentences(sent_a) - splits_b, index_to_sent_b, sent_to_index_b = split_sentences(sent_b) - - d = difflib.Differ( - linejunk=lambda x: x in " \n", charjunk=lambda x: x in " \n \t" - ) - diff = list(d.compare(splits_a, splits_b)) - - index = 0 - last_match = 0 - options = defaultdict(list) - for i in range(len(diff)): - - code = diff[i][:2] - if code == "? ": - continue - elif code == "+ ": - options[last_match].append(diff[i]) - elif code == "- ": - options[last_match].append(diff[i]) - else: - options[index] = diff[i][2:] - last_match = index + 1 - index += 1 - - diff_data = [] - - has_diff = False - for key, value in options.items(): - # print(key, value) - if not isinstance(value, list): - diff_data.append( - { - "sent_a": value, - "sent_b": value, - "sent_a_diffs": [], - "sent_b_diffs": [], - "diff_type": None, - } - ) - continue - - diff_a = [d[2:] for d in value if "- " in d] - diff_b = [d[2:] for d in value if "+ " in d] - - has_diff = True - - # nlp = spacy.load("en") - # nlp.add_pipe(BeneparComponent("benepar_en3")) - - for da in diff_a: - match = False - for i in range(len(diff_b) - 1, -1, -1): - db = diff_b[i] - score = sentence_bleu([da.split()], db.split()) - # print(score) - if score > 0.1: - # local_a, local_b = get_diffs(da, db) - diff = get_sentence_diff(da, db, nlp=None) - diff["diff_type"] = "EDIT" - diff["score"] = score - - # filter alphanumeric - # orig_a = list( diff["sent_a_diffs"]) - # orig_b = list( diff["sent_b_diffs"]) - diff["sent_a_diffs"] = [ - d for d in diff["sent_a_diffs"] if check_alphanumeric(d) - ] - diff["sent_b_diffs"] = [ - d for d in diff["sent_b_diffs"] if check_alphanumeric(d) - ] - if ( - len(diff["sent_a_diffs"]) == 0 - and len(diff["sent_b_diffs"]) == 0 - and da == db - ): - diff["diff_type"] = None - # print("CONVERT EDIT TO NONE") - # print(diff["sent_a_raw_diffs"]) - # print(diff["sent_b_raw_diffs"]) - # print(orig_a) - # print(orig_b) - - # pprint(diff) - diff_data.append(diff) - del diff_b[i] # avoid double counting - match = True - break - - if not match: - diff_data.append( - { - "sent_a": da, - "sent_b": "", - "sent_a_diffs": [da], - "sent_b_diffs": [], - "diff_type": "DELETE", - } - ) - - for db in diff_b: - diff_data.append( - { - "sent_a": "", - "sent_b": db, - "sent_a_diffs": [], - "sent_b_diffs": [db], - "diff_type": "INSERT", - } - ) - - # pprint([d for d in diff_data if d['diff_type'] is not None]) - all_diffs.append(diff_data) - count += 1 - - # if len(all_diffs) > 1000: - # print("Writing", count) - # pickle.dump(all_diffs, open(f"output/sent_diffs_{count}.pkl", "wb")) - # all_diffs = [] - - return all_diffs, has_diff - - -def main(): - edits = read_incr_dump("/home/ubuntu/incr-enwiki-20190206/text/") - print("finished reading edits", len(edits.keys())) - documents = read_dump("/home/ubuntu/enwiki-20190201/tmp/parsed", edits) - print("finished reading docs", len(documents)) - - print("generating diffs...") - # generate_diffs(documents, edits) - generate_sentence_level_diffs(documents, edits) - - -if __name__ == "__main__": - main() diff --git a/wikipedia/preprocessing/log_data.py b/wikipedia/preprocessing/log_data.py deleted file mode 100644 index c7fdb7a..0000000 --- a/wikipedia/preprocessing/log_data.py +++ /dev/null @@ -1,79 +0,0 @@ -import wandb -import configparser -import os - - -def log_questions(run, config): - # log questions file - artifact = wandb.Artifact("questions", type='dataset') - artifact.add_file(config["files"]["raw_questions_file"]) - artifact.add_file(config["files"]["questions_file"]) - artifact.add_file(config["files"]["test_questions_file"]) - artifact.add_file(config["files"]["train_questions_file"]) - run.log_artifact(artifact) - -def log_files(run, config): - # log files - artifact = wandb.Artifact("files", type='dataset') - artifact.add_file(config["files"]["changes_file"]) - artifact.add_file(config["files"]["titles_file"]) - artifact.add_file(config["files"]["edits_file"]) - run.log_artifact(artifact) - -def log_pageview(run, config): - # log pageview - artifact = wandb.Artifact("pageviews", type='dataset') - artifact.add_file(config["files"]["raw_pageview_file"]) - artifact.add_file(config["files"]["pageview_file"]) - artifact.add_file(config["files"]["timestamp_weights_file"]) - run.log_artifact(artifact) - -def log_simulation(run, config): - # log simulation data - artifact = wandb.Artifact("simulation", type='dataset') - artifact.add_file(config["simulation"]["stream_edits_file"]) - artifact.add_file(config["simulation"]["stream_questions_file"]) - artifact.add_file(config["simulation"]["init_data_file"]) - run.log_artifact(artifact) - -def log_plans(run, config, plan_dir): - artifact = wandb.Artifact("plans", type='dataset') - artifact.add_file(config["simulation"]["optimal_plan_file"]) - artifact.add_dir(plan_dir) - run.log_artifact(artifact) - -def log_plan_data(run, config, plan_name, plan_path): - artifact = wandb.Artifact(plan_name, type='dataset') - artifact.add_dir(plan_path) - run.log_artifact - - -def log_experiment(run, config): - # log experiment output - artifact = wandb.Artifact("prediction_results", type='dataset') - #files = os.listdir(config["directory"]["dpr_dir"]) - #files = os.listdir("/data/wooders/wikipedia/predictions") - artifact.add_dir("/data/wooders/wikipedia/predictions") - #for filename in files: - # if "plan-" in filename and '.json' in filename: - # artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) - # if "plan-" in filename and '.pkl' in filename: - # artifact.add_file(os.path.join(config["directory"]["dpr_dir"], filename)) - - run.log_artifact(artifact) - -if __name__ == "__main__": - - print("Running wandb logging on data") - run = wandb.init(job_type="dataset-creation", project="wiki-workload") - - # configuration file - config = configparser.ConfigParser() - config.read("config.yml") - - log_questions(run, config) - log_files(run, config) - log_pageview(run, config) - log_simulation(run, config) - log_experiment(run, config) - diff --git a/wikipedia/preprocessing/wiki_api_data.py b/wikipedia/preprocessing/wiki_api_data.py deleted file mode 100644 index 06b9b7b..0000000 --- a/wikipedia/preprocessing/wiki_api_data.py +++ /dev/null @@ -1,731 +0,0 @@ -import os -import time -import pickle -import json -from tqdm import tqdm -from collections import defaultdict -from datetime import datetime -import subprocess - -import configparser -import argparse - -import pandas as pd -import numpy as np - -from multiprocessing import Pool - -import wandb - -# from concurrent.futures import ProcessPoolExecutor - -# from generate diffs file (originally from DPR repo... sorry kevin) -from generate_diffs import generate_sentence_level_diffs -from embedding import generate_embeddings - -from log_data import log_files, log_pageview, log_simulation, log_questions - - -def query_recentchanges(start_time, end_time, revision_file): - from bs4 import BeautifulSoup - pass - - -def query_doc_versions(titles_file, start_time, end_time, raw_doc_dir): - # TODO: query doc versions - titles_df = pd.read_csv(titles_file) - titles = list(set(top_titles.index.tolist())) - pass - - -def get_recent_changes(revisions_dir, changes_file): - changes = [] - revids = set([]) - files = os.listdir(revisions_dir) - for i in range(len(files)): - f = files[i] - f_changes = json.loads(open(os.path.join(revisions_dir, f), "r").read()) - - for change in f_changes: - if change["revid"] in revids: - continue - - changes.append(change) - revids.add(change["revid"]) - - # if i % 100 == 0: - # print(f"Read {i}/{len(files)}, changes so far: {len(changes)}") - - # create dataframe - changes_df = pd.DataFrame(changes) - - # create time index - changes_df["datetime"] = pd.to_datetime(changes_df["timestamp"]) - changes_df = changes_df.set_index("datetime").sort_index() - - # save to CSV file - changes_df.to_csv(changes_file) - - return changes_df - - -def get_titles(changes_file, titles_file, n=200): - changes_df = pd.read_csv(changes_file) - title_ids = set(changes_df[["title", "pageid"]].apply(tuple, axis=1).tolist()) - - counts = changes_df.title.value_counts().to_frame() - top_titles = counts[counts["title"] > n] - top_titles.columns = ["count"] - top_titles["title"] = top_titles.index - top_titles.to_csv(titles_file) - return top_titles - - -def get_edits(edits_file, changes_file, titles_file): - changes_df = pd.read_csv(changes_file) - titles_df = get_titles(changes_file, titles_file) - titles = list(set(titles_df.index.tolist())) - edits_df = changes_df[changes_df.title.apply(lambda x: x in titles)] - - # assign timestamps - edits_df["ts_min"] = ( - pd.to_datetime(edits_df["datetime"]) - .astype(np.int64) - .apply(assign_timestamps_min) - ) - - # write CSV - edits_df.to_csv(edits_file) - return edits_df - - -def get_questions(raw_questions_file, questions_file): - questions_df = pd.read_csv(raw_questions_file, sep="\t") - questions_df.columns = [ - "question", - "answer", - "doc_id", - "datetime", - "revid", - "oldrevid", - ] - - # assign timestamps - questions_df["ts_min"] = ( - pd.to_datetime(questions_df["datetime"]) - .astype(np.int64) - .apply(assign_timestamps_min) - ) - - # write CSV - questions_df.to_csv(questions_file) - return questions_df - -def get_pageviews(raw_pageview_file, pageview_file, edits_file, timestamp_weights_file): - - edits_df = pd.read_csv(edits_file) - pageview_df = pd.read_csv(raw_pageview_file) - - # map title -> id - title_to_id = edits_df.set_index("title")["pageid"].to_dict() - open("title_to_id.json", "w").write(json.dumps(title_to_id)) - - # calculate page weights - total_views = pageview_df.iloc[:, 2:].sum(axis=1).sum() - weights = pageview_df.iloc[:, 2:].sum(axis=1) / total_views - pageview_df['weights'] = weights - pageview_df['doc_id'] = pageview_df['title'].apply(lambda x: title_to_id[x]) - pageview_df.to_csv(pageview_file) - - # page weights per timestamp - ts_to_weights = {} - dates = pageview_df.columns[2:-2] - for date in dates: - print(date) - dt = datetime.strptime(date[:-2], '%Y%m%d') - ts = dt.timestamp() * 1000000000 - ts_min = assign_timestamps_min(ts) - view_counts = pageview_df[date].tolist() - id_to_count = pageview_df.set_index("doc_id")[date].to_dict() - ts_to_weights[ts_min] = id_to_count - open(timestamp_weights_file, "w").write(json.dumps(ts_to_weights)) - print("Generated ts weights file", timestamp_weights_file) - return pageview_df - - -# create diff JSON file from valid list of revision pairs, doc pkl -def create_diff_json(doc_pkl, rev_pairs, diff_dir): - - # load data for file - data = pickle.loads(open(doc_pkl, "rb").read()) - title = os.path.basename(doc_pkl).replace(".pkl", "") - - for i in range(len(data)): - orig_doc = data[i] - - for j in range(0, len(data), 1): - new_doc = data[j] - - rev_pair = orig_doc["id"] + "_" + new_doc["id"] - - if rev_pair not in rev_pairs: - continue - - diff_file = os.path.join(diff_dir, rev_pair + ".json") - if os.path.exists(diff_file): - # skip - continue - - edits = {orig_doc["title"]: new_doc} - try: - all_diffs = generate_sentence_level_diffs([orig_doc], edits) - except Exception as e: - print(e) - raise ValueError(f"Failed to parse diffs {rev_pair}") - diff = { - "title": orig_doc["title"], - "timestamp": rev_pairs[rev_pair], - "orig_id": orig_doc["id"], - "new_id": new_doc["id"], - "diffs": all_diffs, - } - open(diff_file, "w").write(json.dumps(diff, indent=2)) - - -def generate_diffs_helper(filename, diff_dir, rev_pair, timestamp): - - data = pickle.loads(open(filename, "rb").read()) - - for i in range(len(data)): - for j in range(len(data)): - orig_doc = data[i] - new_doc = data[j] - - if new_doc["id"] + "_" + orig_doc["id"] != rev_pair: - continue - - # parse diffs - diff_file = os.path.join(diff_dir, rev_pair + ".json") - - if os.path.exists(diff_file): - continue - - edits = {orig_doc["title"]: new_doc} - st = time.time() - all_diffs, has_diff = generate_sentence_level_diffs([orig_doc], edits) - # print("runtime", time.time() - st) - diff = { - "title": orig_doc["title"], - "timestamp": timestamp, - "orig_id": orig_doc["id"], - "new_id": new_doc["id"], - "diffs": all_diffs, - } - if has_diff: - diff = { - "title": orig_doc["title"], - "timestamp": timestamp, - "orig_id": orig_doc["id"], - "new_id": new_doc["id"], - "diffs": all_diffs, - } - else: - diff = { - "title": orig_doc["title"], - "timestamp": timestamp, - "orig_id": orig_doc["id"], - "new_id": new_doc["id"], - "diffs": [], - } - # TODO: write to tmp file first (make sure we dont have messed up files) - open(diff_file, "w").write(json.dumps(diff, indent=2)) - return - - -def generate_diffs( - edits_file, titles_file, parsed_doc_dir, diff_dir, revision_file, workers=32 -): - - # make sure title is in titles df - titles_df = pd.read_csv(titles_file) - titles = list(set(titles_df.title.tolist())) - - # print(titles) - - # filter out revision pairs not in edits_file - edits_df = pd.read_csv(edits_file) - title_to_rev_pairs = defaultdict(dict) - for index, row in edits_df.iterrows(): - if row["title"] not in titles: - continue # skip if not top title - - # map title -> (revid, old_revid) -> timestamp of revision - rev_pair = str(row["revid"]) + "_" + str(row["old_revid"]) - title_to_rev_pairs[row["title"]][rev_pair] = row["timestamp"] - - open(revision_file, "w").write(json.dumps(title_to_rev_pairs)) - - num_keys = len(title_to_rev_pairs.keys()) - # print( - # f"Proceessing revisions for {num_keys} titles, writing to {diff_dir}" - # ) - - inputs = [] - for title in tqdm(titles): - filename = os.path.join(parsed_doc_dir, f"{title}.pkl") - if not os.path.exists(filename): - print("missing", filename) - continue - - for rev_pair in title_to_rev_pairs[title].keys(): - if os.path.exists(os.path.join(diff_dir, rev_pair + ".json")): - continue - inputs.append( - (filename, diff_dir, rev_pair, title_to_rev_pairs[title][rev_pair]) - ) - - print("processing revids", len(inputs), diff_dir) - chunk_size = 100000 - for i in range(0, len(inputs), chunk_size): - p = Pool(128) - print("created pool", i, i + chunk_size, len(inputs)) - p.starmap(generate_diffs_helper, inputs[i : i + chunk_size]) - p.close() - - return - - # diff remaining - inputs = [ - ( - os.path.join(parsed_doc_dir, f"{title}.pkl"), - title_to_rev_pairs[title], - diff_dir, - ) - for title in titles - ] - p = Pool(workers) - p.starmap(create_diff_json, inputs) - p.close() - - -# convert wikipedia dump into single pkl file per title -def dump_to_pickle_title(top_folder, target_dir, title): - total = 0 - docs = [] - for folder in os.listdir(top_folder): - for file in os.listdir(os.path.join(top_folder, folder)): - - filename = os.path.join(top_folder, folder, file) - data = open(filename, "r").read() - soup = BeautifulSoup(data, "html.parser") - - for doc in soup.find_all("doc"): - id = doc.get("id") - title = doc.get("title") - url = doc.get("url") - text = doc.get_text() - docs.append({"id": id, "url": url, "title": title, "text": text}) - total += len(docs) - pickle.dump(docs, open(os.path.join(target_dir, title + ".pkl"), "wb")) - return os.path.join(target_dir, title + ".pkl") - - -# call wikiextractor library on XML -def extract(title, raw_doc_dir, parsed_tmp_dir, parsed_doc_dir): - f = f"{raw_doc_dir}/{title}" - bashCommand = f"wikiextractor {f} -o {parsed_tmp_dir}/tmp_parsed{title}" - - process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) - output, error = process.communicate() - - pkl_file = dump_to_pickle_title( - f"{parsed_tmp_dir}/tmp_parsed{title}", parsed_doc_dir, title - ) - - -def parse_docs(raw_doc_dir, parsed_tmp_dir, parsed_doc_dir, workers=32): - # parse documents from raw XML - - # extract individual doc - files = os.listdir(raw_doc_dir) - # TODO: add assert to make sure titles correspond to filenames - files = [ - (f, raw_doc_dir, parsed_tmp_dir, parsed_doc_dir) - for f in files - if not os.path.isdir(f) - ] - - # create pool and run - p = Pool(workers) - p.starmap(extract, files) - p.close() - - -# assign timesteps -def assign_timestamps_min(ts): - # take in unix timestamp - covert to integer - start_ts = 1628131044000000000 # don't change - delta = ts - start_ts - if delta < 0: - return None - - return int(delta / (60 * 1000000000)) - - -def generate_simulation_data( - questions_file, - edits_file, - diff_dir, - init_data_file, - stream_edits_file, - stream_questions_file, -): - edits_df = pd.read_csv(edits_file) - questions_df = pd.read_csv(questions_file) - - # lists for questions/edits at each timestep - questions = [] - edits = [] - - # initialization data for embeddings/passages - init_data = {} - - # timestamp to stop - max_ts = int(questions_df.ts_min.max()) - - # loop through timestamps - for ts in range(max_ts + 1): - - ts_edits = defaultdict(list) - ts_queries = defaultdict(list) - for index, row in edits_df[edits_df["ts_min"] == ts].iterrows(): - filename = str(row["revid"]) + "_" + str(row["old_revid"]) + ".json" - key = row["pageid"] - - # make sure file is OK - file_path = os.path.join(diff_dir, filename) - if os.path.exists(file_path): - try: - data = json.load(open(file_path)) - if len(data["diffs"]) == 0: - continue - diffs = data["diffs"][0] - except Exception as e: - print(file_path) - print(e) - continue - diff_types = [ - d["diff_type"] for d in diffs if d["diff_type"] is not None - ] - if len(diff_types) == 0: - print(f"Invalid file {filename}") - continue - assert str(data["orig_id"]) == str( - row["old_revid"] - ), f"Invalid id {filename}, id {data['orig_id']} row {row['revid']}" - - # get length of passage - - if key not in init_data: - diffs = data["diffs"][0] - init_data[key] = { - "revid": data["orig_id"], - "sents": [d["sent_a"] for d in diffs], - "file": filename, - "ts_min": row["ts_min"], - } - ts_edits[key].append(filename) - - else: - # print("missing", file_path) - continue - - for index, row in questions_df[questions_df["ts_min"] == ts].iterrows(): - key = row["doc_id"] - ts_queries[key].append( - { - "question": row["question"], - "doc_id": key, - "answer": row["answer"], - "datetime": row["datetime"], - "ts_min": row["ts_min"], - "revid": row["revid"], - "old_revid": row["oldrevid"], - } - ) - - edits.append(ts_edits) - questions.append(ts_queries) - - if ts % 1000 == 0: - unique_files = set([]) - for e in edits: - for files in e.values(): - for f in files: - unique_files.add(f) - print(f"Num edits ts {ts}/{max_ts+1}: {len(unique_files)}") - - open(stream_edits_file, "w").write(json.dumps(edits)) - open(stream_questions_file, "w").write(json.dumps(questions)) - open(init_data_file, "w").write(json.dumps(init_data)) - - -def search_answer(rev_file, embedding_dir, question): - # read file and see if answer is contained - revid = rev_file.replace(".json", "").split("_")[0] - # assert str(revid) == str(question["revid"]), f"Invalid id {revid}, {question}" - embedding_filename = os.path.join(embedding_dir, f"{revid}_new.pkl") - try: - passages = pickle.load(open(embedding_filename, "rb"))["passages"] - except Exception as e: - print(e) - print("File error", embedding_filename) - return False - - found_answer = False - for passage in passages: - if question["answer"] in passage: - found_answer = True - return found_answer - - -def generate_key_weights(pageview_file, titles_file): - pass - - -def check_dataset( - titles_file, - edits_file, - init_data_file, - stream_edits_file, - stream_questions_file, - diff_dir, -): - # TODO: add checks (init data keys match stream keys, questions match keys, etc.) - - # load data - edits_df = pd.read_csv(edits_file) - titles_df = get_titles(changes_file, titles_file) - titles = list(set(titles_df.index.tolist())) - init_data = json.load(open(init_data_file)) - edits = json.load(open(stream_edits_file)) - questions = json.load(open(stream_questions_file)) - - # same length - assert len(questions) == len(edits) - - for ts in range(len(questions)): - for doc_id in questions[ts].keys(): - if not doc_id in init_data: - print("missing doc", doc_id) - continue - for question in questions[ts][doc_id]: - # print(question) - answer = question["answer"] - # import pdb; pdb.set_trace() - - # question = questions[ts][doc_id] - rev_file = ( - str(question["revid"]) + "_" + str(question["old_revid"]) + ".json" - ) - - if not os.path.exists(os.path.join(diff_dir, rev_file)): - print("Still missing diff", rev_file) - continue - - # question generated from document edit - assert it was created before - found = False - revision_file = None - found_index = 0 - for i in range(ts): - if doc_id in edits[ts - i]: - if rev_file in edits[ts - i][doc_id]: - found = True - revision_file = rev_file - found_index = ts - i - break - if not found: - # only option is that it was derived from original doc - assert str(init_data[doc_id]["revid"]) == str( - question["old_revid"] - ), f"Missing revision {ts}, {rev_file}, {doc_id}, init version {init_data[doc_id]['revid']}" - revision_file = init_data[doc_id]["file"] - - # search for answer in revision file - found_answer = search_answer(revision_file, embedding_dir, question) - if not found_answer: - print("NOT FOUND", found_answer, revision_file) - else: - print("FOUND", found_answer, revision_file) - - if ( - question["question"] - == "how far is hurricane ida from cuba?????????????????" - ): - print("DEBUG", question) - print(rev_file) - print("question ts", ts, "edit ts", found_index) - for i in range(found_index, ts + 1, 1): - if doc_id in edits[i]: - print( - i, - edits[i][doc_id], - search_answer( - edits[i][doc_id][-1], embedding_dir, question - ), - ) - print(found_answer) - - # docid_to_title = {} - # for index, row in edits_df.iterrows(): - # docid_to_title[row["pageid"]] = row["title"] - - # open("docid_to_title.json", "w").write(json.dumps(docid_to_title)) - - ## check matching keys - # last_doc = init_data - # for i in len(edits): - # # TODO: assert that question actually contained in this edit? - # continue - - # check each edit is contained - - # check raw edit timestamp is same as query timestamp - - -if __name__ == "__main__": - - run = wandb.init(job_type="dataset-creation", project="wiki-workload") - - # configuration file - config = configparser.ConfigParser() - config.read("config.yml") - - # argument flags - parser = argparse.ArgumentParser() - parser.add_argument( - "--run_query_recentchanges", action="store_true", default=False - ) # query wiki api for recentchanges - parser.add_argument( - "--run_query_doc_versions", action="store_true", default=False - ) # query wiki api for doc versions - parser.add_argument( - "--run_recent_changes", action="store_true", default=False - ) # re-processing api changes data - parser.add_argument( - "--run_parse_docs", action="store_true", default=False - ) # re-parse document versions - parser.add_argument("--run_get_questions", action="store_true", default=False) - parser.add_argument("--run_get_pageviews", action="store_true", default=False) - parser.add_argument( - "--run_generate_diffs", action="store_true", default=False - ) # re-process generating diffs - parser.add_argument( - "--run_generate_simulation_data", action="store_true", default=False - ) - parser.add_argument("--run_check_dataset", action="store_true", default=False) - parser.add_argument("--run_generate_embeddings", action="store_true", default=False) - args = parser.parse_args() - - # directories - data_dir = config["directory"]["data_dir"] - revisions_dir = config["directory"]["revisions_dir"] - raw_doc_dir = config["directory"]["raw_doc_dir"] - parsed_doc_dir = config["directory"]["parsed_doc_dir"] - parsed_tmp_dir = config["directory"]["parsed_tmp_dir"] - diff_dir = config["directory"]["diff_dir"] - embedding_dir = config["directory"]["embedding_dir"] - - # intermediate files - model_file = config["files"]["model_file"] - changes_file = config["files"]["changes_file"] - titles_file = config["files"]["titles_file"] - revisions_file = config["files"]["revisions_file"] - edits_file = config["files"]["edits_file"] - raw_questions_file = config["files"]["raw_questions_file"] - questions_file = config["files"]["questions_file"] - raw_pageview_file = config["files"]["raw_pageview_file"] - pageview_file = config["files"]["pageview_file"] - timestamp_weights_file = config["files"]["timestamp_weights_file"] - - # simulation data - init_data_file = config["simulation"]["init_data_file"] - stream_edits_file = config["simulation"]["stream_edits_file"] - stream_questions_file = config["simulation"]["stream_questions_file"] - - if args.run_query_recentchanges: - query_edit_stream(start_time, end_time, revisions_dir) - - if args.run_query_doc_versions: - query_doc_versions(titles_file, start_time, end_time, raw_doc_dir) - - if args.run_recent_changes: - print("Generating from revisions", revisions_dir) - changes_df = get_recent_changes(revisions_dir, changes_file) - - print("Generated changes file", changes_file) - titles_df = get_titles(changes_file, titles_file) - print("Generated titles file", titles_file) - edits_df = get_edits(edits_file, changes_file, titles_file) - print("Generated edits file", edits_file) - log_files(run, config) - - # query document versions for list of titles - if args.run_query_doc_versions: - if not os.path.exists(raw_doc_dir): - os.mkdir(raw_doc_dir) - query_doc_versions(titles_file, start_time, end_time, raw_doc_dir) - - # parse documents - if args.run_parse_docs: - if not os.path.exists(parsed_doc_dir): - os.mkdir(parsed_doc_dir) - if not os.path.exists(parsed_tmp_dir): - os.mkdir(parsed_tmp_dir) - parse_docs(raw_doc_dir, parsed_tmp_dir, parsed_doc_dir, workers=32) - - # get questions - if args.run_get_questions: - questions_df = get_questions(raw_questions_file, questions_file) - print("Generated questions file", raw_questions_file, questions_file) - log_questions(run, config) - - # generate pageviews / compute page weights - if args.run_get_pageviews: - get_pageviews(raw_pageview_file, pageview_file, edits_file, timestamp_weights_file) - log_pageview(run, config) - - # generate diffs between document versions - if args.run_generate_diffs: - # if not os.path.isdir(diff_dir): - # os.mkdir(diff_dir) - generate_diffs( - edits_file, titles_file, parsed_doc_dir, diff_dir, revisions_file - ) - - # generate simulation data - if args.run_generate_simulation_data: - generate_simulation_data( - questions_file, - edits_file, - diff_dir, - init_data_file, - stream_edits_file, - stream_questions_file, - ) - log_simulation(run, config) - - # run tests to validate simulation data - if args.run_check_dataset: - check_dataset( - titles_file, - edits_file, - init_data_file, - stream_edits_file, - stream_questions_file, - diff_dir, - ) - - # generate embeddings for revids from diffs (make passages) - if args.run_generate_embeddings: - generate_embeddings(model_file, diff_dir, embedding_dir) diff --git a/wikipedia/run_0_generate_data.sh b/wikipedia/run_0_generate_data.sh deleted file mode 100644 index 5d92ff6..0000000 --- a/wikipedia/run_0_generate_data.sh +++ /dev/null @@ -1 +0,0 @@ -python preprocessing/wiki_api_data.py --run_generate_simulation_data --run_get_questions diff --git a/wikipedia/run_1_generate_plan.sh b/wikipedia/run_1_generate_plan.sh deleted file mode 100644 index 8bb1f25..0000000 --- a/wikipedia/run_1_generate_plan.sh +++ /dev/null @@ -1,19 +0,0 @@ -set -xe - -for replicas in 16 32 -do - for model_runtime in 0.25 #0.001 0.05 0.01 0.1 1.0 5.0 10.0 - do - for event_policy in "lifo" #"fifo" - do - for load_shedding_policy in "always_process" - do - for key_policy in "round_robin" "weighted_round_robin" - do - python simulate.py --model_runtime $model_runtime --send_rate 100 \ - --event_policy $event_policy --key_policy $key_policy --load_shedding_policy $load_shedding_policy --num_replicas ${replicas} - done - done - done - done -done diff --git a/wikipedia/run_2_prepare_data.sh b/wikipedia/run_2_prepare_data.sh deleted file mode 100644 index 89faacc..0000000 --- a/wikipedia/run_2_prepare_data.sh +++ /dev/null @@ -1,20 +0,0 @@ -set -xe - -plan_dir=/data/wooders/wiki-plans - -for replicas in 16 32 -do -for model_runtime in 0.25 -do - for event_policy in "lifo" - do - for load_shedding_policy in "always_process" - do - for key_policy in "round_robin" "weighted_round_robin" - do - python wiki_eval_tmp.py --offline-plan-path ${plan_dir}/plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas}.json --workers 32 - done - done - done -done -done diff --git a/wikipedia/run_3_run_predictions.sh b/wikipedia/run_3_run_predictions.sh deleted file mode 100644 index 9b2d73b..0000000 --- a/wikipedia/run_3_run_predictions.sh +++ /dev/null @@ -1,30 +0,0 @@ -set -xe - -plan_dir=/data/wooders/wiki-plans -dpr_dir=~/DPR - -cd $dpr_dir - -for replicas in 16 32 -do -for event_policy in "lifo" -do - for model_runtime in 0.25 - #for model_runtime in 0.01 0.05 0.1 1.0 10.0 0.25 0.005 - do - for load_shedding_policy in "always_process" - do - for key_policy in "weighted_round_robin" "round_robin" - do - #plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 - plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100_replicas_${replicas} - echo $plan_file - CUDA_VISIBLE_DEVICES=4 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file & - - #pid=$! - done - #wait $pid - done - done -done -done diff --git a/wikipedia/run_4_run_optimal_predictions.sh b/wikipedia/run_4_run_optimal_predictions.sh deleted file mode 100644 index 14abe91..0000000 --- a/wikipedia/run_4_run_optimal_predictions.sh +++ /dev/null @@ -1,8 +0,0 @@ -set -xe - -plan_dir=/data/wooders/wiki-plans -dpr_dir=~/DPR -python wiki_eval.py --offline-plan-path optimal_plan.json -cd $dpr_dir -echo $plan_file -CUDA_VISIBLE_DEVICES=5 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh optimal_plan diff --git a/wikipedia/run_5_pipeline_predict.sh b/wikipedia/run_5_pipeline_predict.sh deleted file mode 100644 index ea36102..0000000 --- a/wikipedia/run_5_pipeline_predict.sh +++ /dev/null @@ -1,30 +0,0 @@ -set -xe - -plan_dir=/data/wooders/wiki-plans -dpr_dir=/home/eecs/wooders/DPR -wiki_dir=/home/eecs/wooders/experiments/wikipedia - - -#for key_policy in "weighted_random" "weighted_round_robin" -#for key_policy in "random" "weighted_random" -for model_runtime in 0.01 0.05 0.1 1 10 0.25 0.005 -do - for event_policy in "lifo" "fifo" - do - for load_shedding_policy in "always_process" - do - for key_policy in "round_robin" "weighted_round_robin" "random" "weighted_random" - do - cd $wiki_dir - plan_file=plan-${key_policy}_${event_policy}-${load_shedding_policy}-${model_runtime}-100 - echo $plan_file - python wiki_eval.py --offline-plan-path ${plan_dir}/${plan_file}.json - cd $dpr_dir - CUDA_VISIBLE_DEVICES=3 bash ${dpr_dir}/evaluate_retrieval_single_doc_stream.sh $plan_file - pid=$! - done - wait $pid - done - done -done -p diff --git a/wikipedia/run_wiki.sh b/wikipedia/run_wiki.sh deleted file mode 100644 index 55a3035..0000000 --- a/wikipedia/run_wiki.sh +++ /dev/null @@ -1,10 +0,0 @@ -FILE="passages_sent_diffs_10010.pkl" -MODEL_FILE="/home/ubuntu/DPR/checkpoint/retriever/single/nq/bert-base-encoder.cp" -SEND_RATE=100 -DATA_DIR="/home/ubuntu/flink-feature-flow/RayServer/data/" -EXP_DIR="/home/ubuntu/flink-feature-flow/RayServer/experiments/" -TIMESTAMP=$(date +%s) -EXP="experiment_$TIMESTAMP" -echo $EXP; -python wiki_server.py --data-dir $DATA_DIR --send-rate $SEND_RATE --exp-dir $EXP_DIR --exp $EXP --file $FILE --model_file $MODEL_FILE -#python wiki_client.py --exp $EXP_DIR diff --git a/wikipedia/simulate.py b/wikipedia/simulate.py deleted file mode 100644 index 27db63a..0000000 --- a/wikipedia/simulate.py +++ /dev/null @@ -1,616 +0,0 @@ -import itertools -import json -from typing import DefaultDict, Dict, List, Optional, Tuple -from more_itertools.more import divide -from collections import defaultdict -from dataclasses import dataclass -from functools import cmp_to_key -import random - -import configparser -import argparse - -import pandas as pd - -import wandb - -import simpy -from ralf.state import Record -from ralf.policies.load_shedding_policy import ( - always_process, - make_mean_policy, - make_sampling_policy, -) -from ralf.policies.processing_policy import fifo, lifo # , make_sorter_with_key_weights -from ralf.simulation.priority_queue import PerKeyPriorityQueue -from ralf.simulation.source import JSONSource -from ralf.simulation.window import WindowOperator -from ralf.simulation.mapper import ( - RalfMapper, - RoundRobinLoadBalancer, - CrossKeyLoadBalancer, -) - - -from ralf.policies.load_shedding_policy import ( - always_process, - newer_processing_time, - later_complete_time, - make_sampling_policy, - make_mean_policy, - make_cosine_policy, -) - -from typing import Dict, List, Tuple, Type - -from preprocessing.log_data import log_plans - -def current_weights(ts, ts_to_weights): - ts = int(ts) - min_dist = max(list(ts_to_weights.keys())) - - index = 0 - for key in ts_to_weights.keys(): - if key >= ts: - break - index = key - - return ts_to_weights[key] - -class KeyFIFO(CrossKeyLoadBalancer): - """Simple policy that cycle through all the keys fairly""" - - def __init__(self, num_replicas=1): - self.cur_key_set = {} - for replica_id in range(num_replicas): - self.cur_key_set[replica_id] = set() - print(num_replicas, self.cur_key_set) - - def choose( - self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int - ) -> str: - key_set = set(per_key_queues.keys()) - if key_set != self.cur_key_set[replica_id]: - #print("reset keys", replica_id, len(key_set), key_set, self.cur_key_set[replica_id]) - self.cur_key_set[replica_id] = key_set - self.cur_key_iter[replica_id] = itertools.cycle(key_set) - - seen = set([]) - while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter[replica_id]) - #print(replica_id, key, per_key_queues[key].size(), per_key_queues[key].size() == 0) - if key in seen: - raise ValueError(f"Did full loop - livelock {replica_id}") - #return None - seen.add(key) - # TODO(simon): maybe do a "peak" here to trigger eviction policies - #print("choose", replica_id, key) - return key - - -class RoundRobinLoadBalancer(CrossKeyLoadBalancer): - """Simple policy that cycle through all the keys fairly""" - - def __init__(self, num_replicas=1): - self.cur_key_iter = {} - self.cur_key_set = {} - for replica_id in range(num_replicas): - self.cur_key_set[replica_id] = set() - self.cur_key_iter[replica_id] = None - print(num_replicas, self.cur_key_set) - print(num_replicas, "replicas", self.cur_key_iter) - - def choose( - self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int - ) -> str: - key_set = set(per_key_queues.keys()) - if key_set != self.cur_key_set[replica_id]: - #print("reset keys", replica_id, len(key_set), key_set, self.cur_key_set[replica_id]) - self.cur_key_set[replica_id] = key_set - self.cur_key_iter[replica_id] = itertools.cycle(key_set) - - key = next(self.cur_key_iter[replica_id]) - - seen = set([]) - while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter[replica_id]) - #print(replica_id, key, per_key_queues[key].size(), per_key_queues[key].size() == 0) - if key in seen: - raise ValueError(f"Did full loop - livelock {replica_id}") - #return None - seen.add(key) - # TODO(simon): maybe do a "peak" here to trigger eviction policies - #print("choose", replica_id, key) - return key - -class WeightedRoundRobinLoadBalancer(CrossKeyLoadBalancer): - - def __init__(self, all_keys, num_replicas=1): - - self.weights = json.load(open("bucket_weights.json")) - - # set default weight - for key in all_keys: - if key not in self.weights: - self.weights[key] = 1 - - self.cur_key_iter = {} - self.cur_key_set = {} - for replica_id in range(num_replicas): - self.cur_key_set[replica_id] = set() - self.cur_key_iter[replica_id] = None - - def choose( - self, per_key_queues: Dict[str, PerKeyPriorityQueue], replica_id: int - ) -> str: - key_set = set(per_key_queues.keys()) - - # initialize keys - if key_set != self.cur_key_set[replica_id]: - self.cur_key_set[replica_id] = [] - for key in key_set: - for i in range(self.weights[key]): - self.cur_key_set[replica_id].append(key) - random.shuffle(self.cur_key_set[replica_id]) - self.cur_key_iter[replica_id] = itertools.cycle(self.cur_key_set[replica_id]) - - key = next(self.cur_key_iter[replica_id]) - while per_key_queues[key].size() == 0: - key = next(self.cur_key_iter[replica_id]) - return key - - -#class WeightedRoundRobin(CrossKeyLoadBalancer): -# """Simple policy that cycle through all the keys fairly""" -# -# def __init__(self, pageview_file, all_keys): -# self.cur_key_set = [] -# self.cur_key_iter = None -# pageview_df = pd.read_csv(pageview_file) -# -# self.weights = json.load(open("weights.json")) -# -# ##self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() -# #self.raw_weights = pageview_df.set_index("doc_id")["2021090300"].to_dict() -# #self.weights = {} -# #for key in self.raw_weights.keys(): -# # if str(key) not in all_keys: -# # continue -# -# # self.weights[key] = int(self.raw_weights[key]*1000) -# # #assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" -# # if self.weights[key] == 0: -# # self.weights[key] = 1 -# -# -# for key in all_keys: -# if key not in self.weights: -# self.weights[key] = 1 -# -# -# for key in self.weights.keys(): -# for i in range(self.weights[key]): -# self.cur_key_set.append(str(key)) -# random.shuffle(self.cur_key_set) -# self.cur_key_iter = itertools.cycle(self.cur_key_set) -# -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# -# key = next(self.cur_key_iter) -# while per_key_queues[key].size() == 0: -# key = next(self.cur_key_iter) -# # TODO(simon): maybe do a "peak" here to trigger eviction policies -# return key -# -#class AdaptiveWeightedRoundRobin(CrossKeyLoadBalancer): -# """Simple policy that cycle through all the keys fairly""" -# -# def __init__(self, timestamp_weights_file): -# self.cur_key_set = [] -# self.cur_key_iter = None -# -# pageview_df = pd.read_csv(pageview_file) -# self.raw_weights = pageview_df.set_index("doc_id")["weights"].to_dict() -# self.weights = {} -# for key in self.raw_weights.keys(): -# if str(key) not in all_keys: -# continue -# -# self.weights[key] = int(self.raw_weights[key]*1000) -# assert self.weights[key] > 0, f"Too small {key}, {self.raw_weights[key]}" -# -# -# for key in self.weights.keys(): -# for i in range(self.weights[key]): -# self.cur_key_set.append(str(key)) -# random.shuffle(self.cur_key_set) -# self.cur_key_iter = itertools.cycle(self.cur_key_set) -# -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# -# key = next(self.cur_key_iter) -# while per_key_queues[key].size() == 0: -# key = next(self.cur_key_iter) -# # TODO(simon): maybe do a "peak" here to trigger eviction policies -# return key -# -# -#class AdaptiveWeightedLoadBalancer(CrossKeyLoadBalancer): -# -# def __init__(self, timestamp_weights_file): -# data = json.load(open(timestamp_weights_file)) -# self.timestamp_weights = {} -# for key in data.keys(): -# self.timestamp_weights[int(key)] = data[key] -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], timestamp: int) -> str: -# weights_map = current_weights(timestamp, self.timestamp_weights) -# -# chosen_key = None -# max_len = 0 -# total_len = 0 -# keys = [] -# weights = [] -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if size >= 1 and key in weights_map: -# keys.append(key) -# weights.append(weights_map[key]) -# total_len += size -# chosen_key = random.choices(keys, weights, k=1)[0] -# return chosen_key -# -# -#class WeightedLoadBalancer(CrossKeyLoadBalancer): -# -# def __init__(self, pageview_file): -# pageview_df = pd.read_csv(pageview_file) -# #self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() -# self.weights = json.load(open("weights.json")) -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# chosen_key = None -# max_len = 0 -# total_len = 0 -# keys = [] -# weights = [] -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if size >= 1 and int(key) in self.weights: -# keys.append(key) -# weights.append(self.weights[int(key)]) -# total_len += size -# -# chosen_key = random.choices(keys, weights, k=1)[0] -# #print("choose", chosen_key, keys, weights) -# return chosen_key -# -#class RandomLoadBalancer(CrossKeyLoadBalancer): -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# chosen_key = None -# max_len = 0 -# total_len = 0 -# keys = [] -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if size >= 1: -# keys.append(key) -# total_len += size -# -# chosen_key = random.choices(keys, k=1)[0] -# return chosen_key -# -# -#class WeightedLongestQueueLoadBalancer(CrossKeyLoadBalancer): -# -# def __init__(self, pageview_file): -# pageview_df = pd.read_csv(pageview_file) -# self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() -# #print(self.weights) -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# chosen_key = None -# max_len = 0 -# total_len = 0 -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if int(key) not in self.weights: -# continue -# weighted_size = self.weights[int(key)]*self.weights[int(key)] -# if weighted_size > max_len: -# chosen_key = key -# max_len = size -# total_len += size -# #print(chosen_key, max_len, self.weights[int(chosen_key)]) -# per_key_queues[chosen_key].clear() -# print("clear", chosen_key, total_len, per_key_queues[chosen_key].size()) -# return chosen_key -# -#class WeightedLoadBalancer(CrossKeyLoadBalancer): -# -# def __init__(self, pageview_file): -# pageview_df = pd.read_csv(pageview_file) -# self.weights = pageview_df.set_index("doc_id")["weights"].to_dict() -# #print(self.weights) -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# chosen_key = None -# max_len = 0 -# total_len = 0 -# keys = [] -# weights = [] -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if size >= 1 and int(key) in self.weights: -# keys.append(key) -# weights.append(self.weights[int(key)]) -# total_len += size -# -# chosen_key = random.choices(keys, weights, k=1)[0] -# #print("choose", chosen_key, keys, weights) -# return chosen_key -# -#class LongestQueueLoadBalancer(CrossKeyLoadBalancer): -# -# def choose(self, per_key_queues: Dict[str, PerKeyPriorityQueue], ts) -> str: -# chosen_key = None -# max_len = 0 -# total_len = 0 -# for key in per_key_queues.keys(): -# size = per_key_queues[key].size() -# if size > max_len: -# chosen_key = key -# max_len = size -# total_len += size -# per_key_queues[chosen_key].clear() -# -# return chosen_key - - -class WikiMapper(RalfMapper): - def __init__( - self, - env: simpy.Environment, - source_queues: Dict[str, PerKeyPriorityQueue], - key_selection_policy_cls: Type[CrossKeyLoadBalancer], - model_run_time_s: float, - keys: List[str], - num_replicas: int = 1, - ) -> None: - - super().__init__(env, source_queues, key_selection_policy_cls, model_run_time_s, num_replicas) - self.keys = keys - #self.source_queues = source_queues - - # self.env = env - # self.source_queues = source_queues - # self.key_selection_policy = key_selection_policy_cls() - # self.model_runtime_s = model_run_time_s - # self.env.process(self.run()) - - self.ready_time_to_batch: Dict[float, List[Tuple[int, float]]] = {} - - ## Shard source queues into each replica's id. - #source_keys = list(source_queues.keys()) - #random.shuffle(source_keys) - #self.sharded_keys = dict( - # enumerate(map(list, divide(num_replicas, source_keys))) - #) - #self.key_selection_policy = key_selection_policy_cls - #self.model_runtime_s = model_run_time_s - #for i in range(num_replicas): - # print("Run replica", i) - # self.env.process(self.run(replica_id=i)) - - - def run(self, replica_id: int): - this_shard_source_queues = { - key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] - } - #print("keys", replica_id, self.sharded_keys[replica_id]) - - #self.source_queues = { - # key: self.total_source_queues[key] for key in self.sharded_keys[replica_id] - #} - - while True: - x = yield simpy.AnyOf(self.env, [q.wait() for q in this_shard_source_queues.values()]) - #print("YIELD", replica_id, x) - - # choose key - chosen_key = self.key_selection_policy.choose( - this_shard_source_queues, - replica_id, - ) - assert chosen_key is not None - - # make sure queue size OK - jk doesn't work with dropping - # assert total_size_orig == 0 or total_size == total_size_orig, f"Bad queue size {total_size_orig} -> {total_size}" - - # get chosen key - windows = yield this_shard_source_queues[chosen_key].get() - print( - f"at time {self.env.now:.2f}, RalfMapper replica {replica_id} should work on {windows} (last timestamp), wait time {self.model_runtime_s}" - ) - edits = [(val, windows.key) for val in windows.window[0].value] - - if self.env.now in self.ready_time_to_batch: - self.ready_time_to_batch[self.env.now] += edits - else: - self.ready_time_to_batch[self.env.now] = edits - - # TODO: Add variable runtime - - filename = f"{diff_dir}/{edits[0][0]}" - data = json.load(open(filename)) - num_passages = int(len(data["diffs"][0]) / 10) - runtime = self.model_runtime_s * num_passages - #print(runtime, num_passages) - - yield self.env.timeout(runtime) - #yield self.env.timeout(self.model_runtime_s) - - -# configuration file -config = configparser.ConfigParser() -config.read("config.yml") -plan_dir = config["simulation"]["plan_dir"] -diff_dir = config["directory"]["diff_dir"] -#init_data_file = config["simulation"]["init_data_file"] -#stream_edits_file = config["simulation"]["stream_edits_file"] -#stream_questions_file = config["simulation"]["stream_questions_file"] -#pageview_file = config["files"]["pageview_file"] -#timestamp_weights_file = config["files"]["timestamp_weights_file"] - -run = wandb.init(job_type="dataset-creation", project="wiki-workload") -question_dir = run.use_artifact('ucb-ralf/wiki-workload /questions:v2', type='dataset').download() -simulation_dir = run.use_artifact('ucb-ralf/wiki-workload /simulation:v2', type='dataset').download() -pageview_dir = run.use_artifact('ucb-ralf/wiki-workload /pageviews:v0', type='dataset').download() - -init_data_file = f"{simulation_dir}/init_data.json" -stream_edits_file = f"{simulation_dir}/edit_stream.json" -stream_questions_file = f"{simulation_dir}/question_stream.json" -pageview_file = f"{pageview_dir}/pageviews.csv" -timestamp_weights_file = f"{pageview_dir}/timestamp_weights_file.json" - -# load simulation data -edits = json.load(open(stream_edits_file)) -init_data = json.load(open(init_data_file)) -keys = list(init_data.keys()) - - -def run_once( - out_path: str, - prioritization_policy: str, - load_shedding_policy: str, - keys: List[str], - per_key_records_per_second: int, - total_runtime_s: float, - model_runtime_constant: float, - key_selection_policy: str, - num_replicas: int, -): - - policies = { - "fifo": fifo, - "lifo": lifo, - "always_process": always_process, - "round_robin": RoundRobinLoadBalancer(num_replicas=num_replicas), - "weighted_round_robin": WeightedRoundRobinLoadBalancer(keys, num_replicas=num_replicas) - } - - env = simpy.Environment() - - source_to_window_queue = simpy.Store(env) - windows_to_mapper_queue = { - key: PerKeyPriorityQueue( - env, - processing_policy=policies[prioritization_policy], - load_shedding_policy=policies[load_shedding_policy], - ) - for key in keys - } - - JSONSource( - env, - records_per_sec_per_key=per_key_records_per_second, - num_keys=len(keys), - next_queue=source_to_window_queue, - total_run_time=total_runtime_s, - data_file=stream_edits_file, - ) - - WindowOperator( - env, - window_size=1, - slide_size=1, - source_queue=source_to_window_queue, - next_queues=windows_to_mapper_queue, - ) - - m = WikiMapper( - env, - source_queues=windows_to_mapper_queue, - model_run_time_s=model_runtime_constant, - key_selection_policy_cls=policies[key_selection_policy], - keys=keys, - num_replicas=num_replicas, - ) - env.run(until=total_runtime_s) - - plan = m.ready_time_to_batch - with open(out_path, "w") as f: - json.dump(plan, f) - - -if __name__ == "__main__": - - # argument flags - parser = argparse.ArgumentParser() - parser.add_argument("--send_rate", type=int) - parser.add_argument("--model_runtime", type=float) - parser.add_argument("--total_runtime", type=float, default=len(edits)) - parser.add_argument("--event_policy", type=str) - parser.add_argument("--key_policy", type=str) - parser.add_argument("--load_shedding_policy", type=str) - parser.add_argument("--num_replicas", type=int) - args = parser.parse_args() - - plan_name = f"{plan_dir}/plan-{args.key_policy}_{args.event_policy}-{args.load_shedding_policy}-{args.model_runtime}-{args.send_rate}_replicas_{args.num_replicas}" - out_path = f"{plan_name}.json" - print(out_path) - run_once( - out_path=out_path, - prioritization_policy=args.event_policy, - load_shedding_policy=args.load_shedding_policy, - keys=keys, - per_key_records_per_second=args.send_rate, - total_runtime_s=args.total_runtime, - model_runtime_constant=args.model_runtime, - key_selection_policy=args.key_policy, - num_replicas=args.num_replicas, - ) - log_plans(run, config, plan_dir) - - - # load sheding: random, drop short edits - # prioritization: prioritize most recent version - # cross-key prioritzation: historical page views, - # policies - #prioritization_policies = ["lifo"] # ["fifo", "lifo"] - ##key_selection_policies = ["adaptive_weighted_random", "weighted_round_robin", "weighted_random", "weighted_longest_queue", "longest_queue", "random", "round_robin"] - #key_selection_policies = ["round_robin"] - #load_shedding_policies = ["always_process"] - ##model_runtimes = [0.01, 0.05, 0.1, 1, 5, 10] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] - #model_runtimes = [0.02, 0.05, 0.07] # [0.000001, 0.00001, 0.0000001, 0.000000001, 0] - #records_per_second = [100] - - #output_files = [] - - #for key_selection in key_selection_policies: - # for prio_policy in prioritization_policies: - # for load_shed_policy in load_shedding_policies: - # for runtime in model_runtimes: - # for rate in records_per_second: - - # out_path = f"{plan_dir}/plan-{key_selection}_{prio_policy}-{load_shed_policy}-{runtime}-{rate}.json" - # print("running", out_path, runtime) - # run_once( - # out_path, - # prio_policy, - # load_shed_policy, - # keys, - # per_key_records_per_second=rate, - # total_runtime_s=len(edits), - # model_runtime_constant=runtime, - # key_selection_policy=key_selection, - # ) - - # output_files.append(out_path) - # print("DONE", out_path) - #for f in output_files: - # print(f) - #slide_#open("plans.txt", "w").write("\n".join(output_files)) diff --git a/wikipedia/wiki_eval.py b/wikipedia/wiki_eval.py deleted file mode 100644 index ddc6604..0000000 --- a/wikipedia/wiki_eval.py +++ /dev/null @@ -1,325 +0,0 @@ -import configparser -from typing import List -import pickle -import shutil -from tqdm import tqdm -import time -import numpy as np -import json -import pandas as pd -import argparse -from statsmodels.tsa.seasonal import STL, DecomposeResult -import json -import os -from collections import defaultdict -from multiprocessing import Pool -import torch -from dpr.models import init_biencoder_components -from dpr.options import ( - add_encoder_params, - setup_args_gpu, - print_args, - set_encoder_params_from_state, - add_tokenizer_params, - add_cuda_params, -) -from dpr.utils.model_utils import ( - setup_for_distributed_mode, - load_states_from_checkpoint, - get_model_obj, - move_to_device, -) -from dpr.utils.data_utils import Tensorizer - -from preprocessing.log_data import log_plan_data - -""" - -Script for evaluating plans for the wikipedia edit stream dataset. Writes output files which need to be processed by DPR script. - -Download require data: - * rev_dir = s3://feature-store-datasets/wikipedia/edit_diffs/ - * init_data_file = s3://feature-store-datasets/wikipedia/simulation/init_data.json - * questions_file = ?? - * model_file = s3://feature-store-datasets/wikipedia/models/bert-base-encoder.cp - -Upload results: - * exp_dir = s3://feature-store-datasets/wikipedia/simulation_output/ -""" -# simulation data - -import wandb -run = wandb.init(project='wiki-workload', job_type="dataset-creation") -simulation_dir = run.use_artifact('ucb-ralf/wiki-workload /simulation:v2', type='dataset').download() -question_dir = run.use_artifact('ucb-ralf/wiki-workload /questions:v2', type='dataset').download() - -init_data_file = f"{simulation_dir}/init_data.json" -stream_edits_file = f"{simulation_dir}/edit_stream.json" -stream_questions_file = f"{simulation_dir}/question_stream.json" - -config = configparser.ConfigParser() -config.read("config.yml") -#plan_dir = config["simulation"]["plan_dir"] -#init_data_file = config["simulation"]["init_data_file"] -#stream_edits_file = config["simulation"]["stream_edits_file"] -#stream_questions_file = config["simulation"]["stream_questions_file"] -rev_dir = config['directory']['diff_dir'] -embedding_dir = config['directory']['embedding_dir'] -exp_dir = config['directory']['exp_dir'] -model_file = config['files']['model_file'] - -# Create parser -parser = argparse.ArgumentParser(description="Specify experiment config") -parser.add_argument("--offline-plan-path", type=str) -parser.add_argument("--embed", default=False, action="store_true") -parser.add_argument("--wandb", default=False, action="store_true") -args = parser.parse_args() - -exp_id = os.path.basename(args.offline_plan_path).replace(".json", "") -run.config.update(vars(args)) -run.config.update({"plan": exp_id}) - -def sents_to_passages(sents, num_sent_in_pass=10): - passages = [] - - for i in range(0, len(sents), num_sent_in_pass): - passages.append(" ".join(sents[i : i + num_sent_in_pass])) - return passages - - -def embedding_path(revid, version="_new"): - return os.path.join(embedding_dir, f"{revid}{version}.pkl") - - -def offline_eval(plan_json_path, exp_id, compute_embeddings=True): - - # only process subset of keys - keys = ["51150040"] - filter_keys = False - - - # compute initial passage embeddings for each document - init_data = json.load(open(init_data_file)) - init_state = {} - for key in tqdm(init_data.keys()): - - if filter_keys and key not in keys: - continue - sents = init_data[key]["sents"] - revid = init_data[key]["revid"] - - print(init_data_file) - print(init_data[key]["file"]) - embedding_data = pickle.load(open(embedding_path(revid, version="_orig"), "rb")) - embeddings = embedding_data["embeddings"] - passages = sents_to_passages(sents) - if not len(passages) == len(embeddings): - print(f"passage {len(passages)} embeddings {len(embeddings)}") - print(len(embedding_data["passages"])) - print(revid) - print("diff file", init_data[key]["file"]) - print("embedding file", embedding_data["file"]) - print(embedding_data["timestamp"]) - print(len(sents)) - return - - init_state[key] = { - "passages": passages, - "embeddings": embeddings, - "rev": "init", - } - - print(f"Created init state for {len(init_state.keys())} keys") - - # compute passage embeddings for each timestep (using plan) - embed_versions = {"0": init_state} - plan = json.load(open(plan_json_path)) - embed_version_keys: List[str] = list(plan.keys()) - count = 0 - missing = set([]) - print("looping keys", len(embed_version_keys)) - for version in tqdm(embed_version_keys): - state = {} - for task in plan[version]: - #print("task", task, version) - rev_file = task[0] - doc_id = task[1] - # doc_id = task[2] - # rev_file = task[3] - # if filter_keys and doc_id not in keys: - # continue - data = json.load(open(os.path.join(rev_dir, rev_file))) - timestamp = data["timestamp"] - title = data["title"] - sents = [d["sent_b"] for d in data["diffs"][0]] - revid = rev_file.replace(".json", "").split("_")[0] - embedding_filename = embedding_path(revid, version="_new") - assert os.path.exists( - embedding_filename - ), f"Missing revid {embedding_filename}" - if os.path.exists(embedding_filename): - embedding_data = pickle.load( - open(embedding_path(revid, version="_new"), "rb") - ) - # assert embedding_data["timestamp"] == timestamp - embeddings = embedding_data["embeddings"] - passages = embedding_data["passages"] - assert len(passages) == len( - sents_to_passages(sents) - ), f"Inconsistent passage len {len(passages)}, {len(sents_to_passages(sents))}" - # passages = sents_to_passages(sents) - assert len(passages) == len(embeddings) - else: - missing.add(doc_id) - continue - # print("fitting", timestamp, version, doc_id, rev_file) - count += 1 - state[doc_id] = { - "passages": passages, - "embeddings": embeddings, - "rev": rev_file, - } - - # save version - embed_versions[version] = state - print("EMBED", embed_versions.keys()) - print("Num refits", count, len(missing)) - - embed_filename = "embed_versions.pkl" - pickle.dump(embed_versions, open(embed_filename, "wb")) - return embed_filename - -# returns latest version of document embeddings for timestep/key -def get_latest_embedding(timestep, doc_id, embed_versions): - - latest = 0 - for version in embed_versions.keys(): - version = float(version) - if ( - float(timestep) >= version - and version > latest - and doc_id in embed_versions[str(version)] - ): - latest = version - #print(doc_id, "latest", timestep, latest, timestep - latest) - assert ( - doc_id in embed_versions[str(latest)] - ), f"Missing doc id {doc_id} {latest} {doc_id in init_data}" - doc_version = embed_versions[str(latest)][doc_id] - assert latest <= timestep - return ( - doc_version["passages"], - doc_version["embeddings"], - doc_version["rev"], - latest, - ) - -def generate_question_data_all(exp_id, embed_filename): - # create experiment directory - directory = os.path.join(exp_dir, exp_id) - if os.path.isdir(directory): - print("Removing", directory) - shutil.rmtree(directory) - print("Creating", directory) - os.mkdir(directory) - - # get simulation data questions - questions = json.load(open(stream_questions_file)) - - for ts in range(len(questions)): - questions[ts]["ts"] = ts - - print("processing questions", len(questions)) - print("directory", directory) - - chunk_size = 1000 - chunks = [(questions[i:i+chunk_size], embed_filename, directory) for i in range(0, len(questions), chunk_size)] - p = Pool(64) - staleness_all = p.starmap(generate_question_data, chunks) - p.close() - staleness_all = [item for sublist in staleness_all for item in sublist] - staleness = np.array(staleness_all).mean() - print("all staleness", staleness) - wandb.log({"staleness": staleness}) - return directory - - -def generate_question_data(questions, embed_filename, directory): - embed_versions = pickle.load(open(embed_filename, "rb")) - init_data = json.load(open(init_data_file)) - - staleness = [] - for ts_questions in questions: - ts = ts_questions["ts"] - timestep = ts / 100 # TODO: Watch out!! can change and mess up experiment - for doc_id in ts_questions.keys(): - if doc_id == "ts": continue - # not considered in edits - if doc_id not in init_data: - print("missing", doc_id) - # print(init_data.keys()) - continue - - # get current embedding and write - passage_texts, passage_embeddings, version, latest = get_latest_embedding( - timestep, doc_id, embed_versions - ) - - # loop through questions - doc_questions = ts_questions[doc_id] - queries = [] - for q in doc_questions: - question = q["question"] - answer = q["answer"] - assert ( - str(q["doc_id"]) == doc_id - ), f"doc id mismatch {q['doc_id']}, {doc_id}" - assert ( - q["ts_min"] == ts - ), f"time mismatch {q['ts_min']}, {timestep}, {ts}" - queries.append([question, [answer], doc_id]) - - # append per query - staleness.append(timestep - latest) - - # dump CTX/question script - contex_file = f"{directory}/dpr_ctx_after_{int(ts)}_{doc_id}" - text_file = f"{directory}/passages_{int(ts)}_{doc_id}.tsv" - doc_questions_file = ( - f"{directory}/qa_{int(ts)}_{doc_id}.tsv" # question, answer(s), doc id - ) - doc_questions_df = pd.DataFrame(queries) - doc_questions_df.to_csv( - doc_questions_file, sep="\t", index=False, header=False - ) - # write passage file - id, text, title, doc_id - text_df = pd.DataFrame( - [[i, passage_texts[i], "", doc_id] for i in range(len(passage_texts))] - ) - text_df.to_csv(text_file, sep="\t", index=False, header=False) - # write ctx file - passage_ctx = [] - for i in range(len(passage_embeddings)): - passage_ctx.append([i, passage_embeddings[i]]) - pickle.dump(np.array(passage_ctx, dtype=object), open(contex_file, "wb")) - - assert len(passage_ctx) == len(passage_texts) - assert len(passage_embeddings) == len(passage_texts) - print("staleness", np.array(staleness).mean()) - return staleness - -def main(): - - - embed_filename = offline_eval(args.offline_plan_path, exp_id, compute_embeddings=args.embed) - - #embed_filename = "embed_versions.pkl" - generate_question_data_all(exp_id, embed_filename) - #if args.wandb: - # import wandb - # run = wandb.init(job_type="dataset-creation", project="wiki-workload") - # log_plan_data(run, config, exp_id, output_dir) - - -if __name__ == "__main__": - main() From 1fdb9073ea5cd2fe79ada11a16f24ff47ef86892 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 19 Oct 2021 12:45:39 -0700 Subject: [PATCH 25/26] rename file --- wikipedia/offline/prepare_dpr_data.py | 326 ++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 wikipedia/offline/prepare_dpr_data.py diff --git a/wikipedia/offline/prepare_dpr_data.py b/wikipedia/offline/prepare_dpr_data.py new file mode 100644 index 0000000..5f454a1 --- /dev/null +++ b/wikipedia/offline/prepare_dpr_data.py @@ -0,0 +1,326 @@ +import configparser +from typing import List +import pickle +import shutil +from tqdm import tqdm +import time +import numpy as np +import json +import pandas as pd +import argparse +from statsmodels.tsa.seasonal import STL, DecomposeResult +import json +import os +from collections import defaultdict +from multiprocessing import Pool +import torch +from dpr.models import init_biencoder_components +from dpr.options import ( + add_encoder_params, + setup_args_gpu, + print_args, + set_encoder_params_from_state, + add_tokenizer_params, + add_cuda_params, +) +from dpr.utils.model_utils import ( + setup_for_distributed_mode, + load_states_from_checkpoint, + get_model_obj, + move_to_device, +) +from dpr.utils.data_utils import Tensorizer + +from preprocessing.log_data import log_plan_data + +""" + +Script for evaluating plans for the wikipedia edit stream dataset. Writes output files which need to be processed by DPR script. + +Download require data: + * rev_dir = s3://feature-store-datasets/wikipedia/edit_diffs/ + * init_data_file = s3://feature-store-datasets/wikipedia/simulation/init_data.json + * questions_file = ?? + * model_file = s3://feature-store-datasets/wikipedia/models/bert-base-encoder.cp + +Upload results: + * exp_dir = s3://feature-store-datasets/wikipedia/simulation_output/ +""" +# simulation data + +import wandb +run = wandb.init(project='wiki-workload', job_type="dataset-creation") +simulation_dir = run.use_artifact('ucb-ralf/wiki-workload /simulation:v2', type='dataset').download() +question_dir = run.use_artifact('ucb-ralf/wiki-workload /questions:v2', type='dataset').download() + +init_data_file = f"{simulation_dir}/init_data.json" +stream_edits_file = f"{simulation_dir}/edit_stream.json" +stream_questions_file = f"{simulation_dir}/question_stream.json" + +config = configparser.ConfigParser() +config.read("config.yml") +#plan_dir = config["simulation"]["plan_dir"] +#init_data_file = config["simulation"]["init_data_file"] +#stream_edits_file = config["simulation"]["stream_edits_file"] +#stream_questions_file = config["simulation"]["stream_questions_file"] +rev_dir = config['directory']['diff_dir'] +embedding_dir = config['directory']['embedding_dir'] +exp_dir = config['directory']['exp_dir'] +model_file = config['files']['model_file'] + +# Create parser +parser = argparse.ArgumentParser(description="Specify experiment config") +parser.add_argument("--offline-plan-path", type=str) +parser.add_argument("--embed", default=False, action="store_true") +parser.add_argument("--wandb", default=False, action="store_true") +parser.add_argument("--workers", type=int) +args = parser.parse_args() + +exp_id = os.path.basename(args.offline_plan_path).replace(".json", "") +run.config.update(vars(args)) +run.config.update({"plan": exp_id}) + +def sents_to_passages(sents, num_sent_in_pass=10): + passages = [] + + for i in range(0, len(sents), num_sent_in_pass): + passages.append(" ".join(sents[i : i + num_sent_in_pass])) + return passages + + +def embedding_path(revid, version="_new"): + return os.path.join(embedding_dir, f"{revid}{version}.pkl") + + +def offline_eval(plan_json_path, exp_id, compute_embeddings=True): + + # only process subset of keys + keys = ["51150040"] + filter_keys = False + + + # compute initial passage embeddings for each document + init_data = json.load(open(init_data_file)) + init_state = {} + for key in tqdm(init_data.keys()): + + if filter_keys and key not in keys: + continue + sents = init_data[key]["sents"] + revid = init_data[key]["revid"] + + print(init_data_file) + print(init_data[key]["file"]) + embedding_data = pickle.load(open(embedding_path(revid, version="_orig"), "rb")) + embeddings = embedding_data["embeddings"] + passages = sents_to_passages(sents) + if not len(passages) == len(embeddings): + print(f"passage {len(passages)} embeddings {len(embeddings)}") + print(len(embedding_data["passages"])) + print(revid) + print("diff file", init_data[key]["file"]) + print("embedding file", embedding_data["file"]) + print(embedding_data["timestamp"]) + print(len(sents)) + return + + init_state[key] = { + "passages": passages, + "embeddings": embeddings, + "rev": "init", + } + + print(f"Created init state for {len(init_state.keys())} keys") + + # compute passage embeddings for each timestep (using plan) + embed_versions = {"0": init_state} + plan = json.load(open(plan_json_path)) + embed_version_keys: List[str] = list(plan.keys()) + count = 0 + missing = set([]) + print("looping keys", len(embed_version_keys)) + for version in tqdm(embed_version_keys): + state = {} + for task in plan[version]: + #print("task", task, version) + rev_file = task[0] + doc_id = task[1] + # doc_id = task[2] + # rev_file = task[3] + # if filter_keys and doc_id not in keys: + # continue + data = json.load(open(os.path.join(rev_dir, rev_file))) + timestamp = data["timestamp"] + title = data["title"] + sents = [d["sent_b"] for d in data["diffs"][0]] + revid = rev_file.replace(".json", "").split("_")[0] + embedding_filename = embedding_path(revid, version="_new") + assert os.path.exists( + embedding_filename + ), f"Missing revid {embedding_filename}" + if os.path.exists(embedding_filename): + embedding_data = pickle.load( + open(embedding_path(revid, version="_new"), "rb") + ) + # assert embedding_data["timestamp"] == timestamp + embeddings = embedding_data["embeddings"] + passages = embedding_data["passages"] + assert len(passages) == len( + sents_to_passages(sents) + ), f"Inconsistent passage len {len(passages)}, {len(sents_to_passages(sents))}" + # passages = sents_to_passages(sents) + assert len(passages) == len(embeddings) + else: + missing.add(doc_id) + continue + # print("fitting", timestamp, version, doc_id, rev_file) + count += 1 + state[doc_id] = { + "passages": passages, + "embeddings": embeddings, + "rev": rev_file, + } + + # save version + embed_versions[version] = state + print("EMBED", embed_versions.keys()) + print("Num refits", count, len(missing)) + + embed_filename = "embed_versions.pkl" + pickle.dump(embed_versions, open(embed_filename, "wb")) + return embed_filename + +# returns latest version of document embeddings for timestep/key +def get_latest_embedding(timestep, doc_id, embed_versions): + + latest = 0 + for version in embed_versions.keys(): + version = float(version) + if ( + float(timestep) >= version + and version > latest + and doc_id in embed_versions[str(version)] + ): + latest = version + #print(doc_id, "latest", timestep, latest, timestep - latest) + assert ( + doc_id in embed_versions[str(latest)] + ), f"Missing doc id {doc_id} {latest} {doc_id in init_data}" + doc_version = embed_versions[str(latest)][doc_id] + assert latest <= timestep + return ( + doc_version["passages"], + doc_version["embeddings"], + doc_version["rev"], + latest, + ) + +def generate_question_data_all(exp_id, embed_filename): + # create experiment directory + directory = os.path.join(exp_dir, exp_id) + if os.path.isdir(directory): + print("Removing", directory) + shutil.rmtree(directory) + print("Creating", directory) + os.mkdir(directory) + + # get simulation data questions + questions = json.load(open(stream_questions_file)) + + for ts in range(len(questions)): + questions[ts]["ts"] = ts + + print("processing questions", len(questions)) + print("directory", directory) + + chunk_size = 1000 + chunks = [(questions[i:i+chunk_size], embed_filename, directory) for i in range(0, len(questions), chunk_size)] + p = Pool(args.workers) + staleness_all = p.starmap(generate_question_data, chunks) + p.close() + staleness_all = [item for sublist in staleness_all for item in sublist] + staleness = np.array(staleness_all).mean() + print("all staleness", staleness) + wandb.log({"staleness": staleness}) + return directory + + +def generate_question_data(questions, embed_filename, directory): + embed_versions = pickle.load(open(embed_filename, "rb")) + init_data = json.load(open(init_data_file)) + + staleness = [] + for ts_questions in questions: + ts = ts_questions["ts"] + timestep = ts / 100 # TODO: Watch out!! can change and mess up experiment + for doc_id in ts_questions.keys(): + if doc_id == "ts": continue + # not considered in edits + if doc_id not in init_data: + print("missing", doc_id) + # print(init_data.keys()) + continue + + # get current embedding and write + passage_texts, passage_embeddings, version, latest = get_latest_embedding( + timestep, doc_id, embed_versions + ) + + # loop through questions + doc_questions = ts_questions[doc_id] + queries = [] + for q in doc_questions: + question = q["question"] + answer = q["answer"] + assert ( + str(q["doc_id"]) == doc_id + ), f"doc id mismatch {q['doc_id']}, {doc_id}" + assert ( + q["ts_min"] == ts + ), f"time mismatch {q['ts_min']}, {timestep}, {ts}" + queries.append([question, [answer], doc_id]) + + # append per query + staleness.append(timestep - latest) + + # dump CTX/question script + contex_file = f"{directory}/dpr_ctx_after_{int(ts)}_{doc_id}" + text_file = f"{directory}/passages_{int(ts)}_{doc_id}.tsv" + doc_questions_file = ( + f"{directory}/qa_{int(ts)}_{doc_id}.tsv" # question, answer(s), doc id + ) + doc_questions_df = pd.DataFrame(queries) + doc_questions_df.to_csv( + doc_questions_file, sep="\t", index=False, header=False + ) + # write passage file - id, text, title, doc_id + text_df = pd.DataFrame( + [[i, passage_texts[i], "", doc_id] for i in range(len(passage_texts))] + ) + text_df.to_csv(text_file, sep="\t", index=False, header=False) + # write ctx file + passage_ctx = [] + for i in range(len(passage_embeddings)): + passage_ctx.append([i, passage_embeddings[i]]) + pickle.dump(np.array(passage_ctx, dtype=object), open(contex_file, "wb")) + + assert len(passage_ctx) == len(passage_texts) + assert len(passage_embeddings) == len(passage_texts) + print("staleness", np.array(staleness).mean()) + return staleness + +def main(): + + + embed_filename = offline_eval(args.offline_plan_path, exp_id, compute_embeddings=args.embed) + + #embed_filename = "embed_versions.pkl" + generate_question_data_all(exp_id, embed_filename) + #if args.wandb: + # import wandb + # run = wandb.init(job_type="dataset-creation", project="wiki-workload") + # log_plan_data(run, config, exp_id, output_dir) + + +if __name__ == "__main__": + main() From 6d97aa686fa71d99176e32bbeb09ebee13340192 Mon Sep 17 00:00:00 2001 From: Sarah Wooders Date: Tue, 19 Oct 2021 15:00:43 -0700 Subject: [PATCH 26/26] update README --- wikipedia/README.md | 60 ++++++++++++++++++++++++++++++ wikipedia/offline/download_data.sh | 21 +++++++++++ 2 files changed, 81 insertions(+) create mode 100644 wikipedia/README.md create mode 100644 wikipedia/offline/download_data.sh diff --git a/wikipedia/README.md b/wikipedia/README.md new file mode 100644 index 0000000..983b625 --- /dev/null +++ b/wikipedia/README.md @@ -0,0 +1,60 @@ +# Wikipedia Experiment Pipeline + +### Configuration +Update `config.yml` + +### Generating simulation data +Run parts of the pipeline using flags: +``` +python generate_data.py \ + --run_query_recentchanges # query wikipedia recentchanges api + --run_query_doc_versions # query wikipedia docs api + --run_recent_changes # process raw changes data into changes.csv file + --run_parse_docs # process raw doc data with wikiparser + --run_get_questions # process raw questions into questions.csv + --run_get_pageviews # process raw pageview data into pageviews.csv + --run_generate_diffs # compute diffs between different version + --run_generate_simulation_data # generate simulation data + --run_check_dataset # check dataset + --run_generate_embeddings # embed documents +``` +To update simulation data, make sure you have the embeddings and diffs already download, and run: +``` +python generate_data.py --run_generate_simulation_data --run_get_questions --run_check_dataset +``` + + +## Offline Simulation Pipeline +Download the data with `./download_data.sh` (warning: 100s of GBs) and update `config.yml`. + +Run the simulation in stages to go from raw Wikipedia API data to simulation results: + +``` +./run_0_generate_data.sh # generate simulation data from questions.csv file +./run_1_generate_plan.sh # run simulations to generate plan +./run_2_prepare_data.sh # use plan to determine questions / embedding versions at each timestep +./run_3_run_predictions.sh # run DPR model on embeddings +./run_4_run_optimal_predictons.sh # generate optimal predictions +``` + +### Logging Data +To save the current data, run +``` +python log_data.py +``` + +### Logging Experiments +TODO + +## Online Pipeline (ralf) +(NOTE: incomplete) +Run the server +``` +python wiki_server.py +``` +Run the client +``` +python wiki_client.py +``` + + diff --git a/wikipedia/offline/download_data.sh b/wikipedia/offline/download_data.sh new file mode 100644 index 0000000..208e477 --- /dev/null +++ b/wikipedia/offline/download_data.sh @@ -0,0 +1,21 @@ +data_dir=/data/wooders/wikipedia + +# download diffs +mkdir -p ${data_dir}/diffs; +aws s3 sync s3://feature-store-datasets/wikipedia/diffs diffs; + +# download model +aws s3 cp s3://feature-store-datasets/wikipedia/models/bert-base-encoder.cp ${data_dir}; + +# download questions +aws s3 cp s3://feature-store-datasets/wikipedia/10062021_filtered_questions.csv ${data_dir}; + +# download embeddings +mkdir -p ${data_dir}/embeddings; +aws s3 sync s3://feature-store-datasets/wikipedia/embeddings embeddings; + +## download raw api data +#mkdir -p ${data_dir}/recentchanges; +#mkdir -p ${data_dir}/doc_xml; +#aws s3 sync s3://feature-store-datasets/wikipedia/recentchanges recentchanges; +#aws s3 sync s3://feature-store-datasets/wikipedia/doc_xml doc_xml;