petuum · christy-yuan-li · Dec 15, 2020 · Dec 15, 2020 · Dec 15, 2020 · Dec 15, 2020
diff --git a/autodist/simulator/README.md b/autodist/simulator/README.md
@@ -0,0 +1,56 @@
+
+The ``simulator`` folder implements predefined simulator in AutoSync proposed in: [AutoSync: Learning to Synchronize for Data-Parallel Distributed Deep Learning](https://papers.nips.cc/paper/2020/hash/0a2298a72858d90d5c4b4fee954b6896-Abstract.html).
+
+## Download Data 
+Download the data from https://drive.google.com/file/d/1CTtIVORxzF_wOmxrsusbAhNC3bwmxuD8/view?usp=sharing.
+
+The data folder is organized by ML model categories. For a ML model, the simulation is conducted on two kinds of clusters (AWS and an in-house cluster). Each data sample comprises a <resource specification, runtime, strategy> pair. The resource specification file corresponds to all runtimes and strategies inside runtimes and strategies folders, respectively. The detailed data organization is:  
+
+    Model-1/ (e.g., BERT-large)
+        Cluster-1/ (e.g., AWS-4-g4)
+            resource_spec.yml
+            runtime/
+                <ID>.yml
+            strategies/
+                <ID>  
+        Cluster-2 (e.g., In-house-11-nodes)
+            resource_spec.yml
+            runtime/
+                <ID>.yml
+            strategies/
+                <ID>  
+    Model-2 
+        ......
+
+    Model-3 
+        ...... 
+
+
+## Train a predefined simulator
+
+Inside ``autodist/simulator`` folder. 
+
+Define configuration in ``config.py`` including the model to simulate and the data folders (samples) to use. 
+
+Run: ``python absolute_dir/train_predefined_simulator_clean.py`` 
+
+
+## Simulate (infer) a strategy 
+
+Inside ``autodist/simulator`` folder. 
+
+Define the strategy to simulate and checkpoint to load in ``simulate.py``. 
+
+Run: ``python simulate.py``. 
+
+
+## Read a strategy 
+
+Use 
+
+``from autodist.strategy import base``
+
+``strategy = base.Strategy.deserialize(strategy_file)`` 
+
+to read a strategy from ``strategy_file``. 
+
diff --git a/autodist/simulator/__init__.py b/autodist/simulator/__init__.py
diff --git a/autodist/simulator/config.py b/autodist/simulator/config.py
@@ -0,0 +1,96 @@
+from pathlib import Path
+
+
+GRAPH_ITEM_DIR = f'{str(Path.home())}/graph_items'
+SIMULATION_DATA_DIR = f'{str(Path.home())}/autosync_dataset_release'
+CHECKPOINT_DIR =  f'{str(Path.home())}'
+
+
+simulation_params = {
+	'ncf_large_adam_dense': {
+		'model_batch_size': 256,
+		'model_seq_len': 1,
+		'data_dir': [
+			f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1',
+			f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2',
+			f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_2',
+			f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_g3.4.25.3_g3.4.25.4_3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
+			f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
+        ],
+		'original_graph_item_path': f'{GRAPH_ITEM_DIR}/ncf_original_graph_item',
+        'save_dir': f'{CHECKPOINT_DIR}/ncf_predefined_checkpoints',
+		'save_prefix': 'ckpV1_ncf_large_adam_dense',
+		'baseline': 0.15,
+		'scale': 0.5,
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+		'model_version': 'v1',
+		'do_train': False,
+		'do_test': True,
+		'checkpoint': f'{CHECKPOINT_DIR}/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_all_600_0.83249_0.84517',
+	},
+	'bert': {
+		'model_batch_size': 32,
+		'model_seq_len': 128,
+		'data_dir': [
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_random_orca_11',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-13_trial-100-_expolre-3000_model-on-bert-orca_embedding_sim-weight-0.3_max-par-20_if-partition-lb-100000-zhijie',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_test_run',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-3.5_trial-50_expolre-1000_model-new-2_embedding_sim-1.0',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-8_trial-30_expolre-1000_model-new-3_embedding_sim-0.2_ps-only',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-8_trial-50_expolre-1000_model-new-3_embedding_sim-0.4_ps-only',
+			f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-13_trial-20-_expolre-100_model-on-bert-orca_embedding_sim-weight-0.3_max-par-20_if-partition-lb-100000',
+			f'{SIMULATION_DATA_DIR}/bert/bert-aws/bert-large-aws4g4',
+			f'{SIMULATION_DATA_DIR}/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
+		],
+        'original_graph_item_path': f'{GRAPH_ITEM_DIR}/bert_original_graph_item_large',
+        'save_dir': f'{CHECKPOINT_DIR}/bert_predefined_checkpoints',
+		'save_prefix': 'ckpV1_bert_orca',
+		'baseline': 0.04,
+		'scale': 0.5,
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+		'do_train': True,
+		'do_test': True,
+		'model_version': 'v1',
+		'checkpoint': f'{CHECKPOINT_DIR}//bert_predefined_checkpoints/ckpV1_bert_orca_100_0.67000_0.50000',
+	},
+	'vgg16': {
+		'model_batch_size': 32,
+		'model_seq_len': 1,
+		'data_dir': [
+			f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws4_from_vgg16-orca2aws-421_explore3000',
+			f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws-4_model-aws-new_rejection-4_explore-3000_sim-weight-0.75',
+			f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws-4_model-aws-only_rejection-8_explore-3000_sim-weight-0.3',
+			f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws_4_pure_random',
+		],
+		'original_graph_item_path': f'{GRAPH_ITEM_DIR}/vgg16_original_graph_item',
+		'save_dir': f'{CHECKPOINT_DIR}/vgg16_predefined_checkpoints',
+		'save_prefix': 'ckpV1_vgg_aws',
+		'baseline': 0.0,
+		'scale': 0.5,
+		'do_train': True,
+		'do_test': True,
+		'model_version': 'v1',
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+		'checkpoint': '',
+	},
+	'resnet101': {
+		'model_batch_size': 32,
+		'model_seq_len': 1,
+		'baseline': 0.5,
+		'scale': 0.5,
+		'data_dir': '',
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+	},
+}
diff --git a/autodist/simulator/models/__init__.py b/autodist/simulator/models/__init__.py