Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions autodist/simulator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

The ``simulator`` folder implements predefined simulator in AutoSync proposed in: [AutoSync: Learning to Synchronize for Data-Parallel Distributed Deep Learning](https://papers.nips.cc/paper/2020/hash/0a2298a72858d90d5c4b4fee954b6896-Abstract.html).

## Download Data
Download the data from https://drive.google.com/file/d/1CTtIVORxzF_wOmxrsusbAhNC3bwmxuD8/view?usp=sharing.

The data folder is organized by ML model categories. For a ML model, the simulation is conducted on two kinds of clusters (AWS and an in-house cluster). Each data sample comprises a <resource specification, runtime, strategy> pair. The resource specification file corresponds to all runtimes and strategies inside runtimes and strategies folders, respectively. The detailed data organization is:

Model-1/ (e.g., BERT-large)
Cluster-1/ (e.g., AWS-4-g4)
resource_spec.yml
runtime/
<ID>.yml
strategies/
<ID>
Cluster-2 (e.g., In-house-11-nodes)
resource_spec.yml
runtime/
<ID>.yml
strategies/
<ID>
Model-2
......

Model-3
......


## Train a predefined simulator

Inside ``autodist/simulator`` folder.

Define configuration in ``config.py`` including the model to simulate and the data folders (samples) to use.

Run: ``python absolute_dir/train_predefined_simulator_clean.py``


## Simulate (infer) a strategy

Inside ``autodist/simulator`` folder.

Define the strategy to simulate and checkpoint to load in ``simulate.py``.

Run: ``python simulate.py``.


## Read a strategy

Use

``from autodist.strategy import base``

``strategy = base.Strategy.deserialize(strategy_file)``

to read a strategy from ``strategy_file``.

Empty file added autodist/simulator/__init__.py
Empty file.
96 changes: 96 additions & 0 deletions autodist/simulator/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from pathlib import Path


GRAPH_ITEM_DIR = f'{str(Path.home())}/graph_items'
SIMULATION_DATA_DIR = f'{str(Path.home())}/autosync_dataset_release'
CHECKPOINT_DIR = f'{str(Path.home())}'


simulation_params = {
'ncf_large_adam_dense': {
'model_batch_size': 256,
'model_seq_len': 1,
'data_dir': [
f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1',
f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2',
f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_2',
f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_g3.4.25.3_g3.4.25.4_3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
f'{SIMULATION_DATA_DIR}/cluster1/ncf_large_adam_dense_g3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
],
'original_graph_item_path': f'{GRAPH_ITEM_DIR}/ncf_original_graph_item',
'save_dir': f'{CHECKPOINT_DIR}/ncf_predefined_checkpoints',
'save_prefix': 'ckpV1_ncf_large_adam_dense',
'baseline': 0.15,
'scale': 0.5,
'learning_rate': 0.01,
'list_size': 2,
'batch_size': 100,
'ranking_loss_key': 'pairwise_logistic_loss',
'model_version': 'v1',
'do_train': False,
'do_test': True,
'checkpoint': f'{CHECKPOINT_DIR}/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_all_600_0.83249_0.84517',
},
'bert': {
'model_batch_size': 32,
'model_seq_len': 128,
'data_dir': [
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_random_orca_11',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-13_trial-100-_expolre-3000_model-on-bert-orca_embedding_sim-weight-0.3_max-par-20_if-partition-lb-100000-zhijie',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_test_run',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-3.5_trial-50_expolre-1000_model-new-2_embedding_sim-1.0',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-8_trial-30_expolre-1000_model-new-3_embedding_sim-0.2_ps-only',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-8_trial-50_expolre-1000_model-new-3_embedding_sim-0.4_ps-only',
f'{SIMULATION_DATA_DIR}/cluster2/bert_large_orca_11_random_rej-13_trial-20-_expolre-100_model-on-bert-orca_embedding_sim-weight-0.3_max-par-20_if-partition-lb-100000',
f'{SIMULATION_DATA_DIR}/bert/bert-aws/bert-large-aws4g4',
f'{SIMULATION_DATA_DIR}/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
],
'original_graph_item_path': f'{GRAPH_ITEM_DIR}/bert_original_graph_item_large',
'save_dir': f'{CHECKPOINT_DIR}/bert_predefined_checkpoints',
'save_prefix': 'ckpV1_bert_orca',
'baseline': 0.04,
'scale': 0.5,
'learning_rate': 0.01,
'list_size': 2,
'batch_size': 100,
'ranking_loss_key': 'pairwise_logistic_loss',
'do_train': True,
'do_test': True,
'model_version': 'v1',
'checkpoint': f'{CHECKPOINT_DIR}//bert_predefined_checkpoints/ckpV1_bert_orca_100_0.67000_0.50000',
},
'vgg16': {
'model_batch_size': 32,
'model_seq_len': 1,
'data_dir': [
f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws4_from_vgg16-orca2aws-421_explore3000',
f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws-4_model-aws-new_rejection-4_explore-3000_sim-weight-0.75',
f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws-4_model-aws-only_rejection-8_explore-3000_sim-weight-0.3',
f'{SIMULATION_DATA_DIR}/cluster1/vgg16_aws_4_pure_random',
],
'original_graph_item_path': f'{GRAPH_ITEM_DIR}/vgg16_original_graph_item',
'save_dir': f'{CHECKPOINT_DIR}/vgg16_predefined_checkpoints',
'save_prefix': 'ckpV1_vgg_aws',
'baseline': 0.0,
'scale': 0.5,
'do_train': True,
'do_test': True,
'model_version': 'v1',
'learning_rate': 0.01,
'list_size': 2,
'batch_size': 100,
'ranking_loss_key': 'pairwise_logistic_loss',
'checkpoint': '',
},
'resnet101': {
'model_batch_size': 32,
'model_seq_len': 1,
'baseline': 0.5,
'scale': 0.5,
'data_dir': '',
'learning_rate': 0.01,
'list_size': 2,
'batch_size': 100,
'ranking_loss_key': 'pairwise_logistic_loss',
},
}
Empty file.
Loading