diff --git a/CMakeLists.txt b/CMakeLists.txt index cc629b75..f51b9f51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,11 +26,12 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # 2 This line for cluster #SET (CMAKE_CXX_FLAGS "-std=gnu++17 -Wall -O3 -funroll-loops -msse3 -fno-omit-frame-pointer -D_GLIBCXX_DEBUG") -SET (CMAKE_CXX_FLAGS " -Wall -O3 -funroll-loops -msse3") +# SET (CMAKE_CXX_FLAGS " -Wall -O3 -funroll-loops -msse3 -fsanitize=address") +SET (CMAKE_CXX_FLAGS " -Wall -march=native -O3") #SET (CMAKE_CXX_FLAGS " -Wall -O1 -funroll-loops -msse3 -g -fsanitize=address -fno-omit-frame-pointer -shared-libasan -DGLIBCXX_DEBUG") #SET (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g") #SET (CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -Os -DNDEBUG") -SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O4 -funroll-loops -DNDEBUG") +SET (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG") #SET (CMAKE_SHARED_LINKER_FLAGS " -Wall -O1 -funroll-loops -msse3 -g -fsanitize=address -fno-omit-frame-pointer -shared-libasan -DGLIBCXX_DEBUG") diff --git a/README.md b/README.md index 7ecc5b2c..b297bbc6 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ Update the code format before start a pull request with: ~/exact $ sh scripts/util/format.sh ``` - You may also want to have graphviz installed so you can generate images of the evolved neural networks. EXACT/EXALT/EXAMM will write out evolved genomes in a .gv (graphviz) format for this. For example, can generate a pdf from a gv file (assuming graphviz is installed with): ``` diff --git a/common/files.hxx b/common/files.hxx index ac23ff0d..8c4c8a43 100644 --- a/common/files.hxx +++ b/common/files.hxx @@ -1,6 +1,8 @@ #ifndef EXACT_BOINC_COMMON_HXX #define EXACT_BOINC_COMMON_HXX +#include + #include using std::runtime_error; diff --git a/common/log.cxx b/common/log.cxx index 623475e8..6f82e67f 100644 --- a/common/log.cxx +++ b/common/log.cxx @@ -79,11 +79,11 @@ int8_t Log::parse_level_from_string(string level) { void Log::initialize(const vector& arguments) { // TODO: should read these from the CommandLine (to be created) - string std_message_level_str, file_message_level_str; + string std_message_level_str = "INFO", file_message_level_str = "NONE"; - get_argument(arguments, "--std_message_level", true, std_message_level_str); - get_argument(arguments, "--file_message_level", true, file_message_level_str); - get_argument(arguments, "--output_directory", true, output_directory); + get_argument(arguments, "--std_message_level", false, std_message_level_str); + get_argument(arguments, "--file_message_level", false, file_message_level_str); + get_argument(arguments, "--output_directory", false, output_directory); std_message_level = parse_level_from_string(std_message_level_str); file_message_level = parse_level_from_string(file_message_level_str); diff --git a/common/process_arguments.cxx b/common/process_arguments.cxx index b4257708..fe2a49f0 100644 --- a/common/process_arguments.cxx +++ b/common/process_arguments.cxx @@ -1,3 +1,4 @@ +#include #include using std::string; @@ -120,12 +121,15 @@ IslandSpeciationStrategy* generate_island_speciation_strategy_from_arguments( get_argument(arguments, "--seed_stirs", false, seed_stirs); bool start_filled = argument_exists(arguments, "--start_filled"); bool tl_epigenetic_weights = argument_exists(arguments, "--tl_epigenetic_weights"); + unique_ptr annealing_policy = AnnealingPolicy::from_arguments(arguments); + string output_directory = ""; + get_argument(arguments, "--output_directory", false, output_directory); IslandSpeciationStrategy* island_strategy = new IslandSpeciationStrategy( - number_islands, island_size, mutation_rate, intra_island_co_rate, inter_island_co_rate, seed_genome, + number_islands, island_size, mutation_rate, intra_island_co_rate, inter_island_co_rate, output_directory, seed_genome, island_ranking_method, repopulation_method, extinction_event_generation_number, num_mutations, islands_to_exterminate, max_genomes, repeat_extinction, start_filled, transfer_learning, - transfer_learning_version, seed_stirs, tl_epigenetic_weights + transfer_learning_version, tl_epigenetic_weights, annealing_policy ); return island_strategy; @@ -189,10 +193,10 @@ void get_train_validation_data( time_series_sets->export_training_series(time_offset, train_inputs, train_outputs); time_series_sets->export_test_series(time_offset, validation_inputs, validation_outputs); - int32_t sequence_length = 0; - if (get_argument(arguments, "--train_sequence_length", false, sequence_length)) { - Log::info("Slicing input training data with time sequence length: %d\n", sequence_length); - slice_input_data(train_inputs, train_outputs, sequence_length); + int32_t train_sequence_length = 0; + if (get_argument(arguments, "--train_sequence_length", false, train_sequence_length)) { + Log::info("Slicing input training data with time sequence length: %d\n", train_sequence_length); + slice_input_data(train_inputs, train_outputs, train_sequence_length); } int32_t validation_sequence_length = 0; diff --git a/examm/CMakeLists.txt b/examm/CMakeLists.txt index d5c532f9..2f5942b7 100644 --- a/examm/CMakeLists.txt +++ b/examm/CMakeLists.txt @@ -1 +1 @@ -add_library(examm_strategy examm.cxx species.cxx island.cxx island_speciation_strategy.cxx species.cxx neat_speciation_strategy.cxx) +add_library(examm_strategy examm.cxx species.cxx island.cxx island_speciation_strategy.cxx species.cxx neat_speciation_strategy.cxx annealing.cxx) diff --git a/examm/annealing.cxx b/examm/annealing.cxx new file mode 100644 index 00000000..6d0b5868 --- /dev/null +++ b/examm/annealing.cxx @@ -0,0 +1,86 @@ +#include "annealing.hxx" + +#include +#include + +#include "common/arguments.hxx" +#include "common/log.hxx" + +unique_ptr AnnealingPolicy::from_arguments(const vector& arguments) { + string type; + get_argument(arguments, "--annealing_policy", false, type); + Log::info("Annealing policy = %s\n", type.c_str()); + if (type == "linear") { + return unique_ptr(new LinearAnnealingPolicy(arguments)); + } else if (type == "inv_exp") { + return unique_ptr(new InvExpAnnealingPolicy(arguments)); + } else if (type == "sin") { + return unique_ptr(new SinAnnealingPolicy(arguments)); + } else { + Log::info("Using default annealing policy\n"); + return make_unique(); + } +} + +double AnnealingPolicy::operator()(int32_t genome_number) { + return 0.0; +} + +LinearAnnealingPolicy::LinearAnnealingPolicy( + double start_value, double end_value, int32_t start_genomes, int32_t interp_genomes +) + : start_value(start_value), end_value(end_value), start_genomes(start_genomes), interp_genomes(interp_genomes) { +} + +LinearAnnealingPolicy::LinearAnnealingPolicy(const vector& arguments) { + get_argument(arguments, "--linear_start_value", true, start_value); + get_argument(arguments, "--linear_end_value", true, end_value); + get_argument(arguments, "--linear_start_genomes", true, start_genomes); + get_argument(arguments, "--linear_interp_genomes", true, interp_genomes); +} + +double LinearAnnealingPolicy::operator()(int32_t genome_number) { + if (genome_number <= start_genomes) { + return start_value; + } else if (genome_number <= interp_genomes + start_genomes) { + double weight = (double) (genome_number - (interp_genomes + start_genomes)) / (double) interp_genomes; + return weight * end_value + (1 - weight) * start_value; + } else { + return end_value; + } +} + +InvExpAnnealingPolicy::InvExpAnnealingPolicy(double decay_factor) : decay_factor(decay_factor) { +} +InvExpAnnealingPolicy::InvExpAnnealingPolicy(const vector& arguments) { + get_argument(arguments, "--exp_decay_factor", true, decay_factor); +} + +double InvExpAnnealingPolicy::operator()(int32_t genome_number) { + return std::pow(1. + genome_number, -decay_factor); +} + +SinAnnealingPolicy::SinAnnealingPolicy(double period, double min_p, double max_p) + : period(period), min_p(min_p), max_p(max_p) { + if (min_p > max_p) { + std::swap(min_p, max_p); + } + + if (min_p > 1.0 || min_p < 0.0) { + throw "Invalid min_p supplied to SinAnnealingPolicyConstructor"; + } + if (max_p > 1.0 || max_p < 0.0) { + throw "Invalid max_p supplied to SinAnnealingPolicyConstructor"; + } +} +SinAnnealingPolicy::SinAnnealingPolicy(const vector& arguments) { + get_argument(arguments, "--sin_min_p", true, min_p); + get_argument(arguments, "--sin_max_p", true, max_p); + get_argument(arguments, "--sin_period", true, period); +} + +double SinAnnealingPolicy::operator()(int32_t genome_number) { + double range = max_p - min_p; + + return (max_p + min_p) / 2. + range / 2. * std::sin(2. * M_PI * genome_number / period); +} diff --git a/examm/annealing.hxx b/examm/annealing.hxx new file mode 100644 index 00000000..4406610c --- /dev/null +++ b/examm/annealing.hxx @@ -0,0 +1,65 @@ +#include + +#include +using std::unique_ptr; + +#include +using std::string; + +#include +using std::vector; + +struct AnnealingPolicy { + static unique_ptr from_arguments(const vector& arguments); + + /** + * Compute the probability to be used during genome insertion. + * This represents the probability of inserting the genome, even if it + * has a fitness value that is worse than the worst member in the population. + */ + virtual double operator()(int32_t genome_number); +}; + +/** + * Interpolate between two values for a set number of genomes. + * The `start_value` will be returned for `start_genomes`, + * then a linear interpolation of `start_value` and `end_value` for + * `interp_genomes`. Then, `end_value` is given indefinitely. + */ +class LinearAnnealingPolicy : public AnnealingPolicy { + double start_value, end_value; + int32_t start_genomes, interp_genomes; + + public: + LinearAnnealingPolicy(double start_value, double end_value, int32_t start_genomes, int32_t interp_genomes); + LinearAnnealingPolicy(const vector& arguments); + + virtual double operator()(int32_t genome_number); +}; + +/** + * Calculates p by simply computing `genome_number^(-decay_factor). + **/ +class InvExpAnnealingPolicy : public AnnealingPolicy { + double decay_factor; + + public: + InvExpAnnealingPolicy(double decay_factor); + InvExpAnnealingPolicy(const vector& arguments); + + virtual double operator()(int32_t genome_number); +}; + +/** + * Computes `p` as a value falling on a sinusoidal curve with the supplied period. + * a `min_p` and a `max_p` specify the range of the curve. + **/ +class SinAnnealingPolicy : public AnnealingPolicy { + double period, min_p, max_p; + + public: + SinAnnealingPolicy(double period, double min_p, double max_p); + SinAnnealingPolicy(const vector& arguments); + + virtual double operator()(int32_t genome_number); +}; diff --git a/examm/examm.cxx b/examm/examm.cxx index 5588c349..7d9e7cd4 100644 --- a/examm/examm.cxx +++ b/examm/examm.cxx @@ -96,8 +96,10 @@ void EXAMM::generate_log() { Log::info("Generating fitness log\n"); mkpath(output_directory.c_str(), 0777); log_file = new ofstream(output_directory + "/" + "fitness_log.csv"); - (*log_file) << "Inserted Genomes, Total BP Epochs, Time, Best Val. MAE, Best Val. MSE, Enabled Nodes, Enabled " - "Edges, Enabled Rec. Edges"; + (*log_file + ) << "Inserted Genomes,Total BP Epochs,Time,Best Val. MAE,Best Val. MSE,Trainable Parameters,Enabled " + "Nodes,Enabled" + "Edges,Enabled Rec. Edges,Val. MSE,Pre-Insert MSE,Genome Inserted,Genome Trainable Parameters,Island Id"; (*log_file) << speciation_strategy->get_strategy_information_headers(); (*log_file) << endl; @@ -152,7 +154,7 @@ void EXAMM::update_op_log_statistics(RNN_Genome* genome, int32_t insert_position } } -void EXAMM::update_log() { +void EXAMM::update_log(RNN_Genome* genome) { if (log_file != NULL) { // make sure the log file is still good if (!log_file->good()) { @@ -183,17 +185,27 @@ void EXAMM::update_log() { } (*op_log_file) << endl; } + RNN_Genome* best_genome = get_best_genome(); if (best_genome == NULL) { best_genome = speciation_strategy->get_global_best_genome(); } + std::chrono::time_point currentClock = std::chrono::system_clock::now(); long milliseconds = std::chrono::duration_cast(currentClock - startClock).count(); (*log_file) << speciation_strategy->get_evaluated_genomes() << "," << total_bp_epochs << "," << milliseconds << "," << best_genome->best_validation_mae << "," << best_genome->best_validation_mse << "," - << best_genome->get_enabled_node_count() << "," << best_genome->get_enabled_edge_count() << "," - << best_genome->get_enabled_recurrent_edge_count() - << speciation_strategy->get_strategy_information_values() << endl; + << best_genome->get_number_weights() << "," << best_genome->get_enabled_node_count() << "," + << best_genome->get_enabled_edge_count() << "," << best_genome->get_enabled_recurrent_edge_count() + << "," << genome->best_validation_mse << "," << pre_insert_best_mse << "," + << (int32_t) (last_genome_inserted ? 1 : 0) << "," << genome->get_number_weights() << "," + << genome->get_generation_id() << speciation_strategy->get_strategy_information_values(genome) + << endl; + Log::info( + "mse: %f node count: %d edge count: %d rec edges: %d\n", best_genome->best_validation_mse, + best_genome->get_enabled_node_count(), best_genome->get_enabled_edge_count(), + best_genome->get_enabled_recurrent_edge_count() + ); } } @@ -246,15 +258,17 @@ bool EXAMM::insert_genome(RNN_Genome* genome) { // updates EXAMM's mapping of which genomes have been generated by what genome->update_generation_map(generated_from_map); + pre_insert_best_mse = this->get_best_fitness(); + int32_t insert_position = speciation_strategy->insert_genome(genome); Log::info("insert to speciation strategy complete, at position: %d\n", insert_position); // write this genome to disk if it was a new best found genome if (save_genome_option.compare("all_best_genomes") == 0) { - Log::info("save genome option compared, save genome option size: %d!\n", save_genome_option.size()); - for (int i = 0; i < 20 && i < save_genome_option.size(); i++) { - cout << "save_genome_option[" << i << "]: " << save_genome_option[i] << endl; - } + // Log::info("save genome option compared, save genome option size: %d!\n", save_genome_option.size()); + // for (int i = 0; i < 20 && i < save_genome_option.size(); i++) { + // cout << "save_genome_option[" << i << "]: " << save_genome_option[i] << endl; + // } if (insert_position == 0) { Log::info("saving genome!"); @@ -264,16 +278,26 @@ bool EXAMM::insert_genome(RNN_Genome* genome) { } Log::info("save genome complete\n"); + last_genome_inserted = insert_position >= 0; + speciation_strategy->print(); - Log::info("printed speciation strategy!\n\n"); update_op_log_statistics(genome, insert_position); - update_log(); - return insert_position >= 0; + update_log(genome); + + return last_genome_inserted; } // write function to save genomes to file void EXAMM::save_genome(RNN_Genome* genome, string genome_name = "rnn_genome") { + if (genome->get_fitness() != EXAMM_MAX_DOUBLE) { + // need to set the weights for non-initial genomes so we + // can generate a proper graphviz file + vector best_parameters = genome->get_best_parameters(); + genome->set_weights(best_parameters); + Log::info("set genome parameters to best\n"); + } + genome->write_graphviz(output_directory + "/" + genome_name + "_" + to_string(genome->get_generation_id()) + ".gv"); ofstream equations_filestream( output_directory + "/" + genome_name + "_" + to_string(genome->get_generation_id()) + ".txt" @@ -314,7 +338,7 @@ RNN_Genome* EXAMM::generate_genome() { return genome; } -int32_t EXAMM::get_random_node_type() { +node_t EXAMM::get_random_node_type() { return possible_node_types[rng_0_1(generator) * possible_node_types.size()]; } @@ -354,7 +378,8 @@ void EXAMM::mutate(int32_t max_mutations, RNN_Genome* g) { g->assign_reachability(); double rng = rng_0_1(generator) * total; - int32_t new_node_type = get_random_node_type(); + node_t new_node_type = get_random_node_type(); + Log::info("%d %d\n", new_node_type, NODE_TYPES.size()); string node_type_str = NODE_TYPES[new_node_type]; Log::debug("rng: %lf, total: %lf, new node type: %d (%s)\n", rng, total, new_node_type, node_type_str.c_str()); diff --git a/examm/examm.hxx b/examm/examm.hxx index 3a7288ca..3ec90bd9 100644 --- a/examm/examm.hxx +++ b/examm/examm.hxx @@ -63,21 +63,22 @@ class EXAMM { double split_node_rate; double merge_node_rate; - vector possible_node_types = {SIMPLE_NODE, JORDAN_NODE, ELMAN_NODE, UGRNN_NODE, - MGU_NODE, GRU_NODE, DELTA_NODE, LSTM_NODE}; + vector possible_node_types = {SIMPLE_NODE, JORDAN_NODE, ELMAN_NODE, UGRNN_NODE, + MGU_NODE, GRU_NODE, DELTA_NODE, LSTM_NODE}; vector op_log_ordering; map inserted_counts; map generated_counts; - string output_directory; + const string output_directory; ofstream* log_file; ofstream* op_log_file; + double pre_insert_best_mse = 1000000; + bool last_genome_inserted = false; std::chrono::time_point startClock; - string genome_file_name; - string save_genome_option; + const string save_genome_option; public: EXAMM( @@ -89,13 +90,13 @@ class EXAMM { ~EXAMM(); void print(); - void update_log(); + void update_log(RNN_Genome* genome); void set_possible_node_types(vector possible_node_type_strings); uniform_int_distribution get_recurrent_depth_dist(); - int32_t get_random_node_type(); + node_t get_random_node_type(); RNN_Genome* generate_genome(); bool insert_genome(RNN_Genome* genome); diff --git a/examm/island.cxx b/examm/island.cxx index 6d8b0b5f..eb39ed98 100644 --- a/examm/island.cxx +++ b/examm/island.cxx @@ -1,22 +1,12 @@ #include -using std::lower_bound; -using std::sort; +#include using std::upper_bound; #include -using std::setw; - -#include -using std::minstd_rand0; -using std::uniform_real_distribution; - #include using std::string; using std::to_string; -#include -using std::unordered_map; - #include using std::vector; @@ -24,17 +14,25 @@ using std::vector; #include "island.hxx" #include "rnn/rnn_genome.hxx" -Island::Island(int32_t _id, int32_t _max_size) - : id(_id), max_size(_max_size), status(Island::INITIALIZING), erase_again(0), erased(false) { +Island::Island( + int32_t id, int32_t max_size, vector genomes, int32_t status, AnnealingPolicy& annealing_policy +) + : id(id), max_size(max_size), genomes(genomes), annealing_policy(annealing_policy), status(status) { + using namespace std::chrono; + long long t = time_point_cast(system_clock::now()).time_since_epoch().count(); + generator = mt19937_64(t + 1123 * id + 12334 * max_size); + + for (int i = 0; i < 100; i++) { + generate_canonical(generator); + } +} + +Island::Island(int32_t id, int32_t max_size, AnnealingPolicy& annealing_policy) + : Island(id, max_size, vector(), Island::INITIALIZING, annealing_policy) { } -Island::Island(int32_t _id, vector _genomes) - : id(_id), - max_size((int32_t) _genomes.size()), - genomes(_genomes), - status(Island::FILLED), - erase_again(0), - erased(false) { +Island::Island(int32_t id, vector genomes, AnnealingPolicy& annealing_policy) + : Island(id, genomes.size(), genomes, Island::FILLED, annealing_policy) { } RNN_Genome* Island::get_best_genome() { @@ -62,6 +60,14 @@ double Island::get_best_fitness() { } } +double Island::get_best_all_time_fitness() { + if (all_time_local_best) { + return all_time_local_best->get_fitness(); + } else { + return EXAMM_MAX_DOUBLE; + } +} + double Island::get_worst_fitness() { RNN_Genome* worst_genome = get_worst_genome(); if (worst_genome == NULL) { @@ -147,6 +153,22 @@ int32_t Island::insert_genome(RNN_Genome* genome) { double new_fitness = genome->get_fitness(); Log::info("inserting genome with fitness: %s to island %d\n", parse_fitness(genome->get_fitness()).c_str(), id); + // Only do simulated annealing if the island is full + // This will with a probability prescribed by the annealing policy (a function of genome number) randomly accept + // genomes by deleting a random member of the population. + double p = annealing_policy(genome->get_generation_id()); + Log::info("Annealing policy p = %f\n", p); + + if (is_full() && uniform_real_distribution<>(0.0, 1.0)(generator) < p) { + int32_t index = uniform_real_distribution<>(0., 1.)(generator) * genomes.size(); + + Log::info("Simulated annealing triggered - deleting a random genome %d\n", index); + + RNN_Genome* victim = genomes[index]; + genomes.erase(genomes.begin() + index); + structure_set.erase(victim); + } + // discard the genome if the island is full and it's fitness is worse than the worst in thte population if (is_full() && new_fitness > get_worst_fitness()) { Log::debug( @@ -154,116 +176,38 @@ int32_t Island::insert_genome(RNN_Genome* genome) { genomes.back()->get_fitness() ); do_population_check(__LINE__, initial_size); - return false; + return -1; } // check and see if the structural hash of the genome is in the // set of hashes for this population Log::info("getting structural hash\n"); - string structural_hash = genome->get_structural_hash(); - if (structure_map.count(structural_hash) > 0) { - vector& potential_matches = structure_map.find(structural_hash)->second; - Log::debug( - "potential duplicate for hash '%s', had %d potential matches.\n", structural_hash.c_str(), - potential_matches.size() - ); - - for (auto potential_match = potential_matches.begin(); potential_match != potential_matches.end();) { - Log::debug( - "on potential match %d of %d\n", potential_match - potential_matches.begin(), potential_matches.size() - ); - if ((*potential_match)->equals(genome)) { - if ((*potential_match)->get_fitness() > new_fitness) { - Log::debug( - "REPLACING DUPLICATE GENOME, fitness of genome in search: %s, new fitness: %s\n", - parse_fitness((*potential_match)->get_fitness()).c_str(), - parse_fitness(genome->get_fitness()).c_str() - ); - // we have an exact match for this genome in the island and its fitness is worse - // than the genome we're trying to remove, so remove the duplicate it from the genomes - // as well from the potential matches vector - - auto duplicate_genome_iterator = - lower_bound(genomes.begin(), genomes.end(), *potential_match, sort_genomes_by_fitness()); - bool found = false; - for (; duplicate_genome_iterator != genomes.end(); duplicate_genome_iterator++) { - Log::debug( - "duplicate_genome_iterator: %p, (*potential_match): %p\n", (*duplicate_genome_iterator), - (*potential_match) - ); - if ((*duplicate_genome_iterator) == (*potential_match)) { - found = true; - break; - } - } - if (!found) { - Log::fatal( - "ERROR: could not find duplicate genome even though its structural hash was in the island, " - "this should never happen!\n" - ); - exit(1); - } - Log::debug( - "potential_match->get_fitness(): %lf, duplicate_genome_iterator->get_fitness(): %lf, " - "new_fitness: %lf\n", - (*potential_match)->get_fitness(), (*duplicate_genome_iterator)->get_fitness(), new_fitness - ); - int32_t duplicate_genome_index = duplicate_genome_iterator - genomes.begin(); - Log::debug("duplicate_genome_index: %d\n", duplicate_genome_index); - // int32_t test_index = contains(genome); - // Log::info("test_index: %d\n", test_index); - RNN_Genome* duplicate = genomes[duplicate_genome_index]; - // Log::info("duplicate.equals(potential_match)? %d\n", duplicate->equals(*potential_match)); - genomes.erase(genomes.begin() + duplicate_genome_index); - Log::debug("potential_matches.size() before erase: %d\n", potential_matches.size()); - - // erase the potential match from the structure map as well - // returns an iterator to next element after the deleted one so - // we don't need to increment it - potential_match = potential_matches.erase(potential_match); - delete duplicate; - - Log::debug("potential_matches.size() after erase: %d\n", potential_matches.size()); - Log::debug( - "structure_map[%s].size() after erase: %d\n", structural_hash.c_str(), - structure_map[structural_hash].size() - ); - if (potential_matches.size() == 0) { - Log::debug( - "deleting the potential_matches vector for hash '%s' because it was empty.\n", - structural_hash.c_str() - ); - structure_map.erase(structural_hash); - break; // break because this vector is now empty and deleted - } - } else { - Log::info( - "Island %d: island already contains a duplicate genome with a better fitness! not inserting.\n", - id - ); - do_population_check(__LINE__, initial_size); - return -1; - } - } else { - // increment potential match because we didn't delete an entry (or return from the method) - potential_match++; - } + auto duplicate_it = structure_set.find(genome); + + bool duplicate_exists = duplicate_it != structure_set.end(); + if (duplicate_exists) { + RNN_Genome* duplicate = *duplicate_it; + // TODO: Add annealment here + if (duplicate->get_fitness() > genome->get_fitness()) { + genomes.erase(std::find(genomes.begin(), genomes.end(), duplicate)); } } // inorder insert the new individual RNN_Genome* copy = genome->copy(); + copy->set_generation_id(genome->get_generation_id()); + vector best = copy->get_best_parameters(); if (best.size() != 0) { copy->set_weights(best); } - copy->set_generation_id(genome->get_generation_id()); - Log::debug("created copy to insert to island: %d\n", copy->get_group_id()); + auto index_iterator = upper_bound(genomes.begin(), genomes.end(), copy, sort_genomes_by_fitness()); int32_t insert_index = index_iterator - genomes.begin(); - Log::debug("inserting genome at index: %d\n", insert_index); + Log::info("inserting genome at index: %d\n", insert_index); if (insert_index >= max_size) { + // For simulated annealing: if this is true, then we should remove a random member of the population to insert. // if we're going to insert this at the back of the population // its just going to get removed anyways, so we can delete // it and report it was not inserted. @@ -274,24 +218,14 @@ int32_t Island::insert_genome(RNN_Genome* genome) { } genomes.insert(index_iterator, copy); - // calculate the index the genome was inseretd at from the iterator - - structural_hash = copy->get_structural_hash(); - // add the genome to the vector for this structural hash - structure_map[structural_hash].push_back(copy); - Log::debug("adding to structure_map[%s] : %p\n", structural_hash.c_str(), ©); + structure_set.insert(copy); if (insert_index == 0) { // this was a new best genome for this island - Log::info("Island %d: new best fitness found!\n", id); - if (genome->get_fitness() != EXAMM_MAX_DOUBLE) { - // need to set the weights for non-initial genomes so we - // can generate a proper graphviz file - vector best_parameters = genome->get_best_parameters(); - genome->set_weights(best_parameters); - Log::info("set genome parameters to best\n"); + if (!all_time_local_best || all_time_local_best->get_fitness() > genome->get_fitness()) { + all_time_local_best = unique_ptr(genome->copy()); } } @@ -309,51 +243,7 @@ int32_t Island::insert_genome(RNN_Genome* genome) { Log::debug("deleting worst genome\n"); RNN_Genome* worst = genomes.back(); genomes.pop_back(); - structural_hash = worst->get_structural_hash(); - - vector& potential_matches = structure_map.find(structural_hash)->second; - - bool found = false; - for (auto potential_match = potential_matches.begin(); potential_match != potential_matches.end();) { - // make sure the addresses of the pointers are the same - Log::debug( - "checking to remove worst from structure_map - &worst: %p, &(*potential_match): %p\n", worst, - (*potential_match) - ); - if ((*potential_match) == worst) { - found = true; - Log::debug("potential_matches.size() before erase: %d\n", potential_matches.size()); - - // erase the potential match from the structure map as well - potential_match = potential_matches.erase(potential_match); - - Log::debug("potential_matches.size() after erase: %d\n", potential_matches.size()); - Log::debug( - "structure_map[%s].size() after erase: %d\n", structural_hash.c_str(), - structure_map[structural_hash].size() - ); - - // clean up the structure_map if no genomes in the population have this hash - if (potential_matches.size() == 0) { - Log::debug( - "deleting the potential_matches vector for hash '%s' because it was empty.\n", - structural_hash.c_str() - ); - structure_map.erase(structural_hash); - break; - } - } else { - potential_match++; - } - } - - if (!found) { - Log::debug( - "could not erase from structure_map[%s], genome not found! This should never happen.\n", - structural_hash.c_str() - ); - exit(1); - } + structure_set.erase(worst); delete worst; } @@ -382,24 +272,19 @@ void Island::print(string indent) { } void Island::erase_island() { - erased_generation_id = latest_generation_id; + structure_set.clear(); + for (int32_t i = 0; i < (int32_t) genomes.size(); i++) { delete genomes[i]; } + genomes.clear(); + erased = true; erase_again = 5; - Log::debug("Worst island size after erased: %d\n", genomes.size()); - - if (genomes.size() != 0) { - Log::error("The worst island is not fully erased!\n"); - } -} + erased_generation_id = latest_generation_id; -void Island::erase_structure_map() { - Log::debug("Erasing the structure map in the worst performing island\n"); - structure_map.clear(); - Log::debug("after erase structure map size is %d\n", structure_map.size()); + Log::debug("Worst island size after erased: %d\n", genomes.size()); } int32_t Island::get_erased_generation_id() { diff --git a/examm/island.hxx b/examm/island.hxx index c75921aa..707bf9db 100644 --- a/examm/island.hxx +++ b/examm/island.hxx @@ -2,22 +2,23 @@ #define EXAMM_ISLAND_STRATEGY_HXX #include -using std::sort; -using std::upper_bound; - #include +#include using std::function; #include using std::minstd_rand0; +using std::mt19937_64; using std::uniform_real_distribution; #include using std::string; #include -using std::unordered_map; +#include +using std::unordered_set; +#include "annealing.hxx" #include "rnn/rnn_genome.hxx" class Island { @@ -35,30 +36,49 @@ class Island { */ vector genomes; - unordered_map> structure_map; + /** + * If we are using simulated annealing, then the genomes vector may not contain the best genome we have discovered. + * Keep an additional clone of the best genome here for logging. + **/ + unique_ptr all_time_local_best; + + /** + * A set of the genomes this island contains (one entry per genome in Island::genomes. + * These are hashed by their structure: the nodes, edges, and their innovation numbers. Weights are not considered. + **/ + unordered_set structure_set; + + mt19937_64 generator; + + AnnealingPolicy& annealing_policy; + int32_t status; /**> The status of this island (either Island:INITIALIZING, Island::FILLED or Island::REPOPULATING */ - int32_t erase_again; /**< a flag to track if this islands has been erased */ - bool erased; /**< a flag to track if this islands has been erased */ + int32_t erase_again = 0; /**< a flag to track if this islands has been erased */ + bool erased = false; /**< a flag to track if this islands has been erased */ public: const static int32_t INITIALIZING = 0; /**< status flag for if the island is initializing. */ const static int32_t FILLED = 1; /**< status flag for if the island is filled. */ const static int32_t REPOPULATING = 2; /**< status flag for if the island is repopulating. */ + Island( + int32_t id, int32_t max_size, vector genomes, int32_t status, AnnealingPolicy& annealing_policy + ); + /** * Initializes an island with a given max size. * * \param max_size is the maximum number of genomes in the island. */ - Island(int32_t id, int32_t max_size); + Island(int32_t id, int32_t max_size, AnnealingPolicy& annealing_policy); /** * Initializes an island filled the supplied genomes. The size of the island will be the size * of the supplied genome vector. The island status is set to filled. */ - Island(int32_t id, vector genomes); + Island(int32_t id, vector genomes, AnnealingPolicy& annealing_policy); /** * Returns the fitness of the best genome in the island @@ -67,6 +87,11 @@ class Island { */ double get_best_fitness(); + /** + * Returns the best fitness ever obtains by any genome in this island - even if that genome has been removed. + **/ + double get_best_all_time_fitness(); + /** * Returns the fitness of the worst genome in the island * @@ -172,8 +197,6 @@ class Island { */ void erase_island(); - void erase_structure_map(); - /** * returns the get_erased_generation_id. */ diff --git a/examm/island_speciation_strategy.cxx b/examm/island_speciation_strategy.cxx index 9df6bd9c..e05b95d1 100644 --- a/examm/island_speciation_strategy.cxx +++ b/examm/island_speciation_strategy.cxx @@ -1,9 +1,12 @@ #include +#include using std::function; #include // #include +#include +using std::stringstream; #include @@ -23,10 +26,11 @@ using std::string; */ IslandSpeciationStrategy::IslandSpeciationStrategy( int32_t _number_of_islands, int32_t _max_island_size, double _mutation_rate, double _intra_island_crossover_rate, - double _inter_island_crossover_rate, RNN_Genome* _seed_genome, string _island_ranking_method, + double _inter_island_crossover_rate, string output_directory, RNN_Genome* _seed_genome, string _island_ranking_method, string _repopulation_method, int32_t _extinction_event_generation_number, int32_t _num_mutations, int32_t _islands_to_exterminate, int32_t _max_genomes, bool _repeat_extinction, bool _start_filled, - bool _transfer_learning, string _transfer_learning_version, int32_t _seed_stirs, bool _tl_epigenetic_weights + bool _transfer_learning, string _transfer_learning_version, bool _tl_epigenetic_weights, + unique_ptr& annealing_policy ) : generation_island(0), number_of_islands(_number_of_islands), @@ -34,6 +38,7 @@ IslandSpeciationStrategy::IslandSpeciationStrategy( mutation_rate(_mutation_rate), intra_island_crossover_rate(_intra_island_crossover_rate), inter_island_crossover_rate(_inter_island_crossover_rate), + output_directory(output_directory), generated_genomes(0), evaluated_genomes(0), seed_genome(_seed_genome), @@ -47,8 +52,8 @@ IslandSpeciationStrategy::IslandSpeciationStrategy( start_filled(_start_filled), transfer_learning(_transfer_learning), transfer_learning_version(_transfer_learning_version), - seed_stirs(_seed_stirs), - tl_epigenetic_weights(_tl_epigenetic_weights) { + tl_epigenetic_weights(_tl_epigenetic_weights), + annealing_policy(std::move(annealing_policy)) { double rate_sum = mutation_rate + intra_island_crossover_rate + inter_island_crossover_rate; if (rate_sum != 1.0) { mutation_rate = mutation_rate / rate_sum; @@ -78,15 +83,14 @@ IslandSpeciationStrategy::IslandSpeciationStrategy( if (transfer_learning) { Log::info("Transfer learning version is %s\n", transfer_learning_version.c_str()); - Log::info("Apply seed stirs: %d\n", seed_stirs); } } void IslandSpeciationStrategy::initialize_population(function& mutate) { for (int32_t i = 0; i < number_of_islands; i++) { - Island* new_island = new Island(i, max_island_size); + Island* new_island = new Island(i, max_island_size, *annealing_policy); if (start_filled) { - new_island->fill_with_mutated_genomes(seed_genome, seed_stirs, tl_epigenetic_weights, mutate); + new_island->fill_with_mutated_genomes(seed_genome, num_mutations, tl_epigenetic_weights, mutate); } islands.push_back(new_island); } @@ -100,12 +104,12 @@ int32_t IslandSpeciationStrategy::get_evaluated_genomes() const { return evaluated_genomes; } -RNN_Genome* IslandSpeciationStrategy::get_best_genome() { +RNN_Genome* IslandSpeciationStrategy::get_best_genome() const { // the global_best_genome is updated every time a genome is inserted return global_best_genome; } -RNN_Genome* IslandSpeciationStrategy::get_worst_genome() { +RNN_Genome* IslandSpeciationStrategy::get_worst_genome() const { int32_t worst_genome_island = -1; double worst_fitness = -EXAMM_MAX_DOUBLE; @@ -126,7 +130,7 @@ RNN_Genome* IslandSpeciationStrategy::get_worst_genome() { } } -double IslandSpeciationStrategy::get_best_fitness() { +double IslandSpeciationStrategy::get_best_fitness() const { RNN_Genome* best_genome = get_best_genome(); if (best_genome == NULL) { return EXAMM_MAX_DOUBLE; @@ -135,7 +139,7 @@ double IslandSpeciationStrategy::get_best_fitness() { } } -double IslandSpeciationStrategy::get_worst_fitness() { +double IslandSpeciationStrategy::get_worst_fitness() const { RNN_Genome* worst_genome = get_worst_genome(); if (worst_genome == NULL) { return EXAMM_MAX_DOUBLE; @@ -186,14 +190,14 @@ int32_t IslandSpeciationStrategy::insert_genome(RNN_Genome* genome) { Log::fatal("ERROR: island[%d] is null!\n", island); } int32_t insert_position = islands[island]->insert_genome(genome); - Log::info("Island %d: Insert position was: %d\n", insert_position); + Log::info("Island %d: Insert position was: %d\n", island, insert_position); if (insert_position == 0) { - if (new_global_best) { - return 0; - } else { - return 1; - } + stringstream ss; + ss << output_directory << "/island_" << island << "_best.bin"; + genome->write_to_file(ss.str()); + + return insert_position != 0; } else { return insert_position; // will be -1 if not inserted, or > 0 if not the global best } @@ -228,7 +232,6 @@ void IslandSpeciationStrategy::repopulate() { if (rank[i] >= 0) { Log::info("found island: %d is the worst island \n", rank[0]); islands[rank[i]]->erase_island(); - islands[rank[i]]->erase_structure_map(); islands[rank[i]]->set_status(Island::REPOPULATING); } else { Log::error("Didn't find the worst island!"); @@ -287,14 +290,6 @@ RNN_Genome* IslandSpeciationStrategy::generate_for_initializing_island( new_genome = seed_genome->copy(); new_genome->initialize_randomly(); - bool stir_seed_genome = false; - if (stir_seed_genome) { - Log::info("Stir the seed genome with %d mutations\n", seed_stirs); - mutate(seed_stirs, new_genome); - if (!tl_epigenetic_weights) { - new_genome->initialize_randomly(); - } - } } else { Log::info("Island %d: island is initializing but not empty, mutating a random genome\n", generation_island); while (new_genome == NULL) { @@ -376,11 +371,15 @@ RNN_Genome* IslandSpeciationStrategy::generate_genome( Log::info("Island %d: new genome is still null, regenerating\n", generation_island); new_genome = generate_genome(rng_0_1, generator, mutate, crossover); } + generated_genomes++; new_genome->set_generation_id(generated_genomes); islands[generation_island]->set_latest_generation_id(generated_genomes); new_genome->set_group_id(generation_island); + pair perf = {this->get_best_fitness(), this->get_worst_fitness()}; + genome_performance.emplace(new_genome->generation_id, perf); + if (current_island->is_initializing()) { RNN_Genome* genome_copy = new_genome->copy(); Log::debug("inserting genome copy!\n"); @@ -460,34 +459,38 @@ void IslandSpeciationStrategy::print(string indent) const { * Gets speciation strategy information headers for logs */ string IslandSpeciationStrategy::get_strategy_information_headers() const { + stringstream oss; + string info_header = ""; + oss << ",mse_min_pre,mse_max_pre,mse_min_post,mse_max_post"; for (int32_t i = 0; i < (int32_t) islands.size(); i++) { - info_header.append(","); - info_header.append("Island_"); - info_header.append(to_string(i)); - info_header.append("_best_fitness"); - info_header.append(","); - info_header.append("Island_"); - info_header.append(to_string(i)); - info_header.append("_worst_fitness"); - } - return info_header; + oss << ",Island_" << i << "_best_fitness" << ",Island_" << i << "_wort_fitness" << ",Island_" << i + << "_all_time_best"; + } + + return oss.str(); } /** * Gets speciation strategy information values for logs */ -string IslandSpeciationStrategy::get_strategy_information_values() const { - string info_value = ""; +string IslandSpeciationStrategy::get_strategy_information_values(RNN_Genome* genome) const { + stringstream oss; + auto& [min_mse_pre, max_mse_pre] = genome_performance.at(genome->generation_id); + oss << "," << min_mse_pre << "," << max_mse_pre; + + float min_mse_post = this->get_best_fitness(); + float max_mse_post = this->get_worst_fitness(); + oss << "," << min_mse_post << "," << max_mse_post; + for (int32_t i = 0; i < (int32_t) islands.size(); i++) { double best_fitness = islands[i]->get_best_fitness(); double worst_fitness = islands[i]->get_worst_fitness(); - info_value.append(","); - info_value.append(to_string(best_fitness)); - info_value.append(","); - info_value.append(to_string(worst_fitness)); + double all_time_best = islands[i]->get_best_all_time_fitness(); + oss << "," << best_fitness << "," << worst_fitness << "," << all_time_best; } - return info_value; + + return oss.str(); } RNN_Genome* IslandSpeciationStrategy::parents_repopulation( @@ -584,8 +587,8 @@ void IslandSpeciationStrategy::set_erased_islands_status() { RNN_Genome* IslandSpeciationStrategy::get_seed_genome() { return seed_genome; } -// write a save entire population function with an input saving function +// write a save entire population function with an input saving function void IslandSpeciationStrategy::save_entire_population(string output_path) { for (int32_t i = 0; i < (int32_t) islands.size(); i++) { islands[i]->save_population(output_path); diff --git a/examm/island_speciation_strategy.hxx b/examm/island_speciation_strategy.hxx index b3888621..bd32d507 100644 --- a/examm/island_speciation_strategy.hxx +++ b/examm/island_speciation_strategy.hxx @@ -37,6 +37,7 @@ class IslandSpeciationStrategy : public SpeciationStrategy { RNN_Genome* seed_genome; /**< keep a reference to the seed genome so we can re-use it across islands and not duplicate innovation numbers. */ + string output_directory; string island_ranking_method; /**< The method used to find the worst island in population */ string repopulation_method; /**< The method used to repopulate the island after being erased */ @@ -62,13 +63,20 @@ class IslandSpeciationStrategy : public SpeciationStrategy { vector islands; RNN_Genome* global_best_genome; + ofstream* island_log_file; + + // Maps genome number to a pair representing (worst island mse, best island mse) at + // the time of genome generation. + unordered_map> genome_performance; + // Transfer learning class properties: bool transfer_learning; string transfer_learning_version; - int32_t seed_stirs; bool tl_epigenetic_weights; + unique_ptr annealing_policy; + public: // static void register_command_line_arguments(); // static IslandSpeciationStrategy* generate_from_command_line(); @@ -81,11 +89,11 @@ class IslandSpeciationStrategy : public SpeciationStrategy { */ IslandSpeciationStrategy( int32_t _number_of_islands, int32_t _max_island_size, double _mutation_rate, - double _intra_island_crossover_rate, double _inter_island_crossover_rate, RNN_Genome* _seed_genome, + double _intra_island_crossover_rate, double _inter_island_crossover_rate, string output_directory, RNN_Genome* _seed_genome, string _island_ranking_method, string _repopulation_method, int32_t _extinction_event_generation_number, int32_t _num_mutations, int32_t _islands_to_exterminate, int32_t _max_genomes, bool _repeat_extinction, - bool _start_filled, bool _transfer_learning, string _transfer_learning_version, int32_t _seed_stirs, - bool _tl_epigenetic_weights + bool _start_filled, bool _transfer_learning, string _transfer_learning_version, bool _tl_epigenetic_weights, + unique_ptr& annealing_policy ); // /** @@ -114,25 +122,25 @@ class IslandSpeciationStrategy : public SpeciationStrategy { * Gets the fitness of the best genome of all the islands * \return the best fitness over all islands */ - double get_best_fitness(); + double get_best_fitness() const; /** * Gets the fitness of the worst genome of all the islands * \return the worst fitness over all islands */ - double get_worst_fitness(); + double get_worst_fitness() const; /** * Gets the best genome of all the islands * \return the best genome of all islands or NULL if no genomes have yet been inserted */ - RNN_Genome* get_best_genome(); + RNN_Genome* get_best_genome() const; /** * Gets the the worst genome of all the islands * \return the worst genome of all islands or NULL if no genomes have yet been inserted */ - RNN_Genome* get_worst_genome(); + RNN_Genome* get_worst_genome() const; /** * \return true if all the islands are full @@ -207,7 +215,7 @@ class IslandSpeciationStrategy : public SpeciationStrategy { /** * Gets speciation strategy information values for logs */ - string get_strategy_information_values() const; + string get_strategy_information_values(RNN_Genome* genome) const; /** * Island repopulation through two random parents from two seperate islands, diff --git a/examm/neat_speciation_strategy.cxx b/examm/neat_speciation_strategy.cxx index 4fdd3d94..e24f6fc1 100644 --- a/examm/neat_speciation_strategy.cxx +++ b/examm/neat_speciation_strategy.cxx @@ -74,7 +74,7 @@ int32_t NeatSpeciationStrategy::get_evaluated_genomes() const { return evaluated_genomes; } -RNN_Genome* NeatSpeciationStrategy::get_best_genome() { +RNN_Genome* NeatSpeciationStrategy::get_best_genome() const { int32_t best_genome_species = -1; double best_fitness = EXAMM_MAX_DOUBLE; @@ -95,7 +95,7 @@ RNN_Genome* NeatSpeciationStrategy::get_best_genome() { } } -RNN_Genome* NeatSpeciationStrategy::get_worst_genome() { +RNN_Genome* NeatSpeciationStrategy::get_worst_genome() const { int32_t worst_genome_species = -1; double worst_fitness = -EXAMM_MAX_DOUBLE; @@ -116,7 +116,7 @@ RNN_Genome* NeatSpeciationStrategy::get_worst_genome() { } } -double NeatSpeciationStrategy::get_best_fitness() { +double NeatSpeciationStrategy::get_best_fitness() const { RNN_Genome* best_genome = get_best_genome(); if (best_genome == NULL) { return EXAMM_MAX_DOUBLE; @@ -125,7 +125,7 @@ double NeatSpeciationStrategy::get_best_fitness() { } } -double NeatSpeciationStrategy::get_worst_fitness() { +double NeatSpeciationStrategy::get_worst_fitness() const { RNN_Genome* worst_genome = get_worst_genome(); if (worst_genome == NULL) { return EXAMM_MAX_DOUBLE; @@ -399,7 +399,7 @@ string NeatSpeciationStrategy::get_strategy_information_headers() const { /** * Gets speciation strategy information values for logs */ -string NeatSpeciationStrategy::get_strategy_information_values() const { +string NeatSpeciationStrategy::get_strategy_information_values(RNN_Genome* genome) const { string info_value = ""; for (int32_t i = 0; i < (int32_t) Neat_Species.size(); i++) { double best_fitness = Neat_Species[i]->get_best_fitness(); diff --git a/examm/neat_speciation_strategy.hxx b/examm/neat_speciation_strategy.hxx index 01dc38a2..1cc88ceb 100644 --- a/examm/neat_speciation_strategy.hxx +++ b/examm/neat_speciation_strategy.hxx @@ -64,25 +64,25 @@ class NeatSpeciationStrategy : public SpeciationStrategy { * Gets the fitness of the best genome of all the islands * \return the best fitness over all islands */ - double get_best_fitness(); + double get_best_fitness() const; /** * Gets the fitness of the worst genome of all the islands * \return the worst fitness over all islands */ - double get_worst_fitness(); + double get_worst_fitness() const; /** * Gets the best genome of all the islands * \return the best genome of all islands */ - RNN_Genome* get_best_genome(); + RNN_Genome* get_best_genome() const; /** * Gets the the worst genome of all the islands * \return the worst genome of all islands */ - RNN_Genome* get_worst_genome(); + RNN_Genome* get_worst_genome() const; /** * Inserts a copy of the genome into this speciation strategy. @@ -130,7 +130,7 @@ class NeatSpeciationStrategy : public SpeciationStrategy { /** * Gets speciation strategy information values for logs */ - string get_strategy_information_values() const; + string get_strategy_information_values(RNN_Genome* genome) const; RNN_Genome* get_global_best_genome(); diff --git a/examm/speciation_strategy.hxx b/examm/speciation_strategy.hxx index bf8a43d5..713bd216 100644 --- a/examm/speciation_strategy.hxx +++ b/examm/speciation_strategy.hxx @@ -9,6 +9,8 @@ using std::string; using std::minstd_rand0; using std::uniform_real_distribution; +#include "rnn/rnn_genome.hxx" + class SpeciationStrategy { public: /** @@ -25,25 +27,25 @@ class SpeciationStrategy { * Gets the fitness of the best genome of all the islands * \return the best fitness over all islands */ - virtual double get_best_fitness() = 0; + virtual double get_best_fitness() const = 0; /** * Gets the fitness of the worst genome of all the islands * \return the worst fitness over all islands */ - virtual double get_worst_fitness() = 0; + virtual double get_worst_fitness() const = 0; /** * Gets the best genome of all the islands * \return the best genome of all islands */ - virtual RNN_Genome* get_best_genome() = 0; + virtual RNN_Genome* get_best_genome() const = 0; /** * Gets the the worst genome of all the islands * \return the worst genome of all islands */ - virtual RNN_Genome* get_worst_genome() = 0; + virtual RNN_Genome* get_worst_genome() const = 0; /** * Inserts a copy of the genome into this speciation strategy. @@ -86,7 +88,7 @@ class SpeciationStrategy { /** * Gets speciation strategy information values for logs */ - virtual string get_strategy_information_values() const = 0; + virtual string get_strategy_information_values(RNN_Genome* genome) const = 0; virtual RNN_Genome* get_global_best_genome() = 0; virtual void initialize_population(function& mutate) = 0; diff --git a/examm/species.cxx b/examm/species.cxx index 1e650ec7..9081b203 100644 --- a/examm/species.cxx +++ b/examm/species.cxx @@ -130,12 +130,6 @@ int32_t Species::insert_genome(RNN_Genome* genome) { if (insert_index == 0) { // this was a new best genome for this island Log::info("new best fitness for island: %d!\n", id); - if (genome->get_fitness() != EXAMM_MAX_DOUBLE) { - // need to set the weights for non-initial genomes so we - // can generate a proper graphviz file - vector best_parameters = genome->get_best_parameters(); - genome->set_weights(best_parameters); - } species_not_improving_count = 0; } else { species_not_improving_count++; @@ -233,4 +227,4 @@ int32_t Species::get_species_not_improving_count() { void Species::set_species_not_improving_count(int32_t count) { species_not_improving_count = count; -} \ No newline at end of file +} diff --git a/ground_truth_experiments/cell_experiments.sh b/ground_truth_experiments/cell_experiments.sh new file mode 100755 index 00000000..9c0e29d4 --- /dev/null +++ b/ground_truth_experiments/cell_experiments.sh @@ -0,0 +1,40 @@ +#!/usr/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM' + +offset=1 +bp_epoch=1000 + +for SIZE in 1 2 4; do + for CELL_TYPE in dnas; do + for fold in 0 1 2 3 4 5 6 7 8 9; do + output_dir=ground_truth_experiments/results/$CELL_TYPE/$SIZE/$fold + mkdir -p $output_dir + Release/rnn_examples/train_rnn \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --stochastic \ + --rnn_type $CELL_TYPE \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --random_sequence_length \ + --sequence_length_lower_bound 50 \ + --sequence_length_upper_bound 100 \ + --max_recurrent_depth 1 \ + --weight_update adagrad \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level ERROR \ + --file_message_level INFO & + done + done + wait +done + diff --git a/ground_truth_experiments/source_genomes.sh b/ground_truth_experiments/source_genomes.sh new file mode 100755 index 00000000..1c251134 --- /dev/null +++ b/ground_truth_experiments/source_genomes.sh @@ -0,0 +1,33 @@ +#!/usr/bin/zsh +# This is an example of running EXAMM MPI version on c172 dataset +# +# The c172 dataset is not normalized +# To run datasets that's not normalized, make sure to add arguments: +# --normalize min_max for Min Max normalization, or +# --normalize avg_std_dev for Z-score normalization + +INPUT_PARAMETERS="AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd" +OUTPUT_PARAMETERS="Pitch" + +for i in 0 1 2 3 4 5 6 7 8 9; do + exp_name="ground_truth_experiments/results/source_genomes/$i" + mkdir -p $exp_name + echo $exp_name + mpirun -np 5 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --number_islands 8 \ + --island_size 8 \ + --max_genomes 10000 \ + --bp_iterations 5 \ + --num_mutations 2 \ + --normalize min_max \ + --output_directory $exp_name \ + --possible_node_types simple UGRNN MGU GRU delta LSTM \ + --std_message_level ERROR \ + --file_message_level INFO & +done +wait diff --git a/mpi/examm_mpi.cxx b/mpi/examm_mpi.cxx index 7886d91d..66ee6594 100644 --- a/mpi/examm_mpi.cxx +++ b/mpi/examm_mpi.cxx @@ -114,10 +114,62 @@ void receive_terminate_message(int32_t source) { MPI_Recv(terminate_message, 1, MPI_INT, source, TERMINATE_TAG, MPI_COMM_WORLD, &status); } -void master(int32_t max_rank) { - // the "main" id will have already been set by the main function so we do not need to re-set it here - Log::debug("MAX int32_t: %d\n", numeric_limits::max()); +void master_sync(int32_t max_rank) { + max_rank -= 1; + int32_t generation = 0; + while (true) { + // Wait for N work requests + int32_t nreqs = 0; + while (nreqs < max_rank) { + MPI_Status status; + MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + + int32_t source = status.MPI_SOURCE; + int32_t tag = status.MPI_TAG; + // Log::info("probe returned message from: %d with tag: %d\n", source, tag); + + if (tag == WORK_REQUEST_TAG) { + receive_work_request(source); + nreqs++; + } else if (tag == GENOME_LENGTH_TAG) { + Log::debug("received genome from: %d\n", source); + RNN_Genome* genome = receive_genome_from(source); + + examm->insert_genome(genome); + + // delete the genome as it won't be used again, a copy was inserted + delete genome; + } else { + Log::fatal("ERROR: received message from %d with unknown tag: %d", source, tag); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + vector genomes(max_rank); + for (int32_t i = 1; i <= max_rank; i++) { + RNN_Genome* genome = examm->generate_genome(); + if (genome == NULL) { + break; + } + genomes[i - 1] = genome; + } + if (genomes.size() != max_rank) { + break; + } + + for (int i = 1; i <= max_rank; i++) { + send_genome_to(i, genomes[i - 1]); + delete genomes[i - 1]; + } + } + + for (int i = 1; i <= max_rank; i++) { + send_terminate_message(i); + } +} + +void master(int32_t max_rank) { int32_t terminates_sent = 0; while (true) { @@ -134,12 +186,7 @@ void master(int32_t max_rank) { if (tag == WORK_REQUEST_TAG) { receive_work_request(source); - // if (transfer_learning_version.compare("v3") == 0 || transfer_learning_version.compare("v1+v3") == 0) { - // seed_stirs = 3; - // } - examm_mutex.lock(); RNN_Genome* genome = examm->generate_genome(); - examm_mutex.unlock(); if (genome == NULL) { // search was completed if it returns NULL for an individual // send terminate message @@ -167,9 +214,7 @@ void master(int32_t max_rank) { Log::debug("received genome from: %d\n", source); RNN_Genome* genome = receive_genome_from(source); - examm_mutex.lock(); examm->insert_genome(genome); - examm_mutex.unlock(); // delete the genome as it won't be used again, a copy was inserted delete genome; @@ -207,9 +252,14 @@ void worker(int32_t rank) { // have each worker write the backproagation to a separate log file string log_id = "genome_" + to_string(genome->get_generation_id()) + "_worker_" + to_string(rank); Log::set_id(log_id); + + vector params; + genome->get_weights(params); + genome->backpropagate_stochastic( training_inputs, training_outputs, validation_inputs, validation_outputs, weight_update_method ); + Log::release_id(log_id); // go back to the worker's log for MPI communication @@ -259,12 +309,20 @@ int main(int argc, char** argv) { RNN_Genome* seed_genome = get_seed_genome(arguments, time_series_sets, weight_rules); + bool synchronous = argument_exists(arguments, "--synchronous"); + Log::warning("synchronous? %d\n", synchronous); + Log::clear_rank_restriction(); if (rank == 0) { write_time_series_to_file(arguments, time_series_sets); examm = generate_examm_from_arguments(arguments, time_series_sets, weight_rules, seed_genome); - master(max_rank); + + if (synchronous) { + master_sync(max_rank); + } else { + master(max_rank); + } } else { worker(rank); } diff --git a/rnn/dnas_node.cxx b/rnn/dnas_node.cxx index e51fa179..a54d2f28 100644 --- a/rnn/dnas_node.cxx +++ b/rnn/dnas_node.cxx @@ -1,6 +1,7 @@ #include using std::sort; +#include #include using std::pair; @@ -13,13 +14,16 @@ using std::max; #include "common/log.hxx" #include "dnas_node.hxx" +int32_t DNASNode::CRYSTALLIZATION_THRESHOLD = 1000; +int32_t DNASNode::k = -1; + DNASNode::DNASNode( vector&& _nodes, int32_t _innovation_number, int32_t _type, double _depth, int32_t counter ) : RNN_Node_Interface(_innovation_number, _type, _depth), nodes(_nodes), pi(vector(nodes.size(), 1.0)), - z(vector(nodes.size())), + z(vector(nodes.size(), 0.0)), x(vector(nodes.size())), g(vector(nodes.size())), d_pi(vector(nodes.size())), @@ -49,7 +53,6 @@ DNASNode::DNASNode(const DNASNode& src) : RNN_Node_Interface(src.innovation_numb g = src.g; x = src.x; xtotal = src.xtotal; - tao = src.tao; stochastic = src.stochastic; counter = src.counter; maxi = src.maxi; @@ -88,12 +91,19 @@ void DNASNode::sample_gumbel_softmax(Rng& rng) { x.assign(pi.size(), 0.0); gumbel_noise(rng, g); - calculate_z(); } +double DNASNode::calculate_pi_lr() { + return 0.1; +} + +double DNASNode::calculate_tao() { + return 6.0; +} + void DNASNode::calculate_z() { - tao = max(1.0 / 3.0, 1.0 / (1.0 + (double) counter * 0.05)); + tao = calculate_tao(); xtotal = 0.0; double emax = -10000000; @@ -125,29 +135,50 @@ void DNASNode::calculate_z() { ); double total = 0.0; - for (int32_t i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { total += ps_with_indices[i].second; } - for (int32_t i = 0; i < (int32_t) z.size(); i++) { + for (int i = 0; i < z.size(); i++) { z[i] = 0.0; } - for (int32_t i = 0; i < k; i++) { + for (int i = 0; i < k; i++) { z[ps_with_indices[i].first] = ps_with_indices[i].second / total; } } } +void DNASNode::print_info() { + printf(" "); + int best_pi_idx = 0; + for (int i = 0; i < nodes.size(); i++) { + printf("%-10s & ", std::to_string(pi[i]).c_str()); + if (pi[i] > pi[best_pi_idx]) { + best_pi_idx = i; + } + } + printf("\n"); + Log::info("Node types: "); + for (auto node : nodes) { + Log::info_no_header("%d ", node->node_type); + } + Log::info_no_header("\n "); + Log::info("Best node: %i, node type: %d\n", best_pi_idx, nodes[best_pi_idx]->node_type); +} + void DNASNode::reset(int32_t series_length) { - d_pi = vector(pi.size(), 0.0); - d_input = vector(series_length, 0.0); - node_outputs = vector>(series_length, vector(pi.size(), 0.0)); - output_values = vector(series_length, 0.0); - error_values = vector(series_length, 0.0); - inputs_fired = vector(series_length, 0); - outputs_fired = vector(series_length, 0); - input_values = vector(series_length, 0.0); + d_pi.assign(pi.size(), 0.0); + d_input.assign(series_length, 0.0); + node_outputs.clear(); + for (int i = 0; i < series_length; i++) { + node_outputs.emplace_back(pi.size(), 0.0); + } + output_values.assign(series_length, 0.0); + error_values.assign(series_length, 0.0); + inputs_fired.assign(series_length, 0); + outputs_fired.assign(series_length, 0); + input_values.assign(series_length, 0.0); if (counter >= CRYSTALLIZATION_THRESHOLD) { nodes[maxi]->reset(series_length); @@ -178,8 +209,10 @@ void DNASNode::input_fired(int32_t time, double incoming_output) { } if (counter >= CRYSTALLIZATION_THRESHOLD) { + Log::info("%d hmm\n", maxi >= 0); assert(maxi >= 0); + Log::info("%d %d %p\n", maxi, time, nodes[maxi]); nodes[maxi]->input_fired(time, input_values[time]); node_outputs[time][maxi] = nodes[maxi]->output_values[time]; output_values[time] = nodes[maxi]->output_values[time]; @@ -286,6 +319,7 @@ void DNASNode::set_weights(const vector& parameters) { } void DNASNode::get_weights(int32_t& offset, vector& parameters) const { + // int start = offset; // Log::info("pi start %d; ", offset); for (int32_t i = 0; i < (int32_t) pi.size(); i++) { parameters[offset++] = pi[i]; @@ -300,17 +334,15 @@ void DNASNode::set_weights(int32_t& offset, const vector& parameters) { // int start = offset; for (int32_t i = 0; i < (int32_t) pi.size(); i++) { pi[i] = parameters[offset++]; + if (pi[i] < 0.01) { + pi[i] = 0.01; + } } - // Log::info("Pi indices: %d-%d\n", start, offset); + for (auto node : nodes) { node->set_weights(offset, parameters); } calculate_z(); - // string s = "Pi = { "; - // for (auto p : pi) { - // s += std::to_string(p) + ", "; - // } - // Log::info("%s }\n", s.c_str()); } void DNASNode::set_pi(const vector& new_pi) { @@ -360,6 +392,7 @@ void DNASNode::get_gradients(vector& gradients) { } else { gradients.assign(get_number_weights(), 0.0); int offset = 0; + for (int32_t i = 0; i < (int32_t) pi.size(); i++) { gradients[offset++] = d_pi[i] * 0.1; } diff --git a/rnn/dnas_node.hxx b/rnn/dnas_node.hxx index bd5a6b2d..bdafc88f 100644 --- a/rnn/dnas_node.hxx +++ b/rnn/dnas_node.hxx @@ -22,8 +22,6 @@ using std::unique_ptr; #include "rnn_node.hxx" #include "rnn_node_interface.hxx" -#define CRYSTALLIZATION_THRESHOLD 1000 - class DNASNode : public RNN_Node_Interface { private: template @@ -58,13 +56,9 @@ class DNASNode : public RNN_Node_Interface { // A vector to put gumbel noise into; just to avoid re-allocation vector noise; - // Temperature used when drawing samples from Gumbel-Softmax(pi) - double tao = 1.0; int32_t counter = 0; int32_t maxi = -1; - - // if > 0, then the samples will be forced to be K-hot (K non-zero values that sum to one) - int32_t k = 1; + double tao; // Whether to re-sample the gumbel softmax distribution when resetting the node. // Can be set externally using DNASNode::set_stochastic @@ -73,6 +67,9 @@ class DNASNode : public RNN_Node_Interface { vector> node_outputs; public: + static int32_t CRYSTALLIZATION_THRESHOLD; + static int32_t k; + DNASNode( vector&& nodes, int32_t _innovation_number, int32_t _type, double _depth, int32_t counter = -1 @@ -83,6 +80,8 @@ class DNASNode : public RNN_Node_Interface { template void sample_gumbel_softmax(Rng& rng); void calculate_z(); + double calculate_tao(); + double calculate_pi_lr(); virtual void initialize_lamarckian( minstd_rand0& generator, NormalDistribution& normal_distribution, double mu, double sigma @@ -110,6 +109,8 @@ class DNASNode : public RNN_Node_Interface { virtual void reset(int32_t _series_length); virtual void write_to_stream(ostream& out); + void print_info(); + virtual RNN_Node_Interface* copy() const; void set_stochastic(bool stochastic); diff --git a/rnn/generate_nn.cxx b/rnn/generate_nn.cxx index 91231a01..ba656a3e 100644 --- a/rnn/generate_nn.cxx +++ b/rnn/generate_nn.cxx @@ -9,11 +9,10 @@ using std::string; using std::vector; /* - * node_kind is the type of memory cell (e.g. LSTM, UGRNN) * innovation_counter - reference to an integer used to keep track if innovation numbers. it will be incremented once. */ -RNN_Node_Interface* create_hidden_node(int32_t node_kind, int32_t& innovation_counter, double depth) { - switch (node_kind) { +RNN_Node_Interface* create_hidden_node(node_t node_type, int32_t& innovation_counter, double depth) { + switch (node_type) { case SIMPLE_NODE: return new RNN_Node(++innovation_counter, HIDDEN_LAYER, depth, SIMPLE_NODE); case JORDAN_NODE: @@ -36,9 +35,6 @@ RNN_Node_Interface* create_hidden_node(int32_t node_kind, int32_t& innovation_co return new ENAS_DAG_Node(++innovation_counter, HIDDEN_LAYER, depth); case RANDOM_DAG_NODE: return new RANDOM_DAG_Node(++innovation_counter, HIDDEN_LAYER, depth); - case DNAS_NODE: - Log::fatal("You shouldn't be creating DNAS nodes using generate_nn::create_hidden_node.\n"); - exit(1); case SIN_NODE: return new SIN_Node(++innovation_counter, HIDDEN_LAYER, depth); case SUM_NODE: @@ -55,7 +51,7 @@ RNN_Node_Interface* create_hidden_node(int32_t node_kind, int32_t& innovation_co return new MULTIPLY_Node(++innovation_counter, HIDDEN_LAYER, depth); default: Log::fatal( - "If you are seeing this, an invalid node_kind was used to create a node (node_kind = %d\n", node_kind + "If you are seeing this, an invalid node_type was used to create a node (node_type = %d\n", node_type ); exit(1); } @@ -64,7 +60,7 @@ RNN_Node_Interface* create_hidden_node(int32_t node_kind, int32_t& innovation_co return nullptr; } -DNASNode* create_dnas_node(int32_t& innovation_counter, double depth, const vector& node_types) { +DNASNode* create_dnas_node(int32_t& innovation_counter, double depth, const vector& node_types) { vector nodes(node_types.size()); if (node_types.size() == 0) { @@ -150,7 +146,7 @@ RNN_Genome* create_nn( RNN_Genome* create_dnas_nn( const vector& input_parameter_names, int32_t number_hidden_layers, int32_t number_hidden_nodes, - const vector& output_parameter_names, int32_t max_recurrent_depth, vector& node_types, + const vector& output_parameter_names, int32_t max_recurrent_depth, vector& node_types, WeightRules* weight_rules ) { auto f = [&](int32_t& innovation_counter, double depth) -> RNN_Node_Interface* { @@ -199,13 +195,21 @@ RNN_Genome* get_seed_genome( ); Log::info("Finished transfering seed genome\n"); } else { - if (seed_genome == NULL) { + bool use_dnas_seed = argument_exists(arguments, "--use_dnas_seed"); + + if (!use_dnas_seed) { seed_genome = create_ff( time_series_sets->get_input_parameter_names(), 0, 0, time_series_sets->get_output_parameter_names(), 0, weight_rules ); seed_genome->initialize_randomly(); Log::info("Generated seed genome, seed genome is minimal\n"); + } else { + vector node_types = {SIMPLE_NODE, UGRNN_NODE, MGU_NODE, GRU_NODE, DELTA_NODE, LSTM_NODE}; + seed_genome = create_dnas_nn( + time_series_sets->get_input_parameter_names(), 0, 0, time_series_sets->get_output_parameter_names(), 0, + node_types, weight_rules + ); } } diff --git a/rnn/generate_nn.hxx b/rnn/generate_nn.hxx index 3614497b..0ebd0d2a 100644 --- a/rnn/generate_nn.hxx +++ b/rnn/generate_nn.hxx @@ -5,6 +5,9 @@ #include using std::string; +#include +using std::unordered_map; + #include using std::vector; @@ -36,7 +39,8 @@ template NodeT* create_hidden_memory_cell(int32_t& innovation_counter, double depth) { return new NodeT(++innovation_counter, HIDDEN_LAYER, depth); } -RNN_Node_Interface* create_hidden_node(int32_t node_kind, int32_t& innovation_counter, double depth); + +RNN_Node_Interface* create_hidden_node(node_t node_type, int32_t& innovation_counter, double depth); RNN_Genome* create_nn( const vector& input_parameter_names, int32_t number_hidden_layers, int32_t number_hidden_nodes, @@ -44,7 +48,7 @@ RNN_Genome* create_nn( std::function make_node, WeightRules* weight_rules ); -template +template RNN_Genome* create_simple_nn( const vector& input_parameter_names, int32_t number_hidden_layers, int32_t number_hidden_nodes, const vector& output_parameter_names, int32_t max_recurrent_depth, WeightRules* weight_rules @@ -95,11 +99,11 @@ RNN_Genome* create_memory_cell_nn( #define create_inverse(...) create_memory_cell_nn(__VA_ARGS__) #define create_multiply(...) create_memory_cell_nn(__VA_ARGS__) -DNASNode* create_dnas_node(int32_t& innovation_counter, double depth, const vector& node_types); +DNASNode* create_dnas_node(int32_t& innovation_counter, double depth, const vector& node_types); RNN_Genome* create_dnas_nn( const vector& input_parameter_names, int32_t number_hidden_layers, int32_t number_hidden_nodes, - const vector& output_parameter_names, int32_t max_recurrent_depth, vector& node_types, + const vector& output_parameter_names, int32_t max_recurrent_depth, vector& node_types, WeightRules* weight_rules ); diff --git a/rnn/genome_property.cxx b/rnn/genome_property.cxx index 6bf061b9..b4766c48 100644 --- a/rnn/genome_property.cxx +++ b/rnn/genome_property.cxx @@ -10,6 +10,22 @@ GenomeProperty::GenomeProperty() { max_recurrent_depth = 10; } +int32_t GenomeProperty::compute_bp_iterations(RNN_Genome* genome) { + if (use_burn_in_bp_epoch) { + int32_t n = genome->generation_id / burn_in_period; + n = n > max_burn_in_cycles ? max_burn_in_cycles : n; + + float epochs = bp_epochs_start; + for (int i = 0; i < n; i++) { + epochs *= burn_in_ratio; + } + + return (int32_t) epochs; + } else { + return bp_iterations; + } +} + void GenomeProperty::generate_genome_property_from_arguments(const vector& arguments) { get_argument(arguments, "--bp_iterations", true, bp_iterations); use_dropout = get_argument(arguments, "--dropout_probability", false, dropout_probability); @@ -17,6 +33,12 @@ void GenomeProperty::generate_genome_property_from_arguments(const vectorset_bp_iterations(bp_iterations); + genome->set_bp_iterations(compute_bp_iterations(genome)); + if (use_dropout) { genome->enable_dropout(dropout_probability); } + genome->normalize_type = normalize_type; genome->set_parameter_names(input_parameter_names, output_parameter_names); genome->set_normalize_bounds(normalize_type, normalize_mins, normalize_maxs, normalize_avgs, normalize_std_devs); @@ -48,4 +72,4 @@ void GenomeProperty::get_time_series_parameters(TimeSeriesSets* time_series_sets uniform_int_distribution GenomeProperty::get_recurrent_depth_dist() { return uniform_int_distribution(this->min_recurrent_depth, this->max_recurrent_depth); -} \ No newline at end of file +} diff --git a/rnn/genome_property.hxx b/rnn/genome_property.hxx index 7d220ff6..b863fd47 100644 --- a/rnn/genome_property.hxx +++ b/rnn/genome_property.hxx @@ -18,6 +18,12 @@ class GenomeProperty { int32_t min_recurrent_depth; int32_t max_recurrent_depth; + bool use_burn_in_bp_epoch; + int32_t burn_in_period = 2048; + int32_t max_burn_in_cycles = 4; + double bp_epochs_start = 0.5; + double burn_in_ratio = 2.0; + // TimeSeriesSets *time_series_sets; int32_t number_inputs; int32_t number_outputs; @@ -30,12 +36,16 @@ class GenomeProperty { map normalize_avgs; map normalize_std_devs; + int32_t compute_bp_iterations(RNN_Genome* genome); + public: GenomeProperty(); + void generate_genome_property_from_arguments(const vector& arguments); void set_genome_properties(RNN_Genome* genome); void get_time_series_parameters(TimeSeriesSets* time_series_sets); + uniform_int_distribution get_recurrent_depth_dist(); }; -#endif \ No newline at end of file +#endif diff --git a/rnn/inverse_node.cxx b/rnn/inverse_node.cxx index 202dac29..fff9bcd4 100644 --- a/rnn/inverse_node.cxx +++ b/rnn/inverse_node.cxx @@ -19,7 +19,7 @@ double INVERSE_Node::activation_function(double input) { double INVERSE_Node::derivative_function(double input) { double gradient = -1.0 / ((input) * (input)); - if (isnan(gradient) || isinf(gradient)) { + if (std::isnan(gradient) || std::isinf(gradient)) { gradient = -1000.0; } return gradient; diff --git a/rnn/rnn_edge.cxx b/rnn/rnn_edge.cxx index 51b53254..ca63dc65 100644 --- a/rnn/rnn_edge.cxx +++ b/rnn/rnn_edge.cxx @@ -93,7 +93,8 @@ RNN_Edge* RNN_Edge::copy(const vector new_nodes) { } void RNN_Edge::propagate_forward(int32_t time) { - if (input_node->inputs_fired[time] != input_node->total_inputs) { + if (input_node->inputs_fired[time] != input_node->total_inputs || time < 0 + || time >= input_node->output_values.size()) { Log::fatal( "ERROR! propagate forward called on edge %d where input_node->inputs_fired[%d] (%d) != total_inputs (%d)\n", innovation_number, time, input_node->inputs_fired[time], input_node->total_inputs @@ -105,7 +106,6 @@ void RNN_Edge::propagate_forward(int32_t time) { exit(1); } - // Log::debug("input_node %p %d\n", input_node, input_node->output_values.size()); double output = input_node->output_values[time] * weight; // Log::debug("propagating forward at time %d from %d to %d, value: %lf, input: %lf, weight: %lf\n", time, diff --git a/rnn/rnn_genome.cxx b/rnn/rnn_genome.cxx index 0d9006f9..47bd934f 100644 --- a/rnn/rnn_genome.cxx +++ b/rnn/rnn_genome.cxx @@ -44,6 +44,9 @@ using std::to_string; #include using std::vector; +#include +using std::move; + #include using std::unordered_map; @@ -68,7 +71,7 @@ using std::map; #include "time_series/time_series.hxx" #include "ugrnn_node.hxx" -vector dnas_node_types = {SIMPLE_NODE, UGRNN_NODE, MGU_NODE, GRU_NODE, DELTA_NODE, LSTM_NODE}; +vector dnas_node_types = {SIMPLE_NODE, UGRNN_NODE, MGU_NODE, GRU_NODE, DELTA_NODE, LSTM_NODE}; string parse_fitness(double fitness) { if (fitness == EXAMM_MAX_DOUBLE) { @@ -257,8 +260,8 @@ string RNN_Genome::print_statistics() { << get_node_count_str(MGU_NODE) << setw(12) << get_node_count_str(GRU_NODE) << setw(12) << get_node_count_str(DELTA_NODE) << setw(12) << get_node_count_str(LSTM_NODE) << setw(12) << get_node_count_str(ENARC_NODE) << setw(12) << get_node_count_str(ENAS_DAG_NODE) << setw(12) - << get_node_count_str(RANDOM_DAG_NODE) << setw(12) << get_node_count_str(-1) //-1 does all nodes - << generated_by_string(); + << get_node_count_str(RANDOM_DAG_NODE) << setw(12) << get_enabled_node_count() << " (" << get_node_count() + << ")" << generated_by_string(); return oss.str(); } @@ -290,7 +293,7 @@ string RNN_Genome::get_edge_count_str(bool recurrent) { return oss.str(); } -string RNN_Genome::get_node_count_str(int32_t node_type) { +string RNN_Genome::get_node_count_str(node_t node_type) { ostringstream oss; if (node_type < 0) { oss << get_enabled_node_count() << " (" << get_node_count() << ")"; @@ -317,7 +320,7 @@ int32_t RNN_Genome::get_enabled_node_count() { return count; } -int32_t RNN_Genome::get_enabled_node_count(int32_t node_type) { +int32_t RNN_Genome::get_enabled_node_count(node_t node_type) { int32_t count = 0; for (int32_t i = 0; i < (int32_t) nodes.size(); i++) { @@ -333,7 +336,7 @@ int32_t RNN_Genome::get_node_count() { return (int32_t) nodes.size(); } -int32_t RNN_Genome::get_node_count(int32_t node_type) { +int32_t RNN_Genome::get_node_count(node_t node_type) { int32_t count = 0; for (int32_t i = 0; i < (int32_t) nodes.size(); i++) { @@ -1334,7 +1337,7 @@ bool RNN_Genome::has_node_with_innovation(int32_t innovation_number) const { return false; } -bool RNN_Genome::equals(RNN_Genome* other) { +bool RNN_Genome::equals(const RNN_Genome* other) const { if (nodes.size() != other->nodes.size()) { return false; } @@ -1366,6 +1369,19 @@ bool RNN_Genome::equals(RNN_Genome* other) { return true; } +bool RNN_Genome::operator==(const RNN_Genome& other) const { + return other.equals(this); +} + +size_t RNN_Genome::StructuralHash::operator()(const RNN_Genome* genome) const { + return this->operator()(*genome); +} + +size_t RNN_Genome::StructuralHash::operator()(const RNN_Genome& genome) const { + std::hash hasher; + return hasher(genome.get_structural_hash()); +} + void RNN_Genome::assign_reachability() { Log::trace("assigning reachability!\n"); Log::trace("%6d nodes, %6d edges, %6d recurrent edges\n", nodes.size(), edges.size(), recurrent_edges.size()); @@ -1654,7 +1670,7 @@ void RNN_Genome::get_mu_sigma(const vector& p, double& mu, double& sigma } RNN_Node_Interface* RNN_Genome::create_node( - double mu, double sigma, int32_t node_type, int32_t& node_innovation_count, double depth + double mu, double sigma, node_t node_type, int32_t& node_innovation_count, double depth ) { RNN_Node_Interface* n = NULL; WeightType mutated_component_weight = weight_rules->get_mutated_components_weight_method(); @@ -2004,7 +2020,7 @@ bool RNN_Genome::enable_edge() { } bool RNN_Genome::split_edge( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ) { Log::trace("\tattempting to split an edge!\n"); @@ -2382,7 +2398,7 @@ bool RNN_Genome::connect_node_to_hid_nodes( /* ################# ################# ################# */ bool RNN_Genome::add_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ) { Log::trace("\tattempting to add a node!\n"); @@ -2537,7 +2553,7 @@ bool RNN_Genome::disable_node() { } bool RNN_Genome::split_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ) { Log::trace("\tattempting to split a node!\n"); @@ -2759,7 +2775,7 @@ bool RNN_Genome::split_node( } bool RNN_Genome::merge_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ) { Log::trace("\tattempting to merge a node!\n"); @@ -3187,16 +3203,18 @@ void RNN_Genome::read_from_array(char* array, int32_t length) { } RNN_Node_Interface* RNN_Genome::read_node_from_stream(istream& bin_istream) { - int32_t innovation_number, layer_type, node_type; + int32_t innovation_number, layer_type, inode_type; double depth; bool enabled; bin_istream.read((char*) &innovation_number, sizeof(int32_t)); bin_istream.read((char*) &layer_type, sizeof(int32_t)); - bin_istream.read((char*) &node_type, sizeof(int32_t)); + bin_istream.read((char*) &inode_type, sizeof(int32_t)); bin_istream.read((char*) &depth, sizeof(double)); bin_istream.read((char*) &enabled, sizeof(bool)); + node_t node_type = (node_t) inode_type; + string parameter_name; read_binary_string(bin_istream, parameter_name, "parameter_name"); Log::debug( @@ -3204,77 +3222,81 @@ RNN_Node_Interface* RNN_Genome::read_node_from_stream(istream& bin_istream) { ); RNN_Node_Interface* node = nullptr; - if (node_type == LSTM_NODE) { - node = new LSTM_Node(innovation_number, layer_type, depth); - } else if (node_type == DELTA_NODE) { - node = new Delta_Node(innovation_number, layer_type, depth); - } else if (node_type == GRU_NODE) { - node = new GRU_Node(innovation_number, layer_type, depth); - } else if (node_type == ENARC_NODE) { - node = new ENARC_Node(innovation_number, layer_type, depth); - } else if (node_type == ENAS_DAG_NODE) { - node = new ENAS_DAG_Node(innovation_number, layer_type, depth); - } else if (node_type == RANDOM_DAG_NODE) { - node = new RANDOM_DAG_Node(innovation_number, layer_type, depth); - } else if (node_type == MGU_NODE) { - node = new MGU_Node(innovation_number, layer_type, depth); - } else if (node_type == UGRNN_NODE) { - node = new UGRNN_Node(innovation_number, layer_type, depth); - } else if (node_type == SIMPLE_NODE || node_type == JORDAN_NODE || node_type == ELMAN_NODE) { - if (layer_type == HIDDEN_LAYER) { - node = new RNN_Node(innovation_number, layer_type, depth, node_type); - } else { - node = new RNN_Node(innovation_number, layer_type, depth, node_type, parameter_name); - } - } else if (node_type == DNAS_NODE) { - int32_t n_nodes; - bin_istream.read((char*) &n_nodes, sizeof(int32_t)); - - int32_t counter; - bin_istream.read((char*) &counter, sizeof(int32_t)); - vector pi(n_nodes, 0.0); - bin_istream.read((char*) &pi[0], sizeof(double) * n_nodes); - - vector nodes(n_nodes, nullptr); - for (int i = 0; i < n_nodes; i++) { - nodes[i] = RNN_Genome::read_node_from_stream(bin_istream); - } - - DNASNode* dnas_node = new DNASNode(move(nodes), innovation_number, layer_type, depth, counter); - dnas_node->set_pi(pi); - node = (RNN_Node_Interface*) dnas_node; - } else if (node_type == SIN_NODE) { - node = new SIN_Node(innovation_number, layer_type, depth); - } else if (node_type == SUM_NODE) { - node = new SUM_Node(innovation_number, layer_type, depth); - } else if (node_type == COS_NODE) { - node = new COS_Node(innovation_number, layer_type, depth); - } else if (node_type == TANH_NODE) { - node = new TANH_Node(innovation_number, layer_type, depth); - } else if (node_type == SIGMOID_NODE) { - node = new SIGMOID_Node(innovation_number, layer_type, depth); - } else if (node_type == INVERSE_NODE) { - node = new INVERSE_Node(innovation_number, layer_type, depth); - } else if (node_type == MULTIPLY_NODE) { - node = new MULTIPLY_Node(innovation_number, layer_type, depth); - } else { - Log::fatal("Error reading node from stream, unknown node_type: %d\n", node_type); - exit(1); + switch (node_type) { + case SIMPLE_NODE: + case JORDAN_NODE: + case ELMAN_NODE: + if (layer_type == HIDDEN_LAYER) { + node = new RNN_Node(innovation_number, layer_type, depth, node_type); + } else { + node = new RNN_Node(innovation_number, layer_type, depth, node_type, parameter_name); + } + break; + + case DNAS_NODE: { + int32_t n_nodes; + bin_istream.read((char*) &n_nodes, sizeof(int32_t)); + + int32_t counter; + bin_istream.read((char*) &counter, sizeof(int32_t)); + vector pi(n_nodes, 0.0); + bin_istream.read((char*) &pi[0], sizeof(double) * n_nodes); + + vector nodes(n_nodes, nullptr); + for (int i = 0; i < n_nodes; i++) { + nodes[i] = RNN_Genome::read_node_from_stream(bin_istream); + } + + DNASNode* dnas_node = new DNASNode(std::move(nodes), innovation_number, layer_type, depth, counter); + dnas_node->set_pi(pi); + node = (RNN_Node_Interface*) dnas_node; + break; + } + + default: + int32_t dummy_counter = 0; + node = create_hidden_node(node_type, dummy_counter, depth); + node->innovation_number = innovation_number; } node->enabled = enabled; return node; } + +#define MAGIC 0xFA + +#define read_magic(place) \ + { \ + uint8_t boo = MAGIC; \ + bin_istream.read((char*) &boo, sizeof(uint8_t)); \ + if (boo != MAGIC) { \ + Log::error("ERROR IN SERIALIZING - FAILED TO READ MAGIC at %d; %x != %x\n", place, boo, MAGIC); \ + exit(-1); \ + } \ + } + +#define write_magic() \ + { \ + uint8_t xxmagic = MAGIC; \ + bin_ostream.write((char*) &xxmagic, sizeof(uint8_t)); \ + } + void RNN_Genome::read_from_stream(istream& bin_istream) { Log::debug("READING GENOME FROM STREAM\n"); + read_magic(__LINE__); + bin_istream.read((char*) &generation_id, sizeof(int32_t)); bin_istream.read((char*) &group_id, sizeof(int32_t)); bin_istream.read((char*) &bp_iterations, sizeof(int32_t)); + read_magic(__LINE__); + bin_istream.read((char*) &use_dropout, sizeof(bool)); bin_istream.read((char*) &dropout_probability, sizeof(double)); + read_magic(__LINE__); + WeightType weight_initialize = WeightType::NONE; WeightType weight_inheritance = WeightType::NONE; WeightType mutated_component_weight = WeightType::NONE; @@ -3283,6 +3305,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { bin_istream.read((char*) &weight_inheritance, sizeof(int32_t)); bin_istream.read((char*) &mutated_component_weight, sizeof(int32_t)); + read_magic(__LINE__); + weight_rules = new WeightRules(); weight_rules->set_weight_initialize_method(weight_initialize); weight_rules->set_weight_inheritance_method(weight_inheritance); @@ -3304,8 +3328,10 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { istringstream generator_iss(generator_str); generator_iss >> generator; - string rng_0_1_str; - read_binary_string(bin_istream, rng_0_1_str, "rng_0_1"); + read_magic(__LINE__); + + // string rng_0_1_str; + // read_binary_string(bin_istream, rng_0_1_str, "rng_0_1"); // So for some reason this was serialized incorrectly for some genomes, // but the value should always be the same so we really don't need to de-serialize it anways and can just // assign it a constant value @@ -3319,6 +3345,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { istringstream generated_by_map_iss(generated_by_map_str); read_map(generated_by_map_iss, generated_by_map); + read_magic(__LINE__); + bin_istream.read((char*) &best_validation_mse, sizeof(double)); bin_istream.read((char*) &best_validation_mae, sizeof(double)); @@ -3330,6 +3358,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { initial_parameters.assign(initial_parameters_v, initial_parameters_v + n_initial_parameters); delete[] initial_parameters_v; + read_magic(__LINE__); + int32_t n_best_parameters; bin_istream.read((char*) &n_best_parameters, sizeof(int32_t)); Log::debug("reading %d best parameters.\n", n_best_parameters); @@ -3338,6 +3368,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { best_parameters.assign(best_parameters_v, best_parameters_v + n_best_parameters); delete[] best_parameters_v; + read_magic(__LINE__); + input_parameter_names.clear(); int32_t n_input_parameter_names; bin_istream.read((char*) &n_input_parameter_names, sizeof(int32_t)); @@ -3348,6 +3380,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { input_parameter_names.push_back(input_parameter_name); } + read_magic(__LINE__); + output_parameter_names.clear(); int32_t n_output_parameter_names; bin_istream.read((char*) &n_output_parameter_names, sizeof(int32_t)); @@ -3358,6 +3392,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { output_parameter_names.push_back(output_parameter_name); } + read_magic(__LINE__); + int32_t n_nodes; bin_istream.read((char*) &n_nodes, sizeof(int32_t)); Log::debug("reading %d nodes.\n", n_nodes); @@ -3365,6 +3401,7 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { nodes.clear(); for (int32_t i = 0; i < n_nodes; i++) { nodes.push_back(RNN_Genome::read_node_from_stream(bin_istream)); + read_magic(__LINE__); } int32_t n_edges; @@ -3391,6 +3428,7 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { // innovation_list.push_back(innovation_number); edge->enabled = enabled; edges.push_back(edge); + read_magic(__LINE__); } int32_t n_recurrent_edges; @@ -3422,6 +3460,7 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { // innovation_list.push_back(innovation_number); recurrent_edge->enabled = enabled; recurrent_edges.push_back(recurrent_edge); + read_magic(__LINE__); } read_binary_string(bin_istream, normalize_type, "normalize_type"); @@ -3446,6 +3485,8 @@ void RNN_Genome::read_from_stream(istream& bin_istream) { istringstream normalize_std_devs_iss(normalize_std_devs_str); read_map(normalize_std_devs_iss, normalize_std_devs); + read_magic(__LINE__); + assign_reachability(); } @@ -3469,13 +3510,20 @@ void RNN_Genome::write_to_file(string bin_filename) { void RNN_Genome::write_to_stream(ostream& bin_ostream) { Log::debug("WRITING GENOME TO STREAM\n"); + + write_magic(); + bin_ostream.write((char*) &generation_id, sizeof(int32_t)); bin_ostream.write((char*) &group_id, sizeof(int32_t)); bin_ostream.write((char*) &bp_iterations, sizeof(int32_t)); + write_magic(); + bin_ostream.write((char*) &use_dropout, sizeof(bool)); bin_ostream.write((char*) &dropout_probability, sizeof(double)); + write_magic(); + WeightType weight_initialize = weight_rules->get_weight_initialize_method(); WeightType weight_inheritance = weight_rules->get_weight_inheritance_method(); WeightType mutated_component_weight = weight_rules->get_mutated_components_weight_method(); @@ -3483,6 +3531,8 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { bin_ostream.write((char*) &weight_inheritance, sizeof(int32_t)); bin_ostream.write((char*) &mutated_component_weight, sizeof(int32_t)); + write_magic(); + Log::debug("generation_id: %d\n", generation_id); Log::debug("bp_iterations: %d\n", bp_iterations); @@ -3500,16 +3550,20 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { string generator_str = generator_oss.str(); write_binary_string(bin_ostream, generator_str, "generator"); - ostringstream rng_0_1_oss; - rng_0_1_oss << rng_0_1; - string rng_0_1_str = rng_0_1_oss.str(); - write_binary_string(bin_ostream, rng_0_1_str, "rng_0_1"); + write_magic(); + + // ostringstream rng_0_1_oss; + // rng_0_1_oss << rng_0_1; + // string rng_0_1_str = rng_0_1_oss.str(); + // write_binary_string(bin_ostream, rng_0_1_str, "rng_0_1"); ostringstream generated_by_map_oss; write_map(generated_by_map_oss, generated_by_map); string generated_by_map_str = generated_by_map_oss.str(); write_binary_string(bin_ostream, generated_by_map_str, "generated_by_map"); + write_magic(); + bin_ostream.write((char*) &best_validation_mse, sizeof(double)); bin_ostream.write((char*) &best_validation_mae, sizeof(double)); @@ -3518,18 +3572,24 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { bin_ostream.write((char*) &n_initial_parameters, sizeof(int32_t)); bin_ostream.write((char*) &initial_parameters[0], sizeof(double) * initial_parameters.size()); + write_magic(); + int32_t n_best_parameters = (int32_t) best_parameters.size(); bin_ostream.write((char*) &n_best_parameters, sizeof(int32_t)); if (n_best_parameters) { bin_ostream.write((char*) &best_parameters[0], sizeof(double) * best_parameters.size()); } + write_magic(); + int32_t n_input_parameter_names = (int32_t) input_parameter_names.size(); bin_ostream.write((char*) &n_input_parameter_names, sizeof(int32_t)); for (int32_t i = 0; i < (int32_t) input_parameter_names.size(); i++) { write_binary_string(bin_ostream, input_parameter_names[i], "input_parameter_names[" + std::to_string(i) + "]"); } + write_magic(); + int32_t n_output_parameter_names = (int32_t) output_parameter_names.size(); bin_ostream.write((char*) &n_output_parameter_names, sizeof(int32_t)); for (int32_t i = 0; i < (int32_t) output_parameter_names.size(); i++) { @@ -3538,6 +3598,8 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { ); } + write_magic(); + int32_t n_nodes = (int32_t) nodes.size(); bin_ostream.write((char*) &n_nodes, sizeof(int32_t)); Log::debug("writing %d nodes.\n", n_nodes); @@ -3548,6 +3610,7 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { nodes[i]->depth, nodes[i]->parameter_name.c_str() ); nodes[i]->write_to_stream(bin_ostream); + write_magic(); } int32_t n_edges = (int32_t) edges.size(); @@ -3560,6 +3623,7 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { edges[i]->output_innovation_number ); edges[i]->write_to_stream(bin_ostream); + write_magic(); } int32_t n_recurrent_edges = (int32_t) recurrent_edges.size(); @@ -3573,6 +3637,7 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { ); recurrent_edges[i]->write_to_stream(bin_ostream); + write_magic(); } write_binary_string(bin_ostream, normalize_type, "normalize_type"); @@ -3596,6 +3661,8 @@ void RNN_Genome::write_to_stream(ostream& bin_ostream) { write_map(normalize_std_devs_oss, normalize_std_devs); string normalize_std_devs_str = normalize_std_devs_oss.str(); write_binary_string(bin_ostream, normalize_std_devs_str, "normalize_std_devs"); + + write_magic(); } void RNN_Genome::update_innovation_counts(int32_t& node_innovation_count, int32_t& edge_innovation_count) { diff --git a/rnn/rnn_genome.hxx b/rnn/rnn_genome.hxx index 717b5257..c3584e51 100644 --- a/rnn/rnn_genome.hxx +++ b/rnn/rnn_genome.hxx @@ -32,7 +32,7 @@ using std::vector; // mysql can't handle the max float value for some reason #define EXAMM_MAX_DOUBLE 10000000 -extern vector dnas_node_types; +extern vector dnas_node_types; string parse_fitness(double fitness); @@ -111,7 +111,7 @@ class RNN_Genome { string generated_by_string(); string get_edge_count_str(bool recurrent); - string get_node_count_str(int32_t node_type); + string get_node_count_str(node_t node_type); const map* get_generated_by_map(); @@ -119,8 +119,8 @@ class RNN_Genome { int32_t get_enabled_edge_count(); int32_t get_enabled_recurrent_edge_count(); - int32_t get_enabled_node_count(int32_t node_type); - int32_t get_node_count(int32_t node_type); + int32_t get_enabled_node_count(node_t node_type); + int32_t get_node_count(node_t node_type); int32_t get_enabled_node_count(); int32_t get_node_count(); @@ -241,7 +241,7 @@ class RNN_Genome { bool outputs_unreachable(); RNN_Node_Interface* create_node( - double mu, double sigma, int32_t node_type, int32_t& node_innovation_count, double depth + double mu, double sigma, node_t node_type, int32_t& node_innovation_count, double depth ); bool attempt_edge_insert( @@ -266,23 +266,23 @@ class RNN_Genome { bool disable_edge(); bool enable_edge(); bool split_edge( - double mu, double sigma, int32_t node_type, uniform_int_distribution rec_depth_dist, + double mu, double sigma, node_t node_type, uniform_int_distribution rec_depth_dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ); bool add_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ); bool enable_node(); bool disable_node(); bool split_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ); bool merge_node( - double mu, double sigma, int32_t node_type, uniform_int_distribution dist, + double mu, double sigma, node_t node_type, uniform_int_distribution dist, int32_t& edge_innovation_count, int32_t& node_innovation_count ); @@ -295,7 +295,17 @@ class RNN_Genome { */ bool has_node_with_innovation(int32_t innovation_number) const; - bool equals(RNN_Genome* other); + bool equals(const RNN_Genome* other) const; + bool operator==(const RNN_Genome& other) const; + + /** + * Hash function implementation. + * Based on the hash code of the structural hash. + * */ + struct StructuralHash { + size_t operator()(const RNN_Genome& other) const; + size_t operator()(const RNN_Genome* other) const; + }; string get_color(double weight, bool is_recurrent); void write_graphviz(string filename); @@ -327,6 +337,10 @@ class RNN_Genome { ); vector pick_possible_nodes(int32_t layer_type, bool not_all_hidden, string node_type); + const vector& get_nodes() { + return this->nodes; + } + void update_innovation_counts(int32_t& node_innovation_count, int32_t& edge_innovation_count); vector get_innovation_list(); diff --git a/rnn/rnn_node.cxx b/rnn/rnn_node.cxx index b6f34344..e0270e2a 100644 --- a/rnn/rnn_node.cxx +++ b/rnn/rnn_node.cxx @@ -5,7 +5,7 @@ using std::vector; #include "common/log.hxx" #include "rnn_node.hxx" -RNN_Node::RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _depth, int32_t _node_type) +RNN_Node::RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _depth, node_t _node_type) : RNN_Node_Interface(_innovation_number, _layer_type, _depth), bias(0) { // node type will be simple, jordan or elman node_type = _node_type; @@ -13,7 +13,7 @@ RNN_Node::RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _dept } RNN_Node::RNN_Node( - int32_t _innovation_number, int32_t _layer_type, double _depth, int32_t _node_type, string _parameter_name + int32_t _innovation_number, int32_t _layer_type, double _depth, node_t _node_type, string _parameter_name ) : RNN_Node_Interface(_innovation_number, _layer_type, _depth, _parameter_name), bias(0) { // node type will be simple, jordan or elman @@ -91,6 +91,8 @@ void RNN_Node::try_update_deltas(int32_t time) { outputs_fired[time], total_outputs ); exit(1); + } else if (time >= d_input.size() || time < 0) { + Log::fatal("invalid time %d\n", time); } d_input[time] *= ld_output[time]; diff --git a/rnn/rnn_node.hxx b/rnn/rnn_node.hxx index 0f6bf741..3bfefa1b 100644 --- a/rnn/rnn_node.hxx +++ b/rnn/rnn_node.hxx @@ -15,12 +15,10 @@ class RNN_Node : public RNN_Node_Interface { public: // constructor for hidden nodes - RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _depth, int32_t _node_type); + RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _depth, node_t _node_type); // constructor for input and output nodes - RNN_Node( - int32_t _innovation_number, int32_t _layer_type, double _depth, int32_t _node_type, string _parameter_name - ); + RNN_Node(int32_t _innovation_number, int32_t _layer_type, double _depth, node_t _node_type, string _parameter_name); ~RNN_Node(); void initialize_lamarckian( diff --git a/rnn/rnn_node_interface.cxx b/rnn/rnn_node_interface.cxx index 5030aa88..7f3d061b 100644 --- a/rnn/rnn_node_interface.cxx +++ b/rnn/rnn_node_interface.cxx @@ -8,14 +8,18 @@ using std::ostream; #include using std::string; +#include +using std::vector; + #include "common/log.hxx" #include "rnn/rnn_genome.hxx" #include "rnn_node_interface.hxx" -extern const string NODE_TYPES[] = {"simple", "jordan", "elman", "UGRNN", "MGU", "GRU", "delta", - "LSTM", "ENARC", "ENAS_DAG", "rdag", "dnas", "sin", "sum", - "cos", "tanh", "sigmoid", "inverse", "multiply"}; -extern const unordered_map string_to_node_type = { +const vector NODE_TYPES = {"simple", "jordan", "elman", "UGRNN", "MGU", "GRU", "delta", + "LSTM", "ENARC", "ENAS_DAG", "rdag", "dnas", "sin", "sum", + "cos", "tanh", "sigmoid", "inverse", "multiply"}; + +extern const unordered_map string_to_node_type = { { "simple", SIMPLE_NODE}, { "jordan", JORDAN_NODE}, { "elman", ELMAN_NODE}, @@ -37,9 +41,9 @@ extern const unordered_map string_to_node_type = { {"multiply", MULTIPLY_NODE}, }; -extern const int32_t NUMBER_NODE_TYPES = string_to_node_type.size(); +extern const int32_t NUMBER_NODE_TYPES = NODE_TYPES.size(); -int32_t node_type_from_string(string& node_type) { +node_t node_type_from_string(string& node_type) { std::transform(node_type.begin(), node_type.end(), node_type.begin(), [](unsigned char c) { return std::tolower(c); }); diff --git a/rnn/rnn_node_interface.hxx b/rnn/rnn_node_interface.hxx index 2ee56875..2782a87b 100644 --- a/rnn/rnn_node_interface.hxx +++ b/rnn/rnn_node_interface.hxx @@ -26,32 +26,33 @@ class RNN; #define HIDDEN_LAYER 1 #define OUTPUT_LAYER 2 -extern const string NODE_TYPES[]; -extern const unordered_map string_to_node_type; -extern const int32_t NUMBER_NODE_TYPES; -int32_t node_type_from_string(string& node_type); - -#define SIMPLE_NODE 0 -#define JORDAN_NODE 1 -#define ELMAN_NODE 2 -#define UGRNN_NODE 3 -#define MGU_NODE 4 -#define GRU_NODE 5 -#define DELTA_NODE 6 -#define LSTM_NODE 7 -#define ENARC_NODE 8 -#define ENAS_DAG_NODE 9 -#define RANDOM_DAG_NODE 10 -#define DNAS_NODE 11 -#define SIN_NODE 12 -#define SUM_NODE 13 -#define COS_NODE 14 -#define TANH_NODE 15 -#define SIGMOID_NODE 16 -#define INVERSE_NODE 17 -#define MULTIPLY_NODE 18 - -int32_t node_type_from_string(string& node_type); +extern const vector NODE_TYPES; + +enum node_t : int32_t { + SIMPLE_NODE = 0, + JORDAN_NODE = 1, + ELMAN_NODE = 2, + UGRNN_NODE = 3, + MGU_NODE = 4, + GRU_NODE = 5, + DELTA_NODE = 6, + LSTM_NODE = 7, + ENARC_NODE = 8, + ENAS_DAG_NODE = 9, + RANDOM_DAG_NODE = 10, + DNAS_NODE = 11, + SIN_NODE = 12, + SUM_NODE = 13, + COS_NODE = 14, + TANH_NODE = 15, + SIGMOID_NODE = 16, + INVERSE_NODE = 17, + MULTIPLY_NODE = 18, +}; + +node_t node_type_from_string(string& node_type); + +extern const unordered_map string_to_node_type; double sigmoid(double value); double sigmoid_derivative(double value); @@ -69,7 +70,7 @@ class RNN_Node_Interface { public: int32_t innovation_number; int32_t layer_type; - int32_t node_type; + node_t node_type; double depth; diff --git a/rnn_examples/CMakeLists.txt b/rnn_examples/CMakeLists.txt index 2bfda532..f5e294c6 100644 --- a/rnn_examples/CMakeLists.txt +++ b/rnn_examples/CMakeLists.txt @@ -16,3 +16,6 @@ target_link_libraries(evaluate_rnns_multi_offset examm_strategy exact_common exa add_executable(rnn_statistics rnn_statistics.cxx) target_link_libraries(rnn_statistics examm_strategy exact_common exact_time_series exact_weights examm_nn ${MPI_LIBRARIES} ${MPI_EXTRA} ${MYSQL_LIBRARIES} pthread) +add_executable(dnas_info dnas_info.cxx) +target_link_libraries(dnas_info examm_strategy exact_common exact_time_series exact_weights examm_nn ${MPI_LIBRARIES} ${MPI_EXTRA} ${MYSQL_LIBRARIES} pthread) + diff --git a/rnn_examples/dnas_info.cxx b/rnn_examples/dnas_info.cxx new file mode 100644 index 00000000..fac60c84 --- /dev/null +++ b/rnn_examples/dnas_info.cxx @@ -0,0 +1,96 @@ +#include +#include +using std::getline; +using std::ifstream; +using std::ofstream; + +#include +using std::minstd_rand0; +using std::uniform_real_distribution; + +#include +using std::string; + +#include +using std::vector; + +#include "common/arguments.hxx" +#include "common/files.hxx" +#include "common/log.hxx" +#include "rnn/generate_nn.hxx" +#include "rnn/gru_node.hxx" +#include "rnn/lstm_node.hxx" +#include "rnn/rnn_edge.hxx" +#include "rnn/rnn_genome.hxx" +#include "rnn/rnn_node.hxx" +#include "rnn/rnn_node_interface.hxx" +#include "time_series/time_series.hxx" +#include "weights/weight_rules.hxx" +#include "weights/weight_update.hxx" + +vector > > training_inputs; +vector > > training_outputs; +vector > > test_inputs; +vector > > test_outputs; + +bool random_sequence_length; +int32_t sequence_length_lower_bound = 30; +int32_t sequence_length_upper_bound = 100; + +RNN_Genome* genome; +RNN* rnn; +WeightUpdate* weight_update_method; +int32_t bp_iterations; +bool using_dropout; +double dropout_probability; + +ofstream* log_file; +string output_directory; + +double objective_function(const vector& parameters) { + rnn->set_weights(parameters); + + double error = 0.0; + + for (int32_t i = 0; i < (int32_t) training_inputs.size(); i++) { + error += rnn->prediction_mae(training_inputs[i], training_outputs[i], false, true, 0.0); + } + + return -error; +} + +double test_objective_function(const vector& parameters) { + rnn->set_weights(parameters); + + double total_error = 0.0; + + for (int32_t i = 0; i < (int32_t) test_inputs.size(); i++) { + double error = rnn->prediction_mse(test_inputs[i], test_outputs[i], false, true, 0.0); + total_error += error; + + Log::info("output for series[%d]: %lf\n", i, error); + } + + return -total_error; +} + +int main(int argc, char** argv) { + vector arguments = vector(argv, argv + argc); + + Log::initialize(arguments); + Log::set_id("main"); + + string filename; + get_argument(arguments, "--filename", true, filename); + + RNN_Genome genome(filename); + + for (auto node : genome.get_nodes()) { + if (DNASNode* d = dynamic_cast(node)) { + std::cout << "'" << filename << "': "; + d->print_info(); + } + } + + Log::release_id("main"); +} diff --git a/rnn_examples/train_rnn.cxx b/rnn_examples/train_rnn.cxx index ffdf8999..64727f07 100644 --- a/rnn_examples/train_rnn.cxx +++ b/rnn_examples/train_rnn.cxx @@ -17,6 +17,7 @@ using std::vector; #include "common/arguments.hxx" #include "common/files.hxx" #include "common/log.hxx" +#include "common/process_arguments.hxx" #include "rnn/generate_nn.hxx" #include "rnn/gru_node.hxx" #include "rnn/lstm_node.hxx" @@ -81,25 +82,24 @@ int main(int argc, char** argv) { Log::set_id("main"); TimeSeriesSets* time_series_sets = TimeSeriesSets::generate_from_arguments(arguments); + get_train_validation_data( + arguments, time_series_sets, training_inputs, training_outputs, test_inputs, test_outputs + ); + + int32_t crystallization_threshold = 1000; + get_argument(arguments, "--crystalize_iters", false, crystallization_threshold); + DNASNode::CRYSTALLIZATION_THRESHOLD = crystallization_threshold; - int32_t time_offset = 1; - get_argument(arguments, "--time_offset", true, time_offset); + int32_t k = -1; + get_argument(arguments, "--dnas_k", false, k); + DNASNode::k = k; - time_series_sets->export_training_series(time_offset, training_inputs, training_outputs); - time_series_sets->export_test_series(time_offset, test_inputs, test_outputs); + // time_series_sets->export_training_series(time_offset, training_inputs, training_outputs); + // time_series_sets->export_test_series(time_offset, test_inputs, test_outputs); int number_inputs = time_series_sets->get_number_inputs(); // int number_outputs = time_series_sets->get_number_outputs(); - string rnn_type; - get_argument(arguments, "--rnn_type", true, rnn_type); - - int32_t num_hidden_layers; - get_argument(arguments, "--num_hidden_layers", true, num_hidden_layers); - - int32_t max_recurrent_depth; - get_argument(arguments, "--max_recurrent_depth", true, max_recurrent_depth); - WeightRules* weight_rules = new WeightRules(arguments); weight_update_method = new WeightUpdate(); @@ -108,74 +108,98 @@ int main(int argc, char** argv) { vector input_parameter_names = time_series_sets->get_input_parameter_names(); vector output_parameter_names = time_series_sets->get_output_parameter_names(); - RNN_Genome* genome; - Log::info("RNN TYPE = %s\n", rnn_type.c_str()); - if (rnn_type == "lstm") { - genome = create_lstm( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - - } else if (rnn_type == "gru") { - genome = create_gru( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - - } else if (rnn_type == "delta") { - genome = create_delta( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); + string genome_file; + get_argument(arguments, "--genome_file", false, genome_file); + Log::info("RNN_GENOME = <%s> \n", genome_file.c_str()); - } else if (rnn_type == "mgu") { - genome = create_mgu( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - - } else if (rnn_type == "ugrnn") { - genome = create_ugrnn( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - - } else if (rnn_type == "ff") { - genome = create_ff( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - - } else if (rnn_type == "jordan") { - genome = create_jordan( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); + RNN_Genome* genome; - } else if (rnn_type == "elman") { - genome = create_elman( - input_parameter_names, num_hidden_layers, number_inputs, output_parameter_names, max_recurrent_depth, - weight_rules - ); - } else if (rnn_type == "dnas") { - vector node_types = {SIMPLE_NODE, LSTM_NODE, GRU_NODE, MGU_NODE, DELTA_NODE}; - genome = create_dnas_nn( - input_parameter_names, num_hidden_layers, 1, output_parameter_names, max_recurrent_depth, node_types, - weight_rules - ); + if (genome_file.size() != 0) { + genome = new RNN_Genome(genome_file); + genome->set_weights(genome->get_best_parameters()); + Log::info("Number of weights = %d\n", genome->get_number_weights()); } else { - Log::fatal("ERROR: incorrect rnn type\n"); - Log::fatal("Possibilities are:\n"); - Log::fatal(" lstm\n"); - Log::fatal(" gru\n"); - Log::fatal(" ff\n"); - Log::fatal(" jordan\n"); - Log::fatal(" elman\n"); - exit(1); + string rnn_type; + get_argument(arguments, "--rnn_type", true, rnn_type); + + Log::info("RNN TYPE = %s\n", rnn_type.c_str()); + + int32_t num_hidden_layers; + get_argument(arguments, "--num_hidden_layers", false, num_hidden_layers); + + int32_t max_recurrent_depth; + get_argument(arguments, "--max_recurrent_depth", false, max_recurrent_depth); + + int32_t hidden_layer_size = number_inputs; + get_argument(arguments, "--hidden_layer_size", false, hidden_layer_size); + + if (rnn_type == "lstm") { + genome = create_lstm( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "gru") { + genome = create_gru( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "delta") { + genome = create_delta( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "mgu") { + genome = create_mgu( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "ugrnn") { + genome = create_ugrnn( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "ff") { + genome = create_ff( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "jordan") { + genome = create_jordan( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + + } else if (rnn_type == "elman") { + genome = create_elman( + input_parameter_names, num_hidden_layers, hidden_layer_size, output_parameter_names, + max_recurrent_depth, weight_rules + ); + } else if (rnn_type == "dnas") { + vector node_types = {SIMPLE_NODE, LSTM_NODE, GRU_NODE, MGU_NODE, DELTA_NODE, UGRNN_NODE}; + genome = create_dnas_nn( + input_parameter_names, num_hidden_layers, 1, output_parameter_names, max_recurrent_depth, node_types, + weight_rules + ); + } else { + Log::fatal("ERROR: incorrect rnn type %s\n", rnn_type.c_str()); + Log::fatal("Possibilities are:\n"); + Log::fatal(" lstm\n"); + Log::fatal(" gru\n"); + Log::fatal(" ff\n"); + Log::fatal(" jordan\n"); + Log::fatal(" elman\n"); + exit(1); + } } get_argument(arguments, "--bp_iterations", true, bp_iterations); - genome->set_bp_iterations(bp_iterations); + genome->set_bp_iterations(bp_iterations + genome->get_bp_iterations()); get_argument(arguments, "--output_directory", true, output_directory); if (output_directory != "") { @@ -187,6 +211,9 @@ int main(int argc, char** argv) { genome->set_log_filename(output_directory + "/" + log_filename); } + string output_genome_name = "output_genome.bin"; + get_argument(arguments, "--output_genome_name", false, output_genome_name); + genome->set_parameter_names( time_series_sets->get_input_parameter_names(), time_series_sets->get_output_parameter_names() ); @@ -208,7 +235,7 @@ int main(int argc, char** argv) { using_dropout = false; - genome->initialize_randomly(); + genome->set_weights(genome->get_best_parameters()); double learning_rate = 0.001; get_argument(arguments, "--learning_rate", false, learning_rate); @@ -232,6 +259,8 @@ int main(int argc, char** argv) { genome->get_weights(best_parameters); rnn->set_weights(best_parameters); + genome->write_to_file(output_directory + "/output_genome.bin"); + Log::info("TRAINING ERRORS:\n"); Log::info("MSE: %lf\n", genome->get_mse(best_parameters, training_inputs, training_outputs)); Log::info("MAE: %lf\n", genome->get_mae(best_parameters, training_inputs, training_outputs)); diff --git a/rnn_tests/test_dnas_gradients.cxx b/rnn_tests/test_dnas_gradients.cxx index 98078193..df917349 100644 --- a/rnn_tests/test_dnas_gradients.cxx +++ b/rnn_tests/test_dnas_gradients.cxx @@ -49,7 +49,7 @@ int main(int argc, char** argv) { WeightRules* weight_rules = new WeightRules(); weight_rules->initialize_from_args(arguments); - vector node_types = {SIMPLE_NODE, LSTM_NODE, GRU_NODE, MGU_NODE, JORDAN_NODE, ELMAN_NODE, DELTA_NODE}; + vector node_types = {SIMPLE_NODE, LSTM_NODE, GRU_NODE, MGU_NODE, JORDAN_NODE, ELMAN_NODE, DELTA_NODE}; for (int32_t max_recurrent_depth = 1; max_recurrent_depth <= 5; max_recurrent_depth++) { Log::info("testing with max recurrent depth: %d\n", max_recurrent_depth); diff --git a/scripts/air_quality/eval_merra.sh b/scripts/air_quality/eval_merra.sh new file mode 100755 index 00000000..d3de8abd --- /dev/null +++ b/scripts/air_quality/eval_merra.sh @@ -0,0 +1,17 @@ +INPUT_PARAMETERS="lon lat lev AIRDENS SO4 SO2 RH PS H O3 T U V" +# OUTPUT_PARAMETERS="CO" +OUTPUT_PARAMETERS="CO SO4 SO2 O3" + +exp_name="/home/aidan/sandbox/DEEPSPrj/output/merra/multivar_B/evaluation" +mkdir -p $exp_name + +../../build/rnn_examples/evaluate_rnn \ +--testing_filenames /home/aidan/sandbox/DEEPSPrj/data/MERRA/merra_eval_1000.csv \ +--time_offset 1 \ +--input_parameter_names $INPUT_PARAMETERS \ +--output_parameter_names $OUTPUT_PARAMETERS \ +--genome_file $1 \ +--output_directory $exp_name \ +--std_message_level INFO \ +--file_message_level ERROR +# --bp_iterations $epochs \ diff --git a/scripts/air_quality/evolve_aq.sh b/scripts/air_quality/evolve_aq.sh new file mode 100755 index 00000000..0136307e --- /dev/null +++ b/scripts/air_quality/evolve_aq.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# This is a script for evlolving networks to predict Air Quality data +# This also doubles as an example for the csv delimiter option + +cd build + +# INPUT_PARAMETERS="Date Time PT08.S1(CO) PT08.S2(NMHC) PT08.S3(NOx) PT08.S4(NO2) PT08.S5(O3) T RH AH" +INPUT_PARAMETERS="PT08.S5(O3) T RH AH" +# OUTPUT_PARAMETERS="CO(GT) NO2(GT) NOx(GT) NMHC(GT)" +OUTPUT_PARAMETERS="CO(GT)" + +exp_name="/home/aidan/sandbox/DEEPSPrj/output/univar3" +mkdir -p $exp_name +echo "Running base EXAMM code with UCI Air Quality dataset, results will be saved to: "$exp_name +echo "###-------------------###" + +../../build/multithreaded/examm_mt \ +--training_filenames /home/aidan/sandbox/DEEPSPrj/data/AirQualityUCI.csv \ +--test_filenames /home/aidan/sandbox/DEEPSPrj/data/AirQualityUCI.csv \ +--time_offset 1 \ +--input_parameter_names $INPUT_PARAMETERS \ +--output_parameter_names $OUTPUT_PARAMETERS \ +--number_islands 10 \ +--min_recurrent_depth 10 \ +--max_recurrent_depth 40 \ +--island_size 10 \ +--max_genomes 20000 \ +--number_threads 14 \ +--num_mutations 20 \ +--bp_iterations 20 \ +--normalize min_max \ +--output_directory $exp_name \ +--possible_node_types simple UGRNN MGU GRU delta LSTM \ +--std_message_level INFO \ +--file_message_level NONE \ +--csv_delimiter ";" diff --git a/scripts/air_quality/evolve_merra.sh b/scripts/air_quality/evolve_merra.sh new file mode 100755 index 00000000..d5a71e31 --- /dev/null +++ b/scripts/air_quality/evolve_merra.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# This is a script for evlolving networks to predict Air Quality data +# This also doubles as an example for the csv delimiter option + +cd build + +INPUT_PARAMETERS="lon lat lev AIRDENS SO4 SO2 RH PS H O3 T U V" +# OUTPUT_PARAMETERS="CO" +OUTPUT_PARAMETERS="CO SO4 SO2 O3" + +exp_name="/home/aidan/sandbox/DEEPSPrj/output/merra/mv-A" +mkdir -p $exp_name +echo "Running base EXAMM code with MERRA-2 dataset, results will be saved to: "$exp_name +echo "###-------------------###" + +../../build/multithreaded/examm_mt \ +--training_filenames /home/aidan/sandbox/DEEPSPrj/data/MERRA/merra_100k_23.csv \ +--test_filenames /home/aidan/sandbox/DEEPSPrj/data/MERRA/merra_100k_23_test.csv \ +--time_offset 1 \ +--input_parameter_names $INPUT_PARAMETERS \ +--output_parameter_names $OUTPUT_PARAMETERS \ +--number_islands 10 \ +--min_recurrent_depth 1 \ +--max_recurrent_depth 100 \ +--island_size 10 \ +--max_genomes 1000 \ +--number_threads 14 \ +--num_mutations 20 \ +--bp_iterations 5 \ +--normalize none \ +--output_directory $exp_name \ +--possible_node_types simple UGRNN MGU GRU delta LSTM \ +--std_message_level INFO \ +--file_message_level NONE \ +--csv_delimiter "," diff --git a/scripts/air_quality/train_aq.sh b/scripts/air_quality/train_aq.sh new file mode 100755 index 00000000..1d323ab8 --- /dev/null +++ b/scripts/air_quality/train_aq.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# This is an example of running EXAMM MPI version on pa28 dataset, output parameters are non engine parameters +# +# The pa28 dataset is not normalized +# To run datasets that's not normalized, make sure to add arguments: +# --normalize min_max for Min Max normalization, or +# --normalize avg_std_dev for Z-score normalization + +cd build + +INPUT_PARAMETERS="Date Time PT08.S5(O3) T RH AH" +# OUTPUT_PARAMETERS="CO(GT) NO2(GT) NOx(GT) NMHC(GT)" +OUTPUT_PARAMETERS="CO(GT)" + +exp_name="/home/aidan/sandbox/DEEPSPrj/output/init_mvar_1" +mkdir -p "${exp_name}/training" +echo "Running base EXAMM rnn training code with UCI Air Quality dataset, results will be saved to: "$exp_name +echo "###-------------------###" + +../../build/rnn_examples/train_rnn \ +--training_filenames /home/aidan/sandbox/DEEPSPrj/data/AirQualityUCI.csv \ +--test_filenames /home/aidan/sandbox/DEEPSPrj/data/AirQualityUCI.csv \ +--time_offset 1 \ +--input_parameter_names $INPUT_PARAMETERS \ +--output_parameter_names $OUTPUT_PARAMETERS \ +--bp_iterations 100000 \ +--output_directory "${exp_name}/training" \ +--std_message_level INFO \ +--file_message_level NONE \ +--genome_file $1 \ +--learning_rate 0.001 \ +--csv_delimiter ";" diff --git a/scripts/air_quality/train_merra.sh b/scripts/air_quality/train_merra.sh new file mode 100755 index 00000000..5fd2931e --- /dev/null +++ b/scripts/air_quality/train_merra.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# This is an example of running EXAMM MPI version on pa28 dataset, output parameters are non engine parameters +# +# The pa28 dataset is not normalized +# To run datasets that's not normalized, make sure to add arguments: +# --normalize min_max for Min Max normalization, or +# --normalize avg_std_dev for Z-score normalization + +cd build + +INPUT_PARAMETERS="lon lat lev AIRDENS SO4 SO2 RH PS H O3 T U V" +# OUTPUT_PARAMETERS="CO" +OUTPUT_PARAMETERS="CO SO4 SO2 O3" + +exp_name="/home/aidan/sandbox/DEEPSPrj/output/merra/multivar_B" +mkdir -p "${exp_name}/training" +echo "Running base EXAMM rnn training code with UCI Air Quality dataset, results will be saved to: "$exp_name"/training" +echo "###-------------------###" + +../../build/rnn_examples/train_rnn \ +--training_filenames /home/aidan/sandbox/DEEPSPrj/data/MERRA/merra_post.csv \ +--test_filenames /home/aidan/sandbox/DEEPSPrj/data/MERRA/poc_merra_test.csv \ +--time_offset 1 \ +--input_parameter_names $INPUT_PARAMETERS \ +--output_parameter_names $OUTPUT_PARAMETERS \ +--bp_iterations 5 \ +--output_directory "${exp_name}/training" \ +--std_message_level INFO \ +--file_message_level NONE \ +--genome_file $1 \ +--learning_rate 0.01 \ +--csv_delimiter "," diff --git a/scripts/dnas/analyze.py b/scripts/dnas/analyze.py new file mode 100644 index 00000000..78d51466 --- /dev/null +++ b/scripts/dnas/analyze.py @@ -0,0 +1,110 @@ +import pandas + +import numpy as np + +import matplotlib.pyplot as plt + +fig, subplts = plt.subplots(6, 1) + +bprange = [8, 16] +plts = {k:v for k, v in zip(bprange, subplts)} +print(plts) +base = plts[bprange[0]] + +for k, v in plts.items(): + v.set_title(f"{k} BPI") + if k == bprange[0]: + continue + v.sharey(base) + v.sharex(base) + +def avg(files, slice_at=-1): + r = {} + for file in files: + x = [] + + for fold in range(8): + f = pandas.read_csv(f"{file}/{fold}/fitness_log.csv")[:slice_at] + print(f"{file}/{fold} -> {len(f)}") + x.append(f) + + + enabled_nodes = [] + enabled_edges = [] + enabled_rec_edges = [] + + bpi_columns = [] + mse_columns = [] + + minlen = 100000000 + + for f in x: + bpi_columns.append(f[' Total BP Epochs'].to_numpy()) + mse_columns.append(f[' Best Val. MSE'].to_numpy()) + enabled_nodes.append(f[' Enabled Nodes'].to_numpy()) + enabled_edges.append(f[' Enabled Edges'].to_numpy()) + enabled_rec_edges.append(f[' Enabled Rec. Edges'].to_numpy()) + + minlen = min(minlen, len(bpi_columns[-1])) + + enabled_nodes = list(map(lambda x: x[:minlen], enabled_nodes)) + enabled_edges = list(map(lambda x: x[:minlen], enabled_edges)) + enabled_rec_edges = list(map(lambda x: x[:minlen], enabled_rec_edges)) + bpi_columns = list(map(lambda x: x[:minlen], bpi_columns)) + mse_columns = list(map(lambda x: x[:minlen], mse_columns)) + + nodesmean = np.mean(np.array(enabled_nodes), axis=0) + edgesmean = np.mean(np.array(enabled_edges), axis=0) + redgesmean = np.mean(np.array(enabled_rec_edges), axis=0) + print(f"Nodes at end mean: {nodesmean[-1]}") + print(f"edges at end mean: {edgesmean[-1]}") + print(f"redges at end mean: {redgesmean[-1]}") + + + bpimean = np.mean(np.array(bpi_columns), axis=0) + msemean = np.mean(np.array(mse_columns), axis=0) + msestd = np.std(np.array(mse_columns), axis=0) + + r[file] = { + 'mean_nodes': nodesmean, + 'mean_edges': edgesmean, + 'mean_rec_edges':redgesmean, + 'bpi': bpimean, + 'mean_mse': msemean, + 'std_mse': msestd, + } + return r + +results = {} +for ci in [64]: + results[ci] = {} + for bpe in bprange: + results[ci][bpe] = {} + for k in [1]: + f = f"initial_integration_experiments/results/v7/{ci}/{bpe}/{k}/" + x = avg([f])[f] + results[ci][bpe][k] = x + print(x) + + print(x['mean_mse'] - x['std_mse']) + g = plts[bpe].plot(x['bpi'], x['mean_mse'], label=f"ci={ci}")[0] + plts[bpe].fill_between(x['bpi'], x['mean_mse'] - x['std_mse'], x['mean_mse'] + x['std_mse'], + alpha=0.2, edgecolor=g.get_color(), facecolor=g.get_color(), linewidth=0) + +control_results = {} +for bp in [8, 16]: + key = f"initial_integration_experiments/results/control_v7/{bp}" + r = avg([key])[key] + control_results[bp] = r + print(list(r.keys())) + g = plts[bp].plot(r['bpi'], r['mean_mse'], label=f"control")[0] + plts[bp].fill_between(r['bpi'], r['mean_mse'] - r['std_mse'], r['mean_mse'] + r['std_mse'], + alpha=0.2, edgecolor=g.get_color(), facecolor=g.get_color(), linewidth=0) + + +for k, v in plts.items(): + v.set_title(f"{k} BPI") + v.legend(fontsize=12, loc="upper right") + + +plt.show() diff --git a/scripts/dnas/analyze.zsh b/scripts/dnas/analyze.zsh new file mode 100644 index 00000000..5c2876f3 --- /dev/null +++ b/scripts/dnas/analyze.zsh @@ -0,0 +1,12 @@ +#!/usr/bin/zsh +# +for crystalize_iters in 64 128 256 512; do + for bp_epoch in 8 16 32 64 128; do + for k in 1; do + for fold in 0 1 2 3 4 5 6 7; do + output_dir=initial_integration_experiments/results/v2/$crystalize_iters/$bp_epoch/$k/$fold + tail -1 $output_dir/fitness_log.csv + done + done + done +done diff --git a/scripts/dnas/aviation.zsh b/scripts/dnas/aviation.zsh new file mode 100644 index 00000000..7059da3e --- /dev/null +++ b/scripts/dnas/aviation.zsh @@ -0,0 +1,37 @@ +#!/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' + +offset=1 + +run_examm() { + output_dir=results/v0/$bp_epoch/$fold + mkdir -p $output_dir + mpirun -np 32 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=output_params} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level INFO \ + --max_genomes 10000 \ + --island_size 32 \ + --number_islands 4 + + touch $output_dir/completed +} + +for output_params in "E1_CHT1" "Pitch"; do + for bp_epoch in 2 4 8 16 32; do + for fold in 0 1 2 3 4 5 6 7 8 9; do + run_examm + done + done +done diff --git a/scripts/dnas/coal_dnas_control.zsh b/scripts/dnas/coal_dnas_control.zsh new file mode 100644 index 00000000..9543cc09 --- /dev/null +++ b/scripts/dnas/coal_dnas_control.zsh @@ -0,0 +1,22 @@ +#!/bin/zsh + +let np=8 +#SBATCH --ntasks=8 +#SBATCH --exclude theocho +#SBATCH --time=8-00:00:00 +#SBATCH -A examm +#SBATCH --partition=TIER +#SBATCH -J examm_coal_gp_control +#SBATCH -o /home/jak5763/exact/results/dnas_control/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/dnas_control/slurm_out/%x.%j.err +#SBATCH --mem=64GB + +source lib.zsh + +output_dir_prefix=/home/jak5763/exact/results/gp_control +bp_epoch_set=(8 16 32 64 128) +nfolds=20 +MAX_GENOMES=4000 +ISLAND_SIZE=10 +N_ISLANDS=10 +coal diff --git a/scripts/dnas/coal_gp.zsh b/scripts/dnas/coal_gp.zsh new file mode 100644 index 00000000..c1318793 --- /dev/null +++ b/scripts/dnas/coal_gp.zsh @@ -0,0 +1,22 @@ +#!/bin/zsh + +let np=8 +#SBATCH --ntasks=8 +#SBATCH --exclude theocho +#SBATCH --time=8-00:00:00 +#SBATCH -A examm +#SBATCH --partition=TIER +#SBATCH -J examm_coal_gp_control +#SBATCH -o /home/jak5763/exact/results/gp_control/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/gp_control/slurm_out/%x.%j.err +#SBATCH --mem=64GB + +source lib.zsh + +output_dir_prefix=/home/jak5763/exact/results/gp_control +bp_epoch_set=(8) +nfolds=20 +MAX_GENOMES=10000 +ISLAND_SIZE=10 +N_ISLANDS=10 +coal diff --git a/scripts/dnas/control.zsh b/scripts/dnas/control.zsh new file mode 100644 index 00000000..f3532525 --- /dev/null +++ b/scripts/dnas/control.zsh @@ -0,0 +1,47 @@ +#!/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=results/control_v8/$bp_epoch/$fold + mkdir -p $output_dir + mpirun -np 14 build/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types lstm mgu gru ugrnn delta simple \ + --stochastic 0 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 32 \ + --number_islands 4 \ + --synchronous + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k initial_integration_experiments/post_training_dnas.zsh +} + +# bp_ge=(8 8192 16 4096 32 2048) +bp_ge=(8 8192) + +for bp_epoch max_genomes in "${(@kv)bp_ge}"; do + for fold in $(seq 0 1); do + run_examm + done +done diff --git a/scripts/dnas/control_cluster.zsh b/scripts/dnas/control_cluster.zsh new file mode 100644 index 00000000..a848302b --- /dev/null +++ b/scripts/dnas/control_cluster.zsh @@ -0,0 +1,50 @@ +#!/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/control_v8/$bp_epoch/$fold + mkdir -p $output_dir + mpirun -np 8 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types lstm mgu gru ugrnn delta simple \ + --stochastic 0 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 32 \ + --number_islands 4 + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k initial_integration_experiments/post_training_dnas.zsh +} + +bp_ge=(8 8192 16 4096 32 2048) + +for bp_epoch max_genomes in "${(@kv)bp_ge}"; do + for fold in 0 1 2 3; do + run_examm & + done + wait + for fold in 4 5 6 7; do + run_examm & + done + wait +done diff --git a/scripts/dnas/debug.zsh b/scripts/dnas/debug.zsh new file mode 100755 index 00000000..ce159c01 --- /dev/null +++ b/scripts/dnas/debug.zsh @@ -0,0 +1,55 @@ +#!/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/debug/$crystalize_iters/$bp_epoch/$k/$fold + mkdir -p $output_dir + mpirun -np 63 --use-hwthread-cpus Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types dnas \ + --stochastic 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes 8192 \ + --island_size 32 \ + --number_islands 4 \ + --stochastic \ + --dnas_k $k + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k ./initial_integration_experiments/post_training_dnas.zsh +} + +CELL_TYPE='dnas' +for crystalize_iters in 128; do + for bp_epoch in 8; do + for k in 1; do + for fold in 0; do + run_examm + done + # wait + # for fold in 4 5 6 7; do + # run_examm & + # done + # wait + done + done +done diff --git a/scripts/dnas/dnas.zsh b/scripts/dnas/dnas.zsh new file mode 100644 index 00000000..8b525b09 --- /dev/null +++ b/scripts/dnas/dnas.zsh @@ -0,0 +1,55 @@ +#!/bin/zsh + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/v8/$crystalize_iters/$bp_epoch/$k/$fold + mkdir -p $output_dir + mpirun -np 8 --use-hwthread-cpus Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types dnas \ + --stochastic 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 32 \ + --number_islands 4 \ + --dnas_k $k + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k ./initial_integration_experiments/post_training_dnas.zsh +} + +CELL_TYPE='dnas' +bp_ge=(8 8192 16 4096 32 2048) +for crystalize_iters in 256; do + for bp_epoch max_genomes in "${(@kv)bp_ge}"; do + for k in 1; do + for fold in 0 1 2 3; do + run_examm & + done + wait + for fold in 4 5 6 7; do + run_examm & + done + wait + done + done +done diff --git a/scripts/dnas/dnas_cluster.zsh b/scripts/dnas/dnas_cluster.zsh new file mode 100644 index 00000000..55823c0c --- /dev/null +++ b/scripts/dnas/dnas_cluster.zsh @@ -0,0 +1,69 @@ +#!/bin/zsh + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=36 +#SBATCH --exclude theocho +#SBATCH --time=23:00:00 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -J examm_dnas_experimental +#SBATCH -o /home/jak5763/exact/results/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/slurm_out/%x.%j.err +#SBATCH --mem=0 + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/v16/$crystalize_iters/$bp_epoch/$k/$fold + mkdir -p $output_dir + srun -n 36 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types dnas \ + --stochastic 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 10 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 32 \ + --number_islands 8 \ + --num_mutations 2 \ + --use_dnas_seed true \ + --use_burn_in_bp_epoch \ + --burn_in_period 1024 \ + --dnas_k $k + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k ./initial_integration_experiments/post_training_dnas.zsh +} + +run_group() { + for crystalize_iters in 512; do + for k in 1; do + for fold in $(seq 0 19); do + run_examm + done + done + done +} + +CELL_TYPE='dnas' +# bp_ge=(8 8192 16 4096 32 2048 64 1024) +# for bp_epoch max_genomes in "${(@kv)bp_ge}"; do +run_group +# done diff --git a/scripts/dnas/dnas_control.zsh b/scripts/dnas/dnas_control.zsh new file mode 100644 index 00000000..88a7c882 --- /dev/null +++ b/scripts/dnas/dnas_control.zsh @@ -0,0 +1,60 @@ +#!/bin/zsh + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=36 +#SBATCH --exclude theocho +#SBATCH --time=23:00:00 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -J examm_dnas_experimental +#SBATCH -o /home/jak5763/exact/results_control/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results_control/slurm_out/%x.%j.err +#SBATCH --mem=0 + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/control_v13.1/$bp_epoch/$fold + mkdir -p $output_dir + srun -n 36 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types lstm mgu gru ugrnn delta simple \ + --stochastic 0 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --max_recurrent_depth 10 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 32 \ + --number_islands 8 \ + --num_mutations 4 \ + --burn_in_period 1024 \ + --use_burn_in_bp_epoch + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k ./initial_integration_experiments/post_training_dnas.zsh +} + +run_group() { + for k in 1; do + for fold in $(seq 0 19); do + run_examm + done + done +} + +run_group diff --git a/scripts/dnas/dnas_r2_cluster.zsh b/scripts/dnas/dnas_r2_cluster.zsh new file mode 100644 index 00000000..a8bce387 --- /dev/null +++ b/scripts/dnas/dnas_r2_cluster.zsh @@ -0,0 +1,67 @@ +#!/bin/zsh + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=36 +#SBATCH --exclude theocho +#SBATCH --time=23:00:00 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -J examm_dnas_experimental +#SBATCH -o /home/jak5763/exact/results/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/slurm_out/%x.%j.err +#SBATCH --mem=0 + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4' + +offset=1 + +run_examm() { + output_dir=initial_integration_experiments/results/v9/$crystalize_iters/$bp_epoch/$k/$fold + mkdir -p $output_dir + srun -n 36 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types dnas \ + --stochastic 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --validation_sequence_length 100 \ + --max_recurrent_depth 10 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.001 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 16 \ + --number_islands 8 \ + --num_mutations 4 \ + --use_dnas_seed true \ + --dnas_k $k + + # best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + # BP_ITERS=$crystalize_iters CRYSTALIZE_ITERS=$crystalize_iters GENOME=$best_genome_file OUTPUT_DIRECTORY=$output_dir k=$k ./initial_integration_experiments/post_training_dnas.zsh +} + +run_group() { + for crystalize_iters in 1000000; do + for k in 1; do + for fold in $(seq 0 19); do + run_examm + done + done + done +} + +CELL_TYPE='dnas' +# bp_ge=(8 8192 16 4096 32 2048 64 1024) +# for bp_epoch max_genomes in "${(@kv)bp_ge}"; do +run_group +# done diff --git a/scripts/dnas/examm_bias_exp.zsh b/scripts/dnas/examm_bias_exp.zsh new file mode 100644 index 00000000..52816f00 --- /dev/null +++ b/scripts/dnas/examm_bias_exp.zsh @@ -0,0 +1,58 @@ +#!/bin/zsh + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=18 +#SBATCH --exclude theocho +#SBATCH --time=48:00:00 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -J examm_bias_ablation +#SBATCH -o /home/jak5763/exact/results/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/slurm_out/%x.%j.err +#SBATCH --mem=64GB + +cd /home/jak5763/exact + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1' + +offset=1 + +run_examm() { + output_dir=results/$synchronous/$scramble_weights/$max_genomes/$fold + mkdir -p $output_dir + srun -n 18 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types simple ugrnn gru mgu lstm delta \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --max_recurrent_depth 10 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 10 \ + --number_islands 10 \ + --num_mutations 1 \ + --$synchronous \ + --$scramble_weights +} + +run_group() { + for fold in $(seq 0 19); do + run_examm + done +} + +let base_genomes=100000 +let max_genomes=$base_genomes/$bp_epoch +run_group diff --git a/scripts/dnas/examm_synchronous.zsh b/scripts/dnas/examm_synchronous.zsh new file mode 100644 index 00000000..1d970272 --- /dev/null +++ b/scripts/dnas/examm_synchronous.zsh @@ -0,0 +1,55 @@ +#!/bin/zsh + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=36 +#SBATCH --exclude theocho +#SBATCH --time=23:00:00 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -J examm_dnas_experimental +#SBATCH -o /home/jak5763/exact/results/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/results/slurm_out/%x.%j.err +#SBATCH --mem=0 + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUT_PARAMETERS='E1_CHT1' + +offset=1 + +run_examm() { + output_dir=results/synchronous/$max_genomes/$fold + mkdir -p $output_dir + srun -n 36 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --possible_node_types simple ugrnn gru mgu lstm delta \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --num_hidden_layers $SIZE \ + --hidden_layer_size $SIZE \ + --max_recurrent_depth 10 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level WARNING \ + --file_message_level WARNING \ + --crystalize_iters $crystalize_iters \ + --max_genomes $max_genomes \ + --island_size 10 \ + --number_islands 10 \ + --num_mutations 1 \ + --synchronous +} + +run_group() { + for fold in $(seq 0 9); do + run_examm + done +} + +let base_genomes=100000 +let max_genomes=$base_genomes/$bp_epoch +run_group diff --git a/scripts/dnas/experiment.zsh b/scripts/dnas/experiment.zsh new file mode 100755 index 00000000..32a1db55 --- /dev/null +++ b/scripts/dnas/experiment.zsh @@ -0,0 +1,34 @@ +#!/bin/zsh +#SBATCH -n 1 +#SBATCH -A examm +#SBATCH --partition=tier3 +#SBATCH -o /home/jak5763/exact/aistats/slurm_out/%x.%j.out +#SBATCH -e /home/jak5763/exact/aistats/slurm_out/%x.%j.err +#SBATCH --mem=10G + +spack load gcc +spack load openmpi +spack load /5aoa7oi +spack load /dd7nzzh + +for i in $(seq 0 19); do + export i=$i + export output_dir=/home/jak5763/exact/aistats/$control/maxt$maxt/crystal$crystal/bp$bp/$i + + if [ "$control" = "control" ]; then + node_types="simple UGRNN MGU GRU delta LSTM" + else + node_types="DNAS" + fi + + echo $node_types $control + + export node_types=$node_types + + # ./run_examm.zsh + + best_genome_file=( $output_dir/rnn_genome_*.bin([-1]) ) + export BP_ITERS=1 + export GENOME=$best_genome_file + ./post_training.zsh +done diff --git a/scripts/dnas/gp_control.zsh b/scripts/dnas/gp_control.zsh new file mode 100644 index 00000000..049e9750 --- /dev/null +++ b/scripts/dnas/gp_control.zsh @@ -0,0 +1,59 @@ +#!/bin/zsh + +offset=1 +MAX_GENOMES=10 +N_ISLANDS=4 +ISLAND_SIZE=32 + +run_examm() { + output_dir=test_results/v0/$bp_epoch/$fold + mkdir -p $output_dir + mpirun -np 32 Release/mpi/examm_mpi \ + --training_filenames ${=training_filenames} \ + --test_filenames ${=test_filenames} \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names $output_params \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level INFO \ + --max_genomes $MAX_GENOMES \ + --island_size $ISLAND_SIZE \ + --number_islands $N_ISLANDS + + touch $output_dir/completed +} + +run_group() { + for output_params in $OUTPUTS; do + for bp_epoch in 2 4 8 16 32; do + for fold in 0 1 2 3 4 5 6 7 8 9; do + run_examm + done + done + done +} + +INPUT_PARAMETERS="Conditioner_Inlet_Temp Conditioner_Outlet_Temp Coal_Feeder_Rate Primary_Air_Flow Primary_Air_Split System_Secondary_Air_Flow_Total Secondary_Air_Flow Secondary_Air_Split Tertiary_Air_Split Total_Comb_Air_Flow Supp_Fuel_Flow Main_Flm_Int" +training_filenames=(datasets/2018_coal/burner_[0-9].csv) +test_filenames=(datasets/2018_coal/burner_1[0-1].csv) +OUTPUTS=("Main_Flm_Int" "Supp_Fuel_Flow") +run_group + + +INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +OUTPUTS=("E1_CHT1" "Pitch") +training_filenames=(datasets/2019_ngafid_transfer/c172_file_[1-9].csv) +test_filenames=(datasets/2019_ngafid_transfer/c172_file_1[0-2].csv) +run_group + +INPUT_PARAMETERS="Ba_avg Rt_avg DCs_avg Cm_avg P_avg S_avg Cosphi_avg Db1t_avg Db2t_avg Dst_avg Gb1t_avg Gb2t_avg Git_avg Gost_avg Ya_avg Yt_avg Ws_avg Wa_avg Ot_avg Nf_avg Nu_avg Rbt_avg" +OUTPUTS=("Cm_avg" "P_avg") +training_filenames=(datasets/2020_wind_engine/turbine_R80711_2017-2020_[1-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_1[0-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_2[0-4].csv) +test_filenames=(datasets/2020_wind_engine/turbine_R80711_2017-2020_2[5-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_3[0-1].csv) +run_group diff --git a/scripts/dnas/lib.zsh b/scripts/dnas/lib.zsh new file mode 100644 index 00000000..49ebc581 --- /dev/null +++ b/scripts/dnas/lib.zsh @@ -0,0 +1,65 @@ +#!/bin/zsh + +offset=1 +MAX_GENOMES=10 +N_ISLANDS=4 +ISLAND_SIZE=32 + +run_examm() { + output_dir=$output_dir_prefix/bp_$bp_epoch/output_$output_params/$fold + mkdir -p $output_dir + echo srun -n $np Release/mpi/examm_mpi \ + --training_filenames ${=training_filenames} \ + --test_filenames ${=test_filenames} \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names $output_params \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level INFO \ + --max_genomes $MAX_GENOMES \ + --island_size $ISLAND_SIZE \ + --number_islands $N_ISLANDS + + touch $output_dir/completed +} + +run_group() { + for output_params in $OUTPUTS; do + for bp_epoch in $bp_epoch_set; do + for fold in $(seq 1 $nfolds); do + run_examm + done + done + done +} + +coal() { + INPUT_PARAMETERS="Conditioner_Inlet_Temp Conditioner_Outlet_Temp Coal_Feeder_Rate Primary_Air_Flow Primary_Air_Split System_Secondary_Air_Flow_Total Secondary_Air_Flow Secondary_Air_Split Tertiary_Air_Split Total_Comb_Air_Flow Supp_Fuel_Flow Main_Flm_Int" + training_filenames=(datasets/2018_coal/burner_[0-9].csv) + test_filenames=(datasets/2018_coal/burner_1[0-1].csv) + OUTPUTS=("Main_Flm_Int" "Supp_Fuel_Flow") + run_group +} + +aviation() { + INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' + OUTPUTS=("E1_CHT1" "Pitch") + training_filenames=(datasets/2019_ngafid_transfer/c172_file_[1-9].csv) + test_filenames=(datasets/2019_ngafid_transfer/c172_file_1[0-2].csv) + run_group +} + +wind() { + INPUT_PARAMETERS="Ba_avg Rt_avg DCs_avg Cm_avg P_avg S_avg Cosphi_avg Db1t_avg Db2t_avg Dst_avg Gb1t_avg Gb2t_avg Git_avg Gost_avg Ya_avg Yt_avg Ws_avg Wa_avg Ot_avg Nf_avg Nu_avg Rbt_avg" + OUTPUTS=("Cm_avg" "P_avg") + training_filenames=(datasets/2020_wind_engine/turbine_R80711_2017-2020_[1-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_1[0-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_2[0-4].csv) + test_filenames=(datasets/2020_wind_engine/turbine_R80711_2017-2020_2[5-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_3[0-1].csv) + run_group +} + diff --git a/scripts/dnas/mk_jobs.zsh b/scripts/dnas/mk_jobs.zsh new file mode 100644 index 00000000..b996883e --- /dev/null +++ b/scripts/dnas/mk_jobs.zsh @@ -0,0 +1,8 @@ +bp=(1 2 3 4 5 10 15 20 30 40 50 100 150 200) +for bp_epoch in $bp; do + for synchronous in "async" "synchronous"; do + for scramble_weights in "epigenetic_weights" "no_epigenetic_weights"; do + bp_epoch=$bp_epoch synchronous="$synchronous" scramble_weights="$scramble_weights" sbatch examm_bias_exp.zsh + done + done +done diff --git a/scripts/dnas/populate_queue.zsh b/scripts/dnas/populate_queue.zsh new file mode 100755 index 00000000..43a09dbb --- /dev/null +++ b/scripts/dnas/populate_queue.zsh @@ -0,0 +1,29 @@ +#!/bin/zsh +export INPUT_PARAMETERS='AltAGL AltB AltGPS AltMSL BaroA E1_CHT1 E1_CHT2 E1_CHT3 E1_CHT4 E1_EGT1 E1_EGT2 E1_EGT3 E1_EGT4 E1_FFlow E1_OilP E1_OilT E1_RPM FQtyL FQtyR GndSpd IAS LatAc NormAc OAT Pitch Roll TAS VSpd VSpdG WndDr WndSpd' +export OUTPUT_PARAMETERS='E1_EGT1' + +export offset=1 +export k=1 + +push_job() { + export maxt=$maxt + export crystal=$crystal + export bp=$bp + export control=$control + sbatch -J $control.maxt$maxt.cr$crystal.bp$bp ./experiment.zsh + +} + +export control="exp" +for maxt in 1.66 1.33 1.0; do + for crystal in 64 128 256; do + for bp in 4 8 16; do + push_job + done + done +done + +export control="control" +for bp in 4 8 16; do + push_job +done diff --git a/scripts/dnas/post_training.zsh b/scripts/dnas/post_training.zsh new file mode 100755 index 00000000..38c2d39d --- /dev/null +++ b/scripts/dnas/post_training.zsh @@ -0,0 +1,28 @@ +#!/usr/bin/zsh +offset=1 + +post_training() { + + echo "genome = $GENOME" + Release/rnn_examples/train_rnn \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $BP_ITERS \ + --stochastic \ + --normalize min_max \ + --genome_file $GENOME \ + --output_directory $output_dir \ + --log_filename post_training.csv \ + --learning_rate 0.01 \ + --weight_update adagrad \ + --train_sequence_length 100 \ + --validation_sequence_length 100 \ + --crystalize_iters $crystal \ + --dnas_k $k + +} + +post_training diff --git a/scripts/dnas/post_training_dnas.zsh b/scripts/dnas/post_training_dnas.zsh new file mode 100755 index 00000000..1c226178 --- /dev/null +++ b/scripts/dnas/post_training_dnas.zsh @@ -0,0 +1,29 @@ +#!/bin/zsh +offset=1 + +post_training() { + + echo "genome = $GENOME" + Release/rnn_examples/train_rnn \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset 1 \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=OUTPUT_PARAMETERS} \ + --bp_iterations $BP_ITERS \ + --stochastic \ + --normalize min_max \ + --genome_file $GENOME \ + --output_directory $OUTPUT_DIRECTORY \ + --log_filename post_training.csv \ + --learning_rate 0.01 \ + --weight_update adagrad \ + --train_sequence_length 1000 \ + --validation_sequence_length 100 \ + --crystalize_iters $CRYSTALIZE_ITERS \ + --dnas_k $k + + tail -1 $OUTPUT_DIRECTORY/post_training.csv +} + +post_training diff --git a/scripts/dnas/posttrain.zsh b/scripts/dnas/posttrain.zsh new file mode 100644 index 00000000..cc54a2eb --- /dev/null +++ b/scripts/dnas/posttrain.zsh @@ -0,0 +1,3 @@ +#!/bin/zsh + + diff --git a/scripts/dnas/run_examm.zsh b/scripts/dnas/run_examm.zsh new file mode 100644 index 00000000..77d2893f --- /dev/null +++ b/scripts/dnas/run_examm.zsh @@ -0,0 +1,25 @@ +#!/bin/zsh + +output_dir=results/v0/$bp_epoch/$fold +mkdir -p $output_dir + +mpirun -np 32 Release/mpi/examm_mpi \ + --training_filenames datasets/2019_ngafid_transfer/c172_file_[1-9].csv \ + --test_filenames datasets/2019_ngafid_transfer/c172_file_1[0-2].csv \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=output_params} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level INFO \ + --max_genomes 4000 \ + --island_size 32 \ + --number_islands 4 + +touch $output_dir/completed + diff --git a/scripts/dnas/run_experiments.zsh b/scripts/dnas/run_experiments.zsh new file mode 100755 index 00000000..7dd8e956 --- /dev/null +++ b/scripts/dnas/run_experiments.zsh @@ -0,0 +1,4 @@ +#!/bin/zsh + +initial_integration_experiments/control.zsh +initial_integration_experiments/dnas.zsh diff --git a/scripts/dnas/wind.zsh b/scripts/dnas/wind.zsh new file mode 100644 index 00000000..7e68f482 --- /dev/null +++ b/scripts/dnas/wind.zsh @@ -0,0 +1,39 @@ +#!/bin/zsh + +INPUT_PARAMETERS="Ba_avg Rt_avg DCs_avg Cm_avg P_avg S_avg Cosphi_avg Db1t_avg Db2t_avg Dst_avg Gb1t_avg Gb2t_avg Git_avg Gost_avg Ya_avg Yt_avg Ws_avg Wa_avg Ot_avg Nf_avg Nu_avg Rbt_avg" + + +offset=1 + +run_examm() { + output_dir=results/v0/$bp_epoch/$fold + mkdir -p $output_dir + mpirun -np 32 Release/mpi/examm_mpi \ + --training_filenames datasets/2020_wind_engine/turbine_R80711_2017-2020_[1-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_1[0-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_2[0-4].csv \ + --test_filenames datasets/2020_wind_engine/turbine_R80711_2017-2020_2[5-9].csv datasets/2020_wind_engine/turbine_R80711_2017-2020_3[0-1].csv \ + --time_offset $offset \ + --input_parameter_names ${=INPUT_PARAMETERS} \ + --output_parameter_names ${=output_params} \ + --bp_iterations $bp_epoch \ + --normalize min_max \ + --max_recurrent_depth 1 \ + --output_directory $output_dir \ + --log_filename fitness.csv \ + --learning_rate 0.01 \ + --std_message_level INFO \ + --file_message_level INFO \ + --max_genomes 10000 \ + --island_size 32 \ + --number_islands 4 + + touch $output_dir/completed +} + + +for output_params in "Cm_avg" "P_avg"; do + for bp_epoch in 2 4 8 16 32; do + for fold in 0 1 2 3 4 5 6 7 8 9; do + run_examm + done + done +done diff --git a/time_series/time_series.cxx b/time_series/time_series.cxx index b91c13a2..035bfc3d 100644 --- a/time_series/time_series.cxx +++ b/time_series/time_series.cxx @@ -238,7 +238,7 @@ void TimeSeriesSet::add_time_series(string name) { } } -TimeSeriesSet::TimeSeriesSet(string _filename, const vector& _fields) { +TimeSeriesSet::TimeSeriesSet(string _filename, const vector& _fields, char delim) { filename = _filename; fields = _fields; @@ -251,7 +251,7 @@ TimeSeriesSet::TimeSeriesSet(string _filename, const vector& _fields) { } vector file_fields; - string_split(line, ',', file_fields); + string_split(line, delim, file_fields); for (int32_t i = 0; i < (int32_t) file_fields.size(); i++) { // get rid of carriage returns (sometimes windows messes this up) file_fields[i].erase(std::remove(file_fields[i].begin(), file_fields[i].end(), '\r'), file_fields[i].end()); @@ -308,7 +308,7 @@ TimeSeriesSet::TimeSeriesSet(string _filename, const vector& _fields) { } vector parts; - string_split(line, ',', parts); + string_split(line, delim, parts); if (parts.size() != file_fields.size()) { Log::fatal( @@ -472,7 +472,7 @@ void TimeSeriesSet::export_time_series( if (time_offset == 0) { for (int32_t i = 0; i < (int32_t) requested_fields.size(); i++) { for (int32_t j = 0; j < number_rows; j++) { - data[i][j] = time_series[requested_fields[i]]->get_value(j); + data[i][j] = time_series.at(requested_fields[i])->get_value(j); } } @@ -480,7 +480,7 @@ void TimeSeriesSet::export_time_series( // output data, ignore the first N values for (int32_t i = 0; i < (int32_t) requested_fields.size(); i++) { for (int32_t j = time_offset; j < number_rows; j++) { - data[i][j - time_offset] = time_series[requested_fields[i]]->get_value(j); + data[i][j - time_offset] = time_series.at(requested_fields[i])->get_value(j); } } @@ -492,13 +492,13 @@ void TimeSeriesSet::export_time_series( Log::debug("doing shift for field: '%s'\n", requested_fields[i].c_str()); // shift the shifted fields to the same as the output, not the input for (int32_t j = -time_offset; j < number_rows; j++) { - data[i][j + time_offset] = time_series[requested_fields[i]]->get_value(j); + data[i][j + time_offset] = time_series.at(requested_fields[i])->get_value(j); // Log::info("\tdata[%d][%d]: %lf\n", i, j + time_offset, data[i][j + time_offset]); } } else { Log::debug("not doing shift for field: '%s'\n", requested_fields[i].c_str()); for (int32_t j = 0; j < number_rows + time_offset; j++) { - data[i][j] = time_series[requested_fields[i]]->get_value(j); + data[i][j] = time_series.at(requested_fields[i])->get_value(j); } } } @@ -734,7 +734,7 @@ void TimeSeriesSets::load_time_series() { for (int32_t i = 0; i < (int32_t) filenames.size(); i++) { Log::info("\t%s\n", filenames[i].c_str()); - TimeSeriesSet* ts = new TimeSeriesSet(filenames[i], all_parameter_names); + TimeSeriesSet* ts = new TimeSeriesSet(filenames[i], all_parameter_names, this->csv_delimiter); time_series.push_back(ts); rows += ts->get_number_rows(); @@ -831,6 +831,25 @@ TimeSeriesSets* TimeSeriesSets::generate_from_arguments(const vector& ar exit(1); } + if (argument_exists(arguments, "--csv_delimiter")) { + vector delim_vec; + get_argument_vector(arguments, "--csv_delimiter", false, delim_vec); + + string delim_str = delim_vec.front(); + + if (delim_vec.size() != 1 || delim_str.size() != 1) { + // Exit if the user specifies more than one delimiter character + Log::fatal( + "The delimeter for CSV files should be a single character." + ); + + help_message(); + exit(1); + } + + tss->csv_delimiter = delim_str.at(0); + } + tss->load_time_series(); tss->normalize_type = ""; diff --git a/time_series/time_series.hxx b/time_series/time_series.hxx index fada6f51..2ccc75c0 100644 --- a/time_series/time_series.hxx +++ b/time_series/time_series.hxx @@ -72,7 +72,7 @@ class TimeSeriesSet { TimeSeriesSet(); public: - TimeSeriesSet(string _filename, const vector& _fields); + TimeSeriesSet(string _filename, const vector& _fields, char delim); ~TimeSeriesSet(); void add_time_series(string name); @@ -115,6 +115,8 @@ class TimeSeriesSet { class TimeSeriesSets { private: + char csv_delimiter = ','; + string normalize_type; vector filenames;