diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..eaf91e2 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/Graph_Edit_Distance.iml b/.idea/Graph_Edit_Distance.iml new file mode 100644 index 0000000..d9e6024 --- /dev/null +++ b/.idea/Graph_Edit_Distance.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..e1c2707 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,27 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..7b5bf37 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d06d743 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..9661ac7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/main.cpp b/main.cpp index 990f2dc..693ec34 100644 --- a/main.cpp +++ b/main.cpp @@ -4,6 +4,10 @@ #include "Timer.h" #include "popl.hpp" +#include +#include +#include + using namespace std; using namespace popl; @@ -13,7 +17,7 @@ void print_usage() { } ui label2int(const char *str, map &M) { - if(M.find(string(str)) == M.end()) M[string(str)] = M.size(); + if (M.find(string(str)) == M.end()) M[string(str)] = M.size(); return M[string(str)]; } @@ -22,7 +26,7 @@ ui load_db(const char *file_name, vector &graphs, map &vM, ma const ui MAX_LINE = 1024; char line[MAX_LINE]; - if(fgets(line, MAX_LINE, fin) == NULL) { + if (fgets(line, MAX_LINE, fin) == nullptr) { fclose(fin); return 0; } @@ -34,34 +38,35 @@ ui load_db(const char *file_name, vector &graphs, map &vM, ma string id(buf); line[0] = 'x'; - vector > vertices; - vector,ui> > edges; - while(fgets(line, MAX_LINE, fin) != NULL&&line[0] != 't') { - if(line[0] == 'v') { + vector> vertices; + vector,ui>> edges; + while(fgets(line, MAX_LINE, fin) != nullptr && line[0] != 't') { + if (line[0] == 'v') { int a; sscanf(line+2, "%d%s", &a, buf); - //buf[0] = '1'; vertices.pb(mp(a, label2int(buf, vM))); } - else if(line[0] == 'e') { + else if (line[0] == 'e') { int a, b; sscanf(line+2, "%d%d%s", &a, &b, buf); edges.pb(mp(mp(a,b), label2int(buf, eM))); edges.pb(mp(mp(b,a), label2int(buf, eM))); } - else printf("!!! Unrecongnized first letter in a line when loading DB!\n"); + else printf("!!! Unrecognized first letter in a line when loading DB !!!\n"); line[0] = 'x'; } + int v_size = static_cast(vertices.size()); + int e_size = static_cast(edges.size()); sort(vertices.begin(), vertices.end()); - for(ui i = 0;i < vertices.size();i ++) assert(vertices[i].first == i); - if(vertices.size() > max_n) max_n = vertices.size(); + for (ui i = 0; i < v_size; i++) assert(vertices[i].first == i); + if (v_size > max_n) max_n = v_size; sort(edges.begin(), edges.end()); - for(ui i = 0;i < edges.size();i ++) { - assert(edges[i].first.first >= 0&&edges[i].first.first < vertices.size()); - assert(edges[i].first.second >= 0&&edges[i].first.second < vertices.size()); - if(i > 0) assert(edges[i].first != edges[i-1].first); + for (ui i = 0; i < e_size; i++) { + assert(edges[i].first.first >= 0 && edges[i].first.first < v_size); + assert(edges[i].first.second >= 0 && edges[i].first.second < v_size); + if (i > 0) assert(edges[i].first != edges[i-1].first); assert(edges[i].second < eM.size()); } @@ -74,21 +79,19 @@ ui load_db(const char *file_name, vector &graphs, map &vM, ma void generate_queries(const vector &db, vector &queries, ui q_n) { assert(!db.empty()); - srand(time(NULL)); - for(ui i = 0;i < q_n;i ++) queries.pb(rand()%db.size()); + srand(time(nullptr)); + for (ui i = 0; i < q_n; i ++) queries.pb(rand() % db.size()); } void write_queries(const char *file_name, const vector &db, const vector &queries, const map &vM, const map &eM, bool bss) { vector vlabels(vM.size()); vector elabels(eM.size()); - for(pair p: vM) vlabels[p.second] = p.first; - for(pair p: eM) elabels[p.second] = p.first; + for (pair p: vM) vlabels[p.second] = p.first; + for (pair p: eM) elabels[p.second] = p.first; FILE *fout = Utility::open_file(file_name, "w"); - - for(ui i = 0;i < queries.size();i ++) db[queries[i]]->write_graph(fout, vlabels, elabels, bss); - + for (ui i = 0; i < queries.size(); i++) db[queries[i]]->write_graph(fout, vlabels, elabels, bss); fclose(fout); } @@ -102,7 +105,7 @@ int main(int argc, char *argv[]) { print_usage(); - string mode, paradigm, lower_bound; + string mode, paradigm, lower_bound, save_filepath; int threshold = -1; bool print_ged = false; @@ -110,16 +113,26 @@ int main(int argc, char *argv[]) { auto help_option = op.add("h", "help", "\'produce help message\'"); auto database_option = op.add>("d", "database", "\'database file name\'"); auto query_option = op.add>("q", "query", "\'query file name\'"); - auto mode_option = op.add>("m", "mode", "\'running mode\' (search | pair)", "search", &mode); + auto save_option = op.add>("s", "save", "\'pairwise distance filename", "datasets/pairwise_ged.csv", &save_filepath); + auto mode_option = op.add>("m", "mode", "\'running mode\' (search | pair | pairwise)", "search", &mode); auto paradigm_option = op.add>("p", "paradigm", "\'search paradigm\' (astar | dfs)", "astar", ¶digm); auto lower_bound_option = op.add>("l", "lower_bound", "\'lower bound method\' (LSa | BMao | BMa)", "BMao", &lower_bound); auto threshold_option = op.add>("t", "threshold", "\'threshold for GED verification; if not provided, then GED computation", -1, &threshold); op.add("g", "ged", "\'print_ged\'", &print_ged); - op.parse(argc, argv); - if(help_option->is_set()||argc == 1) cout << op << endl; - if(!database_option->is_set()||!query_option->is_set()) { + if (help_option->is_set()||argc == 1) cout << op << endl; + if (mode == "pairwise") { + if (!database_option->is_set()) { + printf("!!! Database file name not provided! Exit !!!\n"); + return 0; + } else { + query_option = database_option; + } + if (query_option->is_set()) { + printf("Warning: query file ignored. Pairwise distances are calculated for the compounds from database file\n"); + } + } else if (!database_option->is_set() || !query_option->is_set()) { printf("!!! Database file name or query file name is not provided! Exit !!!\n"); return 0; } @@ -130,9 +143,8 @@ int main(int argc, char *argv[]) { map vM, eM; ui max_db_n = load_db(database.c_str(), db, vM, eM); - printf("*** %s %s %s %d: %s %s", mode.c_str(), paradigm.c_str(), lower_bound.c_str(), threshold, database.c_str(), query.c_str()); #ifdef _EXPAND_ALL_ - //printf(" Expand_all"); + // printf(" Expand_all"); #else printf(" Expand_one"); #endif @@ -148,7 +160,7 @@ int main(int argc, char *argv[]) { ui max_query_n = load_db(query.c_str(), queries, vM, eM); ui verify_upper_bound; - if(threshold < 0) verify_upper_bound = INF; + if (threshold < 0) verify_upper_bound = INF; else verify_upper_bound = (ui)threshold; long long search_space = 0; @@ -161,170 +173,195 @@ int main(int argc, char *argv[]) { memset(vlabel_cnt, 0, sizeof(int)*vM.size()); memset(elabel_cnt, 0, sizeof(int)*eM.size()); - if(max_query_n > max_db_n) max_db_n = max_query_n; + if (max_query_n > max_db_n) max_db_n = max_query_n; int *degree_q = new int[max_db_n]; int *degree_g = new int[max_db_n]; int *tmp = new int[max_db_n]; - if(strcmp(mode.c_str(), "pair") != 0&&strcmp(mode.c_str(), "search") != 0) { - printf("!!! Wrong mode (pair | search) selection!\n"); + if (mode != "pair" && mode != "search" && mode != "pairwise") { + printf("!!! Wrong mode (pair | search) selection !!!\n"); return 0; } - if(strcmp(paradigm.c_str(), "astar") != 0&&strcmp(paradigm.c_str(), "dfs") != 0) { - printf("!!! Wrong algorithm (astar | dfs) selection!\n"); + if (paradigm != "astar" && paradigm != "dfs") { + printf("!!! Wrong algorithm (astar | dfs) selection !!!\n"); return 0; } Timer t; - if(strcmp(mode.c_str(), "pair") == 0) { + if (mode == "pair") { long long time1 = 0, cnt1 = 0, ss1 = 0; long long time2 = 0, cnt2 = 0, ss2 = 0; - if(queries.size() != db.size()) { + int db_size = static_cast(db.size()); + int q_size = static_cast(queries.size()); + if (q_size != db_size) { printf("Query size != db size in the pair mode\n"); exit(0); } - if(print_ged) printf("*** GEDs ***\n"); - ui min_ged = 1000000000, max_ged = 0; - for(ui i = 0;i < queries.size();i ++) { - ui current = i*100/queries.size(); - if(current != pre) { + if (print_ged) printf("*** GEDs ***\n"); + ui min_ged = INT_MAX, max_ged = 0; + for (ui i = 0; i < q_size; i++) { + ui current = i*100/q_size; + if (current != pre) { fprintf(stderr, "\r[%d%% finished]", current); fflush(stderr); - //cout<<"\r["<ged_lower_bound_filter(db[i], verify_upper_bound, vlabel_cnt, elabel_cnt, degree_q, degree_g, tmp); - if(lb > verify_upper_bound) continue; + if (lb > verify_upper_bound) continue; - ++ candidates_cnt; + ++candidates_cnt; Timer t1; Application *app = new Application(verify_upper_bound, lower_bound.c_str()); app->init(db[i], queries[i]); int res = INF; - if(strcmp(paradigm.c_str(), "astar") == 0) res = app->AStar(); - else res = app->DFS(NULL); + if (paradigm == "astar") res = app->AStar(); + else res = app->DFS(nullptr); #ifndef NDEBUG assert(res == app->compute_ged_of_BX()); #endif search_space += app->get_search_space(); - if(res <= verify_upper_bound) ++ results_cnt; + if (res <= verify_upper_bound)++results_cnt; else res = -1; - if(print_ged) { + if (print_ged) { printf("%d\n", res); - if(res > max_ged) max_ged = res; - if(res < min_ged) min_ged = res; + if (res > max_ged) max_ged = res; + if (res < min_ged) min_ged = res; } - - if(res == -1) { + if (res == -1) { time2 += t1.elapsed(); ss2 += app->get_search_space(); - ++ cnt2; - } - else { + ++cnt2; + } else { time1 += t1.elapsed(); ss1 += app->get_search_space(); - ++ cnt1; + ++cnt1; } - - //printf("%u %u\n", db[i]->n, queries[i]->n); - //if(db[i]->id.compare(queries[i]->id) < 0) printf("\t(pair_%u %s %s) GED: %d, Time: %s, Search space: %lld\n", i, db[i]->id.c_str(), queries[i]->id.c_str(), res, Utility::integer_to_string(t1.elapsed()).c_str(), app->get_search_space()); - //else printf("\t(pair_%u %s %s) GED: %d, Time: %s, Search space: %lld\n", i, queries[i]->id.c_str(), db[i]->id.c_str(), res, Utility::integer_to_string(t1.elapsed()).c_str(), app->get_search_space()); - //fflush(stdout); - delete app; } fprintf(stderr, "\n"); - if(print_ged) { + if (print_ged) { printf("*** GEDs ***\n"); printf("min_ged: %u, max_ged: %u\n", min_ged, max_ged); } - //printf("%d %d\n", cnt1, cnt2); - if(cnt1 + cnt2 != 0) printf("total average time: %s, total average_ss: %lld\n", Utility::integer_to_string((time1+time2)/(cnt1+cnt2)).c_str(), (ss1+ss2)/(cnt1+cnt2)); - if(verify_upper_bound < INF) { + if (cnt1 + cnt2 != 0) printf("total average time: %s, total average_ss: %lld\n", Utility::integer_to_string((time1+time2)/(cnt1+cnt2)).c_str(), (ss1+ss2)/(cnt1+cnt2)); + if (verify_upper_bound < INF) { printf("Dissimilar (%lld pairs) average time: ", cnt2); - if(cnt2 == 0) printf("0, "); + if (cnt2 == 0) printf("0, "); else printf("%s, ", Utility::integer_to_string(time2/cnt2).c_str()); printf("Dissimilar average space: "); - if(cnt2 == 0) printf("0\n"); + if (cnt2 == 0) printf("0\n"); else printf("%lld\n", ss2/cnt2); printf("Similar (%lld pairs) average time: ", cnt1); - if(cnt1 == 0) printf("0, "); + if (cnt1 == 0) printf("0, "); else printf("%s, ", Utility::integer_to_string(time1/cnt1).c_str()); printf("Similar average space: "); - if(cnt1 == 0) printf("0\n"); + if (cnt1 == 0) printf("0\n"); else printf("%lld\n", ss1/cnt1); } - } - else { + printf("Total time: %s (microseconds), total search space: %lld\n #candidates: %lld, #matches: %lld\n", Utility::integer_to_string(t.elapsed()).c_str(), search_space, candidates_cnt, results_cnt); + } else if (mode == "pairwise") { long long total_res = 0; - if(print_ged) printf("*** GEDs ***\n"); - ui min_ged = 1000000000, max_ged = 0; - for(ui i = 0;i < queries.size();i ++) { - for(ui j = 0; j < db.size();j ++) { - ui current = (i*(long long)(db.size())+j+1)*100/(queries.size()*(long long)(db.size())); - if(current != pre) { + int db_size = static_cast(db.size()); + vector> ged_matrix(db_size, vector(db_size, 0)); + ui min_ged = INT_MAX, max_ged = 0; + + for (ui i = 0; i < db_size; i++) { + for (ui j = i+1; j < db_size; j++) { + Application *app = new Application(verify_upper_bound, lower_bound.c_str()); + app->init(db[i], db[j]); + int res = INF; + if (paradigm == "astar") res = app->AStar(); + else res = app->DFS(nullptr); + ged_matrix[i][j] = ged_matrix[j][i] = res; + if (res > max_ged) max_ged = res; + if (res < min_ged) min_ged = res; + total_res += res; + + delete app; + } + } + std::ofstream out(save_filepath); + for (auto& row : ged_matrix) { + for (int k = 0; k < db_size-1; k++) { + out << row[k] <<','; + } + out << row[db_size-1] << '\n'; + } + printf("*** GEDs ***\n"); + printf("Min ged: %u, max ged: %u avg ged: %.3lf\n", min_ged, max_ged, double(total_res) / (db_size*(db_size-1)/2)); + printf("Total time: %s (microseconds)\n", Utility::integer_to_string(t.elapsed()).c_str()); + printf("Results saved in %s\n", save_filepath.c_str()); + } else { + long long total_res = 0; + int db_size = static_cast(db.size()); + int q_size = static_cast(queries.size()); + + if (print_ged) printf("*** GEDs ***\n"); + ui min_ged = INT_MAX, max_ged = 0; + for (ui i = 0; i < q_size; i++) { + for (ui j = 0; j < db_size; j++) { + ui current = (i*(long long)(db_size)+j+1)*100/(q_size*(long long)(db_size)); + if (current != pre) { fprintf(stderr, "\r[%d%% finished]", current); fflush(stderr); - //cout<<"\r["<ged_lower_bound_filter(db[j], verify_upper_bound, vlabel_cnt, elabel_cnt, degree_q, degree_g, tmp); - if(lb > verify_upper_bound) continue; + if (lb > verify_upper_bound) continue; - ++ candidates_cnt; + ++candidates_cnt; Application *app = new Application(verify_upper_bound, lower_bound.c_str()); - //app->init(db_v[i], db_e[i], query_v[i], query_e[i]); + // app->init(db_v[i], db_e[i], query_v[i], query_e[i]); app->init(db[j], queries[i]); int res = INF; - if(strcmp(paradigm.c_str(), "astar") == 0) res = app->AStar(); - else res = app->DFS(NULL); + if (paradigm == "astar") res = app->AStar(); + else res = app->DFS(nullptr); #ifndef NDEBUG assert(res == app->compute_ged_of_BX()); #endif - if(print_ged) { - if(j) printf(" "); + if (print_ged) { + if (j) printf(" "); printf("%u", res); - if(res > max_ged) max_ged = res; - if(res < min_ged) min_ged = res; + if (res > max_ged) max_ged = res; + if (res < min_ged) min_ged = res; } total_res += res; - //printf("pair %lu (%s, %s): %d\n", i*db.size()+j, queries[i]->id.c_str(), db[j]->id.c_str(), res); + // printf("pair %lu (%s, %s): %d\n", i*db.size()+j, queries[i]->id.c_str(), db[j]->id.c_str(), res); search_space += app->get_search_space(); - if(res <= verify_upper_bound) ++ results_cnt; + if (res <= verify_upper_bound) ++results_cnt; delete app; } - if(print_ged) printf("\n"); + if (print_ged) printf("\n"); } fprintf(stderr, "\n"); - if(print_ged) { + if (print_ged) { printf("*** GEDs ***\n"); - printf("min_ged: %u, max_ged: %u\n", min_ged, max_ged); + printf("Min ged: %u, max ged: %u avg ged: %.3lf\n", min_ged, max_ged, double(total_res) / (q_size*db_size)); } - //printf("Average GED: %.3lf\n", double(total_res)/(queries.size()*db.size())); + printf("Total time: %s (microseconds), total search space: %lld\n #candidates: %lld, #matches: %lld\n", Utility::integer_to_string(t.elapsed()).c_str(), search_space, candidates_cnt, results_cnt); } - printf("Total time: %s (microseconds), total search space: %lld\n #candidates: %lld, #matches: %lld\n", Utility::integer_to_string(t.elapsed()).c_str(), search_space, candidates_cnt, results_cnt); - delete[] vlabel_cnt; vlabel_cnt = NULL; - delete[] elabel_cnt; elabel_cnt = NULL; - delete[] degree_q; degree_q = NULL; - delete[] degree_g; degree_g = NULL; - delete[] tmp; tmp = NULL; + delete[] vlabel_cnt; vlabel_cnt = nullptr; + delete[] elabel_cnt; elabel_cnt = nullptr; + delete[] degree_q; degree_q = nullptr; + delete[] degree_g; degree_g = nullptr; + delete[] tmp; tmp = nullptr; - for(ui i = 0;i < db.size();i ++) { + for (ui i = 0; i < db.size(); i++) { delete db[i]; db[i] = nullptr; } - for(ui i = 0;i < queries.size();i ++) { + for (ui i = 0; i < queries.size(); i++) { delete queries[i]; queries[i] = nullptr; }