diff --git a/install.py b/install.py index cd1b77bd5..a6822d90c 100755 --- a/install.py +++ b/install.py @@ -191,7 +191,10 @@ def find_dependencies(args): } def setup_vcpkg(args): - if args.clean and not args.skip_vcpkg_setup and os.path.isdir("vcpkg"): + if args.skip_vcpkg_setup: + return + + if args.clean and os.path.isdir("vcpkg"): subprocess.call("rm -rf vcpkg", shell=True) if not os.path.isdir("vcpkg"): commands = "git clone https://github.com/Microsoft/vcpkg.git vcpkg && ./vcpkg/bootstrap-vcpkg.sh -disableMetrics && ./vcpkg/vcpkg install" diff --git a/quepistasis/header/cpu_sa.h b/quepistasis/header/cpu_sa.h index 04806980e..13fc53237 100644 --- a/quepistasis/header/cpu_sa.h +++ b/quepistasis/header/cpu_sa.h @@ -29,6 +29,7 @@ #endif #endif +#include #include double get_flip_energy( diff --git a/src/data_model/SNPStorage.cpp b/src/data_model/SNPStorage.cpp index d2d4e0b3e..f508d3607 100644 --- a/src/data_model/SNPStorage.cpp +++ b/src/data_model/SNPStorage.cpp @@ -41,14 +41,21 @@ namespace epi { void SNPStorage::add_SNP_annotations(const std::vector>& annotations) { for (auto & anno : annotations) { - snp_data[anno.first.value].annotations.push_back(anno.second); + auto & anno_vec = snp_data[anno.first.value].annotations; + if (std::find(anno_vec.begin(), anno_vec.end(),anno.second) == anno_vec.end()) { + anno_vec.push_back(anno.second); + } + auto map_item = annotations_map.find(anno.second); if (map_item == annotations_map.end()) { std::vector v; v.push_back(anno.first.value); annotations_map.insert({anno.second, v }); } else { - map_item->second.push_back(anno.first.value); + if (std::find(map_item->second.begin(), map_item->second.end(), anno.first.value) == map_item->second.end()) { + // only add if not in the list yet + map_item->second.push_back(anno.first.value); + } } } } @@ -67,7 +74,7 @@ namespace epi { } } - std::unordered_map> SNPStorage::get_annotations_map() { + const std::unordered_map>& SNPStorage::get_annotations_map() { return annotations_map; } diff --git a/src/data_model/SNPStorage.hpp b/src/data_model/SNPStorage.hpp index e60fa5e06..752bd5f3b 100644 --- a/src/data_model/SNPStorage.hpp +++ b/src/data_model/SNPStorage.hpp @@ -80,7 +80,7 @@ namespace epi { bool contains_name(const std::string& name); void add_SNP_annotations(const std::vector>& annotations); - std::unordered_map> get_annotations_map(); + const std::unordered_map>& get_annotations_map(); std::vector snp_get_annotations(const SNP_t& snp); void snp_set_variable_attribute(const SNP_t& snp, const std::string& key, const std::string& value); diff --git a/src/jobs/NetworkCsvConnector.cpp b/src/jobs/NetworkCsvConnector.cpp index bc4d57896..04787e3bf 100644 --- a/src/jobs/NetworkCsvConnector.cpp +++ b/src/jobs/NetworkCsvConnector.cpp @@ -92,18 +92,42 @@ namespace epi { parser.parse(path, csv_separator); std::vector> col1_splits (parser.num_rows()), col2_splits(parser.num_rows()); - if (col1_separator != -1) { #pragma omp parallel for default(none) shared(has_header, column1, col1_separator, parser, col1_splits) - for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { - col1_splits[i] = string_split(parser.cell(i, column1), col1_separator); + for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { + const auto &line = parser.cell(i, column1); + if (col1_separator != -1) { + col1_splits[i] = string_split(line, col1_separator); + } else { + col1_splits[i] = {}; + col1_splits[i].push_back(line); } } - if (col2_separator != -1) { #pragma omp parallel for default(none) shared(has_header, column2, col2_separator, parser, col2_splits) - for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { - col2_splits[i] = string_split(parser.cell(i, column2), col2_separator); + for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { + const auto &line = parser.cell(i, column2); + if (col2_separator != -1) { + col2_splits[i] = string_split(line, col2_separator); + } else { + col2_splits[i] = {}; + col2_splits[i].push_back(line); + } + } + + // this loop is only for logging purposes: How many annotations of our SNPs can be found in the network file + { + std::unordered_set network_annotations{}; + const auto &snp_annotations = data->snpStorage->get_annotations_map(); + for (const auto splits: {std::cref(col1_splits), std::cref(col2_splits)}) { + for (const auto &col: splits.get()) { + for (const auto &anno: col) { + if (snp_annotations.contains(anno)) { + network_annotations.insert(anno); + } + } + } } + Logger::logLine("Distinct annotations that overlap between network file and annotated SNPs: " + std::to_string(network_annotations.size())); } size_t num_threads = omp_get_max_threads(); @@ -115,20 +139,8 @@ namespace epi { #pragma omp parallel for default(none) shared(col1, col2, parser, col1_splits, col2_splits, data, nodes, edges) for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { size_t thr = omp_get_thread_num(); - if (col1_separator == -1) { - col1[thr].clear(); - col1[thr].push_back(parser.cell(i, column1)); - } else { - // string_split(snps, parser.cell(i, snp_column), snp_separator); - col1[thr] = col1_splits[i]; - } - if (col2_separator == -1) { - col2[thr].clear(); - col2[thr].push_back(parser.cell(i, column2)); - } else { - // string_split(annotations, parser.cell(i, annotation_column), annotation_separator); - col2[thr] = col2_splits[i]; - } + col1[thr] = col1_splits[i]; + col2[thr] = col2_splits[i]; for (auto & gene_symbol_1 : col1[thr]) { for (auto & gene_symbol_2 : col2[thr]) { diff --git a/src/jobs/NetworkStatsPrinter.cpp b/src/jobs/NetworkStatsPrinter.cpp index a6f6607ab..196419441 100644 --- a/src/jobs/NetworkStatsPrinter.cpp +++ b/src/jobs/NetworkStatsPrinter.cpp @@ -18,7 +18,7 @@ namespace epi { // determine the number of annotations std::unordered_set annotations; - auto anno_map = data->snpStorage->get_annotations_map(); + const auto &anno_map = data->snpStorage->get_annotations_map(); for (auto &snp : data->snpNetwork->get_network_snps()) { auto annos = data->snpStorage->snp_get_annotations(snp); annotations.insert(annos.begin(), annos.end()); diff --git a/src/jobs/SameAnnotationConnector.cpp b/src/jobs/SameAnnotationConnector.cpp index 7e3a49516..121772fd5 100644 --- a/src/jobs/SameAnnotationConnector.cpp +++ b/src/jobs/SameAnnotationConnector.cpp @@ -17,7 +17,7 @@ namespace epi { std::unordered_set nodes; std::vector edges; - auto annotations_map = data->snpStorage->get_annotations_map(); + const auto &annotations_map = data->snpStorage->get_annotations_map(); for (auto & anno : annotations_map) { std::vector nodes_repr; for (auto & node : anno.second) { diff --git a/src/jobs/SaveNetwork.cpp b/src/jobs/SaveNetwork.cpp index 8f2904104..736f2a7ba 100644 --- a/src/jobs/SaveNetwork.cpp +++ b/src/jobs/SaveNetwork.cpp @@ -369,7 +369,7 @@ namespace epi { if (rc) { throw epi::Error("SQLITE-error when preparing has_annotation insert statement"); } - auto all_annotations = data->snpStorage->get_annotations_map(); + const auto &all_annotations = data->snpStorage->get_annotations_map(); size_t anno_id = 0; for (auto &anno : network_node_annotations) { @@ -377,7 +377,9 @@ namespace epi { sqlite3_bind_text(insert_annos_stmt_sqlite, 2, anno.c_str(), anno.size(), SQLITE_STATIC); sqlite3_step(insert_annos_stmt_sqlite); - for (auto & node : all_annotations[anno]) { + const auto all_annotations_anno = all_annotations.find(anno); + + for (auto & node : all_annotations_anno->second) { sqlite3_bind_int(has_anno_stmt_sqlite, 1, node); sqlite3_bind_int(has_anno_stmt_sqlite, 2, anno_id); sqlite3_step(has_anno_stmt_sqlite); diff --git a/src/jobs/SnpCsvAnnotator.cpp b/src/jobs/SnpCsvAnnotator.cpp index 9eb442b2a..59381d638 100644 --- a/src/jobs/SnpCsvAnnotator.cpp +++ b/src/jobs/SnpCsvAnnotator.cpp @@ -92,64 +92,67 @@ namespace epi { const auto filter_map = filter_entries(parser); std::vector> snp_splits (parser.num_rows()), annotation_splits(parser.num_rows()); - if (snp_separator != -1) { #pragma omp parallel for default(none) shared(has_header, snp_column, snp_separator, parser, snp_splits, filter_map) - for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { - if (!filter_map[i]) continue; - snp_splits[i] = string_split(parser.cell(i, snp_column), snp_separator); + for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { + if (!filter_map[i]) continue; + const auto & line = parser.cell(i, snp_column); + if (snp_separator != -1) { + snp_splits[i] = string_split(line, snp_separator); + } else { + snp_splits[i] = {}; + snp_splits[i].push_back(line); } } - if (annotation_separator != -1) { #pragma omp parallel for default(none) shared(has_header, annotation_column, annotation_separator, parser, annotation_splits, filter_map) - for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { - if (!filter_map[i]) continue; - annotation_splits[i] = string_split(parser.cell(i, annotation_column), annotation_separator); + for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { + if (!filter_map[i]) continue; + const auto & line = parser.cell(i, annotation_column); + if (annotation_separator != -1) { + annotation_splits[i] = string_split(line, annotation_separator); + } else { + annotation_splits[i] = {}; + annotation_splits[i].push_back(line); } } - // size_t anno_max = 0, anno_sum = 0, anno_count = 0; std::vector snps, annotations; for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) { if (!filter_map[i]) continue; - if (snp_separator == -1) { - snps.clear(); - snps.push_back(parser.cell(i, snp_column)); - } else { - // string_split(snps, parser.cell(i, snp_column), snp_separator); - snps = snp_splits[i]; - } - if (annotation_separator == -1) { - annotations.clear(); - annotations.push_back(parser.cell(i, annotation_column)); - } else { - // string_split(annotations, parser.cell(i, annotation_column), annotation_separator); - annotations = annotation_splits[i]; - } + snps = snp_splits[i]; + annotations = annotation_splits[i]; for (auto &snp: snps) { if (data->snpStorage->contains_name(snp)) { auto snp_t = data->snpStorage->by_name(snp); for (auto &anno: annotations){ - /* - anno_max = std::max(anno_max, anno.size()); - anno_sum += anno.size(); - anno_count ++; - */ all_annotations.emplace_back(snp_t, anno); } } } } - // std::cout << anno_max << ", " << anno_count << ", " << anno_sum << std::endl; - // apply mappings data->snpStorage->add_SNP_annotations(all_annotations); + const auto & annotations_map = data->snpStorage->get_annotations_map(); + size_t num_annotation_strings = annotations_map.size(); + size_t num_annotations = 0; + std::unordered_set annotated_snps{}; + for (const auto & anno : annotations_map) { + annotated_snps.insert(anno.second.begin(), anno.second.end()); + num_annotations += anno.second.size(); + } + + size_t num_annotated_snps = annotated_snps.size(); + logger.stop(); + Logger::logLine("SNP Annotation: "); + Logger::logLine(" SNP-annotation pairs: " + std::to_string(num_annotations)); + Logger::logLine(" distinct annotation strings: " + std::to_string(num_annotation_strings)); + Logger::logLine(" distinct SNPs with annotation: " + std::to_string(num_annotated_snps)); } rapidjson::Value SnpCsvAnnotator::getConfig(rapidjson::Document &doc) {