Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion install.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ def find_dependencies(args):
}

def setup_vcpkg(args):
if args.clean and not args.skip_vcpkg_setup and os.path.isdir("vcpkg"):
if args.skip_vcpkg_setup:
return

if args.clean and os.path.isdir("vcpkg"):
subprocess.call("rm -rf vcpkg", shell=True)
if not os.path.isdir("vcpkg"):
commands = "git clone https://github.com/Microsoft/vcpkg.git vcpkg && ./vcpkg/bootstrap-vcpkg.sh -disableMetrics && ./vcpkg/vcpkg install"
Expand Down
1 change: 1 addition & 0 deletions quepistasis/header/cpu_sa.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#endif
#endif

#include <cstdint>
#include <vector>

double get_flip_energy(
Expand Down
13 changes: 10 additions & 3 deletions src/data_model/SNPStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,21 @@ namespace epi {

void SNPStorage::add_SNP_annotations(const std::vector<std::pair<SNP_t, std::string>>& annotations) {
for (auto & anno : annotations) {
snp_data[anno.first.value].annotations.push_back(anno.second);
auto & anno_vec = snp_data[anno.first.value].annotations;
if (std::find(anno_vec.begin(), anno_vec.end(),anno.second) == anno_vec.end()) {
anno_vec.push_back(anno.second);
}

auto map_item = annotations_map.find(anno.second);
if (map_item == annotations_map.end()) {
std::vector<size_t> v;
v.push_back(anno.first.value);
annotations_map.insert({anno.second, v });
} else {
map_item->second.push_back(anno.first.value);
if (std::find(map_item->second.begin(), map_item->second.end(), anno.first.value) == map_item->second.end()) {
// only add if not in the list yet
map_item->second.push_back(anno.first.value);
}
}
}
}
Expand All @@ -67,7 +74,7 @@ namespace epi {
}
}

std::unordered_map<std::string, std::vector<size_t>> SNPStorage::get_annotations_map() {
const std::unordered_map<std::string, std::vector<size_t>>& SNPStorage::get_annotations_map() {
return annotations_map;
}

Expand Down
2 changes: 1 addition & 1 deletion src/data_model/SNPStorage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ namespace epi {
bool contains_name(const std::string& name);

void add_SNP_annotations(const std::vector<std::pair<SNP_t, std::string>>& annotations);
std::unordered_map<std::string, std::vector<size_t>> get_annotations_map();
const std::unordered_map<std::string, std::vector<size_t>>& get_annotations_map();
std::vector<std::string> snp_get_annotations(const SNP_t& snp);

void snp_set_variable_attribute(const SNP_t& snp, const std::string& key, const std::string& value);
Expand Down
52 changes: 32 additions & 20 deletions src/jobs/NetworkCsvConnector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,42 @@ namespace epi {
parser.parse(path, csv_separator);

std::vector<std::vector<std::string>> col1_splits (parser.num_rows()), col2_splits(parser.num_rows());
if (col1_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, column1, col1_separator, parser, col1_splits)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
col1_splits[i] = string_split(parser.cell(i, column1), col1_separator);
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
const auto &line = parser.cell(i, column1);
if (col1_separator != -1) {
col1_splits[i] = string_split(line, col1_separator);
} else {
col1_splits[i] = {};
col1_splits[i].push_back(line);
}
}

if (col2_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, column2, col2_separator, parser, col2_splits)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
col2_splits[i] = string_split(parser.cell(i, column2), col2_separator);
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
const auto &line = parser.cell(i, column2);
if (col2_separator != -1) {
col2_splits[i] = string_split(line, col2_separator);
} else {
col2_splits[i] = {};
col2_splits[i].push_back(line);
}
}

// this loop is only for logging purposes: How many annotations of our SNPs can be found in the network file
{
std::unordered_set<std::string> network_annotations{};
const auto &snp_annotations = data->snpStorage->get_annotations_map();
for (const auto splits: {std::cref(col1_splits), std::cref(col2_splits)}) {
for (const auto &col: splits.get()) {
for (const auto &anno: col) {
if (snp_annotations.contains(anno)) {
network_annotations.insert(anno);
}
}
}
}
Logger::logLine("Distinct annotations that overlap between network file and annotated SNPs: " + std::to_string(network_annotations.size()));
}

size_t num_threads = omp_get_max_threads();
Expand All @@ -115,20 +139,8 @@ namespace epi {
#pragma omp parallel for default(none) shared(col1, col2, parser, col1_splits, col2_splits, data, nodes, edges)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
size_t thr = omp_get_thread_num();
if (col1_separator == -1) {
col1[thr].clear();
col1[thr].push_back(parser.cell(i, column1));
} else {
// string_split(snps, parser.cell(i, snp_column), snp_separator);
col1[thr] = col1_splits[i];
}
if (col2_separator == -1) {
col2[thr].clear();
col2[thr].push_back(parser.cell(i, column2));
} else {
// string_split(annotations, parser.cell(i, annotation_column), annotation_separator);
col2[thr] = col2_splits[i];
}
col1[thr] = col1_splits[i];
col2[thr] = col2_splits[i];

for (auto & gene_symbol_1 : col1[thr]) {
for (auto & gene_symbol_2 : col2[thr]) {
Expand Down
2 changes: 1 addition & 1 deletion src/jobs/NetworkStatsPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace epi {

// determine the number of annotations
std::unordered_set<std::string> annotations;
auto anno_map = data->snpStorage->get_annotations_map();
const auto &anno_map = data->snpStorage->get_annotations_map();
for (auto &snp : data->snpNetwork->get_network_snps()) {
auto annos = data->snpStorage->snp_get_annotations(snp);
annotations.insert(annos.begin(), annos.end());
Expand Down
2 changes: 1 addition & 1 deletion src/jobs/SameAnnotationConnector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace epi {
std::unordered_set<SNP_t, SNP_t::SNPHash> nodes;
std::vector<SNPEdge> edges;

auto annotations_map = data->snpStorage->get_annotations_map();
const auto &annotations_map = data->snpStorage->get_annotations_map();
for (auto & anno : annotations_map) {
std::vector<SNP_t> nodes_repr;
for (auto & node : anno.second) {
Expand Down
6 changes: 4 additions & 2 deletions src/jobs/SaveNetwork.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,15 +369,17 @@ namespace epi {
if (rc) {
throw epi::Error("SQLITE-error when preparing has_annotation insert statement");
}
auto all_annotations = data->snpStorage->get_annotations_map();
const auto &all_annotations = data->snpStorage->get_annotations_map();

size_t anno_id = 0;
for (auto &anno : network_node_annotations) {
sqlite3_bind_int(insert_annos_stmt_sqlite, 1, anno_id);
sqlite3_bind_text(insert_annos_stmt_sqlite, 2, anno.c_str(), anno.size(), SQLITE_STATIC);
sqlite3_step(insert_annos_stmt_sqlite);

for (auto & node : all_annotations[anno]) {
const auto all_annotations_anno = all_annotations.find(anno);

for (auto & node : all_annotations_anno->second) {
sqlite3_bind_int(has_anno_stmt_sqlite, 1, node);
sqlite3_bind_int(has_anno_stmt_sqlite, 2, anno_id);
sqlite3_step(has_anno_stmt_sqlite);
Expand Down
63 changes: 33 additions & 30 deletions src/jobs/SnpCsvAnnotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,64 +92,67 @@ namespace epi {
const auto filter_map = filter_entries(parser);

std::vector<std::vector<std::string>> snp_splits (parser.num_rows()), annotation_splits(parser.num_rows());
if (snp_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, snp_column, snp_separator, parser, snp_splits, filter_map)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
snp_splits[i] = string_split(parser.cell(i, snp_column), snp_separator);
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
const auto & line = parser.cell(i, snp_column);
if (snp_separator != -1) {
snp_splits[i] = string_split(line, snp_separator);
} else {
snp_splits[i] = {};
snp_splits[i].push_back(line);
}
}

if (annotation_separator != -1) {
#pragma omp parallel for default(none) shared(has_header, annotation_column, annotation_separator, parser, annotation_splits, filter_map)
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
annotation_splits[i] = string_split(parser.cell(i, annotation_column), annotation_separator);
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
const auto & line = parser.cell(i, annotation_column);
if (annotation_separator != -1) {
annotation_splits[i] = string_split(line, annotation_separator);
} else {
annotation_splits[i] = {};
annotation_splits[i].push_back(line);
}
}

// size_t anno_max = 0, anno_sum = 0, anno_count = 0;

std::vector<std::string> snps, annotations;
for (size_t i = has_header ? 1 : 0; i < parser.num_rows(); i++) {
if (!filter_map[i]) continue;
if (snp_separator == -1) {
snps.clear();
snps.push_back(parser.cell(i, snp_column));
} else {
// string_split(snps, parser.cell(i, snp_column), snp_separator);
snps = snp_splits[i];
}
if (annotation_separator == -1) {
annotations.clear();
annotations.push_back(parser.cell(i, annotation_column));
} else {
// string_split(annotations, parser.cell(i, annotation_column), annotation_separator);
annotations = annotation_splits[i];
}
snps = snp_splits[i];
annotations = annotation_splits[i];

for (auto &snp: snps) {
if (data->snpStorage->contains_name(snp)) {
auto snp_t = data->snpStorage->by_name(snp);

for (auto &anno: annotations){
/*
anno_max = std::max(anno_max, anno.size());
anno_sum += anno.size();
anno_count ++;
*/
all_annotations.emplace_back(snp_t, anno);
}
}
}
}

// std::cout << anno_max << ", " << anno_count << ", " << anno_sum << std::endl;

// apply mappings
data->snpStorage->add_SNP_annotations(all_annotations);

const auto & annotations_map = data->snpStorage->get_annotations_map();
size_t num_annotation_strings = annotations_map.size();
size_t num_annotations = 0;
std::unordered_set<SNP> annotated_snps{};
for (const auto & anno : annotations_map) {
annotated_snps.insert(anno.second.begin(), anno.second.end());
num_annotations += anno.second.size();
}

size_t num_annotated_snps = annotated_snps.size();

logger.stop();
Logger::logLine("SNP Annotation: ");
Logger::logLine(" SNP-annotation pairs: " + std::to_string(num_annotations));
Logger::logLine(" distinct annotation strings: " + std::to_string(num_annotation_strings));
Logger::logLine(" distinct SNPs with annotation: " + std::to_string(num_annotated_snps));
}

rapidjson::Value SnpCsvAnnotator::getConfig(rapidjson::Document &doc) {
Expand Down
Loading