From ef4b728f5f22f6f66142c32a08c8eabb36e6f330 Mon Sep 17 00:00:00 2001 From: Marisa Emerson Date: Mon, 28 Jul 2014 23:37:04 -0700 Subject: [PATCH 1/4] Modified the id in variants to match that of ref_feat Added a relation between variants and ref_feat for much faster selections with table joins --- src/BanzaiDB/banzaidb.py | 6 ++++++ src/BanzaiDB/core.py | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py index 631bbbd..0152440 100644 --- a/src/BanzaiDB/banzaidb.py +++ b/src/BanzaiDB/banzaidb.py @@ -146,6 +146,12 @@ def populate_mapping(args): ref, ref_meta = core.reference_genome_features_to_JSON(ref) inserted = r.table('ref').insert(ref).run(connection) inserted = r.table('ref_feat').insert(ref_meta).run(connection) + # Add relations from ref_feat to variants + for feature in ref_meta: + r.table('variants')\ + .filter({"LocusTag" : feature.LocusTag})\ + .update({"RefFeat" : feature.id})\ + .run(connection) def populate_assembly(): diff --git a/src/BanzaiDB/core.py b/src/BanzaiDB/core.py index f435cd0..108c3b7 100644 --- a/src/BanzaiDB/core.py +++ b/src/BanzaiDB/core.py @@ -45,9 +45,6 @@ def nesoni_report_to_JSON(report_file): fin.readline() for line in fin: ref_id, pos, ftype, old, new, evidence, cons = line.split('\t') - tmp = ref_id.split('.') - tmp = '.'.join(tmp[:-1]) - ref_id = tmp obs_count = parsers.parse_evidence(evidence) # Deal with "mixed" features mixed = cons.split(',') From 5286a5dadb6d2aaaa3d179e22caf061690d4ffb5 Mon Sep 17 00:00:00 2001 From: Marisa Emerson Date: Mon, 28 Jul 2014 23:54:39 -0700 Subject: [PATCH 2/4] python is not javascript, fixed syntax --- src/BanzaiDB/banzaidb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py index 0152440..6d86ef9 100644 --- a/src/BanzaiDB/banzaidb.py +++ b/src/BanzaiDB/banzaidb.py @@ -146,11 +146,12 @@ def populate_mapping(args): ref, ref_meta = core.reference_genome_features_to_JSON(ref) inserted = r.table('ref').insert(ref).run(connection) inserted = r.table('ref_feat').insert(ref_meta).run(connection) + # Add relations from ref_feat to variants for feature in ref_meta: r.table('variants')\ - .filter({"LocusTag" : feature.LocusTag})\ - .update({"RefFeat" : feature.id})\ + .filter({"LocusTag" : feature['LocusTag']})\ + .update({"RefFeat" : feature['id']})\ .run(connection) From 39f6a1575a32ad34bf545825104a637f332d354a Mon Sep 17 00:00:00 2001 From: Marisa Emerson Date: Thu, 7 Aug 2014 20:17:35 -0700 Subject: [PATCH 3/4] added relations to ref_feat --- src/BanzaiDB/banzaidb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py index 6d86ef9..735ca74 100644 --- a/src/BanzaiDB/banzaidb.py +++ b/src/BanzaiDB/banzaidb.py @@ -154,7 +154,6 @@ def populate_mapping(args): .update({"RefFeat" : feature['id']})\ .run(connection) - def populate_assembly(): """ Populate the database with an assembly run From 0c365ebbf4dbdf45897c9987c61128bcdb6579f1 Mon Sep 17 00:00:00 2001 From: Marisa Emerson Date: Thu, 25 Sep 2014 19:49:12 -0700 Subject: [PATCH 4/4] Added coverage statistics insertion --- src/BanzaiDB/banzaidb.py | 8 +++----- src/BanzaiDB/mapping.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py index fc59eb0..0b20ddf 100644 --- a/src/BanzaiDB/banzaidb.py +++ b/src/BanzaiDB/banzaidb.py @@ -64,7 +64,7 @@ def init_database_with_default_tables(args): """ # Add additional (default) tables here... def_tables = ['determined_variants', 'strains_under_investigation', - 'references', 'reference_features'] + 'references', 'reference_features', 'strain_features'] with database.make_connection() as connection: try: r.db_create(connection.db).run(connection) @@ -177,10 +177,8 @@ def populate_mapping(args): cur_ref = r.table('references').get('current_reference').run(connection) ref = cur_ref["reference_id"]+"_"+str(cur_ref["revision"]) for e in strains: - # open the userplot of the current reference & strain - not_called = mapping.get_N_char_positions(run_path, e['StrainID']) - ranges = misc.get_intervals(not_called) - r.table('strains_under_investigation').get(e['StrainID']).update({"reference": ref, "coverage": json.dumps(ranges)}).run(connection) + coverage = mapping.get_coverage(run_path, e['StrainID']) + r.table('strain_features').insert(coverage).run(connection) # Add relations from ref_feat to variants for feature in ref_meta: diff --git a/src/BanzaiDB/mapping.py b/src/BanzaiDB/mapping.py index 94d0872..29639f9 100644 --- a/src/BanzaiDB/mapping.py +++ b/src/BanzaiDB/mapping.py @@ -45,3 +45,37 @@ def get_N_char_positions(run_path, sid): if e == 'N': no_call.append(idx) return no_call + +def get_coverage(run_path, sid): + """ + Return any abnormal coverage information in the .consequences file + """ + filename = sid + ".consequences" #change this when we work out naming convention + strain_features = [] + + with open(os.path.join(os.path.join(run_path, sid), filename)) as fin: + for idx, line in enumerate(fin): + feature = {} + if idx >= 1: + cur = line.split("\t") + feature['id'] = cur[0]+'_'+sid+'_'+cur[1] + coverage = float(cur[5]) + + # Only store if there is interesting coverage statistics + if coverage != 1.0: # + feature['coverage'] = coverage + + old_len, new_len = int(cur[2]), int(cur[3]) + AA_len = old_len - new_len + + # Only store if there is an indel + if AA_len != 0: + feature['aa_difference'] = AA_len + + if 'coverage' in feature or 'difference' in feature: + feature['StrainID'] = sid + feature['Reference'] = cur[0] + feature['LocusTag'] = cur[1] + strain_features.append(feature) + + return strain_features