From ef4b728f5f22f6f66142c32a08c8eabb36e6f330 Mon Sep 17 00:00:00 2001
From: Marisa Emerson <mje@uq.edu.au>
Date: Mon, 28 Jul 2014 23:37:04 -0700
Subject: [PATCH 1/4] Modified the id in variants to match that of ref_feat
 Added a relation between variants and ref_feat for much faster selections
 with table joins

---
 src/BanzaiDB/banzaidb.py | 6 ++++++
 src/BanzaiDB/core.py     | 3 ---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py
index 631bbbd..0152440 100644
--- a/src/BanzaiDB/banzaidb.py
+++ b/src/BanzaiDB/banzaidb.py
@@ -146,6 +146,12 @@ def populate_mapping(args):
         ref, ref_meta = core.reference_genome_features_to_JSON(ref)
         inserted = r.table('ref').insert(ref).run(connection)
         inserted = r.table('ref_feat').insert(ref_meta).run(connection)
+        # Add relations from ref_feat to variants
+        for feature in ref_meta:
+            r.table('variants')\
+                .filter({"LocusTag" : feature.LocusTag})\
+                .update({"RefFeat" : feature.id})\
+                .run(connection)
 
 
 def populate_assembly():
diff --git a/src/BanzaiDB/core.py b/src/BanzaiDB/core.py
index f435cd0..108c3b7 100644
--- a/src/BanzaiDB/core.py
+++ b/src/BanzaiDB/core.py
@@ -45,9 +45,6 @@ def nesoni_report_to_JSON(report_file):
         fin.readline()
         for line in fin:
             ref_id, pos, ftype, old, new, evidence, cons = line.split('\t')
-            tmp = ref_id.split('.')
-            tmp = '.'.join(tmp[:-1])
-            ref_id = tmp
             obs_count = parsers.parse_evidence(evidence)
             # Deal with "mixed" features
             mixed = cons.split(',')

From 5286a5dadb6d2aaaa3d179e22caf061690d4ffb5 Mon Sep 17 00:00:00 2001
From: Marisa Emerson <mje@uq.edu.au>
Date: Mon, 28 Jul 2014 23:54:39 -0700
Subject: [PATCH 2/4] python is not javascript, fixed syntax

---
 src/BanzaiDB/banzaidb.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py
index 0152440..6d86ef9 100644
--- a/src/BanzaiDB/banzaidb.py
+++ b/src/BanzaiDB/banzaidb.py
@@ -146,11 +146,12 @@ def populate_mapping(args):
         ref, ref_meta = core.reference_genome_features_to_JSON(ref)
         inserted = r.table('ref').insert(ref).run(connection)
         inserted = r.table('ref_feat').insert(ref_meta).run(connection)
+
         # Add relations from ref_feat to variants
         for feature in ref_meta:
             r.table('variants')\
-                .filter({"LocusTag" : feature.LocusTag})\
-                .update({"RefFeat" : feature.id})\
+                .filter({"LocusTag" : feature['LocusTag']})\
+                .update({"RefFeat" : feature['id']})\
                 .run(connection)
 
 

From 39f6a1575a32ad34bf545825104a637f332d354a Mon Sep 17 00:00:00 2001
From: Marisa Emerson <mje@uq.edu.au>
Date: Thu, 7 Aug 2014 20:17:35 -0700
Subject: [PATCH 3/4] added relations to ref_feat

---
 src/BanzaiDB/banzaidb.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py
index 6d86ef9..735ca74 100644
--- a/src/BanzaiDB/banzaidb.py
+++ b/src/BanzaiDB/banzaidb.py
@@ -154,7 +154,6 @@ def populate_mapping(args):
                 .update({"RefFeat" : feature['id']})\
                 .run(connection)
 
-
 def populate_assembly():
     """
     Populate the database with an assembly run

From 0c365ebbf4dbdf45897c9987c61128bcdb6579f1 Mon Sep 17 00:00:00 2001
From: Marisa Emerson <mje@uq.edu.au>
Date: Thu, 25 Sep 2014 19:49:12 -0700
Subject: [PATCH 4/4] Added coverage statistics insertion

---
 src/BanzaiDB/banzaidb.py |  8 +++-----
 src/BanzaiDB/mapping.py  | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/BanzaiDB/banzaidb.py b/src/BanzaiDB/banzaidb.py
index fc59eb0..0b20ddf 100644
--- a/src/BanzaiDB/banzaidb.py
+++ b/src/BanzaiDB/banzaidb.py
@@ -64,7 +64,7 @@ def init_database_with_default_tables(args):
     """
     # Add additional (default) tables here...
     def_tables = ['determined_variants', 'strains_under_investigation',
-                  'references', 'reference_features']
+                  'references', 'reference_features', 'strain_features']
     with database.make_connection() as connection:
         try:
             r.db_create(connection.db).run(connection)
@@ -177,10 +177,8 @@ def populate_mapping(args):
         cur_ref = r.table('references').get('current_reference').run(connection)
         ref = cur_ref["reference_id"]+"_"+str(cur_ref["revision"])
         for e in strains:
-            # open the userplot of the current reference & strain
-            not_called = mapping.get_N_char_positions(run_path, e['StrainID'])
-            ranges = misc.get_intervals(not_called)
-            r.table('strains_under_investigation').get(e['StrainID']).update({"reference": ref, "coverage": json.dumps(ranges)}).run(connection)
+            coverage = mapping.get_coverage(run_path, e['StrainID'])
+            r.table('strain_features').insert(coverage).run(connection)
 
         # Add relations from ref_feat to variants
         for feature in ref_meta:
diff --git a/src/BanzaiDB/mapping.py b/src/BanzaiDB/mapping.py
index 94d0872..29639f9 100644
--- a/src/BanzaiDB/mapping.py
+++ b/src/BanzaiDB/mapping.py
@@ -45,3 +45,37 @@ def get_N_char_positions(run_path, sid):
         if e == 'N':
             no_call.append(idx)
     return no_call
+
+def get_coverage(run_path, sid):
+    """
+    Return any abnormal coverage information in the .consequences file
+    """
+    filename = sid + ".consequences" #change this when we work out naming convention
+    strain_features = []
+
+    with open(os.path.join(os.path.join(run_path, sid), filename)) as fin:
+        for idx, line in enumerate(fin):
+            feature = {}
+            if idx >= 1:
+                cur = line.split("\t")
+                feature['id'] = cur[0]+'_'+sid+'_'+cur[1]
+                coverage = float(cur[5])
+
+                # Only store if there is interesting coverage statistics
+                if coverage != 1.0: # 
+                    feature['coverage'] = coverage
+
+                old_len, new_len = int(cur[2]), int(cur[3])
+                AA_len = old_len - new_len
+               
+                # Only store if there is an indel 
+                if AA_len != 0:
+                    feature['aa_difference'] = AA_len
+
+                if 'coverage' in feature or 'difference' in feature:
+                    feature['StrainID'] = sid 
+                    feature['Reference'] = cur[0]
+                    feature['LocusTag'] = cur[1]
+                    strain_features.append(feature)
+
+    return strain_features