diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml index 056d4402..1cec4155 100644 --- a/.github/workflows/makefile.yml +++ b/.github/workflows/makefile.yml @@ -44,9 +44,9 @@ jobs: - name: Install dependencies run: | sudo apt-get update - mamba install -c conda-forge -c bioconda 'python=3.11' 'pulp<=2.7.0' 'snakemake-minimal>7,<8' GraphAligner mashmap winnowmap - sudo apt-get install gcc-11 g++-11 libcurl4-openssl-dev liblzma-dev libbz2-dev - pip install networkx + mamba install -c conda-forge -c bioconda 'python=3.11' 'pulp<=2.7.0' 'snakemake-minimal>7,<8' GraphAligner mashmap winnowmap samtools seqtk + sudo apt-get install gcc-11 g++-11 libcurl4-openssl-dev liblzma-dev libbz2-dev bamtools libbamtools-dev + pip install networkx pysam sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 @@ -74,6 +74,10 @@ jobs: echo >> x.py "print(G)" python x.py + echo "" + echo "BAMTOOLS TEST" + echo " "`command -v bamtools` + - name: Compile run: | cd src @@ -120,10 +124,12 @@ jobs: echo > lib/verkko/bin/utgcns '#!/bin/sh' echo >> lib/verkko/bin/utgcns 'if [ ! -e part001.fasta ]; then' echo >> lib/verkko/bin/utgcns 'cp ../../cache/part001.fasta packages/part001.fasta.WORKING' + echo >> lib/verkko/bin/utgcns 'samtools dict ../../cache/part001.fasta | samtools view -b -S > packages/part001.bam00000002.bam' echo >> lib/verkko/bin/utgcns 'fi' echo >> lib/verkko/bin/utgcns '' echo >> lib/verkko/bin/utgcns 'if [ ! -e part002.fasta ]; then' echo >> lib/verkko/bin/utgcns 'cp ../../cache/part002.fasta packages/part002.fasta.WORKING' + echo >> lib/verkko/bin/utgcns 'samtools dict ../../cache/part002.fasta | samtools view -b -S > packages/part002.bam00000001.bam' echo >> lib/verkko/bin/utgcns 'fi' chmod 755 mbg.sh diff --git a/.gitmodules b/.gitmodules index 804bc752..4a9bcb81 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,7 @@ url = https://github.com/marbl/rukki [submodule "src/MBG"] path = src/MBG - url = https://github.com/maickrau/MBG + url = https://github.com/skoren/MBG [submodule "src/hifioverlapper"] path = src/hifioverlapper url = https://github.com/maickrau/hifioverlapper diff --git a/src/MBG b/src/MBG index f312a891..8fc6ca8d 160000 --- a/src/MBG +++ b/src/MBG @@ -1 +1 @@ -Subproject commit f312a8915eca046fcd80658633f3f72c74ed5ce4 +Subproject commit 8fc6ca8d141c131b44931e7c960426f8a8c48bad diff --git a/src/Snakefiles/1-buildGraph.sm b/src/Snakefiles/1-buildGraph.sm index 2538f002..8c875fc1 100644 --- a/src/Snakefiles/1-buildGraph.sm +++ b/src/Snakefiles/1-buildGraph.sm @@ -111,10 +111,13 @@ awk 'BEGIN \\\\ }} \\\\ \$1 == "S" \\\\ {{ \\\\ - if (\$6 != "") {{ - \$4 = \$6; - }} - print \$2, substr(\$4, 6), length(\$3); \\\\ + if (\$6 != "" && match(\$6, /^kl:f:/)) {{ \\\\ + \$4 = \$6; \\\\ + }} \\\\ + if (!match(\$4, /^kl:f:/) && !match(\$4, /^ll:f:/)) {{ \\\\ + print "ERROR: line "\$0" does not match expected format for coverages"; exit 1 \\\\ + }} \\\\ + print \$2, substr(\$4, 6), length(\$3); \\\\ }}' \\\\ < ../{output.graph} \\\\ > ../{output.hifi_node_cov} diff --git a/src/Snakefiles/2-processGraph.sm b/src/Snakefiles/2-processGraph.sm index 0ca6f2a7..88e011d8 100644 --- a/src/Snakefiles/2-processGraph.sm +++ b/src/Snakefiles/2-processGraph.sm @@ -47,17 +47,17 @@ cat > ./processGraph.sh < gapped-once-hifi-resolved.gfa echo "" echo Gap insertion pass 2. -{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-once-hifi-resolved.gfa ../{input.paths} 2 300 paths-nogap-2.gaf gaps-hifi-2.gaf gaptwo n \\\\ +{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-once-hifi-resolved.gfa ../{input.paths} 2 50 paths-nogap-2.gaf gaps-hifi-2.gaf gaptwo n \\\\ > gapped-twice-hifi-resolved.gfa echo "" echo Gap insertion pass 3. -{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-hifi-resolved.gfa ../{input.paths} 1 5 paths-nogap-3.gaf gaps-hifi-3.gaf gapthree n \\\\ +{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-hifi-resolved.gfa ../{input.paths} 2 300 paths-nogap-3.gaf gaps-hifi-3.gaf gapthree n \\\\ > gapped-hifi-resolved.gfa echo "" diff --git a/src/Snakefiles/6-layoutContigs.sm b/src/Snakefiles/6-layoutContigs.sm index 5d7aa8af..d7d0e993 100644 --- a/src/Snakefiles/6-layoutContigs.sm +++ b/src/Snakefiles/6-layoutContigs.sm @@ -53,6 +53,7 @@ rule generateLayoutContigsInputs: edges = '6-layoutContigs/combined-edges.gfa', hifialns = '6-layoutContigs/hifi.alignments.gaf', ontalns = '6-layoutContigs/ont.alignments.gaf', + ontgapaln = '6-layoutContigs/ont-gapfill.txt', paths = '6-layoutContigs/consensus_paths.txt', lengths = '6-layoutContigs/nodelens.txt' log: @@ -91,8 +92,8 @@ cat {params.aligns} ../{input.paths} | sed 's/^/hifi_/' \\\\ cat ../{input.ontgaps} | sed 's/^/ont_/' >> ../{output.hifialns} # now get the ONT alignments, again prepending ont_ but ignoring any reads we've already included due to it being used as a gap fill -cat ../{input.ontgaps} |cut -f 1 > ../6-layoutContigs/ont-gapfill.txt -{PYTHON} {VERKKO}/scripts/replace_path_nodes.py ../{input.ontalns} ../{output.nodemap} |grep -F -v -w -f ../6-layoutContigs/ont-gapfill.txt | sed 's/^/ont_/' > ../{output.ontalns} || true +cat ../{input.ontgaps} |cut -f 1 > ../{output.ontgapaln} +{PYTHON} {VERKKO}/scripts/replace_path_nodes.py ../{input.ontalns} ../{output.nodemap} |grep -F -v -w -f ../{output.ontgapaln} | sed 's/^/ont_/' > ../{output.ontalns} || true cat ../{input.graph} \\\\ | awk 'BEGIN \\\\ @@ -144,6 +145,7 @@ rule layoutContigs: output: layout = rules.verkko.input.layout, scfmap = rules.verkko.input.scfmap, + dropped = '6-layoutContigs/unitig-popped.layout.scfmap.dropped', gaps = '6-layoutContigs/gaps.txt' params: haveONT = config['withONT'], @@ -180,15 +182,18 @@ fi ../{input.paths} \\\\ ../{input.lengths} \\\\ ../6-layoutContigs/unitig_initial.layout \\\\ - ../{output.scfmap} + ../6-layoutContigs/unitig_initial.layout.scfmap if [ "{params.haveONT}" = "True" ] && [ "{params.haveBAM}" = "True" ]; then + cp ../6-layoutContigs/unitig_initial.layout.scfmap.dropped ../{output.dropped} {PYTHON} {VERKKO}/scripts/merge_layouts.py \\\\ ../6-layoutContigs/unitig_initial.layout \\\\ ../{input.ontalns} \\\\ - > ../{output.layout} + ../{output.layout} else - cp ../6-layoutContigs/unitig_initial.layout ../{output.layout} + cp ../6-layoutContigs/unitig_initial.layout.scfmap.dropped ../{output.dropped} + cp ../6-layoutContigs/unitig_initial.layout.scfmap ../{output.scfmap} + cp ../6-layoutContigs/unitig_initial.layout ../{output.layout} fi {PYTHON} {VERKKO}/scripts/check_layout_gaps.py < ../{output.layout} > ../{output.gaps} diff --git a/src/Snakefiles/7-combineConsensus.sm b/src/Snakefiles/7-combineConsensus.sm index 1c2627e8..cacfd8f0 100644 --- a/src/Snakefiles/7-combineConsensus.sm +++ b/src/Snakefiles/7-combineConsensus.sm @@ -28,7 +28,7 @@ def combineConsensusI(wildcards): def combineConsensusP(wildcards): return expand( "packages/part{nnnn}.fasta", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) def combineConsensusB(wildcards): - return expand( "packages/part{nnnn}.bam*.bam", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) + return expand( "packages/part{nnnn}.bam*.bam", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) rule combineConsensus: input: @@ -36,7 +36,7 @@ rule combineConsensus: tigmap = rules.buildPackages.output.tigmap, scfmap = rules.layoutContigs.output.scfmap, layout = rules.layoutContigs.output.layout, - ontalns = rules.generateLayoutContigsInputs.output.ontalns, + ontalns = rules.generateLayoutContigsInputs.output.ontgapaln if config['withONT'] == "True" else {rules.emptyfile.output}, hificov = rules.verkko.input.hifi5cov, finished = rules.buildPackages.output.finished, pathstsv = rules.rukki.output.pathstsv if config['ruk_enable'] == "True" else rules.emptyfile.output, @@ -59,14 +59,15 @@ rule combineConsensus: short_len = config['short_contig_length'], screens = config['screen'], packbam = combineConsensusB, - haveBAM = config['withBAM'] + haveBAM = config['withBAM'], + quick = config['cns_quick'] threads: 8 resources: job_id = 1, n_cpus = 8, - mem_gb = lambda wildcards, input, attempt: getAlignMemoryRequest(attempt, 2, input.packcns), - time_h = 4 + mem_gb = lambda wildcards, input, attempt: getAlignMemoryRequest(attempt, 5, input.packcns), + time_h = 48 shell: ''' cd 7-consensus @@ -82,8 +83,8 @@ if [ -e ../../label2 ] ; then label2=`cat ../../label2` ; fi cat > ./combineConsensus.sh < 0: mem = factor * sum / 1024 / 1024 / 1024 @@ -141,7 +143,7 @@ def getConsensusMemoryRequest(attempt, jobid): # Figure out what partitionin for l in f: words = l.strip().split(); if jobid == words[5]: - mem = max(mem, 5.0 + 0.80 * float(words[4])) + mem = max(mem, 5.0 + 0.90 * float(words[4])) return int(math.ceil(math.ceil(mem)*scl)) diff --git a/src/canu b/src/canu index 3c2099fb..80d4b50d 160000 --- a/src/canu +++ b/src/canu @@ -1 +1 @@ -Subproject commit 3c2099fbcb5c9b4fb4b4bc6267139e1821d2bc16 +Subproject commit 80d4b50d82b37e4a517295e68901ea58fde0200c diff --git a/src/main.mk b/src/main.mk index 379d9ea0..18559415 100644 --- a/src/main.mk +++ b/src/main.mk @@ -290,7 +290,7 @@ FILES += \ scripts/hic_prefilter.py -> ../lib/verkko/scripts/hic_prefilter.py \ scripts/launch_scaffolding.py -> ../lib/verkko/scripts/launch_scaffolding.py \ scripts/remove_nodes_add_telomere.py -> ../lib/verkko/scripts/remove_nodes_add_telomere.py \ - scripts/scaffolding/logger_wrap.py -> ../lib/verkko/scripts/scaffolding/logger_wrap.py \ + scripts/logging_utils.py -> ../lib/verkko/scripts/logging_utils.py \ scripts/scaffolding/match_graph.py -> ../lib/verkko/scripts/scaffolding/match_graph.py \ scripts/scaffolding/path_storage.py -> ../lib/verkko/scripts/scaffolding/path_storage.py \ scripts/scaffolding/scaff_prefilter.py -> ../lib/verkko/scripts/scaffolding/scaff_prefilter.py \ diff --git a/src/scripts/bam_rename.py b/src/scripts/bam_rename.py index 133a9d99..244ae8bb 100755 --- a/src/scripts/bam_rename.py +++ b/src/scripts/bam_rename.py @@ -11,7 +11,7 @@ bamfile = pysam.AlignmentFile("-", "rb") layout = sys.argv[1] gap_info = sys.argv[2] -scfmap = seq.readScfMap(sys.argv[3]) +scfmap,_ = seq.readScfMap(sys.argv[3]) namedict = seq.readNameMap(sys.argv[4]) lens = dict() offsets = dict() diff --git a/src/scripts/check_layout_gaps.py b/src/scripts/check_layout_gaps.py index 193062d4..679d4d2a 100755 --- a/src/scripts/check_layout_gaps.py +++ b/src/scripts/check_layout_gaps.py @@ -16,7 +16,7 @@ def check_alns(current_tig, current_len, current_alns): if current_start > last_end: print(current_tig + "\t" + str(last_end) + "\t" + str(current_start) + " (len " + str(current_len) + ")") if current_start < 0 or current_end > current_len: - print("layout outside contig: " + current_tig + "\t" + current_start + "\t" + current_end + "\t" + " (len " + str(current_len) + ")") + print("layout outside contig: " + current_tig + "\t" + str(current_start) + "\t" + str(current_end) + "\t" + " (len " + str(current_len) + ")") last_end = max(last_end, current_end) if last_end < current_len: print(current_tig + "\t" + str(last_end) + "\t" + str(current_len) + " (len " + str(current_len) + ")") diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index 49827edf..ec7db03e 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -1,13 +1,41 @@ +import logging import sys import random import networkx as nx import math import os -import graph_functions -import copy from networkx.algorithms import community import graph_functions as gf -from scaffolding import match_graph, logger_wrap +from scaffolding import match_graph + + +#TODO: move constants to some more relevant place +MIN_LEN = 200000 # best result so far with 200000 + +#likely do not need +#MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless + +KLIN_STARTS = 100 # number of different starts of kernighan lin +KLIN_ITER = 10000 # number of iterations inside kernighan lin + +MAX_SHORT_COMPONENT = 100 # we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than MAX_SHORT_COMPONENT +MAX_SHORT_LEN=200000 #we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than +MIN_WEIGHT = 10 # Ignore edges with few links +SIGNIFICANT_MAJORITY = 2.5 + +RESULT_FILENAME = "hicverkko.colors.tsv" +FILTERED_GFA_FILENAME = "filtered.gfa" +HIC_COMPRESSED_FILENAME = "hic.byread.compressed" + +CLEAR_HOMOLOGY = 250000 #for clear bulge like structures this limmit is decreased +MIN_ALIGNMENT = 50000 #smaller alingments will be filtered out +MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa +FIXED_HOMOLOGY_WEIGHT = -1000000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets + +MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only +MIN_RDNA_COMPONENT = 500000 + + def check_non_empty(part, G): for p in part: @@ -15,7 +43,6 @@ def check_non_empty(part, G): return True return False - #Currently not in use anymore def IsTip(node, edges): for pref in ['>', '<']: @@ -32,7 +59,6 @@ def IsTip(node, edges): return True return False - #collapse a node, add links from def collapseOrientedNode (edges, node): for pref in ['>', '<']: @@ -62,95 +88,6 @@ def collapseOrientedNode (edges, node): if ornode in edges: edges.pop(ornode) -#returns pair of nodes near PAR that should be connected via fake homology, currently placeholder -#TODO: this is likely subject for reconsidering when we'll move to other species - -def checkXYcomponent(current_component, matchgraph, G, edges): -#minimal comp size. XY is vulnerable to gaps, so not 150M - MIN_COMP_SIZE = 80000000 -#maximal homozygous, PAR region - MAX_HOM_LEN = 5000000 -#cutoff for additional fake homozygocity - MIN_NEAR_PAR = 3000000 -#TODO these constants are ABSOLUTELY human focused, and even if we'll save the function should be reconsidered. -#TODO: right way with new matchGraph... - total_l = 0 - total_hom_len = 0 - for node in current_component: - if node in G.nodes: - total_l += G.nodes[node]['length'] - if node in matchgraph: - total_hom_len += G.nodes[node]['length'] - - if total_hom_len == 0 or total_hom_len > MAX_HOM_LEN or total_l < MIN_COMP_SIZE: - return [0, 0] - fake_hom = [] - for f_node in current_component: - for s_node in current_component: - if f_node < s_node and G.nodes[f_node]['length'] > MIN_NEAR_PAR and G.nodes[s_node]['length'] > MIN_NEAR_PAR: - for fp in ['<', '>']: - for sp in ['<', '>']: - f_or = fp + f_node - s_or = sp + s_node - if not (f_or in edges) or not (s_or in edges): - continue - for nxt in edges[f_or]: - if nxt in edges[s_or]: - if len(fake_hom) == 0 or fake_hom[-1][0] != f_node or fake_hom[-1][1] != s_node: - fake_hom.append([f_node, s_node]) - if len(fake_hom) > 1: - sys.stderr.write("MULTIPLE CANDIDATES FOR FAKE HOMOLOGY IN XY\n") - for pair in fake_hom: - sys.stderr.write(f"{pair[0]} --- {pair[1]}\n") - return [0, 0] - elif len(fake_hom) == 1: - sys.stderr.write(f"Adding FAKE HOMOLOGY between {fake_hom[0][0]} --- {fake_hom[0][1]}\n") - return [fake_hom[0][0], fake_hom[0][1]] - return [0, 0] - -def fixUnbalanced(part, C, G): - auxs = [0, 0] - lens = [0, 0] - for ind in range (0, 2): - for node in part[ind]: - if node.startswith('Aux'): - auxs[ind] += 1 - elif node in G.nodes: - lens[ind] += G.nodes[node]['length'] - else: - print(f"Someting went wrong with node {node}") -# print (f"In fix unbalanced {lens[0]} {lens[1]}") - #just _a_ limitation on the swap length is required - maxswap = min(lens[0]/10, lens[1]/10) - curswap = 0 - edge_swapped =0 - for ind in range(0, 2): - #do not move edges _to_ the part where aux edge is present -otherwise it could be swapped with aux - if auxs[1 - ind] == 0: -# print(f'processing components {part[0]} {part[1]}') - changed = True - while changed: - changed = False - for node in part[ind]: - if node in G: - cost = 0 - for i in part[ind]: - if [i, node] in C.edges: - cost += C.edges[i, node]['weight'] - for j in part[1 - ind]: - if [j, node] in C.edges: - cost -= C.edges[j, node]['weight'] - if cost < 0 and curswap + G.nodes[node]['length'] < maxswap: - changed = True - curswap += G.nodes[node]['length'] - part[1 - ind].add(node) - part[ind].remove(node) - edge_swapped +=1 - print (f"SWAPPED {node}") - break - if curswap > 0: - print (f"Fixing uneven component, moved {curswap} bases and {edge_swapped} nodes") - def getMedianCov(nodeset): med_cov = -1 total_length = 0 @@ -159,57 +96,385 @@ def getMedianCov(nodeset): sum_l = 0 sorted_nodes = sorted(nodeset, key=lambda node: node[1]['coverage']) for node in sorted_nodes: - print (f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') + logging.debug(f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') sum_l += node[1]['length'] if 2*sum_l > total_length: med_cov = node[1]['coverage'] - print (f'Median coverage is {med_cov}\n') + logging.info(f'Median coverage is {med_cov}') break return med_cov -def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, uneven_depth): - #TODO: move constants to some more relevant place - MIN_LEN = 200000 # best result so far with 200000 - - #likely do not need - #MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless - - KLIN_STARTS = 1000 # number of different starts of kernighan lin - KLIN_ITER = 10000 # number of iterations inside kernighan lin - - MAX_SHORT_COMPONENT = 100 # we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than MAX_SHORT_COMPONENT - MAX_SHORT_LEN=200000 #we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than - MIN_WEIGHT = 10 # Ignore edges with few links - SIGNIFICANT_MAJORITY = 2.5 - - RESULT_FILENAME = "hicverkko.colors.tsv" - FILTERED_GFA_FILENAME = "filtered.gfa" - HIC_COMPRESSED_FILENAME = "hic.byread.compressed" - LOGGING_FILENAME = "hicverkko.log" - - CLEAR_HOMOLOGY = 500000 #for clear bulge like structures this limmit is decreased - MIN_ALIGNMENT = 100000 #smaller alingments will be filtered out - MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa - FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets - - MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only - MIN_RDNA_COMPONENT = 500000 - logger = logger_wrap.initLogger("phasing.log") +def report_phased_node(node_id, hap, output_file): + if hap == 1: + output_file.write(f'{node_id}\t0\t100000\t0:100000\t#8888FF\n') + elif hap == 0: + output_file.write(f'{node_id}\t100000\t0\t100000:0\t#FF8888\n') + +def create_initial_partition_noKL(C, matchGraph): + random.seed(239) + neg_graph = nx.Graph() + for ec in matchGraph.edges(): + if ec[0] in C and ec[1] in C: + if matchGraph.edges[ec]['weight'] < 0: + for i in range (2): + if not ec[i] in neg_graph.nodes(): + neg_graph.add_node(ec[i]) + neg_graph.add_edge(ec[0], ec[1]) + elif (ec[0] in C or ec[1] in C) and matchGraph.edges[ec]['weight'] < 0: + #This may happen since we delete high-covered nodes, although should not be often + logging.info(f"Weird, edge {ec} partially in component{C}") + #exit(1) + dists = dict(nx.all_pairs_shortest_path_length(neg_graph)) + #friends - should always be in the same component + #enemies - should always be in the opposite + friends = {} + enemies = {} + for n in C: + if n in neg_graph: + friends[n] = [] + enemies[n] = [] + for other in dists[n]: + if other != n: + #TODO: possibly random starts needed? + if dists[n][other] % 2 == 0: + friends[n].append(other) + else: + enemies[n].append(other) + logging.debug(f"Friends of {n}: {friends[n]}") + logging.debug(f"Enemies of {n}: {enemies[n]}") + parts = [set(), set()] + C_nodes = list(C.nodes()) + processed_nodes = set() + opposite = {} + for n in C_nodes: + #lets remove non-homologous nodes + if not n in enemies and n.find("Aux") == -1: + C.remove_node(n) + logging.debug(f"Removing node {n}") + else: + start_component = random.randint(0, 1) + if not n in processed_nodes: + friends[n].append(n) + if len (friends[n]) > 1: + f = glue_nodes(C, friends[n]) + else: + f = friends[n][0] + if len (enemies[n]) > 1: + e = glue_nodes(C, enemies[n]) + else: + e = enemies[n][0] + for n in friends[n]: + processed_nodes.add(n) + for n in enemies[n]: + processed_nodes.add(n) + parts[start_component].add(f) + parts[1 - start_component].add(e) + opposite[f] = e + opposite[e] = f + return parts, opposite + +def random_swap(parts, opposite, seed): + random.seed(seed) + used = set() + res = [set(),set()] + for ind in range (2): + for n in parts[ind]: + if not n in used: + swap = random.randint(0,1) + if swap: + n_ind = 1 - ind + else: + n_ind = ind + res[n_ind].add(n) + res[1 - n_ind].add(opposite[n]) + used.add(n) + used.add(opposite[n]) + return res + +def glue_nodes(C, node_list): + new_node = '_'.join(node_list) + C.add_node(new_node) + for n in node_list: + for neighbor in C.neighbors(n): + if neighbor not in C.neighbors(new_node): + C.add_edge(new_node, neighbor, weight=C.edges[n, neighbor]['weight']) + else: + C[new_node][neighbor]['weight'] += C.edges[n, neighbor]['weight'] + C.remove_nodes_from(node_list) + return new_node + +def unglue_nodes(parts): + new_parts = [set(), set()] + for ind in range(2): + for n in parts[ind]: + original_nodes = n.split('_') + for orig in original_nodes: + new_parts[ind].add(orig) + return new_parts + +def optimize_partition(C, parts, swap_together): + cur_score = score_partition(C, parts) + #TODO: without randomizations iterations are useless. + MAX_ITERATIONS = 5 + not_changed = 0 + all_nodes = [] + for i in range (2): + for j in parts[i]: + all_nodes.append(j) + all_nodes.sort() + while not_changed < MAX_ITERATIONS: + for n in all_nodes: + + score_change = score_swap(C, parts, swap_together[n]) + logging.debug(f"Trying to move {n} and its neighbors {swap_together[n]}, old_score {cur_score}, swap score {score_change}") + if score_change > 0: + upd_score = cur_score + score_change + for swap_node in swap_together[n]: + for j in range (2): + if swap_node in parts[j]: + parts[j-1].add(swap_node) + parts[j].remove(swap_node) + break + not_changed = 0 + logging.debug(f"Moved {n} and its neighbors {swap_together[n]},improving score") + not_changed += 1 + + return parts + +def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists): + + C = nx.Graph() + # rebuild the graph from this component using only edges in the hic graph + C.add_nodes_from(current_component) + + # first we ignore any nodes that are too short or have too few links. + short = [] + + not_to_big = [] + for n in C.nodes(): + logging.debug(n) + if G.nodes[n]['coverage'] < MAX_COV: + not_to_big.append((n, G.nodes[n])) + local_max_cov = 1.5 * getMedianCov(not_to_big) + if local_max_cov < 0: + local_max_cov = MAX_COV + for n in C.nodes(): + if n not in G: + sys.stderr.write("Error, got a node not in original graph %s !" % (n)) + sys.exit() + if G.nodes[n]['coverage'] > local_max_cov and (not uneven_depth): + logging.debug("While partitoning dropping node %s coverage too high" % (n)) + short.append(n) + elif not (n in matchGraph) and uneven_depth: + logging.info("While partitoning dropping node %s uneven coverage and no matches" % (n)) + short.append(n) + elif G.nodes[n]['length'] < MIN_LEN: + collapseOrientedNode(edges, n) + short.append(n) + else: + good = False + for e in hicGraph.edges(n): + if (e[0] != n or e[1] != n) and hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT \ + and (e[0] in C and e[1] in C): + good = True + #Let's phase homologous nodes randomly then + if n in matchGraph and len(matchGraph.edges(n)) > 0: + for ec in matchGraph.edges(n): + if matchGraph.edges[ec]['weight'] < 0: + good = True + break + + if not good: + logging.info("While partitoning dropping node %s low links count" % (n)) + short.append(n) - # load the assembly gfa - G = nx.Graph() - logging_f = open (os.path.join(output_dir, LOGGING_FILENAME), 'w') - graph_functions.load_indirect_graph(graph_gfa, G) + C.remove_nodes_from(short) + logging.info(f'Currently {C.number_of_nodes()} nodes') + for e in hicGraph.edges(current_component): + # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) + if e[0] in C and e[1] in C and (matchGraph.get_edge_data(e[0], e[1]) == None or matchGraph.get_edge_data(e[0], e[1])['weight'] == 0): + # if edges are too distant in graph, hi-c info is trash + # using homologous edges when counting distances to help with coverage gaps + similar_edges = [set(), set()] + for ind in range(0, 2): + for match_edge in matchGraph.edges(e[ind]): + similar_edges[ind].add(match_edge[0]) + similar_edges[ind].add(match_edge[1]) + similar_edges[ind].add(e[ind]) +#TODO: likely this is not needed anymore since we already added links between homologous edges. + C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) + logging.info(f'Currently {C.number_of_nodes()} in current component') + logging.debug(C.nodes()) + if C.number_of_nodes() > 1: + for u, v, w in matchGraph.edges.data("weight"): + if u in C and v in C: + #No more clearing of links between nonhomologous nodes with similar regions. + if w != None and w != 0: + C.add_edge(u, v, weight=w) + for edge in C.edges: + logging.debug(f'HIC edge: {edge} {C.edges[edge]}') + return C + +def score_partition(C, partition): + sum_w = 0 + for part_id in range(0, 2): + for i in partition[part_id]: + for j in partition[part_id]: + if [i, j] in C.edges(): + sum_w += C.edges[i, j]['weight'] + if C.edges[i, j]['weight'] < 0: + + logging.error(f"Negative edge {i} {j} with weight {C.edges[i, j]['weight']} IN THE SAME PARTITION") + exit() + return sum_w +def score_swap(C, partition, to_swap): + upd = 0 + for n in to_swap: + for i in range (2): + if n in partition[i]: + for neighbour in C.neighbors(n): + if not neighbour in to_swap: + if neighbour in partition[i]: + upd -= C.edges[n, neighbour]['weight'] + elif neighbour in partition[1 - i]: + upd += C.edges[n, neighbour]['weight'] + return upd + +def min_nor_dist(n1, n2, dists): + res = 10000000000 + for or1 in ['-', '+']: + for or2 in ['-', '+']: + or_n1 = n1 + or1 + or_n2 = n2 + or2 + if or_n1 in dists and or_n2 in dists[or_n1]: + res = min (res, dists[or_n1][or_n2]) + return res + +def comnponent_size (current_component, G): + total_len = 0 + for n in current_component: + #oriented or not we do not care + if n in G.nodes(): + total_len += G.nodes[n]['length'] + elif n + '+' in G.nodes(): + total_len += G.nodes[n + '+']['length'] + return total_len + +def is_duplication_like(n1, n2, dists, or_G): + CLOSE_SUSP_HOMOLOGY_DIST = 5000000 + if or_G.nodes[n1 + '+']['length'] > CLOSE_SUSP_HOMOLOGY_DIST and or_G.nodes[n2 + '+']['length'] > CLOSE_SUSP_HOMOLOGY_DIST: + return + for or1 in ['-', '+']: + for or2 in ['-', '+']: + or_n1 = n1 + or1 + or_n2 = n2 + or2 + if or_n1 in dists and or_n2 in dists[or_n1]: + dist = (dists[or_n1][or_n2] - or_G.nodes[or_n1]['length'] - or_G.nodes[or_n2]['length']) / 2 + if dist < CLOSE_SUSP_HOMOLOGY_DIST: + if or_n2 in dists and or_n1 in dists[or_n2] and dists[or_n2][or_n1] < CLOSE_SUSP_HOMOLOGY_DIST: + logging.info(f"Nodes {or_n1} and {or_n2} from haploid component in loop, {dist}") + else: + logging.info(f"Nodes {or_n1} and {or_n2} one direction close {dist} but not another") + return True + return False +def clear_links(node_pairs, mg): + for pair in node_pairs: + mg.matchGraph.remove_edge(pair[0], pair[1]) + +def remove_pseudo_homology(current_component, or_G, dists, mg): + #we never remove true homology between really large nodes + #max 1/3 among all homologies can be cleared + MAX_UNDIPLOID_FRACTION = 3 + + total_len = comnponent_size(current_component, or_G) + clear_diploids = [] + total_homology_length = 0 + clear_homology_length = 0 + for n1 in current_component: + for n2 in current_component: + if n1 < n2 and mg.isHomologousNodes(n1, n2, True): + total_homology_length += mg.getHomologyLength(n1, n2) + if is_duplication_like(n1, n2, dists, or_G): + clear_diploids.append([n1, n2]) + clear_homology_length += mg.getHomologyLength(n1, n2) + if (mg.isDiploid(current_component)): + if clear_homology_length * MAX_UNDIPLOID_FRACTION < total_homology_length: + if clear_homology_length > 0: + logging.info(f"Cleaning diploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + clear_links(clear_diploids, mg) + else: + logging.info(f"NOT cleaning diploid component {clear_diploids}, too high fraction. hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + else: + if clear_homology_length > 0: + logging.info(f"Cleaning haploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + if clear_homology_length == total_homology_length: + return "no_diploid" + return "diploid" + +def process_haploid_component(current_component, G, tsv_file, haploid_component_count): + haploid_component_count += 1 + haplotype = haploid_component_count % 2 + nodes_to_report = [] + for n in sorted(current_component): + if G.nodes[n]['length'] > MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: + nodes_to_report.append(n) + #TODO: possibly total length condition? Better not assign colors to distal bits... + if len(nodes_to_report) > 1: + for n in nodes_to_report: + report_phased_node(n, haplotype, tsv_file) + +def output_graph_stats(G): degrees = [val for (node, val) in G.degree()] mean = sum(degrees) / G.number_of_nodes() variance = sum([((x - mean) ** 2) for x in degrees]) / G.number_of_nodes() res = variance ** 0.5 - sys.stderr.write("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f\n" % ( + logging.info("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f" % ( G.number_of_nodes(), G.number_of_edges(), mean, res, mean + 5 * res)) +def loadHiCWithFiltering(hic_byread, mashmap_nonhpc, min_alignment): + #mashmap line: utig4-345 198652527 104460000 104510000 + utig4-838 52114952 34308831 34345700 7 50000 13 id:f:0.951746 kc:f:0.940909 + hicGraph = nx.Graph() + nonhpcHomology = match_graph.HomologyStorage(mashmap_nonhpc, min_alignment) + hic_file = open(hic_byread, 'r') + for line in hic_file: + if "#" in line: + continue + line = line.strip().split() + if len(line) < 3: + continue + #introducing multimappers + if line[1].find(",") != -1 or line[2].find(",") != -1: + continue + if line[1] == line[2]: + continue + hicGraph.add_node(line[1]) + hicGraph.add_node(line[2]) + if len(line) == 6: + first_coord = int(line[4]) + second_coord = int(line[5]) + if nonhpcHomology.isInFilteredInterval(line[1], line[2], first_coord, second_coord): + continue + if len(line) > 3: + add_w = int(line[3]) + else: + add_w = 1 + w = hicGraph.get_edge_data(line[1], line[2], 0) + if w == 0: + hicGraph.add_edge(line[1], line[2], weight=add_w) + else: + w = w['weight'] + add_w + hicGraph[line[1]][line[2]]['weight'] = w + return hicGraph + +#We use nonhpc_mashmap for read filtering and hpc_mashmap for homology detection. Cound use nonhpc for both, but then there'll be hpc/nonhpc length-related issues +def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_dir, no_rdna, uneven_depth): + G = nx.Graph() + gf.load_indirect_graph(graph_gfa, G) + output_graph_stats(G) delete_set = set() largest_component = max(nx.connected_components(G), key=len) + cur_col = 0 old_colors = {} for current_component in sorted(nx.connected_components(G), key=len, reverse=True): @@ -217,19 +482,10 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une old_colors[e] = cur_col cur_col += 1 -#TODO: no_rdna is false, so efficiently we are still removing rDNA component. Should we? - if not (no_rdna): - #Store rDNA component, not to add additional links from matchGraph - #sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n") - #Here we remove large connected components of short edge, just to exclude rDNA cluster - delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT) - else: - sys.stderr.write(f"Not using rDNA component removal heuristics\n") filtered_graph = open(os.path.join(output_dir, FILTERED_GFA_FILENAME), 'w') tsv_output = os.path.join(output_dir, RESULT_FILENAME) tsv_file = open(tsv_output, 'w') tsv_file.write("node\tmat\tpat\tmat:pat\tcolor\n") - translate = open(graph_gfa, 'r') for line in translate: @@ -247,6 +503,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une #TODO: only place which used it is likely outdated #Also we have special function... nodelines = [] + or_G = nx.DiGraph() + gf.load_direct_graph(graph_gfa, or_G) edges = {} for l in open(graph_gfa, 'r'): parts = l.strip().split('\t') @@ -263,291 +521,158 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une edges[fromnode].add(tonode) edges[gf.revnode(tonode)].add(gf.revnode(fromnode)) - -#let's find that nodes that have multiple extension - multiple_ext = set() - for node in edges: - if len(node) > 1: - multiple_ext.add(node) #dirty calculation median coverage, considering that most of the phasing is done - sorted_nodes = sorted(G.nodes(data=True), key=lambda node: node[1]['length']) med_cov = getMedianCov(G.nodes(data=True)) MAX_COV = med_cov * 1.5 if (uneven_depth): - sys.stderr.write(f"Will not use coverage based homozygous nodes detection\n") + logging.info(f"Will not use coverage based homozygous nodes detection") else: - sys.stderr.write(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}\n") - - + logging.info(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}") + # load hic connections based on mappings, weight corresponds to number of times we see a connection - hicGraph = graph_functions.loadHiCGraph(hic_byread) + hicGraph = loadHiCWithFiltering(hic_byread, nonhpc_mashmap, MIN_ALIGNMENT) + #hicGraph = gf.loadHiCGraph(hic_byread) compressed_file = open(os.path.join(output_dir, HIC_COMPRESSED_FILENAME), 'w') - max_w = 0 + for node1, node2 in hicGraph.edges(): - if max_w < hicGraph[node1][node2]["weight"]: - max_w = hicGraph[node1][node2]["weight"] compressed_file.write(f'X {node1} {node2} {hicGraph[node1][node2]["weight"]}\n') - FIXED_WEIGHT = max_w - sys.stderr.write(f'Constant for neighbouring edges set to be {FIXED_WEIGHT} (but not used), for homologous edges {-10 * FIXED_WEIGHT} \n') #Adding link between matched edges to include separated sequence to main component #TODO: only one of those should be used - mg = match_graph.MatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) + mg = match_graph.MatchGraph(hpc_mashmap, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT) matchGraph = mg.getMatchGraph() - component_colors = graph_functions.getComponentColors(G) + component_colors = gf.getComponentColors(G) #reconnecting homologous nodes for [v1,v2] in matchGraph.edges(): if v1 in G.nodes and v2 in G.nodes and matchGraph[v1][v2]['weight'] < 0: if component_colors[v1] != component_colors[v2]: - logging_f.write(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}\n") + logging.info(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}") G.add_edge(v1, v2) - sys.stderr.write("Loaded hic info with %d nodes and %d edges\n" % (hicGraph.number_of_nodes(), hicGraph.number_of_edges())) + logging.info(f"Loaded hic info with {hicGraph.number_of_nodes()} nodes and {hicGraph.number_of_edges()} edges") compressed_file.close() dists = dict(nx.all_pairs_dijkstra_path_length(G, weight=lambda u, v, d: G.nodes[v]['length'])) - sys.stderr.write("Distances counted\n") + logging.info("Distances counted") # connected components decomposition and log the IDs and partition each one # currently not going to do the right thing on rDNA component + haploid_component_count = 0 + dists = dict(nx.all_pairs_dijkstra_path_length(or_G, weight=lambda u, v, d: or_G.edges[u, v]['mid_length'])) for current_component in sorted(nx.connected_components(G), key=len, reverse=True): - logging_f.write("Connected component with %d nodes is: %s\n" % (len(current_component), current_component)) + if len(current_component) > 1: + logging.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) + + cleared = remove_pseudo_homology(current_component, or_G, dists, mg) + #only backward compatibility with KL + if not mg.isDiploid(current_component): + haploid_component_count += 1 + if cleared == "no_diploid": + #TODO: backwards compatibility not to change distal bits, to_remove + #if component_size(current_component, G) > 10000000: + process_haploid_component(current_component, G, tsv_file, haploid_component_count) + continue - total_l = 0 + C = create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists) + if len(C.nodes) == 1: + continue + #TODO: cycle for randomized starts + #optimize_partition(C, parts, swap_together) + + best_score = FIXED_HOMOLOGY_WEIGHT * C.number_of_nodes() * C.number_of_nodes() + best_part = [set(), set()] + init_parts, opposite = create_initial_partition_noKL(C, matchGraph) + if len(init_parts[0]) == 0 or len(init_parts[1]) == 0: + logging.warning(f"Trying to phase component with empty set in initial partition, reporting it as haploid") + #May happen in weird cases of uneven coverage when some nodes from connected_component are removed from C + haploid_component_count += 1 + process_haploid_component(current_component, G, tsv_file, haploid_component_count) + continue + for seed in range(0, KLIN_STARTS): # iterate on starting partition + random.seed(seed) + parts = random_swap(init_parts, opposite, seed) + part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) + logging.debug(f"init_part:\n{sorted(part[0])}\n{sorted(part[1])}") + + sum_w = score_partition(C, part) + + if (sum_w > best_score): + logging.info(f'Seed {seed} score {sum_w} improved over {best_score}') + best_part = part + best_score = sum_w + logging.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + best_part = unglue_nodes(best_part) + #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) +# best_part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) +# logging.debug(f"parts:\n{sorted(parts[0])}\n{sorted(parts[1])}") + #logging.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + """ + """ + add_part = [set(), set()] + only_weights = {} for n in current_component: - total_l += G.nodes[n]['length'] - #TODO: currently just a debug message! - # we likely need to skip phasing for non-diploids and assign the same color for the component, but this can cause troubles for short arms until we stop removing rDNA tangle. - if total_l > 1000000 and not mg.isDiploid(current_component): - logging_f.write(f"Component is not diploid!\n") - - - C = nx.Graph() - # rebuild the graph from this component using only edges in the hic graph - C.add_nodes_from(current_component) - - # first we ignore any nodes that are too short or have too few links. - short = [] - - not_to_big = [] - for n in C.nodes(): - print (n) - if G.nodes[n]['coverage'] < MAX_COV: - not_to_big.append((n, G.nodes[n])) - local_max_cov = 1.5 * getMedianCov(not_to_big) - if local_max_cov < 0: - local_max_cov = MAX_COV - for n in C.nodes(): - if n not in G: - sys.stderr.write("Error got a node not in original graph %s !" % (n)) - sys.exit() - if G.nodes[n]['coverage'] > local_max_cov and (not uneven_depth): - logging_f.write("While partitoning dropping node %s coverage too high\n" % (n)) - short.append(n) - elif not (n in matchGraph) and uneven_depth: - logging_f.write("While partitoning dropping node %s uneven coverage and no matches\n" % (n)) - short.append(n) - elif G.nodes[n]['length'] < MIN_LEN: +# if G.nodes[n]['length'] < MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: + if (not n in C.nodes()) and G.nodes[n]['coverage'] < MAX_COV: - collapseOrientedNode(edges, n) - short.append(n) - - else: - good = False + weights = [0, 0] + all_weights = [0, 0] + total_w = 0 for e in hicGraph.edges(n): - - if (e[0] != n or e[1] != n) and hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT \ - and (e[0] in C and e[1] in C): - good = True - #Let's phase homologous nodes randomly then - if n in matchGraph and len(matchGraph.edges(n)) > 0: - for ec in matchGraph.edges(n): - if matchGraph.edges[ec]['weight'] < 0: - good = True - break - - if not good: - logging_f.write("While partitoning dropping node %s low links count\n" % (n)) - short.append(n) - - C.remove_nodes_from(short) - logging_f.write(f'Currently {C.number_of_nodes()} nodes\n') - # Adding some auxilary vertices to allow slightly unbalanced partitions - # sqrt/2 is quite arbitrary and subject to change - aux_nodes = int(math.sqrt(C.number_of_nodes() // 2)) - if C.number_of_nodes() <= 3: - aux_nodes = 0 - for i in range(0, aux_nodes): - C.add_node("Aux" + str(i)) - for e in hicGraph.edges(current_component): - # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) - if e[0] in C and e[1] in C and matchGraph.get_edge_data(e[0], e[1]) == None: - # if edges are too distant in graph, hi-c info is trash - # using homologous edges when counting distances to help with coverage gaps - similar_edges = [set(), set()] - for ind in range(0, 2): - for match_edge in matchGraph.edges(e[ind]): - similar_edges[ind].add(match_edge[0]) - similar_edges[ind].add(match_edge[1]) - similar_edges[ind].add(e[ind]) - #TODO: likely this is not needed anymore since we already added links between homologous edges. - for e0like in similar_edges[0]: - for e1like in similar_edges[1]: - if e0like in dists and e1like in dists[e0like]: #and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']: - C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) - break - logging_f.write(f'Currently {C.number_of_nodes()} in current component\n') - - - if C.number_of_nodes() > 1: - for u, v, w in matchGraph.edges.data("weight"): - if u in C and v in C: - if w != None: - C.add_edge(u, v, weight=w) - - - - res = checkXYcomponent(current_component, matchGraph, G, edges) - if res != [0, 0]: - sys.stderr.write(f"XY found, {res[0]} {res[1]}, adding fake links\n") - if res[0] in C and res[1] in C: - C.add_edge(res[0], res[1], weight=-10 * FIXED_WEIGHT) - C.add_edge(res[1], res[0], weight=-10 * FIXED_WEIGHT) - for edge in C.edges: - logging_f.write(f'HIC edge: {edge} {C.edges[edge]}\n') - - - best_score = FIXED_WEIGHT * C.number_of_nodes() * C.number_of_nodes() - - - - - for seed in range(0, KLIN_STARTS): # iterate on starting partition - random.seed(seed) - p1 = [] - p2 = [] - for n in sorted(C.nodes()): - if n in matchGraph and len(matchGraph.edges(n)) > 0: - #TODO: replace with something reasonable! multiple nodes will not work here. Transform each homology component into one node before?.. Anyway, it still should be fixed later with K-L iterations. - for ec in matchGraph.edges(n): - if matchGraph.edges[ec]['weight'] >= 0: - continue - if ec[0] == n and ec[1] in p1: - if n not in p1: - p2.append(n) - elif ec[0] == n and ec[1] in p2: - if n not in p2: - p1.append(n) - elif ec[1] == n and ec[0] in p1: - if n not in p1: - p2.append(n) - elif ec[1] == n and ec[0] in p2: - if n not in p2: - p1.append(n) - elif n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - - if n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - else: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - # print("Initial partitions are %s and %s" % (set(p1), set(p2))) - if len(p1) * len(p2) > 0: -# logging_f.write (f"{len(p1)} {len(p2)} {len(C.nodes())}\n") - #Debug for weird crashes - inter = list(set(p1).intersection(p2)) - missing = set(C.nodes()) - set(p1) - set(p2) - if len(inter) > 0: - logging_f.write (f"Intersecting {inter}\n") - if len(missing) > 0: - logging_f.write (f"Missing {missing}\n") - - part = community.kernighan_lin.kernighan_lin_bisection(C, partition=[set(p1), set(p2)], max_iter=KLIN_ITER, - weight='weight', seed=seed) - sum_w = 0 - for i in part[0]: - for j in part[1]: - if [i, j] in C.edges(): - # print (f'{i} {j} edge {C.edges[i,j]}') - sum_w += C.edges[i, j]['weight'] - - if (sum_w < best_score): - #lets forbid Aux only components - if check_non_empty(part[0], G) and check_non_empty(part[1], G): - logging_f.write(f'Seed {seed} score {sum_w} improved over {best_score}\n') - best_part = part - best_score = sum_w - #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) - fixUnbalanced(best_part, C, G) - logging_f.write(f'RES\t{best_part}\n') - - add_part = [set(), set()] - only_weights = {} - for n in current_component: - if G.nodes[n]['length'] < MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: - weights = [0, 0] - all_weights = [0, 0] - total_w = 0 - for e in hicGraph.edges(n): - if (e[0] != n or e[1] != n) and (e[0] in current_component and e[1] in current_component): - logging_f.write(f'DEBUG: edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}\n') - for ind in range(0, 2): - if e[0] in best_part[ind] or e[1] in best_part[ind]: - if hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT: - logging_f.write(f'DEBUG: {e} edge to partition {ind}!\n') - weights[ind] += hicGraph[e[0]][e[1]]["weight"] - all_weights[ind] += hicGraph[e[0]][e[1]]["weight"] - total_w += hicGraph[e[0]][e[1]]["weight"] - if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: - add_part[0].add(n) - logging_f.write(f"Edge {n} assigned color 0\n") - elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: - add_part[1].add(n) - logging_f.write(f"Edge {n} assigned color 1\n") - else: - logging_f.write(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}\n") - logging_f.write(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}\n") - if (all_weights[0] > SIGNIFICANT_MAJORITY * all_weights[1] or all_weights[1] > SIGNIFICANT_MAJORITY * all_weights[0]) and (all_weights[0] > MIN_WEIGHT or all_weights[1]> MIN_WEIGHT): - logging_f.write(f"Edge {n} real weights allows to make a decision forbidden by weights!\n") + if (e[0] != n or e[1] != n) and (e[0] in current_component and e[1] in current_component): + #logging.debug(f'edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}') + for ind in range(0, 2): + if e[0] in best_part[ind] or e[1] in best_part[ind]: + if hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT: + logging.debug(f'{e} edge to partition {ind}!') + weights[ind] += hicGraph[e[0]][e[1]]["weight"] + all_weights[ind] += hicGraph[e[0]][e[1]]["weight"] + total_w += hicGraph[e[0]][e[1]]["weight"] + if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: + add_part[0].add(n) + logging.debug(f"Edge {n} assigned color 0") + elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: + add_part[1].add(n) + logging.debug(f"Edge {n} assigned color 1") + else: + logging.debug(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}") + logging.debug(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}") + if (all_weights[0] > SIGNIFICANT_MAJORITY * all_weights[1] or all_weights[1] > SIGNIFICANT_MAJORITY * all_weights[0]) and (all_weights[0] > MIN_WEIGHT or all_weights[1]> MIN_WEIGHT): + logging.debug(f"Edge {n} real weights allows to make a decision forbidden by weights!") + if G.nodes[n]['length'] < MIN_LEN: only_weights[n] = all_weights - elif G.nodes[n]['length'] < MIN_LEN: - logging_f.write(f"Edge {n} did not pass coverage check\n") - - logging_f.write(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition\n') - logging_f.write(f'RES\t{add_part}\n') - best_part[0].update(add_part[0]) - best_part[1].update(add_part[1]) - right = False - for ind in range (0, 2): - for contig in sorted(best_part[ind]): - if contig in current_component: - if ind == 1: - tsv_file.write(f'{contig}\t0\t100000\t0:100000\t#8888FF\n') else: - tsv_file.write(f'{contig}\t100000\t0\t100000:0\t#FF8888\n') - for contig in sorted(only_weights.keys()): - tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') - + if all_weights[0] > all_weights[1]: + add_part[0].add(n) + logging.debug(f"Edge {n} assigned color 0 by additional all_weights") + else: + add_part[1].add(n) + logging.debug(f"Edge {n} assigned color 1 by additional all_weights") + elif G.nodes[n]['length'] < MIN_LEN: + logging.debug(f"Edge {n} did not pass coverage check") + + logging.info(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition') + logging.info(f'Added short edges\t{add_part}') + best_part[0].update(add_part[0]) + best_part[1].update(add_part[1]) + for ind in range (0, 2): + for contig in sorted(best_part[ind]): + #filtering out Aux nodes + if contig in current_component: + report_phased_node(contig, ind, tsv_file) + for contig in sorted(only_weights.keys()): + tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') + logging.info("Phasing successfully finished") if __name__ == "__main__": - if len(sys.argv) != 7: - print(f'Usage: {sys.argv[0]} graph.gfa homologous_nodes.matches hic_byread output_dir, no_rdna, uneven_depth') + if len(sys.argv) != 8: + print(f'Usage: {sys.argv[0]} graph.gfa hpc.mashmap nonhpc.mashmap hic_byread output_dir no_rdna uneven_depth') exit() - run_clustering(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) + run_clustering(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7]) diff --git a/src/scripts/fasta_combine.py b/src/scripts/fasta_combine.py index 8dde500f..a53cdd20 100755 --- a/src/scripts/fasta_combine.py +++ b/src/scripts/fasta_combine.py @@ -23,7 +23,7 @@ mode = sys.argv[1] output_name = sys.argv[2] namedict = seq.readNameMap(sys.argv[3]) - scfmap = seq.readScfMap(sys.argv[4]) + scfmap,_ = seq.readScfMap(sys.argv[4]) filenames = sys.argv[5:] else: diff --git a/src/scripts/fasta_util.py b/src/scripts/fasta_util.py index 757e3cb2..41654f12 100755 --- a/src/scripts/fasta_util.py +++ b/src/scripts/fasta_util.py @@ -144,6 +144,7 @@ def replaceName(name, namedict, mode): # def readScfMap(filename): scfmap = dict() + names = dict() ctgname = "" ctglist = [] @@ -153,11 +154,13 @@ def readScfMap(filename): if words[0] == "path": ctgname = words[1] + path = words[2] ctglist = [] elif words[0] == "end": scfmap[ctgname] = ctglist + names[ctgname] = path else: ctglist.append(words[0]) - return(scfmap) + return(scfmap, names) diff --git a/src/scripts/get_layout_from_mbg.py b/src/scripts/get_layout_from_mbg.py index ad5be91c..ecb5d78d 100755 --- a/src/scripts/get_layout_from_mbg.py +++ b/src/scripts/get_layout_from_mbg.py @@ -5,6 +5,8 @@ import re import graph_functions as gf +random.seed(42) + mapping_file = sys.argv[1] edge_overlap_file = sys.argv[2] @@ -650,7 +652,7 @@ def get_exact_match_length(clusters): # we look at randomly placed reads and pick a single location for each one # if we're first we'll select a location, single value between 1 and number of times it's used if line[0] not in read_output_counts: - read_output_counts[line[0]] = ("", "", 0) + read_output_counts[line[0]] = ("", "", 0, read_actual_counts[line[0]]) #sys.stderr.write("For reach %s which occurs %d times in contigs %s we are seing it for the first time in %s and selecting"%(line[0], read_actual_counts[line[0]], str(read_actual_contigs[line[0]]), contig)) if len(read_actual_contigs[line[0]]) <= 1: #sys.stderr.write(" read is used %d times in %s contigs so placing everywhere "%(read_actual_counts[line[0]], str(read_actual_contigs[line[0]]))) @@ -659,12 +661,12 @@ def get_exact_match_length(clusters): read_actual_counts[line[0]] = random.randint(1, read_actual_counts[line[0]]) #sys.stderr.write(" position %d\n"%(read_actual_counts[line[0]])) #check if we are we right instance - read_output_counts[line[0]] = (read_output_counts[line[0]][0], read_output_counts[line[0]][1], read_output_counts[line[0]][2] + 1) + read_output_counts[line[0]] = (read_output_counts[line[0]][0], read_output_counts[line[0]][1], read_output_counts[line[0]][2] + 1, read_output_counts[line[0]][3]) #sys.stderr.write("The count for read %s is now %s\n"%(line[0], str(read_output_counts[line[0]]))) if read_actual_counts[line[0]] > 0 and read_output_counts[line[0]][2] != read_actual_counts[line[0]]: continue # record the contig and coordinate we will use - read_output_counts[line[0]] = (contig, line[1], read_actual_counts[line[0]]) + read_output_counts[line[0]] = (contig, line[1], read_actual_counts[line[0]], read_output_counts[line[0]][3]) start_pos = min(start_pos, line[1]) start_pos = min(start_pos, line[2]) end_pos = max(end_pos, line[1]) @@ -685,7 +687,7 @@ def get_exact_match_length(clusters): end = line[2] - start_pos #sys.stderr.write("For read %s which has been selected to be in position %s we are expecting %d\n"%(line[0], str(read_output_counts[line[0]]), read_actual_counts[line[0]])) if read_actual_counts[line[0]] < 0 or (read_output_counts[line[0]][0] == contig and read_output_counts[line[0]][1] == line[1]): - print(f"{line[0]}\t{bgn}\t{end}", file=tig_layout_file) + print(f"{line[0]}\t{bgn}\t{end}\t{read_output_counts[line[0]][3]}", file=tig_layout_file) print(f"end", file=tig_layout_file) tig_layout_file.close() diff --git a/src/scripts/get_utig1_from_utig4.py b/src/scripts/get_utig1_from_utig4.py new file mode 100755 index 00000000..a4767e6e --- /dev/null +++ b/src/scripts/get_utig1_from_utig4.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python + +import random +import sys +import re +import graph_functions as gf + +MAX_GAP_SIZE=100000 +mapping_file = sys.argv[1] +edge_overlap_file = sys.argv[2] + +#Either paths from rukki or just single nodes +paths_file = sys.argv[3] + +nodelens_file = sys.argv[4] + +#transform paths to base elements - mbg nodes and gaps. +def get_leafs(path, mapping, edge_overlaps, raw_node_lens): + path_len = 0 + for i in range(0, len(path)): + path_len += raw_node_lens[path[i][1:]] + if i > 0: path_len -= edge_overlaps[gf.canon(path[i-1], path[i])] + result = [(n, 0, raw_node_lens[n[1:]]) for n in path] + overlaps = [] + for i in range(1, len(path)): + overlaps.append(edge_overlaps[gf.canon(path[i-1], path[i])]) + current_len = 0 + for i in range(0, len(result)): + assert result[i][2] > result[i][1] + assert result[i][2] <= raw_node_lens[result[i][0][1:]] + assert result[i][1] >= 0 + current_len += result[i][2] - result[i][1] + if i > 0: current_len -= overlaps[i-1] + assert current_len == path_len + while True: + any_replaced = False + new_result = [] + new_overlaps = [] + for i in range(0, len(result)): + if result[i][0][1:] not in mapping: + new_result.append(result[i]) + if i > 0: new_overlaps.append(overlaps[i-1]) + else: + any_replaced = True + part = [(n, 0, raw_node_lens[n[1:]]) for n in mapping[result[i][0][1:]][0]] + part[0] = (part[0][0], part[0][1] + mapping[result[i][0][1:]][1], part[0][2]) + part[-1] = (part[-1][0], part[-1][1], part[-1][2] - mapping[result[i][0][1:]][2]) + if result[i][0][0] == "<": + part = [(gf.revnode(n[0]), raw_node_lens[n[0][1:]] - n[2], raw_node_lens[n[0][1:]] - n[1]) for n in part[::-1]] + old_start_clip = result[i][1] + old_end_clip = (raw_node_lens[result[i][0][1:]] - result[i][2]) + part[0] = (part[0][0], part[0][1] + old_start_clip, part[0][2]) + part[-1] = (part[-1][0], part[-1][1], part[-1][2] - old_end_clip) + new_result += part + if i > 0: new_overlaps.append(overlaps[i-1]) + for j in range(1, len(part)): + new_overlaps.append(edge_overlaps[gf.canon(part[j-1][0], part[j][0])]) + assert len(new_result) == len(new_overlaps)+1 + assert len(new_result) >= len(result) + if not any_replaced: break + result = new_result + overlaps = new_overlaps + current_len = 0 + for i in range(0, len(result)): + # strangely, this assertion is not always true. + # The ONT based k-mer increase can create a node where the overlap is greater than the initial MBG node size + # and in that case the initial MBG node will have a "negative" length within the contig + # assert result[i][2] > result[i][1] + assert result[i][2] <= raw_node_lens[result[i][0][1:]] + assert result[i][1] >= 0 + current_len += result[i][2] - result[i][1] + if i > 0: current_len -= overlaps[i-1] + assert current_len == path_len + return (result, overlaps) + +raw_node_lens = {} +with open(nodelens_file) as f: + for l in f: + parts = l.strip().split('\t') + assert parts[0] not in raw_node_lens or raw_node_lens[parts[0]] == int(parts[1]) + raw_node_lens[parts[0]] = int(parts[1]) + +edge_overlaps = {} +with open(edge_overlap_file) as f: + for l in f: + parts = l.strip().split('\t') + assert parts[0] == "L" + fromnode = (">" if parts[2] == "+" else "<") + parts[1] + tonode = (">" if parts[4] == "+" else "<") + parts[3] + overlap = int(parts[5][:-1]) + key = gf.canon(fromnode, tonode) + if key in edge_overlaps: assert edge_overlaps[key] == overlap + edge_overlaps[key] = overlap + +node_mapping = {} +cut_mapping = {} +with open(mapping_file) as f: + for l in f: + + parts = l.strip().split('\t') + assert parts[0] not in node_mapping + if not re.search(r"utig\d+[a-z]?-" , parts[1]): + continue + path = parts[1].split(':')[0].replace('<', "\t<").replace('>', "\t>").strip().split('\t') + left_clip = int(parts[1].split(':')[1]) + right_clip = int(parts[1].split(':')[2]) + node_mapping[parts[0]] = (path, left_clip, right_clip) + left_len = raw_node_lens[parts[0]] + right_len = 0 + for i in range(0, len(path)): + right_len += raw_node_lens[path[i][1:]] + if i > 0: right_len -= edge_overlaps[gf.canon(path[i-1], path[i])] + assert left_len == right_len - left_clip - right_clip + + # save the mapping of cut nodes to their respective coordinates so we can find them later + if (len(path) == 1 and path[0][1:] in raw_node_lens): + new_name = path[0][1:] + ":" + str(left_clip) + ":" + str(raw_node_lens[path[0][1:]]-right_clip) + cut_mapping[new_name] = parts[0] +pieceid = 0 + +#all these contains info about contigs - here nodes or rukki paths splitted by N +#paths are transformed into mbg nodes and gaps with get_leafs procedure +contig_lens = {} +contig_node_offsets = {} +contig_pieces = {} +with open(paths_file) as f: + for l in f: + lp = l.strip().split('\t') + + # Find all words that + # begin with [<>], contain anything but [ + # begin with [N, contain digits and end with N] or N:optional-description] + # we dump the description here and anly keep the N, digits N] part + # + fullname = lp[0] + pathfull = re.findall(r"([<>][^[]+|\[N\d+N(?:[^\]]+){0,1}\])", lp[1]) + + contig_pieces[fullname] = [] + + for pp in pathfull: + #pp is either path without gaps or gap. In latest case do nothing + gp = re.match(r"\[(N\d+N)(?:[^\]]+){0,1}\]", pp) + if gp: + tuned_numn = min(round(int(gp.group(1)[1:-1]) * 1.5), MAX_GAP_SIZE) + contig_pieces[fullname].append("[N" + str(tuned_numn) + "N:gap]") + continue + + pieceid = pieceid + 1 + pathname = fullname + + (path, overlaps) = get_leafs(re.findall(r"[<>][^<>]+", pp), node_mapping, edge_overlaps, raw_node_lens) + # skip a path if the only thing in it is a gapfill + if len(path) == 1 and path[0][0][1:4] == "gap": + continue + + contig_node_offsets[pathname] = [] + pos = 0 + end = -1 + for i in range(0, len(path)-1): + contig_node_offsets[pathname].append(pos) + pos += path[i][2] - path[i][1] + pos -= overlaps[i] + contig_node_offsets[pathname].append(pos) + contig_lens[pathname] = contig_node_offsets[pathname][-1] + path[-1][2] - path[-1][1] + check_len = 0 + for i in range(0, len(path)): + check_len += path[i][2] - path[i][1] + if i > 0: check_len -= overlaps[i-1] + assert contig_lens[pathname] == check_len + pathstr = "" + for i in range(0, len(path)): + # build a name using the contig without the <> but also append coordinates if it's partial match to check for cut node + # if a cut version exists, use that name instead, otherwise use the original node name + new_name = path[i][0][1:] + if path[i][1] != 0 or path[i][2] != raw_node_lens[new_name]: + if path[i][0][0] == ">": + new_name = path[i][0][1:] + ":" + str(path[i][1]) + ":" + str(path[i][2]) + else: + new_name = path[i][0][1:] + ":" + str(raw_node_lens[new_name]-path[i][2]) + ":" + str(raw_node_lens[new_name]-path[i][1]) + if new_name not in cut_mapping: + new_name = path[i][0][1:] + + # when we see the name in our path already and the offset is earlier than the largest we have already seen, this is an overlap + # we skip these overlapping nodes from the path and continue at the new unique/larger offset node + #sys.stderr.write("Checking node %s with coordinates %d-%d and offset is %d vs %d and is already used is %d\n"%(path[i][0], path[i][1], path[i][2], (contig_node_offsets[pathname][i]-path[i][1]), end, (new_name in pathstr))) + if (contig_node_offsets[pathname][i]-path[i][1]) <= end and new_name in pathstr: + continue + end = contig_node_offsets[pathname][i]-path[i][1] + if path[i][1] != 0 or path[i][2] != raw_node_lens[path[i][0][1:]]: + if (new_name in cut_mapping): + pathstr += path[i][0] + "_" + cut_mapping[new_name].strip().split("_")[-1] + else: + pathstr += path[i][0] + else: + pathstr += path[i][0] + contig_pieces[fullname].append(pathstr) + +for fullname in contig_pieces: + if "name" in fullname: continue + sys.stdout.write(fullname + "\t" + "".join(contig_pieces[fullname]) + "\n") diff --git a/src/scripts/graph_functions.py b/src/scripts/graph_functions.py index 49cf38f3..bae7bbcc 100755 --- a/src/scripts/graph_functions.py +++ b/src/scripts/graph_functions.py @@ -308,6 +308,7 @@ def topological_sort(nodelens, edges): assert belongs_to_component[edge[0]] >= belongs_to_component[node] return (result, belongs_to_component) +#Likely not used anymore def loadHiCGraph(hic_byread_file): hicGraph = nx.Graph() hic_file = open(hic_byread_file, 'r') diff --git a/src/scripts/launch_phasing.py b/src/scripts/launch_phasing.py index 5ee24665..5f112ca1 100755 --- a/src/scripts/launch_phasing.py +++ b/src/scripts/launch_phasing.py @@ -6,6 +6,7 @@ from os import listdir from os.path import isfile, join import cluster +import logging_utils if __name__ == "__main__": if len(sys.argv) < 4: print(f'Usage: {sys.argv[0]} ') @@ -26,6 +27,7 @@ os.makedirs(output_dir, exist_ok=True) matches_file = os.path.join(output_dir, "unitigs_hpc50.mashmap") + nonhpc_mashmap = os.path.join(output_dir, "unitigs_nonhpc50.mashmap") hic_file = os.path.join(output_dir, "hic_mapping.byread.output") if not os.path.exists(hic_file): hic_file = os.path.join(output_dir, "hic.byread.compressed") @@ -34,7 +36,8 @@ hic_file = compressed_hic noseq_gfa = os.path.join(output_dir, "unitigs.hpc.noseq.gfa") - cluster.run_clustering(noseq_gfa, matches_file, hic_file, output_dir, no_rdna, uneven_depth) + logging_utils.setup_logging(os.path.join(output_dir, "phasing.log")) + cluster.run_clustering(noseq_gfa, matches_file, nonhpc_mashmap, hic_file, output_dir, no_rdna, uneven_depth) #Saved for further debug diff --git a/src/scripts/launch_scaffolding.py b/src/scripts/launch_scaffolding.py index 82f44dd5..a7484506 100755 --- a/src/scripts/launch_scaffolding.py +++ b/src/scripts/launch_scaffolding.py @@ -9,8 +9,8 @@ from numpy import argmax import graph_functions as gf import logging -from scaffolding import scaffold_graph, logger_wrap, path_storage - +from scaffolding import scaffold_graph, path_storage +import logging_utils #get all rdna nodes from mashmap #get nodes from seqtk telo #select nodes that are between 500K and 2M and close to both rDNA and telomeres to be short arm @@ -36,16 +36,13 @@ scaff_rukki_gaf_file = os.path.join(hicrun_dir, "scaff_rukki.paths.gaf") translation_paths_file = os.path.join (hicrun_dir, "final_contigs/6-layoutContigs/unitig-popped.layout.scfmap") translation_hap1_file = os.path.join (hicrun_dir, os.pardir, "translation_hap1") -rdna_file = os.path.join(hicrun_dir, os.pardir, "rdnanodes.txt") hic_byread = os.path.join(hicrun_dir, "hic.byread.compressed") -hicverkko_log = os.path.join(hicrun_dir, "hicverkko.log") - G = nx.DiGraph() gf.load_direct_graph(gfa_file, G) -logger = logger_wrap.initLogger(os.path.join(hicrun_dir, "scaffolding.log")) +logging_utils.setup_logging(os.path.join(hicrun_dir, "scaffolding.log")) path_mashmap = os.path.join(hicrun_dir, "paths2ref.mashmap") #path_mashmap = os.path.join(hicrun_dir, "paths2chm13100K.mashmap") rukki_path_storage = path_storage.PathStorage(G) @@ -54,6 +51,6 @@ read_aln = os.path.join(hicrun_dir, "hic_mapping.byread.output") else: read_aln = os.path.join(hicrun_dir, "hic_nodefiltered.bam") -sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec, logger) +sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec) res = sg.generateScaffolds() diff --git a/src/scripts/logging_utils.py b/src/scripts/logging_utils.py new file mode 100755 index 00000000..38ddc2e6 --- /dev/null +++ b/src/scripts/logging_utils.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import sys +import logging +import time +import os +import subprocess + + +def setup_logging(log_file, file_log_level = logging.DEBUG): + log_level = logging.INFO + # Configure logging with runtime from program start + start_time = time.time() + + class RuntimeFormatter(logging.Formatter): + def __init__(self, fmt=None, datefmt=None, start_time=None): + super().__init__(fmt, datefmt) + self.start_time = start_time + + def format(self, record): + record.runtime = time.time() - self.start_time + return super().format(record) + + datefmt = '%H:%M:%S' + + def format_runtime(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = int(seconds % 60) + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + log_format = '%(runtime)s - %(levelname)s - [%(filename)s:%(funcName)s:%(lineno)d] - %(message)s' + formatter = RuntimeFormatter(log_format, datefmt) + + # Always log to both file and console + #log_file = os.path.join(args.outdir, f"{args.basename}.log") + + # Configure root logger + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + # File handler + file_handler = logging.FileHandler(log_file, mode='w') + file_handler.setFormatter(RuntimeFormatter(log_format, datefmt=datefmt, start_time=start_time)) + file_handler.setLevel(file_log_level) + root_logger.addHandler(file_handler) + + # Console handler + console_handler = logging.StreamHandler(sys.stderr) + console_handler.setFormatter(RuntimeFormatter(log_format, datefmt=datefmt, start_time=start_time)) + console_handler.setLevel(logging.INFO) + root_logger.addHandler(console_handler) + + # Log the GitHub commit hash if available + try: + commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=sys.path[0]).strip().decode('utf-8') + logging.info(f"GitHub Commit Hash: {commit_hash}") + except Exception as e: + logging.warning(f"Failed to retrieve GitHub commit hash: {e}") + + # Log the command-line arguments + logging.info(f"Command-line arguments: {' '.join(sys.argv)}") + logging.info(f"Logging to file: {log_file}") + + +def log_assert(condition, message, logger=None): + """Assert a condition and log an error message if it fails.""" + if not condition: + error_msg = f"Assertion failed: {message}" + if logger: + logger.error(error_msg) + else: + logging.error(error_msg) + exit(1) + #raise AssertionError(error_msg) diff --git a/src/scripts/merge_layouts.py b/src/scripts/merge_layouts.py index b42e69a5..d12b695f 100755 --- a/src/scripts/merge_layouts.py +++ b/src/scripts/merge_layouts.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import sys +import fasta_util as seq +import re def read_list(file_path): read_list = set() @@ -31,8 +33,8 @@ def read_layout_file(file_path, flag): elif line == "end": current_tig = None else: - read, start, end = line.split("\t") - layout_data[-1]["reads"].append({"read": read, "start": int(start), "end": int(end), "flag": flag}) + read, start, end, copy = line.split("\t") + layout_data[-1]["reads"].append({"read": read, "start": int(start), "end": int(end), "copy" : int(copy), "flag": flag}) return layout_data @@ -42,6 +44,7 @@ def merge_layout_files(file1, flag1, file2, flag2): layout2 = read_list(file2) merged_layout = [] + dropped = set() # Merge layout data for tigs present in both files for layout1_entry in layout1: @@ -54,24 +57,70 @@ def merge_layout_files(file1, flag1, file2, flag2): else: keepTig = True - if keepTig == True: merged_layout.append(layout1_entry) + if keepTig == True: + merged_layout.append(layout1_entry) + else: + dropped.add(layout1_entry["tig"]) - return merged_layout + return merged_layout, dropped -file1_path = sys.argv[1] -file2_path = sys.argv[2] +file1_path = sys.argv[1] +file2_path = sys.argv[2] +output_prefix = sys.argv[3] +scfmap, scfname = seq.readScfMap(f"{sys.argv[1]}.scfmap") +tig_layout_file = open(f"{output_prefix}", mode="w") +scf_layout_file = open(f"{output_prefix}.scfmap", mode="w") +nul_layout_file = open(f"{output_prefix}.scfmap.dropped", mode="a") -merged_layout = merge_layout_files(file1_path, 0, file2_path, 1) +merged_layout, dropped = merge_layout_files(file1_path, 0, file2_path, 1) # Print the merged layout for entry in merged_layout: - print("tig", entry["tig"], sep='\t') - print("len", entry["len"], sep='\t') - print("trm", entry["trm"], sep='\t') - print("rds", entry["num_reads"], sep='\t') + print("tig", entry["tig"], sep='\t', file=tig_layout_file) + print("len", entry["len"], sep='\t', file=tig_layout_file) + print("trm", entry["trm"], sep='\t', file=tig_layout_file) + print("rds", entry["num_reads"], sep='\t', file=tig_layout_file) for read_entry in entry["reads"]: - print(read_entry["read"], read_entry["start"], read_entry["end"], read_entry['flag'], sep='\t') - - print("end") + print(read_entry["read"], read_entry["start"], read_entry["end"], read_entry['copy'], read_entry['flag'], sep='\t', file=tig_layout_file) + + print("end", file=tig_layout_file) + +# print the merged scfmap +for clist in scfmap: + keep = False + for piece in scfmap[clist]: + numn = re.match(r"\[N(\d+)N]", piece) + if numn or piece not in dropped: + keep= True + + # now that we've checked that some pieces are saved, output it + if keep == False: + print(f"{clist} has no reads assigned and is not output.", file=nul_layout_file) + continue + + previous = "NONE" + new_pieces = [] + print(f"path {clist} {scfname[clist]}", file=scf_layout_file) + for piece in scfmap[clist]: + is_gap = re.match(r"\[N\d+N\]", piece) + if is_gap: + if previous == "NONE": + continue + if previous == "piece": + new_pieces.append(f"{piece}") + previous = "gap" + else: + #merging consecutive gaps + last = new_pieces.pop() + previous = "gap" + last_int = int(last[2:-2]) + cur_int = int(piece[2:-2]) + new_pieces.append("[N%dN]"%(last_int + cur_int)) + else: + if piece in dropped: continue + new_pieces.append(f"{piece}") + previous = "piece" + print("\n".join(new_pieces), file=scf_layout_file) + print("end", file=scf_layout_file) diff --git a/src/scripts/scaffolding/logger_wrap.py b/src/scripts/scaffolding/logger_wrap.py deleted file mode 100755 index 82eda519..00000000 --- a/src/scripts/scaffolding/logger_wrap.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -import logging -import time - -class UpdatedFormatter(logging.Formatter): - def __init__(self, fmt=None, datefmt=None, style='%', start_time=None): - super().__init__(fmt, datefmt, style) - self.start_time = start_time or time.time() - - def format(self, record): - elapsed_seconds = time.time() - self.start_time - elapsed_time = self.formatTime(record, datefmt="%H:%M:%S") - elapsed_seconds = record.relativeCreated / 1000 - hours, remainder = divmod(elapsed_seconds, 3600) - minutes, seconds = divmod(remainder, 60) - record.elapsed_seconds = "%02d:%02d:%02d" % (hours, minutes, seconds) - record.elapsed_time = elapsed_time - return super().format(record) - -class UpdatedAdapter(logging.LoggerAdapter): - def __init__(self, logger, classname): - super().__init__(logger, {'classname': classname}) - - def process(self, msg, kwargs): - return f'{self.extra["classname"]} - {msg}', kwargs - - -def initLogger(log_file): - logger = logging.getLogger('HiCPipeline') - # self.logger = ClassNameLoggerAdapter(logger, self.__class__.__name__) - - logger.setLevel(logging.DEBUG) - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - formatter = UpdatedFormatter('%(elapsed_seconds)s %(levelname)s: %(message)s', datefmt='%H:%M:%S') - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - file_handler = logging.FileHandler(log_file, mode='w') - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - return logger \ No newline at end of file diff --git a/src/scripts/scaffolding/match_graph.py b/src/scripts/scaffolding/match_graph.py index 233aaaf2..fe5a03d8 100755 --- a/src/scripts/scaffolding/match_graph.py +++ b/src/scripts/scaffolding/match_graph.py @@ -2,16 +2,18 @@ import networkx as nx import graph_functions as gf import sys -from scaffolding import logger_wrap - +import logging #classes for matchGraph construction + +#TODO: only filtered intervals are really used. So we will fail for genomes with diverged haplotypes... class HomologyInfo: #when counting approximate positions, do not allow jumps longer than this * neighboring intervals_lens JUMP_JOINING_FRACTION = 0.5 #DO not want huge jumps anyway JUMP_JOINING_ABSOLUTE = 5000000 + FILTERED_IDY_CUTOFF = 0.995 - def __init__(self, node1, node2, len1, len2, logger): + def __init__(self, node1, node2, len1, len2): self.nodes = [node1, node2] self.len = [len1, len2] self.covered = [0, 0] @@ -26,14 +28,13 @@ def __init__(self, node1, node2, len1, len2, logger): #TODO: should be used for homology check in scaffolding, with specific IDY cutoff and not sorted self.filtered_intervals = [[],[]] - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) def parseIDY(self, idy): return float(idy.split(":")[2]) def addInterval(self, intervals, orientation, idy): real_idy = self.parseIDY(idy) #Constant not depending on genome, intervals too similar for hi-c alignment to use - if real_idy > 0.995: + if real_idy > self.FILTERED_IDY_CUTOFF: for i in range(0, 2): self.filtered_intervals[i].append(intervals[i]) @@ -48,6 +49,13 @@ def addInterval(self, intervals, orientation, idy): self.orientation = orientation self.largest_interval_center = [(intervals[0][1] + intervals[0][0])/2, (intervals[1][1] + intervals[1][0])/2] + def isInFilteredInterval(self, coord1, coord2): + for i in range(0, 2): + for interval in self.filtered_intervals[i]: + if interval[0] <= coord1 <= interval[1] and interval[0] <= coord2 <= interval[1]: + return True + return False + #TODO: whether we should use orientation. def fillCoverage(self): for ind in range(0, 2): @@ -69,7 +77,7 @@ def fillCoverage(self): if state == 0: current_end = prev covered_intervals.append([current_start, current_end]) - self.logger.debug (f"Covered-intervals {self.nodes} {covered_intervals}") + logging.debug (f"Covered-intervals {self.nodes} {covered_intervals}") #TODO Or possibly use approximate alignment intervals? self.covered[ind] = total_c #if there are short trashy intervals between long ones - do not want them to spoil jumping. So dynamic programming here @@ -88,7 +96,7 @@ def fillCoverage(self): if (jump > self.JUMP_JOINING_FRACTION * prev_len and jump > self.JUMP_JOINING_FRACTION * next_len) or jump > self.JUMP_JOINING_ABSOLUTE: continue available_next[i].add(j) - self.logger.debug(f"Available next {self.nodes} {available_next}") + logging.debug(f"Available next {self.nodes} {available_next}") max_i = 0 for i in range (0, len(covered_intervals)): for j in available_next[i]: @@ -110,13 +118,10 @@ def getMinLength(self): return min(self.len[0], self.len[1]) class HomologyStorage: - #{node1: {node2: HomologyInfo(node_1, node_2)}} - def __init__(self, logger, mashmap_file, min_alignment): + #{node1: {node2: HomologyInfo(node_1, node_2)}} + def __init__(self, mashmap_file, min_alignment): self.homologies = {} self.lens = {} - - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) - total_lines = 0 used_lines = 0 for line in open(mashmap_file, 'r'): @@ -133,9 +138,10 @@ def __init__(self, logger, mashmap_file, min_alignment): continue used_lines +=1 #utig4-0 2145330 0 990000 + utig4-0 2145330 12 994065 37 994053 51 id:f:0.999992 kc:f:0.874893 + self.addHomology(arr[0], arr[5], int(arr[1]), int(arr[6]), [[int(arr[2]), int(arr[3])], [int(arr[7]), int(arr[8])]], arr[4], arr[12]) - self.logger.info(f"Loaded {used_lines} out of {total_lines} mashmap lines") - self.logger.info(f"{len(self.homologies)} nodes have at least one used homology") + logging.info(f"Loaded {used_lines} out of {total_lines} mashmap lines") + logging.info(f"{len(self.homologies)} nodes have at least one used homology") self.fillCoverage() #do we want to add other direction of pair? @@ -143,7 +149,7 @@ def addHomology(self, node1, node2, len1, len2, intervals, orientation, idy): if not node1 in self.homologies: self.homologies[node1] = {} if not node2 in self.homologies[node1]: - self.homologies[node1][node2] = HomologyInfo(node1, node2, len1, len2, self.logger) + self.homologies[node1][node2] = HomologyInfo(node1, node2, len1, len2) self.homologies[node1][node2].addInterval(intervals, orientation, idy) self.lens[node1] = len1 self.lens[node2] = len2 @@ -162,6 +168,13 @@ def isValid(self, node1, node2): def getLength(self, node): return self.lens[node] + def isInFilteredInterval(self, node1, node2, coord1, coord2): + if not self.isValid(node1, node2): + return False + if self.homologies[node1][node2].isInFilteredInterval(coord1, coord2): + return True + return False + def getApproximatePositionLength(self, node1, node2, ind): if not self.isValid(node1, node2): return 0 @@ -179,14 +192,11 @@ class MatchGraph: #homologous intervals should cover at least 1/3 of at least one of the nodes in pair REQUIRED_COVERAGE_FRACTION = 1/3 - def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignment, logger): + def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignment): self.matchGraph = nx.Graph() - self.hom_storage = HomologyStorage(logger, mashmap_sim, min_alignment) + self.hom_storage = HomologyStorage(mashmap_sim, min_alignment) self.G = G - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) - - - #Do not want to think whether we use diGraph or Graph + #Do not want to think whether we use diGraph or Graph, internal structures are not oriented neighbours = {} indirect_nodes = set() for node in G.nodes(): @@ -195,15 +205,14 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm for edge in G.edges(): for i in range(0, 2): neighbours[edge[i].strip('-+')].add(edge[1 - i].strip('-+')) - self.logger.info(f"Loaded {len(self.hom_storage.homologies)} homologies") + logging.info(f"Loaded {len(self.hom_storage.homologies)} homologies") for node1 in self.hom_storage.homologies: for node2 in self.hom_storage.homologies[node1]: #we deleted some nodes after mashmap - #sys.stderr.write(f"Checking {node1} {node2} {self.hom_storage.homologies[node1][node2].getMinCovered()} {min_big_homology}\n") if node1 in indirect_nodes and node2 in indirect_nodes: cur_homology = self.hom_storage.homologies[node1][node2].getCoveredLen() if cur_homology > min_big_homology: - self.logger.debug(f"Adding normal edge {node1} {node2} {cur_homology}") + logging.debug(f"Adding normal edge {node1} {node2} {cur_homology}") self.matchGraph.add_edge(node1, node2, homology_len = cur_homology) else: #less strict condition for bulge-like structure @@ -212,7 +221,7 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm # possibly check whether we have something already phased nearby? # strict bulge-like condition, either R1[A/B]R2 or R1[A/B] or [A/B]R2. Digraph can be more precise here but anyway if cur_homology > len_cutoff and neighbours[node1] == neighbours[node2] and len(neighbours[node1]) > 0: - self.logger.debug(f"Adding bulge-like edge {node1} {node2} {cur_homology} {len_cutoff}") + logging.debug(f"Adding bulge-like edge {node1} {node2} {cur_homology} {len_cutoff}") self.matchGraph.add_edge(node1, node2, homology_len = cur_homology) # while we build the initial partition give a big bonus edge for putting the homologous nodes into different partitions @@ -247,10 +256,10 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm else: self.matchGraph.remove_edge(ec[0],ec[1]) - self.logger.info("Loaded match info with %d nodes and %d edges\n" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) + logging.info("Loaded match info with %d nodes and %d edges" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) for d in sorted(self.matchGraph.edges()): - self.logger.debug(f"homology edge {d} : {self.matchGraph.edges[d]}") + logging.debug(f"homology edge {d} : {self.matchGraph.edges[d]}") def getMatchGraph(self): return self.matchGraph @@ -276,6 +285,9 @@ def hasEdge(self, node1, node2): def getEdgeAttribute(self, node1, node2, attr): return self.matchGraph.edges[node1, node2][attr] + def getHomologyLength(self, node1, node2): + return self.getEdgeAttribute(node1, node2, 'homology_len') + def isHomologousNodes(self, node1, node2, strict:bool): if self.matchGraph.has_edge(node1, node2) and ((not strict) or self.matchGraph.edges[node1, node2]['weight'] < 0): return True @@ -314,7 +326,7 @@ def isHomologousPath (self, paths, lens): hom_size += self.matchGraph.edges[nor_p0, nor_p1]['homology_len'] if hom_size * 2> lens[0] or hom_size * 2> lens[1]: - self.logger.debug (f"Found homologous paths {paths} with homology size {hom_size}") + logging.debug (f"Found homologous paths {paths} with homology size {hom_size}") return True else: return False @@ -327,15 +339,15 @@ def getHomologousPaths(self, path_storage, path_id): for hom_node in self.matchGraph.neighbors(node): if self.matchGraph.edges[node, hom_node]['weight'] < 0: homologous_nodes.add(hom_node) - self.logger.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}") + logging.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}") paths_to_check = set() for node in homologous_nodes: paths_to_check.update(path_storage.getPathsFromNode(node)) - self.logger.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}") + logging.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}") for susp_path_id in paths_to_check: if self.isHomologousPath([path_storage.getPathById(path_id), path_storage.getPathById(susp_path_id)], [path_storage.getLength(path_id), path_storage.getLength(susp_path_id)]): homologous_paths.append(susp_path_id) - self.logger.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}") + logging.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}") return homologous_paths diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py index fff229ab..471be7c4 100755 --- a/src/scripts/scaffolding/scaffold_graph.py +++ b/src/scripts/scaffolding/scaffold_graph.py @@ -16,7 +16,8 @@ import pysam #TODO: remove clother to release? import psutil -from scaffolding import logger_wrap, match_graph, path_storage +from scaffolding import match_graph, path_storage + #from line_profiler import LineProfiler class ReferencePosition: @@ -117,26 +118,25 @@ class ScaffoldGraph: MAX_GRAPH_FOR_DISTANCES = 1000000 MAX_COMPONENT_FOR_DISTANCES = 50000 - def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger): - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) + def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec): self.rukki_paths = rukki_paths self.uncompressed_lens = gf.get_lengths(uncompressed_fasta) self.multiplicities = rukki_paths.getEdgeMultiplicities() interesting_nodes = self.getInterestingNodes(self.uncompressed_lens) - self.logger.info(f"Total nodes {len(G.nodes)} interesting nodes {len(interesting_nodes)}") + logging.info(f"Total nodes {len(G.nodes)} interesting nodes {len(interesting_nodes)}") #upd_G - graph with telomeric nodes self.tel_nodes, self.upd_G = gf.get_telomeric_nodes(telomere_locations_file, G) - self.logger.debug("Telomeric nodes") - self.logger.debug(self.tel_nodes) + logging.debug("Telomeric nodes") + logging.debug(self.tel_nodes) - self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT, logger) + self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT) if porec == True: - self.logger.info ("Loading pore-c alignments") + logging.info ("Loading pore-c alignments") all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True) else: - self.logger.info ("Loading Hi-C alignments") + logging.info ("Loading Hi-C alignments") all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True) @@ -179,15 +179,15 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat unique_scores = {} self.dists = {} if G.number_of_nodes() > ScaffoldGraph.MAX_GRAPH_FOR_DISTANCES: - self.logger.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances") + logging.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances") else: max_comp_size = len(max(nx.weakly_connected_components(G), key=len)) if max_comp_size > ScaffoldGraph.MAX_COMPONENT_FOR_DISTANCES: - self.logger.info(f"Biggest component is too big {max_comp_size}, not calculating pairwise distances") + logging.info(f"Biggest component is too big {max_comp_size}, not calculating pairwise distances") else: - self.logger.info("Calculating pairwise distances for assembly graph nodes") + logging.info("Calculating pairwise distances for assembly graph nodes") self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length'])) - self.logger.info("Pairwise distances in assembly graph calculated") + logging.info("Pairwise distances in assembly graph calculated") self.haploids = self.getHaploidPaths(self.rukki_paths) self.scores = {} @@ -217,7 +217,7 @@ def isHaploidPath(self, paths, nor_path_id): if total_hom * 2 < path_len: #DEBUG ONLY if path_len > ScaffoldGraph.MIN_EXPECTED_T2T: - self.logger.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ") + logging.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ") return True else: return False @@ -260,7 +260,7 @@ def nodeToPathDist(self, node, path, check_homologous): closest = ScaffoldGraph.TOO_FAR add_dist = 0 for path_node in path: - #self.logger.debug (f"Checking nodepair dist from {node} to {path_node}") + #logging.debug (f"Checking nodepair dist from {node} to {path_node}") if path_node in self.upd_G.nodes: #TODO: likely this is not needed # if path_node == node: @@ -268,9 +268,9 @@ def nodeToPathDist(self, node, path, check_homologous): # else: closest = min(closest, add_dist + self.orNodeDist(node, path_node)) if check_homologous: - #self.logger.debug(f"Checking homologous to node {path_node}: {self.homologousOrNodes(path_node)}") + #logging.debug(f"Checking homologous to node {path_node}: {self.homologousOrNodes(path_node)}") for hom_node in self.match_graph.getHomologousOrNodes(path_node, True): - #self.logger.debug (f"Checking homologous nodepair dist from {node} to {hom_node}") + #logging.debug (f"Checking homologous nodepair dist from {node} to {hom_node}") if hom_node == node: closest = 0 closest = min(closest, add_dist + self.orNodeDist(node, hom_node)) @@ -286,7 +286,7 @@ def pathDist(self, path_from:list, path_to:list, check_homologous_from:bool, che closest = min(closest, self.nodeToPathDist(node, path_to, check_homologous_to) + add_dist) if check_homologous_from: for hom_node in self.match_graph.getHomologousOrNodes(node, True) : - #self.logger.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}") + #logging.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}") closest = min(closest, (self.nodeToPathDist(hom_node, path_to, check_homologous_to) + add_dist)) add_dist += self.compressed_lens[node.strip("-+")] return closest @@ -327,7 +327,7 @@ def norPathIdDist(self, nor_path_id_from:str, nor_path_id_to:str, paths:path_sto def getPathPositions(self, path_mashmap): - hom_storage = match_graph.HomologyStorage(self.logger, path_mashmap, ScaffoldGraph.MIN_HOMOLOGY_REF) + hom_storage = match_graph.HomologyStorage(path_mashmap, ScaffoldGraph.MIN_HOMOLOGY_REF) for path_id in hom_storage.homologies: all_refs = [] if self.rukki_paths.getLength(path_id) < ScaffoldGraph.MIN_PATH_TO_SCAFFOLD: @@ -337,9 +337,9 @@ def getPathPositions(self, path_mashmap): all_refs.sort(reverse=True) if len(all_refs) == 1 or all_refs[0][0] > ScaffoldGraph.RATIO_HOMOLOGY_REF * all_refs[1][0]: best_ref = all_refs[0][1] - self.logger.debug(f"Clearly best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") + logging.debug(f"Clearly best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") if all_refs[0][0] > ScaffoldGraph.LEN_FRAC_REF * hom_storage.getLength(path_id): - self.logger.debug(f"Best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") + logging.debug(f"Best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") if not (best_ref+'+') in self.reference_positions: self.reference_positions[best_ref+ "+"] = [] self.reference_positions[best_ref+ "-"] = [] @@ -353,12 +353,12 @@ def getPathPositions(self, path_mashmap): self.reference_positions[best_ref + "-"].append(ReferencePosition(path_id + gf.rc_orientation(hom_info.orientation), best_ref + '-', hom_storage.getLength(best_ref) - hom_info.approximate_positions[1][1], hom_storage.getLength(best_ref) - hom_info.approximate_positions[1][0], hom_storage.getLength(path_id), gf.rc_orientation(hom_info.orientation))) else: - self.logger.debug(f"Best homology for {path_id} is {best_ref} not covered enough frac {all_refs[0][0] / hom_storage.getLength(path_id)}") + logging.debug(f"Best homology for {path_id} is {best_ref} not covered enough frac {all_refs[0][0] / hom_storage.getLength(path_id)}") for ref in self.reference_positions: - self.logger.debug(f"Reference positions for {ref}") + logging.debug(f"Reference positions for {ref}") self.reference_positions[ref] = sorted(self.reference_positions[ref], key=lambda x: x.average_pos) for pos in self.reference_positions[ref]: - self.logger.debug(f"{pos.name_q} {pos.name_r} {pos.average_pos} {pos.ref_start} {pos.ref_end} {pos.query_len} {pos.orientation}") + logging.debug(f"{pos.name_q} {pos.name_r} {pos.average_pos} {pos.ref_start} {pos.ref_end} {pos.query_len} {pos.orientation}") def isNextByRef(self, from_path_id, to_path_id): @@ -395,19 +395,19 @@ def forbiddenPair(self, from_path_id, to_path_id): #Heterogametic chromosomes get more links since there is no homologous one to absorb multiple alignments, so no connection of diploid and long enough ahploids if nor_from_path_id in self.haploids and self.rukki_paths.getLength(nor_from_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_to_path_id in self.haploids): if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END: - self.logger.warning(f"Banning distant link from long haploid {from_path_id} to diploid {to_path_id}") + logging.warning(f"Banning distant link from long haploid {from_path_id} to diploid {to_path_id}") return True else: - self.logger.warning(f"Allowing link from long haploid {from_path_id} to diploid {to_path_id}, close in graph") + logging.warning(f"Allowing link from long haploid {from_path_id} to diploid {to_path_id}, close in graph") if nor_to_path_id in self.haploids and self.rukki_paths.getLength(nor_to_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_from_path_id in self.haploids): if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END: - self.logger.warning(f"Banning distant link from diploid {from_path_id} to long haploid {to_path_id}") + logging.warning(f"Banning distant link from diploid {from_path_id} to long haploid {to_path_id}") return True else: - self.logger.warning(f"Allowing link from diploid {from_path_id} to long haploid {to_path_id}, close in graph") + logging.warning(f"Allowing link from diploid {from_path_id} to long haploid {to_path_id}, close in graph") #relatively short fragments with telomere are special case, we may fail to detect orientation there but belive in telomere. if self.rukki_paths.getLength(nor_to_path_id) <= ScaffoldGraph.SHORT_TEL_CUTOFF and self.scaffold_graph.nodes[to_path_id]['telomere'][0]: - self.logger.error(f"Banning link from {from_path_id} into short telomere, should not happen {to_path_id}") + logging.error(f"Banning link from {from_path_id} into short telomere, should not happen {to_path_id}") return True return False @@ -436,7 +436,7 @@ def getScores(self, or_path_id, path_storage, type): connection_nodemap = self.unique_connections_nodemap else: - self.logger.error(f"Unknown type {type}") + logging.error(f"Unknown type {type}") return res nor_path_id = gf.nor_path_id(or_path_id) precounted = True @@ -473,11 +473,11 @@ def getScores(self, or_path_id, path_storage, type): #type: weight/unique_weight def findExtension(self, cur_path_id, type): local_scores = [] - self.logger.info(f"Checking {cur_path_id}") + logging.info(f"Checking {cur_path_id}") if not (cur_path_id in self.scaffold_graph.nodes): return "NONE",0 if self.scaffold_graph.nodes[cur_path_id]['telomere'][1]: - self.logger.info (f"Stopped at the telomere") + logging.info (f"Stopped at the telomere") return "DONE",0 local_scores = self.getScores(cur_path_id, self.rukki_paths, type) @@ -492,20 +492,20 @@ def findExtension(self, cur_path_id, type): second_best_ind = j break if len(local_scores) == 0: - self.logger.info (f"Nothing next") + logging.info (f"Nothing next") return "NONE", 0 elif local_scores[best_ind][1] <= ScaffoldGraph.MIN_LINKS: - self.logger.info (f"very few links, best valid candidate {local_scores[best_ind]}") + logging.info (f"very few links, best valid candidate {local_scores[best_ind]}") return "NONE", 0 #not valid but waay best solution exists elif self.scaffold_graph.nodes[local_scores[best_ind][0]]['telomere'][0]: - self.logger.info (f"Best {local_scores[best_ind]}, is good count but actually are in telomere!") + logging.info (f"Best {local_scores[best_ind]}, is good count but actually are in telomere!") return "NONE", 0 elif len(local_scores) == 1 or (len(local_scores) > 1 and local_scores[second_best_ind][1] == 0): - self.logger.info (f"Only one next, {local_scores[best_ind]}") + logging.info (f"Only one next, {local_scores[best_ind]}") return local_scores[best_ind][0], ScaffoldGraph.CLEAR_MAJORITY*2 else: - self.logger.info (f"Really best {local_scores[best_ind]}, second best {local_scores[second_best_ind]}") + logging.info (f"Really best {local_scores[best_ind]}, second best {local_scores[second_best_ind]}") return local_scores[best_ind][0],local_scores[best_ind][1]/local_scores[second_best_ind][1] #made function to allow to use uniques @@ -513,17 +513,17 @@ def findExtension(self, cur_path_id, type): def findNextPath(self, current_path_id, nor_used_path_ids, type): next_path_id, ratio_fwd = self.findExtension(current_path_id, type) if next_path_id.strip('-+') in nor_used_path_ids: - self.logger.info (f"Extention {next_path_id} looks good but already used") + logging.info (f"Extention {next_path_id} looks good but already used") return "NONE" prev_path_id, ratio_bwd = self.findExtension(gf.rc_path_id(next_path_id), type) if gf.rc_path_id(prev_path_id) != current_path_id: - self.logger.info (f"backward check failed for {next_path_id}") + logging.info (f"backward check failed for {next_path_id}") return "NONE" if ratio_fwd > ScaffoldGraph.CLEAR_MAJORITY and ratio_bwd > ScaffoldGraph.CLEAR_MAJORITY: - self.logger.info (f"Extention {next_path_id} looks good") + logging.info (f"Extention {next_path_id} looks good") return next_path_id elif ratio_fwd * ratio_bwd > ScaffoldGraph.CLEAR_MAJORITY * ScaffoldGraph.CLEAR_MAJORITY * ScaffoldGraph.CLEAR_MAJORITY and type == "unique_weight": - self.logger.info (f" Risky extension, not clear majority for one direction but very clear for other {current_path_id} {next_path_id} {ratio_fwd} {ratio_bwd}") + logging.info (f" Risky extension, not clear majority for one direction but very clear for other {current_path_id} {next_path_id} {ratio_fwd} {ratio_bwd}") return next_path_id return "NONE" @@ -548,8 +548,8 @@ def generateScaffolds(self): tel_starting_paths.sort(key=lambda x: self.rukki_paths.getLength(x.strip('-+')), reverse=True) middle_paths.sort(key=lambda x: self.rukki_paths.getLength(x.strip('-+')), reverse=True) starting_paths = tel_starting_paths + middle_paths - self.logger.info ("Starting paths") - self.logger.info (starting_paths) + logging.info ("Starting paths") + logging.info (starting_paths) #from ID to amount of scaffolds scaff_sizes = {} for or_from_path_id in starting_paths: @@ -563,30 +563,30 @@ def generateScaffolds(self): while True: next_path_id = self.findNextPath(cur_path_id, nor_used_path_ids, "weight") if next_path_id == "NONE": - self.logger.info (f"Failed to find regular extension for {cur_path_id}, trying unique") + logging.info (f"Failed to find regular extension for {cur_path_id}, trying unique") next_path_id = self.findNextPath(cur_path_id, nor_used_path_ids, "unique_weight") if next_path_id == "DONE": - self.logger.info ("All done, stopped at telomere") + logging.info ("All done, stopped at telomere") if tried_reverse: break else: - self.logger.info (f"Reversing {cur_scaffold}") + logging.info (f"Reversing {cur_scaffold}") cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info (f"Reversed {cur_scaffold}") + logging.info (f"Reversed {cur_scaffold}") cur_path_id = cur_scaffold[-1] tried_reverse = True continue elif next_path_id == "NONE": - self.logger.info ("Failed to find extension, stopping") + logging.info ("Failed to find extension, stopping") if tried_reverse: break else: - self.logger.info (f"Reversing {cur_scaffold}") + logging.info (f"Reversing {cur_scaffold}") cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info (f"Reversed {cur_scaffold}") + logging.info (f"Reversed {cur_scaffold}") cur_path_id = cur_scaffold[-1] tried_reverse = True continue @@ -600,10 +600,10 @@ def generateScaffolds(self): if nor_prev_path_id in homologous_paths: # self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]): #TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation - self.logger.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}") + logging.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}") hom_before = True if hom_before: - self.logger.info (f"Homologous paths before in scaffold, not extending {cur_path_id} with {next_path_id}") + logging.info (f"Homologous paths before in scaffold, not extending {cur_path_id} with {next_path_id}") if tried_reverse: break else: @@ -612,7 +612,7 @@ def generateScaffolds(self): cur_path_id = cur_scaffold[-1] tried_reverse = True continue - self.logger.info (f"Extending {cur_path_id} with {next_path_id}") + logging.info (f"Extending {cur_path_id} with {next_path_id}") #possibly not do so with paths of length one? They can be more successful in other direction cur_scaffold.append(next_path_id) @@ -621,7 +621,7 @@ def generateScaffolds(self): #Reversing for comparison with previous runs only cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info(f"scaffold {cur_scaffold}\n") + logging.info(f"scaffold {cur_scaffold}\n") if len(cur_scaffold) > 1: res.append(cur_scaffold) else: @@ -642,28 +642,28 @@ def generateScaffolds(self): total_new_t2t += 1 scaffolded_rephased.extend(scaff_result[3]) final_paths.addPath(scaff_result[0]) - self.logger.info (f"Rephased scaffolds to other haplo, need to check homologous {scaffolded_rephased}") + logging.info (f"Rephased scaffolds to other haplo, need to check homologous {scaffolded_rephased}") #this should be better done after completeConnectedComponent but it is very weird case if it matters rephased_fix = self.getAdditionalRephase(scaffolded_rephased, nor_used_path_ids, self.rukki_paths) - self.logger.info(f"Rephased fix {rephased_fix}") - self.logger.info(f"used_ids {nor_used_path_ids}") + logging.info(f"Rephased fix {rephased_fix}") + logging.info(f"used_ids {nor_used_path_ids}") for nor_path_id in self.rukki_paths.getPathIds(): - self.logger.debug(f"Checking {nor_path_id}") + logging.debug(f"Checking {nor_path_id}") if not (nor_path_id in nor_used_path_ids): if nor_path_id in rephased_fix: - self.logger.debug(f"in reph fix {nor_path_id}") + logging.debug(f"in reph fix {nor_path_id}") final_paths.addPath(rephased_fix[nor_path_id]) else: final_paths.addPath(self.rukki_paths.getPathTsv(nor_path_id)) - self.logger.info("Starting last telomere tuning") + logging.info("Starting last telomere tuning") component_tuned_paths = self.completeConnectedComponent(final_paths) telomere_cheating = len(final_paths.getPathIds()) - len(component_tuned_paths.getPathIds()) self.fixHaploidNames(component_tuned_paths) - self.logger.warning (f"Total normal scaffolds {total_scf} last telomere tuned {telomere_cheating} total jumps {total_jumps + telomere_cheating} new T2T {total_new_t2t + telomere_cheating}") + logging.warning (f"Total normal scaffolds {total_scf} last telomere tuned {telomere_cheating} total jumps {total_jumps + telomere_cheating} new T2T {total_new_t2t + telomere_cheating}") self.outputScaffolds(component_tuned_paths) return res @@ -685,17 +685,17 @@ def getAdditionalRephase(self, scaffolded_rephased, used_ids, paths): splitted_id = alt_path_id.split("_") old_prefix = splitted_id[0] if alt_path_id in used_ids: - self.logger.info(f"Not rephasing {alt_path_id} because it is scaffolded anyway") + logging.info(f"Not rephasing {alt_path_id} because it is scaffolded anyway") continue if old_label in label_switch and old_prefix in prefix_switch: new_label = label_switch[old_label] new_prefix = prefix_switch[old_prefix] new_id = new_prefix + "_" + "_".join(splitted_id[1:]) - self.logger.warning(f"Rephasing {alt_path_id} to {new_id} because of {nor_old_id} scaffolded") + logging.warning(f"Rephasing {alt_path_id} to {new_id} because of {nor_old_id} scaffolded") new_tsv = "\t".join([new_id, paths.getPathString(alt_path_id), new_label]) res[alt_path_id] = new_tsv else: - self.logger.warning(f"Rephasing failed for {alt_path_id} initiated by {nor_old_id}") + logging.warning(f"Rephasing failed for {alt_path_id} initiated by {nor_old_id}") return res #if only two paths with one telomere missing remains in connected component and all other long nodes are in T2T and some conditions on distance and length then we connect def completeConnectedComponent(self, prefinal_paths): @@ -730,7 +730,7 @@ def completeConnectedComponent(self, prefinal_paths): node_comp[node] = comp_id - self.logger.info(f"Checking {len(nor_components)} weakly connected components ") + logging.info(f"Checking {len(nor_components)} weakly connected components ") for path_id in prefinal_paths.getPathIds(): cur_color = 0 for node in prefinal_paths.getPathById(path_id): @@ -785,12 +785,12 @@ def completeConnectedComponent(self, prefinal_paths): t2t_paths += 1 #component not finished, stopping elif (not start_telo) and not (end_telo): - self.logger.debug(f"Found path {path_id} of length {prefinal_paths.getLength(path_id)} without telomeres, do nothing with component ") + logging.debug(f"Found path {path_id} of length {prefinal_paths.getLength(path_id)} without telomeres, do nothing with component ") missing_telos = 1000 break #only two non-t2t paths of reasonable length in connected component if (total_long_paths > 0): - self.logger.debug(f"Component {comp_id} has {missing_telos} non-t2t telos, {t2t_paths} t2t paths, {total_long_paths} long paths") + logging.debug(f"Component {comp_id} has {missing_telos} non-t2t telos, {t2t_paths} t2t paths, {total_long_paths} long paths") if missing_telos == 2: or_ids = [] @@ -802,7 +802,7 @@ def completeConnectedComponent(self, prefinal_paths): dist = self.orPathIdDist(or_ids[0], or_ids[1], prefinal_paths, True) #Paths are not far away - self.logger.info (f"Potential t2t connectivity cheat {or_ids} dist {dist}") + logging.info (f"Potential t2t connectivity cheat {or_ids} dist {dist}") if dist < ScaffoldGraph.CLOSE_IN_GRAPH: sum_len = prefinal_paths.getLength(to_scaffold[0]) + prefinal_paths.getLength(to_scaffold[1]) @@ -816,7 +816,7 @@ def completeConnectedComponent(self, prefinal_paths): if cur_len > ScaffoldGraph.MIN_EXPECTED_T2T: haploid_paths += 1 else: - self.logger.debug(f"Path {alt_path_id} is haploid but too short") + logging.debug(f"Path {alt_path_id} is haploid but too short") short_haploid_paths += 1 if telos[alt_path_id][0] and telos[alt_path_id][1] and cur_len > ScaffoldGraph.MIN_EXPECTED_T2T: if cur_len > sum_len * ScaffoldGraph.SIMILAR_LEN_FRAC and cur_len * ScaffoldGraph.SIMILAR_LEN_FRAC < sum_len: @@ -826,7 +826,7 @@ def completeConnectedComponent(self, prefinal_paths): #Either haploid or there is homologous T2T of similar length, not too short if ((distant_len_t2t == 0 and haploid_paths ==2) or similar_len_t2t > 0) and sum_len > ScaffoldGraph.MIN_EXPECTED_T2T: node_sec = [] - self.logger.info(f"t2t connectivity cheat worked {or_ids}!") + logging.info(f"t2t connectivity cheat worked {or_ids}!") new_tsv_tuple = self.scaffoldFromOrPathIds(or_ids, prefinal_paths) res.addPath(new_tsv_tuple[0]) if new_tsv_tuple[1]: @@ -835,14 +835,14 @@ def completeConnectedComponent(self, prefinal_paths): for i in range(0, 2): scaffolded_paths.add(to_scaffold[i].strip('-+')) else: - self.logger.info(f"t2t connectivity cheat {or_ids} failed, total_len {sum_len} distant len count {distant_len_t2t} haploid paths {haploid_paths} similar len {similar_len_t2t} short haploid paths: {short_haploid_paths}") + logging.info(f"t2t connectivity cheat {or_ids} failed, total_len {sum_len} distant len count {distant_len_t2t} haploid paths {haploid_paths} similar len {similar_len_t2t} short haploid paths: {short_haploid_paths}") else: - self.logger.info(f"t2t connectivity cheat {or_ids} failed, too far {dist}") + logging.info(f"t2t connectivity cheat {or_ids} failed, too far {dist}") for path_id in prefinal_paths.getPathIds(): if not (path_id in scaffolded_paths): res.addPath(prefinal_paths.getPathTsv(path_id)) - self.logger.warning (f"Total last telomere tuned scaffolds (all t2t) {added_t2t}, jumps {total_jumps}") + logging.warning (f"Total last telomere tuned scaffolds (all t2t) {added_t2t}, jumps {total_jumps}") return res #Returns (new_tsv_line, is_t2t, num_jumps, nor_rephased_ids) @@ -887,7 +887,7 @@ def scaffoldFromOrPathIds(self, or_ids, prefinal_paths): for i in range (0, len(or_ids) - 1): swap_info = (or_ids[i].strip('-+'), or_ids[i+1].strip('-+'), or_ids[i][-1] + or_ids[i+1][-1]) if (swap_info) in self.dangerous_swaps: - self.logger.warning (f"consecutive pair, {swap_info} did signficant changes on hi-c counts based on {self.dangerous_swaps[swap_info]}") + logging.warning (f"consecutive pair, {swap_info} did signficant changes on hi-c counts based on {self.dangerous_swaps[swap_info]}") path_str = "\t".join([largest_id, ",".join(scf_path), largest_label]) telo_end = False telo_start = False @@ -897,7 +897,7 @@ def scaffoldFromOrPathIds(self, or_ids, prefinal_paths): if self.upd_G.has_edge(tel_node, scf_path[0]): telo_start = True if len(or_ids) > 1: - self.logger.info (f"Added SCAFFOLD {or_ids} {telo_start} {telo_end} ") + logging.info (f"Added SCAFFOLD {or_ids} {telo_start} {telo_end} ") return (path_str, (telo_start and telo_end), len(or_ids) - 1, rephased) #large haploid paths should be assigned based on connectivity. Complete human chrX will always be in hap1 @@ -916,12 +916,12 @@ def fixHaploidNames(self, paths: path_storage.PathStorage): if not (close_diploid_path): large_haploid_ids.append(path_id) else: - self.logger.warning(f"large haploid and diploid paths connected, {path_id} and {alt_path}, lens {paths.getLength(path_id)} {paths.getLength(alt_path)}, no relabeing") + logging.warning(f"large haploid and diploid paths connected, {path_id} and {alt_path}, lens {paths.getLength(path_id)} {paths.getLength(alt_path)}, no relabeing") large_haploid_ids.sort(key=lambda x: paths.getLength(x), reverse=True) for path_id in paths.getPathIds(): if path_id.split("_")[0] == "mat": - self.logger.info("Trio labeling detected, haploid haplotype reassignment cancelled") + logging.info("Trio labeling detected, haploid haplotype reassignment cancelled") return names_prefix = {} @@ -956,18 +956,18 @@ def fixHaploidNames(self, paths: path_storage.PathStorage): if new_label != cur_label: utig_id = path_id.split("_")[-1] new_id = names_prefix[new_label] + utig_id - self.logger.info(f"Reasigning haploid label for {path_id} {cur_label} to {new_id} {new_label}, dist {clothest_dist} len {paths.getLength(path_id)}") + logging.info(f"Reasigning haploid label for {path_id} {cur_label} to {new_id} {new_label}, dist {clothest_dist} len {paths.getLength(path_id)}") reassigned += 1 else: - self.logger.info(f"Reassignment of haploid label for {path_id} not required, dist {clothest_dist} len {paths.getLength(path_id)}") + logging.info(f"Reassignment of haploid label for {path_id} not required, dist {clothest_dist} len {paths.getLength(path_id)}") large_haploid_info.append([new_id, ",".join(paths.getPathById(path_id)), new_label]) for path_id in large_haploid_ids: paths.removePath(path_id) for info in large_haploid_info: paths.addPath("\t".join(info)) - self.logger.info(f"Reassigned {reassigned} large haploids, groups of distant haploid paths detected: {not_connected_hap_assign}") + logging.info(f"Reassigned {reassigned} large haploids, groups of distant haploid paths detected: {not_connected_hap_assign}") if not_connected_hap_assign != 0 and not_connected_hap_assign != 2: - self.logger.warning(f"Strange number of not connected haploid groups {not_connected_hap_assign}") + logging.warning(f"Strange number of not connected haploid groups {not_connected_hap_assign}") def outputScaffolds(self, final_paths): @@ -990,7 +990,7 @@ def get_connections_porec(self, alignment_file, use_multimappers:bool): for line in open (alignment_file): ind += 1 if (ind % 10000000 == 0): - self.logger.debug (f"Processed {ind} pore-c alignments") + logging.debug (f"Processed {ind} pore-c alignments") arr = line.split() first = arr[1] @@ -1009,7 +1009,7 @@ def get_connections_porec(self, alignment_file, use_multimappers:bool): unique_storage[names_pair] = [] res[names_pair].append((coords_pair[0], coords_pair[1], weight)) unique_storage[names_pair].append((coords_pair[0], coords_pair[1], weight)) - self.logger.debug (f"Finally processed {ind} pore-c alignments") + logging.debug (f"Finally processed {ind} pore-c alignments") return res, unique_storage def get_connections_bam(self, bam_filename, use_multimappers:bool): @@ -1034,7 +1034,7 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): if (total_reads % 10000000 == 0): if (total_reads % 100000000 == 0): before_compressed = total_compressed - self.logger.debug("Starting map compression...") + logging.debug("Starting map compression...") for names_pair in res: if len(res[names_pair]) > 1000: names_sorted = sorted(res[names_pair]) @@ -1051,14 +1051,14 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): compressed.append((names_sorted[ii][0], names_sorted[ii][1], cur_w)) ii = jj res[names_pair] = compressed - self.logger.debug(f"On current iteration compressed {total_compressed - before_compressed}") - self.logger.debug (f"Processed {total_reads} alignment strings") - self.logger.debug (f"Of them unique vaild unique pairs {unique_pairs}, total pairs {all_pairs} total valid {valid_pairs} ") - self.logger.debug (f"Current memory usage {(psutil.virtual_memory().used / 1024 / 1024 / 1024):.2f} GB") - self.logger.debug (f"Created new pairs {created_pair} total inserted pairs = {total_inserted} total compressed = {total_compressed}") + logging.debug(f"On current iteration compressed {total_compressed - before_compressed}") + logging.debug (f"Processed {total_reads} alignment strings") + logging.debug (f"Of them unique vaild unique pairs {unique_pairs}, total pairs {all_pairs} total valid {valid_pairs} ") + logging.debug (f"Current memory usage {(psutil.virtual_memory().used / 1024 / 1024 / 1024):.2f} GB") + logging.debug (f"Created new pairs {created_pair} total inserted pairs = {total_inserted} total compressed = {total_compressed}") #gc.collect() - #self.logger.debug (f"Current memory usage after GC {psutil.virtual_memory().used / 1024 / 1024 / 1024}GB") - #self.logger.debug (f"Mem usage of main map {asizeof.asizeof(res) / 1024 / 1024 / 1024}GB") + #logging.debug (f"Current memory usage after GC {psutil.virtual_memory().used / 1024 / 1024 / 1024}GB") + #logging.debug (f"Mem usage of main map {asizeof.asizeof(res) / 1024 / 1024 / 1024}GB") #if total_reads == 20000000: # exit() @@ -1110,7 +1110,7 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): filtered_coords = coords lname0 = len(filtered_names[0]) lname1 = len(filtered_names[1]) - #self.logger.debug (f" {valid} {lname0} {lname1}") + #logging.debug (f" {valid} {lname0} {lname1}") if valid and lname0 > 0 and lname1 > 0: valid_pairs += 1 for i in range (0, lname0): @@ -1176,7 +1176,7 @@ def fixOrientation(self, path_ids, scores): if correct_or != "": pair_or = ["",""] pair_or[i] = correct_or - self.logger.debug (f"tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug (f"tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") for second_or in ('-', '+'): correct_pair = pair_or.copy() correct_pair[1 - i] = second_or @@ -1187,16 +1187,16 @@ def fixOrientation(self, path_ids, scores): #just logging if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS* self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: self.dangerous_swaps[(path_ids[0], path_ids[1], correct_pair)] = "telomeres" - self.logger.debug(f"Dangerous telomeric tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']} from {incorrect_pair} to {correct_pair}") - self.logger.debug (f"moving {incorrect_pair} to {correct_pair}") + logging.debug(f"Dangerous telomeric tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']} from {incorrect_pair} to {correct_pair}") + logging.debug (f"moving {incorrect_pair} to {correct_pair}") if self.rukki_paths.getLength(path_ids[i]) <= self.NEAR_PATH_END: scores[correct_pair] += scores[incorrect_pair] #Logic different with connectivity! we never want to connect through telomere else: if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: - self.logger.debug(f"Dangerous telomeric tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug(f"Dangerous telomeric tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") scores[incorrect_pair] = 0 - self.logger.debug (f"telomeric tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug (f"telomeric tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") for i in range (0, 2): #TODO: WIP @@ -1209,7 +1209,7 @@ def fixOrientation(self, path_ids, scores): to_check_ids[1 - i] = path_ids[1 - i] + fixed_orientation to_check_ids[i] = path_ids[i] + orient shortest_paths[orient] = self.orPathIdDist(to_check_ids[0], to_check_ids[1], self.rukki_paths, True) - self.logger.debug(f"Checking dists {to_check_ids} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}") + logging.debug(f"Checking dists {to_check_ids} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}") if shortest_paths['-'] < min_cutoff and shortest_paths['+'] > max_cutoff: correct_or = "-" @@ -1229,15 +1229,15 @@ def fixOrientation(self, path_ids, scores): #just logging if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: self.dangerous_swaps[(path_ids[0], path_ids[1], correct_pair)] = "connectivity" - self.logger.debug(f"Dangerous connectivity tuning pair {path_ids}, i {i} scores {scores}from {incorrect_pair} to {correct_pair}") + logging.debug(f"Dangerous connectivity tuning pair {path_ids}, i {i} scores {scores}from {incorrect_pair} to {correct_pair}") if self.rukki_paths.getLength(path_ids[i]) <= self.NEAR_PATH_END: scores[correct_pair] += scores[incorrect_pair] scores[incorrect_pair] = 0 else: if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: - self.logger.debug(f"Dangerous connectivity tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug(f"Dangerous connectivity tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") #Do not want to trust graph connectivity more than hi-c links, so not setting scores of incorrect links to zero. Not the same as in telomere cheat! - #self.logger.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + #logging.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") #Code duplication:( for first_or in ('-', '+'): for second_or in ('-', '+'): @@ -1246,7 +1246,7 @@ def fixOrientation(self, path_ids, scores): dist_paths = self.orPathIdDist(to_check[0], to_check[1], self.rukki_paths, True) if dist_paths < self.CLOSE_IN_GRAPH and dist_paths < self.rukki_paths.getLength(path_ids[0]) / 4 and dist_paths < self.rukki_paths.getLength(path_ids[1]) / 4: scores[or_str] *= self.CONNECTIVITY_MULTIPLICATIVE_BONUS - self.logger.debug(f"Connectivity bonus applied pair {to_check_ids}, scores {scores}") + logging.debug(f"Connectivity bonus applied pair {to_check_ids}, scores {scores}") return scores @@ -1322,9 +1322,9 @@ def getNodePairConnections(self, pair, orientations, connections, shift_before, scores[str] += conn[2] else: scores["middle"] += conn[2] - self.logger.debug (f"Scores for {pair} {scores} {orientations} filtered/not_filtered {filtered} {not_filtered}") - #self.logger.debug (f"Shifts {shift_before} {shift_after}") - #self.logger.debug (f"Ignored intervals {intervals}") + logging.debug (f"Scores for {pair} {scores} {orientations} filtered/not_filtered {filtered} {not_filtered}") + #logging.debug (f"Shifts {shift_before} {shift_after}") + #logging.debug (f"Ignored intervals {intervals}") return scores #TODO: remove, not used anymore @@ -1397,12 +1397,12 @@ def getPathPairConnections(self, path_ids, connections, lens): for key in scores_pair: scores[key] += scores_pair[key] if self.isPathPairForDebug(path_ids, self.rukki_paths): - self.logger.debug (f"Pathscores for {path_ids} {scores}") + logging.debug (f"Pathscores for {path_ids} {scores}") scores = self.fixOrientation(path_ids, scores) for orientation in scores: if len(orientation) == 2: if self.isNextByRef(path_ids[0] + orientation[0], path_ids[1] + orientation[1]): - self.logger.debug (f"Reference connection found! {path_ids} {orientation}") + logging.debug (f"Reference connection found! {path_ids} {orientation}") scores[orientation] *= self.REFERENCE_MULTIPLICATIVE_BONUS scores[orientation] /= self.INT_NORMALIZATION @@ -1418,7 +1418,7 @@ def getPathPairConnections(self, path_ids, connections, lens): #Never want to apply this bonus twice if very_long <= 1 and very_short == 0: coeff = self.NEAR_PATH_END / min(self.rukki_paths.getLength(path_ids[0]), self.rukki_paths.getLength(path_ids[1])) - self.logger.debug(f"Normalization is applied to {path_ids} , lens {self.rukki_paths.getLength(path_ids[0])} {self.rukki_paths.getLength(path_ids[1])} coefficent {coeff}") + logging.debug(f"Normalization is applied to {path_ids} , lens {self.rukki_paths.getLength(path_ids[0])} {self.rukki_paths.getLength(path_ids[1])} coefficent {coeff}") scores[orientation] *= coeff return scores diff --git a/src/verkko.sh b/src/verkko.sh index 77d26676..141d95e6 100755 --- a/src/verkko.sh +++ b/src/verkko.sh @@ -209,7 +209,7 @@ hic2="" withont="False" withhic="False" withporec="False" -withbam="False" +withbam="True" withref="False" keepinter="True" @@ -300,7 +300,7 @@ mbg_unitig_abundance=2 # split_ont, partitioning ont reads for alignment spl_bases=3000000000 spl_reads=150000 -spl_min_length=0 +spl_min_length=$cor_min_read # align_ont, alignment of ont reads to the initial graph ali_mxm_length=30 @@ -330,6 +330,11 @@ ruk_hap2="" ruk_type="" ruk_fract="0.9" +#consensus +cns_num_iter="2" +cns_max_cov="50" +cns_quick="False" + # HiC heuristics haplo_divergence=0.05 rdna_scaff="True" @@ -414,7 +419,7 @@ ruk_time_h=4 # create_layout lay_n_cpus=1 -lay_mem_gb=32 +lay_mem_gb=64 lay_time_h=24 # get_ont_subset @@ -423,14 +428,14 @@ sub_mem_gb=16 sub_time_h=24 # partition_consensus -par_n_cpus=8 -par_mem_gb=8 -par_time_h=24 +par_n_cpus=1 +par_mem_gb=16 +par_time_h=96 # cns cns_n_cpus=8 cns_mem_gb=0 -cns_time_h=24 +cns_time_h=48 # align_hic stuff ahc_n_cpus=24 @@ -440,7 +445,7 @@ ahc_time_h=48 # fast things in hic pipeline fhc_n_cpus=8 fhc_mem_gb=16 -fhc_time_h=24 +fhc_time_h=36 # hic scaffolding pipeline shc_n_cpus=8 @@ -596,12 +601,14 @@ while [ $# -gt 0 ] ; do elif [ "$opt" = "--correct-k-mer-size" ] ; then mer_size=$arg; shift elif [ "$opt" = "--correct-k-mer-window" ] ; then mer_window=$arg; shift - elif [ "$opt" = "--correct-k-mer-coverage" ] ; then mer_coverage=$arg; shift - elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; shift + elif [ "$opt" = "--correct-k-mer-coverage" ] ; then mer_coverage=$arg; shift + elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; spl_min_length=$arg; shift elif [ "$opt" = "--correct-min-overlap-length" ] ; then cor_min_overlap=$arg; shift elif [ "$opt" = "--correct-index-batches" ] ; then cor_index_batches=$arg; shift elif [ "$opt" = "--correct-overlap-batches" ] ; then cor_overlap_batches=$arg; shift + elif [ "$opt" = "--consensus-num-iterations" ] ; then cns_num_iter=$arg; shift + elif [ "$opt" = "--consensus-max-coverage" ] ; then cns_max_cov=$arg; shift # # MBG options @@ -621,7 +628,6 @@ while [ $# -gt 0 ] ; do elif [ "$opt" = "--split-bases" ] ; then spl_bases=$arg; shift elif [ "$opt" = "--split-reads" ] ; then spl_reads=$arg; shift - elif [ "$opt" = "--min-ont-length" ] ; then spl_min_length=$arg; shift # # alignONT options @@ -652,7 +658,7 @@ while [ $# -gt 0 ] ; do # Post-processing options # - elif [ "$opt" = "--consensus-bam" ] ; then withbam="True"; lay_mem_gb=`expr $lay_mem_gb \* 2`; + elif [ "$opt" = "--no-consensus-bam" ] ; then withbam="False"; lay_mem_gb=`expr $lay_mem_gb \/ 2`; elif [ "$opt" = "--discard-short" ] ; then short_contig_length=$arg; shift elif [ "$opt" = "--screen-human-contaminants" ]; then screen="$screen ebv human-ebv-AJ507799.2.fasta.gz" @@ -961,12 +967,16 @@ if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" -o "x$withbam" = "xTrue" ] elif [ ! -e "$bwa" ] ; then errors="${errors}Can't find BWA executable at '$bwa'.\n" fi + # first instance of consensus should be quick + cns_quick="True" elif [ "x$withporec" = "xTrue" ]; then if [ "x$minimap" = "x" ] ; then errors="${errors}Can't find MINIMAP2 executable in \$PATH or \$VERKKO/bin/minimap2.\n" elif [ ! -e "$minimap" ] ; then errors="${errors}Can't find MINIMAP2 executable at '$bwa'.\n" fi + # first instance of consensus should be quick + cns_quick="True" fi if [ "x$withhic" = "xTrue" -a "x$withporec" = "xTrue" ]; then errors="${errors}Both --hic1/--hic2 and --porec cannot be specified at the same time, please only specify one\n" @@ -1108,7 +1118,6 @@ if [ "x$help" = "xhelp" -o "x$errors" != "x" ] ; then echo "ONT splitting:" echo " --split-bases" echo " --split-reads" - echo " --min-ont-length" echo " " echo " " echo "GraphAligner:" @@ -1247,6 +1256,10 @@ echo >> ${outd}/verkko.yml "ruk_hap2: '${ruk_hap2}'" echo >> ${outd}/verkko.yml "ruk_type: '${ruk_type}'" echo >> ${outd}/verkko.yml "ruk_fract: '${ruk_fract}'" echo >> ${outd}/verkko.yml "" +echo >> ${outd}/verkko.yml "# consensus" +echo >> ${outd}/verkko.yml "cns_num_iter: '${cns_num_iter}'" +echo >> ${outd}/verkko.yml "cns_max_cov: '${cns_max_cov}'" +echo >> ${outd}/verkko.yml "cns_quick: '${cns_quick}'" echo >> ${outd}/verkko.yml "# HiC algo options" echo >> ${outd}/verkko.yml "no_rdna_tangle: '${no_rdna_tangle}'" echo >> ${outd}/verkko.yml "uneven_depth: '${uneven_depth}'" @@ -1527,6 +1540,7 @@ if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" ] ; then fi cd $newoutd + sed -i.bak -E "s/^(cns_quick:[[:space:]]*)'True'/\1'False'/" verkko.yml sed -i.bak 's/runRukkiHIC/cnspath/g' snakemake.sh sed -i.bak 's/HiC_rdnascaff/cnspath/g' snakemake.sh ./snakemake.sh