From ccc91709ed35ea8d46491600f3218b610849a485 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 30 Jul 2025 16:09:24 -0400 Subject: [PATCH 01/19] Phasing reworekd --- src/scripts/cluster.py | 777 ++++++++++++++++--------- src/scripts/scaffolding/logger_wrap.py | 10 +- src/scripts/scaffolding/match_graph.py | 8 +- 3 files changed, 515 insertions(+), 280 deletions(-) diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index 49827edf..be5655ad 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -1,21 +1,48 @@ +import logging import sys import random import networkx as nx import math import os -import graph_functions -import copy from networkx.algorithms import community import graph_functions as gf from scaffolding import match_graph, logger_wrap + +#TODO: move constants to some more relevant place +MIN_LEN = 200000 # best result so far with 200000 + +#likely do not need +#MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless + +KLIN_STARTS = 30 # number of different starts of kernighan lin +KLIN_ITER = 10000 # number of iterations inside kernighan lin + +MAX_SHORT_COMPONENT = 100 # we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than MAX_SHORT_COMPONENT +MAX_SHORT_LEN=200000 #we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than +MIN_WEIGHT = 10 # Ignore edges with few links +SIGNIFICANT_MAJORITY = 2.5 + +RESULT_FILENAME = "hicverkko.colors.tsv" +FILTERED_GFA_FILENAME = "filtered.gfa" +HIC_COMPRESSED_FILENAME = "hic.byread.compressed" + +CLEAR_HOMOLOGY = 250000 #for clear bulge like structures this limmit is decreased +MIN_ALIGNMENT = 50000 #smaller alingments will be filtered out +MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa +FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets + +MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only +MIN_RDNA_COMPONENT = 500000 +logger = logger_wrap.initLogger("phasing.log") + + def check_non_empty(part, G): for p in part: if p in G.nodes: return True return False - #Currently not in use anymore def IsTip(node, edges): for pref in ['>', '<']: @@ -32,7 +59,6 @@ def IsTip(node, edges): return True return False - #collapse a node, add links from def collapseOrientedNode (edges, node): for pref in ['>', '<']: @@ -99,12 +125,12 @@ def checkXYcomponent(current_component, matchgraph, G, edges): if len(fake_hom) == 0 or fake_hom[-1][0] != f_node or fake_hom[-1][1] != s_node: fake_hom.append([f_node, s_node]) if len(fake_hom) > 1: - sys.stderr.write("MULTIPLE CANDIDATES FOR FAKE HOMOLOGY IN XY\n") + logger.warning("MULTIPLE CANDIDATES FOR FAKE HOMOLOGY IN XY\n") for pair in fake_hom: - sys.stderr.write(f"{pair[0]} --- {pair[1]}\n") + logger.warning(f"{pair[0]} --- {pair[1]}\n") return [0, 0] elif len(fake_hom) == 1: - sys.stderr.write(f"Adding FAKE HOMOLOGY between {fake_hom[0][0]} --- {fake_hom[0][1]}\n") + logger.info(f"Adding FAKE HOMOLOGY between {fake_hom[0][0]} --- {fake_hom[0][1]}") return [fake_hom[0][0], fake_hom[0][1]] return [0, 0] @@ -159,57 +185,389 @@ def getMedianCov(nodeset): sum_l = 0 sorted_nodes = sorted(nodeset, key=lambda node: node[1]['coverage']) for node in sorted_nodes: - print (f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') + logger.debug(f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') sum_l += node[1]['length'] if 2*sum_l > total_length: med_cov = node[1]['coverage'] - print (f'Median coverage is {med_cov}\n') + logger.info(f'Median coverage is {med_cov}') break return med_cov -def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, uneven_depth): - #TODO: move constants to some more relevant place - MIN_LEN = 200000 # best result so far with 200000 - - #likely do not need - #MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless - - KLIN_STARTS = 1000 # number of different starts of kernighan lin - KLIN_ITER = 10000 # number of iterations inside kernighan lin - - MAX_SHORT_COMPONENT = 100 # we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than MAX_SHORT_COMPONENT - MAX_SHORT_LEN=200000 #we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than - MIN_WEIGHT = 10 # Ignore edges with few links - SIGNIFICANT_MAJORITY = 2.5 - - RESULT_FILENAME = "hicverkko.colors.tsv" - FILTERED_GFA_FILENAME = "filtered.gfa" - HIC_COMPRESSED_FILENAME = "hic.byread.compressed" - LOGGING_FILENAME = "hicverkko.log" +def report_phased_node(node_id, hap, output_file): + if hap == 1: + output_file.write(f'{node_id}\t0\t100000\t0:100000\t#8888FF\n') + elif hap == 0: + output_file.write(f'{node_id}\t100000\t0\t100000:0\t#FF8888\n') + +def create_initial_partition_noKL(C, matchGraph, seed): + random.seed(seed) + neg_graph = nx.Graph() + for ec in matchGraph.edges(): + if ec[0] in C and ec[1] in C: + if matchGraph.edges[ec]['weight'] < 0: + for i in range (2): + if not ec[i] in neg_graph.nodes(): + neg_graph.add_node(ec[i]) + neg_graph.add_edge(ec[0], ec[1]) + elif (ec[0] in C or ec[1] in C) and matchGraph.edges[ec]['weight'] < 0: + logger.warning(f"Weird, edge {ec} partially in component{C}") + #exit(1) + dists = dict(nx.all_pairs_shortest_path_length(neg_graph)) + #friends - should always be in the same component + #enemies - should always be in the opposite + friends = {} + enemies = {} + for n in C: + if n in neg_graph: + friends[n] = [] + enemies[n] = [] + for other in dists[n]: + if other != n: + #TODO: possibly random starts needed? + if dists[n][other] % 2 == 0: + friends[n].append(other) + else: + enemies[n].append(other) + logger.debug(f"Friends of {n}: {friends[n]}") + logger.debug(f"Enemies of {n}: {enemies[n]}") - CLEAR_HOMOLOGY = 500000 #for clear bulge like structures this limmit is decreased - MIN_ALIGNMENT = 100000 #smaller alingments will be filtered out - MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa - FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets + parts = [set(), set()] + swap_together = {} - MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only - MIN_RDNA_COMPONENT = 500000 - logger = logger_wrap.initLogger("phasing.log") + #TODO: place for randomization + + C_nodes = list(C.nodes()) + for n in C_nodes: + #lets remove non-homologous nodes + if not n in enemies and n.find("Aux") == -1: + C.remove_node(n) + logger.debug(f"Removing node {n}") + else: + swap_together[n] = [n] + start_component = random.randint(0, 1) + if not n in parts[0] and not n in parts[1]: + parts[start_component].add(n) + if n in friends: + for f in friends[n]: + parts[start_component].add(f) + for f in enemies[n]: + parts[1 - start_component].add(f) + + for n in friends: + for f in friends[n]: + swap_together[n].append(f) + for f in enemies[n]: + swap_together[n].append(f) + + return parts, swap_together + + + +def optimize_partition(C, parts, swap_together): + cur_score = score_partition(C, parts) + #TODO: without randomizations iterations are useless. + MAX_ITERATIONS = 5 + not_changed = 0 + all_nodes = [] + for i in range (2): + for j in parts[i]: + all_nodes.append(j) + all_nodes.sort() + while not_changed < MAX_ITERATIONS: + for n in all_nodes: + + score_change = score_swap(C, parts, swap_together[n]) + logger.debug(f"Trying to move {n} and its neighbors {swap_together[n]}, old_score {cur_score}, swap score {score_change}") + if score_change > 0: + upd_score = cur_score + score_change + for swap_node in swap_together[n]: + for j in range (2): + if swap_node in parts[j]: + parts[j-1].add(swap_node) + parts[j].remove(swap_node) + break + not_changed = 0 + logger.debug(f"Moved {n} and its neighbors {swap_together[n]},improving score") + not_changed += 1 + + return parts + + + +#TODO: rewrite, leaving old version for better tests +def create_initial_partition(C, matchGraph): + p1 = [] + p2 = [] + for n in sorted(C.nodes()): + if n in matchGraph and len(matchGraph.edges(n)) > 0: + #TODO: replace with something reasonable! multiple nodes will not work here. Transform each homology component into one node before?.. Anyway, it still should be fixed later with K-L iterations. + #TODO: distance from starting vertex in match graph % 2 + for ec in matchGraph.edges(n): + if matchGraph.edges[ec]['weight'] >= 0: + continue + if ec[0] == n and ec[1] in p1: + if n not in p1: + p2.append(n) + elif ec[0] == n and ec[1] in p2: + if n not in p2: + p1.append(n) + elif ec[1] == n and ec[0] in p1: + if n not in p1: + p2.append(n) + elif ec[1] == n and ec[0] in p2: + if n not in p2: + p1.append(n) + elif n not in p1 and n not in p2: + if random.random() <= 0.5: + p1.append(n) + else: + p2.append(n) + + if n not in p1 and n not in p2: + if random.random() <= 0.5: + p1.append(n) + else: + p2.append(n) + else: + if random.random() <= 0.5: + p1.append(n) + else: + p2.append(n) +# print("Initial partitions are %s and %s" % (set(p1), set(p2))) + if len(p1) * len(p2) > 0: +# logger.info (f"{len(p1)} {len(p2)} {len(C.nodes())}") + #Debug for weird crashes + inter = list(set(p1).intersection(p2)) + missing = set(C.nodes()) - set(p1) - set(p2) + if len(inter) > 0: + logger.info (f"Intersecting {inter}") + if len(missing) > 0: + logger.info (f"Missing {missing}") + return [set(p1), set(p2)] + +def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists): + + C = nx.Graph() + # rebuild the graph from this component using only edges in the hic graph + C.add_nodes_from(current_component) + + # first we ignore any nodes that are too short or have too few links. + short = [] + + not_to_big = [] + for n in C.nodes(): + logging.debug(n) + if G.nodes[n]['coverage'] < MAX_COV: + not_to_big.append((n, G.nodes[n])) + local_max_cov = 1.5 * getMedianCov(not_to_big) + if local_max_cov < 0: + local_max_cov = MAX_COV + for n in C.nodes(): + if n not in G: + sys.stderr.write("Error, got a node not in original graph %s !" % (n)) + sys.exit() + if G.nodes[n]['coverage'] > local_max_cov and (not uneven_depth): + logger.debug("While partitoning dropping node %s coverage too high" % (n)) + short.append(n) + elif not (n in matchGraph) and uneven_depth: + logger.info("While partitoning dropping node %s uneven coverage and no matches" % (n)) + short.append(n) + elif G.nodes[n]['length'] < MIN_LEN: + collapseOrientedNode(edges, n) + short.append(n) + else: + good = False + for e in hicGraph.edges(n): + if (e[0] != n or e[1] != n) and hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT \ + and (e[0] in C and e[1] in C): + good = True + #Let's phase homologous nodes randomly then + if n in matchGraph and len(matchGraph.edges(n)) > 0: + for ec in matchGraph.edges(n): + if matchGraph.edges[ec]['weight'] < 0: + good = True + break + + if not good: + logger.info("While partitoning dropping node %s low links count" % (n)) + short.append(n) - # load the assembly gfa - G = nx.Graph() - logging_f = open (os.path.join(output_dir, LOGGING_FILENAME), 'w') - graph_functions.load_indirect_graph(graph_gfa, G) + C.remove_nodes_from(short) + logger.info(f'Currently {C.number_of_nodes()} nodes') + # Adding some auxilary vertices to allow slightly unbalanced partitions + # sqrt/2 is quite arbitrary and subject to change + aux_nodes = int(math.sqrt(C.number_of_nodes() // 2)) + if C.number_of_nodes() <= 3: + aux_nodes = 0 + for i in range(0, aux_nodes): + C.add_node("Aux" + str(i)) + for e in hicGraph.edges(current_component): + # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) + if e[0] in C and e[1] in C and matchGraph.get_edge_data(e[0], e[1]) == None: + # if edges are too distant in graph, hi-c info is trash + # using homologous edges when counting distances to help with coverage gaps + similar_edges = [set(), set()] + for ind in range(0, 2): + for match_edge in matchGraph.edges(e[ind]): + similar_edges[ind].add(match_edge[0]) + similar_edges[ind].add(match_edge[1]) + similar_edges[ind].add(e[ind]) +#TODO: likely this is not needed anymore since we already added links between homologous edges. + C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) + #Possibly we'll need to restore tis for backcomp +# for e0like in similar_edges[0]: +# for e1like in similar_edges[1]: +# if e0like in dists and e1like in dists[e0like]: #and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']: +# C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) +# break + logger.info(f'Currently {C.number_of_nodes()} in current component') + logger.debug(C.nodes()) + if C.number_of_nodes() > 1: + for u, v, w in matchGraph.edges.data("weight"): + if u in C and v in C: + if w != None: + C.add_edge(u, v, weight=w) + res = checkXYcomponent(current_component, matchGraph, G, edges) + if res != [0, 0]: + sys.stderr.write(f"XY found, {res[0]} {res[1]}, adding fake links\n") + if res[0] in C and res[1] in C: + C.add_edge(res[0], res[1], weight=-10 * FIXED_WEIGHT) + C.add_edge(res[1], res[0], weight=-10 * FIXED_WEIGHT) + for edge in C.edges: + logger.debug(f'HIC edge: {edge} {C.edges[edge]}') + + return C + +def score_partition(C, partition): + sum_w = 0 + for part_id in range(0, 2): + for i in partition[part_id]: + for j in partition[part_id]: + if [i, j] in C.edges(): + sum_w += C.edges[i, j]['weight'] + if C.edges[i, j]['weight'] < 0: + + logger.error(f"Negative edge {i} {j} with weight {C.edges[i, j]['weight']} IN THE SAME PARTITION") + exit() + return sum_w +def score_swap(C, partition, to_swap): + upd = 0 + for n in to_swap: + for i in range (2): + if n in partition[i]: + for neighbour in C.neighbors(n): + if not neighbour in to_swap: + if neighbour in partition[i]: + upd -= C.edges[n, neighbour]['weight'] + elif neighbour in partition[1 - i]: + upd += C.edges[n, neighbour]['weight'] + return upd + +def min_nor_dist(n1, n2, dists): + res = 10000000000 + for or1 in ['-', '+']: + for or2 in ['-', '+']: + or_n1 = n1 + or1 + or_n2 = n2 + or2 + if or_n1 in dists and or_n2 in dists[or_n1]: + res = min (res, dists[or_n1][or_n2]) + return res + +def comnponent_size (current_component, G): + total_len = 0 + for n in current_component: + #oriented or not we do not care + if n in G.nodes(): + total_len += G.nodes[n]['length'] + elif n + '+' in G.nodes(): + total_len += G.nodes[n + '+']['length'] + return total_len + +def is_duplication_like(n1, n2, dists, or_G): + CLOSE_SUSP_HOMOLOGY_DIST = 5000000 + if or_G.nodes[n1 + '+']['length'] > CLOSE_SUSP_HOMOLOGY_DIST and or_G.nodes[n2 + '+']['length'] > CLOSE_SUSP_HOMOLOGY_DIST: + return + for or1 in ['-', '+']: + for or2 in ['-', '+']: + or_n1 = n1 + or1 + or_n2 = n2 + or2 + if or_n1 in dists and or_n2 in dists[or_n1]: + dist = (dists[or_n1][or_n2] - or_G.nodes[or_n1]['length'] - or_G.nodes[or_n2]['length']) / 2 + if dist < CLOSE_SUSP_HOMOLOGY_DIST: + if or_n2 in dists and or_n1 in dists[or_n2] and dists[or_n2][or_n1] < CLOSE_SUSP_HOMOLOGY_DIST: + logger.info(f"Nodes {or_n1} and {or_n2} from haploid component in loop, {dist}") + else: + logger.info(f"Nodes {or_n1} and {or_n2} one direction close {dist} but not another") + return True + return False +def clear_links(node_pairs, mg): + for pair in node_pairs: + mg.matchGraph.remove_edge(pair[0], pair[1]) + +def remove_pseudo_homology(current_component, or_G, dists, mg): + #we never remove true homology between really large nodes + #max 1/3 among all homologies can be cleared + MAX_UNDIPLOID_FRACTION = 3 + + total_len = comnponent_size(current_component, or_G) + clear_diploids = [] + total_homology_length = 0 + clear_homology_length = 0 + for n1 in current_component: + for n2 in current_component: + if n1 < n2 and mg.isHomologousNodes(n1, n2, True): + total_homology_length += mg.getHomologyLength(n1, n2) + if is_duplication_like(n1, n2, dists, or_G): + clear_diploids.append([n1, n2]) + clear_homology_length += mg.getHomologyLength(n1, n2) + if (mg.isDiploid(current_component)): + if clear_homology_length * MAX_UNDIPLOID_FRACTION < total_homology_length: + if clear_homology_length > 0: + logger.info(f"Cleaning diploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + clear_links(clear_diploids, mg) + else: + logger.info(f"NOT cleaning diploid component {clear_diploids}, too high fraction. hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + else: + if clear_homology_length > 0: + logger.info(f"Cleaning haploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + if clear_homology_length == total_homology_length: + return "no_diploid" + return "diploid" + +def process_haploid_component(current_component, G, tsv_file, haploid_component_count): + haploid_component_count += 1 + haplotype = haploid_component_count % 2 + nodes_to_report = [] + for n in sorted(current_component): + if G.nodes[n]['length'] > MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: + nodes_to_report.append(n) + #TODO: possibly total length condition? Better not assign colors to distal bits... + if len(nodes_to_report) > 1: + for n in nodes_to_report: + report_phased_node(n, haplotype, tsv_file) + +def only_aux(C): + for n in C.nodes(): + if not n.startswith('Aux'): + return False + return True + +def output_graph_stats(G): degrees = [val for (node, val) in G.degree()] mean = sum(degrees) / G.number_of_nodes() variance = sum([((x - mean) ** 2) for x in degrees]) / G.number_of_nodes() res = variance ** 0.5 - sys.stderr.write("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f\n" % ( + logger.info("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f" % ( G.number_of_nodes(), G.number_of_edges(), mean, res, mean + 5 * res)) +def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, uneven_depth): + G = nx.Graph() + gf.load_indirect_graph(graph_gfa, G) + output_graph_stats(G) delete_set = set() largest_component = max(nx.connected_components(G), key=len) + cur_col = 0 old_colors = {} for current_component in sorted(nx.connected_components(G), key=len, reverse=True): @@ -218,13 +576,15 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une cur_col += 1 #TODO: no_rdna is false, so efficiently we are still removing rDNA component. Should we? + ''' if not (no_rdna): #Store rDNA component, not to add additional links from matchGraph #sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n") #Here we remove large connected components of short edge, just to exclude rDNA cluster - delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT) + delete_set = gf.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT) else: sys.stderr.write(f"Not using rDNA component removal heuristics\n") + ''' filtered_graph = open(os.path.join(output_dir, FILTERED_GFA_FILENAME), 'w') tsv_output = os.path.join(output_dir, RESULT_FILENAME) tsv_file = open(tsv_output, 'w') @@ -247,6 +607,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une #TODO: only place which used it is likely outdated #Also we have special function... nodelines = [] + or_G = nx.DiGraph() + gf.load_direct_graph(graph_gfa, or_G) edges = {} for l in open(graph_gfa, 'r'): parts = l.strip().split('\t') @@ -263,26 +625,19 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une edges[fromnode].add(tonode) edges[gf.revnode(tonode)].add(gf.revnode(fromnode)) - -#let's find that nodes that have multiple extension - multiple_ext = set() - for node in edges: - if len(node) > 1: - multiple_ext.add(node) #dirty calculation median coverage, considering that most of the phasing is done - sorted_nodes = sorted(G.nodes(data=True), key=lambda node: node[1]['length']) med_cov = getMedianCov(G.nodes(data=True)) MAX_COV = med_cov * 1.5 if (uneven_depth): - sys.stderr.write(f"Will not use coverage based homozygous nodes detection\n") + logger.info(f"Will not use coverage based homozygous nodes detection") else: - sys.stderr.write(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}\n") + logger.info(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}") # load hic connections based on mappings, weight corresponds to number of times we see a connection - hicGraph = graph_functions.loadHiCGraph(hic_byread) + hicGraph = gf.loadHiCGraph(hic_byread) compressed_file = open(os.path.join(output_dir, HIC_COMPRESSED_FILENAME), 'w') max_w = 0 for node1, node2 in hicGraph.edges(): @@ -290,7 +645,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une max_w = hicGraph[node1][node2]["weight"] compressed_file.write(f'X {node1} {node2} {hicGraph[node1][node2]["weight"]}\n') FIXED_WEIGHT = max_w - sys.stderr.write(f'Constant for neighbouring edges set to be {FIXED_WEIGHT} (but not used), for homologous edges {-10 * FIXED_WEIGHT} \n') + logger.info(f'Constant for homologous edges: {-10 * FIXED_WEIGHT}') #Adding link between matched edges to include separated sequence to main component @@ -298,252 +653,124 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une mg = match_graph.MatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) matchGraph = mg.getMatchGraph() - component_colors = graph_functions.getComponentColors(G) + component_colors = gf.getComponentColors(G) #reconnecting homologous nodes for [v1,v2] in matchGraph.edges(): if v1 in G.nodes and v2 in G.nodes and matchGraph[v1][v2]['weight'] < 0: if component_colors[v1] != component_colors[v2]: - logging_f.write(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}\n") + logger.info(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}") G.add_edge(v1, v2) - sys.stderr.write("Loaded hic info with %d nodes and %d edges\n" % (hicGraph.number_of_nodes(), hicGraph.number_of_edges())) + logger.info(f"Loaded hic info with {hicGraph.number_of_nodes()} nodes and {hicGraph.number_of_edges()} edges") compressed_file.close() dists = dict(nx.all_pairs_dijkstra_path_length(G, weight=lambda u, v, d: G.nodes[v]['length'])) - sys.stderr.write("Distances counted\n") + logger.info("Distances counted") # connected components decomposition and log the IDs and partition each one # currently not going to do the right thing on rDNA component + haploid_component_count = 0 + dists = dict(nx.all_pairs_dijkstra_path_length(or_G, weight=lambda u, v, d: or_G.edges[u, v]['mid_length'])) for current_component in sorted(nx.connected_components(G), key=len, reverse=True): - logging_f.write("Connected component with %d nodes is: %s\n" % (len(current_component), current_component)) + logger.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) + + cleared = remove_pseudo_homology(current_component, or_G, dists, mg) + #only backward compatibility with KL + if not mg.isDiploid(current_component): + haploid_component_count += 1 + if cleared == "no_diploid": + #TODO: backwards compatibility not to change distal bits, to_remove + #if component_size(current_component, G) > 10000000: + process_haploid_component(current_component, G, tsv_file, haploid_component_count) + continue - total_l = 0 - for n in current_component: - total_l += G.nodes[n]['length'] - #TODO: currently just a debug message! - # we likely need to skip phasing for non-diploids and assign the same color for the component, but this can cause troubles for short arms until we stop removing rDNA tangle. - if total_l > 1000000 and not mg.isDiploid(current_component): - logging_f.write(f"Component is not diploid!\n") - - - C = nx.Graph() - # rebuild the graph from this component using only edges in the hic graph - C.add_nodes_from(current_component) - - # first we ignore any nodes that are too short or have too few links. - short = [] - - not_to_big = [] - for n in C.nodes(): - print (n) - if G.nodes[n]['coverage'] < MAX_COV: - not_to_big.append((n, G.nodes[n])) - local_max_cov = 1.5 * getMedianCov(not_to_big) - if local_max_cov < 0: - local_max_cov = MAX_COV - for n in C.nodes(): - if n not in G: - sys.stderr.write("Error got a node not in original graph %s !" % (n)) - sys.exit() - if G.nodes[n]['coverage'] > local_max_cov and (not uneven_depth): - logging_f.write("While partitoning dropping node %s coverage too high\n" % (n)) - short.append(n) - elif not (n in matchGraph) and uneven_depth: - logging_f.write("While partitoning dropping node %s uneven coverage and no matches\n" % (n)) - short.append(n) - elif G.nodes[n]['length'] < MIN_LEN: + C = create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists) - collapseOrientedNode(edges, n) - short.append(n) + #TODO: cycle for randomized starts + #optimize_partition(C, parts, swap_together) + + best_score = - FIXED_WEIGHT * C.number_of_nodes() * C.number_of_nodes() + best_part = [set(), set()] + + for seed in range(0, KLIN_STARTS): # iterate on starting partition + random.seed(seed) + parts, swap_together = create_initial_partition_noKL(C, matchGraph, seed) + if only_aux(C): + logger.info("Only aux nodes left in component, skipping") + continue + part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) + logger.debug(f"init_part:\n{sorted(part[0])}\n{sorted(part[1])}") + + sum_w = score_partition(C, part) + + if (sum_w > best_score): + logger.info(f'Seed {seed} score {sum_w} improved over {best_score}') + best_part = part + best_score = sum_w + logger.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) +# best_part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) +# logger.debug(f"parts:\n{sorted(parts[0])}\n{sorted(parts[1])}") + #logger.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + """ + """ + add_part = [set(), set()] + only_weights = {} + for n in current_component: +# if G.nodes[n]['length'] < MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: + if (not n in C.nodes()) and G.nodes[n]['coverage'] < MAX_COV: - else: - good = False + weights = [0, 0] + all_weights = [0, 0] + total_w = 0 for e in hicGraph.edges(n): - - if (e[0] != n or e[1] != n) and hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT \ - and (e[0] in C and e[1] in C): - good = True - #Let's phase homologous nodes randomly then - if n in matchGraph and len(matchGraph.edges(n)) > 0: - for ec in matchGraph.edges(n): - if matchGraph.edges[ec]['weight'] < 0: - good = True - break - - if not good: - logging_f.write("While partitoning dropping node %s low links count\n" % (n)) - short.append(n) - - C.remove_nodes_from(short) - logging_f.write(f'Currently {C.number_of_nodes()} nodes\n') - # Adding some auxilary vertices to allow slightly unbalanced partitions - # sqrt/2 is quite arbitrary and subject to change - aux_nodes = int(math.sqrt(C.number_of_nodes() // 2)) - if C.number_of_nodes() <= 3: - aux_nodes = 0 - for i in range(0, aux_nodes): - C.add_node("Aux" + str(i)) - for e in hicGraph.edges(current_component): - # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) - if e[0] in C and e[1] in C and matchGraph.get_edge_data(e[0], e[1]) == None: - # if edges are too distant in graph, hi-c info is trash - # using homologous edges when counting distances to help with coverage gaps - similar_edges = [set(), set()] - for ind in range(0, 2): - for match_edge in matchGraph.edges(e[ind]): - similar_edges[ind].add(match_edge[0]) - similar_edges[ind].add(match_edge[1]) - similar_edges[ind].add(e[ind]) - #TODO: likely this is not needed anymore since we already added links between homologous edges. - for e0like in similar_edges[0]: - for e1like in similar_edges[1]: - if e0like in dists and e1like in dists[e0like]: #and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']: - C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) - break - logging_f.write(f'Currently {C.number_of_nodes()} in current component\n') - - - if C.number_of_nodes() > 1: - for u, v, w in matchGraph.edges.data("weight"): - if u in C and v in C: - if w != None: - C.add_edge(u, v, weight=w) - - - - res = checkXYcomponent(current_component, matchGraph, G, edges) - if res != [0, 0]: - sys.stderr.write(f"XY found, {res[0]} {res[1]}, adding fake links\n") - if res[0] in C and res[1] in C: - C.add_edge(res[0], res[1], weight=-10 * FIXED_WEIGHT) - C.add_edge(res[1], res[0], weight=-10 * FIXED_WEIGHT) - for edge in C.edges: - logging_f.write(f'HIC edge: {edge} {C.edges[edge]}\n') - - - best_score = FIXED_WEIGHT * C.number_of_nodes() * C.number_of_nodes() - - - - - for seed in range(0, KLIN_STARTS): # iterate on starting partition - random.seed(seed) - p1 = [] - p2 = [] - for n in sorted(C.nodes()): - if n in matchGraph and len(matchGraph.edges(n)) > 0: - #TODO: replace with something reasonable! multiple nodes will not work here. Transform each homology component into one node before?.. Anyway, it still should be fixed later with K-L iterations. - for ec in matchGraph.edges(n): - if matchGraph.edges[ec]['weight'] >= 0: - continue - if ec[0] == n and ec[1] in p1: - if n not in p1: - p2.append(n) - elif ec[0] == n and ec[1] in p2: - if n not in p2: - p1.append(n) - elif ec[1] == n and ec[0] in p1: - if n not in p1: - p2.append(n) - elif ec[1] == n and ec[0] in p2: - if n not in p2: - p1.append(n) - elif n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - - if n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - else: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - # print("Initial partitions are %s and %s" % (set(p1), set(p2))) - if len(p1) * len(p2) > 0: -# logging_f.write (f"{len(p1)} {len(p2)} {len(C.nodes())}\n") - #Debug for weird crashes - inter = list(set(p1).intersection(p2)) - missing = set(C.nodes()) - set(p1) - set(p2) - if len(inter) > 0: - logging_f.write (f"Intersecting {inter}\n") - if len(missing) > 0: - logging_f.write (f"Missing {missing}\n") - - part = community.kernighan_lin.kernighan_lin_bisection(C, partition=[set(p1), set(p2)], max_iter=KLIN_ITER, - weight='weight', seed=seed) - sum_w = 0 - for i in part[0]: - for j in part[1]: - if [i, j] in C.edges(): - # print (f'{i} {j} edge {C.edges[i,j]}') - sum_w += C.edges[i, j]['weight'] - - if (sum_w < best_score): - #lets forbid Aux only components - if check_non_empty(part[0], G) and check_non_empty(part[1], G): - logging_f.write(f'Seed {seed} score {sum_w} improved over {best_score}\n') - best_part = part - best_score = sum_w - #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) - fixUnbalanced(best_part, C, G) - logging_f.write(f'RES\t{best_part}\n') - - add_part = [set(), set()] - only_weights = {} - for n in current_component: - if G.nodes[n]['length'] < MIN_LEN and G.nodes[n]['coverage'] < MAX_COV: - weights = [0, 0] - all_weights = [0, 0] - total_w = 0 - for e in hicGraph.edges(n): - if (e[0] != n or e[1] != n) and (e[0] in current_component and e[1] in current_component): - logging_f.write(f'DEBUG: edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}\n') - for ind in range(0, 2): - if e[0] in best_part[ind] or e[1] in best_part[ind]: - if hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT: - logging_f.write(f'DEBUG: {e} edge to partition {ind}!\n') - weights[ind] += hicGraph[e[0]][e[1]]["weight"] - all_weights[ind] += hicGraph[e[0]][e[1]]["weight"] - total_w += hicGraph[e[0]][e[1]]["weight"] - if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: - add_part[0].add(n) - logging_f.write(f"Edge {n} assigned color 0\n") - elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: - add_part[1].add(n) - logging_f.write(f"Edge {n} assigned color 1\n") - else: - logging_f.write(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}\n") - logging_f.write(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}\n") - if (all_weights[0] > SIGNIFICANT_MAJORITY * all_weights[1] or all_weights[1] > SIGNIFICANT_MAJORITY * all_weights[0]) and (all_weights[0] > MIN_WEIGHT or all_weights[1]> MIN_WEIGHT): - logging_f.write(f"Edge {n} real weights allows to make a decision forbidden by weights!\n") + if (e[0] != n or e[1] != n) and (e[0] in current_component and e[1] in current_component): + #logger.debug(f'edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}') + for ind in range(0, 2): + if e[0] in best_part[ind] or e[1] in best_part[ind]: + if hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT: + logger.debug(f'{e} edge to partition {ind}!') + weights[ind] += hicGraph[e[0]][e[1]]["weight"] + all_weights[ind] += hicGraph[e[0]][e[1]]["weight"] + total_w += hicGraph[e[0]][e[1]]["weight"] + if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: + add_part[0].add(n) + logger.info(f"Edge {n} assigned color 0") + elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: + add_part[1].add(n) + logger.debug(f"Edge {n} assigned color 1") + else: + logger.debug(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}") + logger.debug(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}") + if (all_weights[0] > SIGNIFICANT_MAJORITY * all_weights[1] or all_weights[1] > SIGNIFICANT_MAJORITY * all_weights[0]) and (all_weights[0] > MIN_WEIGHT or all_weights[1]> MIN_WEIGHT): + logger.debug(f"Edge {n} real weights allows to make a decision forbidden by weights!") + if G.nodes[n]['length'] < MIN_LEN: only_weights[n] = all_weights - elif G.nodes[n]['length'] < MIN_LEN: - logging_f.write(f"Edge {n} did not pass coverage check\n") - - logging_f.write(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition\n') - logging_f.write(f'RES\t{add_part}\n') - best_part[0].update(add_part[0]) - best_part[1].update(add_part[1]) - right = False - for ind in range (0, 2): - for contig in sorted(best_part[ind]): - if contig in current_component: - if ind == 1: - tsv_file.write(f'{contig}\t0\t100000\t0:100000\t#8888FF\n') else: - tsv_file.write(f'{contig}\t100000\t0\t100000:0\t#FF8888\n') - for contig in sorted(only_weights.keys()): - tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') + if all_weights[0] > all_weights[1]: + add_part[0].add(n) + logger.debug(f"Edge {n} assigned color 0 by additional all_weights") + else: + add_part[1].add(n) + logger.debug(f"Edge {n} assigned color 1 by additional all_weights") + elif G.nodes[n]['length'] < MIN_LEN: + logger.debug(f"Edge {n} did not pass coverage check") + + logger.info(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition') + logger.info(f'RES\t{add_part}') + best_part[0].update(add_part[0]) + best_part[1].update(add_part[1]) + for ind in range (0, 2): + for contig in sorted(best_part[ind]): + #filtering out Aux nodes + if contig in current_component: + report_phased_node(contig, ind, tsv_file) + for contig in sorted(only_weights.keys()): + tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') if __name__ == "__main__": diff --git a/src/scripts/scaffolding/logger_wrap.py b/src/scripts/scaffolding/logger_wrap.py index 82eda519..6941ba9e 100755 --- a/src/scripts/scaffolding/logger_wrap.py +++ b/src/scripts/scaffolding/logger_wrap.py @@ -27,8 +27,10 @@ def process(self, msg, kwargs): def initLogger(log_file): logger = logging.getLogger('HiCPipeline') - # self.logger = ClassNameLoggerAdapter(logger, self.__class__.__name__) - + + # Clear any existing handlers to prevent duplication and unexpected behavior + logger.handlers.clear() + logger.setLevel(logging.DEBUG) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) @@ -39,4 +41,8 @@ def initLogger(log_file): file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) + + # Prevent propagation to root logger (which might have its own handlers) + logger.propagate = False + return logger \ No newline at end of file diff --git a/src/scripts/scaffolding/match_graph.py b/src/scripts/scaffolding/match_graph.py index 233aaaf2..69de528a 100755 --- a/src/scripts/scaffolding/match_graph.py +++ b/src/scripts/scaffolding/match_graph.py @@ -186,7 +186,7 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) - #Do not want to think whether we use diGraph or Graph + #Do not want to think whether we use diGraph or Graph, internal structures are not oriented neighbours = {} indirect_nodes = set() for node in G.nodes(): @@ -199,7 +199,6 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm for node1 in self.hom_storage.homologies: for node2 in self.hom_storage.homologies[node1]: #we deleted some nodes after mashmap - #sys.stderr.write(f"Checking {node1} {node2} {self.hom_storage.homologies[node1][node2].getMinCovered()} {min_big_homology}\n") if node1 in indirect_nodes and node2 in indirect_nodes: cur_homology = self.hom_storage.homologies[node1][node2].getCoveredLen() if cur_homology > min_big_homology: @@ -247,7 +246,7 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm else: self.matchGraph.remove_edge(ec[0],ec[1]) - self.logger.info("Loaded match info with %d nodes and %d edges\n" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) + self.logger.info("Loaded match info with %d nodes and %d edges" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) for d in sorted(self.matchGraph.edges()): self.logger.debug(f"homology edge {d} : {self.matchGraph.edges[d]}") @@ -276,6 +275,9 @@ def hasEdge(self, node1, node2): def getEdgeAttribute(self, node1, node2, attr): return self.matchGraph.edges[node1, node2][attr] + def getHomologyLength(self, node1, node2): + return self.getEdgeAttribute(node1, node2, 'homology_len') + def isHomologousNodes(self, node1, node2, strict:bool): if self.matchGraph.has_edge(node1, node2) and ((not strict) or self.matchGraph.edges[node1, node2]['weight'] < 0): return True From e07422946a3ceb6f7badbc56df926022977c5a4b Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 15 Aug 2025 18:05:48 -0400 Subject: [PATCH 02/19] Cleanup --- src/scripts/cluster.py | 293 ++++++++++------------------------------- 1 file changed, 73 insertions(+), 220 deletions(-) diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index be5655ad..aa07c641 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -15,7 +15,7 @@ #likely do not need #MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless -KLIN_STARTS = 30 # number of different starts of kernighan lin +KLIN_STARTS = 100 # number of different starts of kernighan lin KLIN_ITER = 10000 # number of iterations inside kernighan lin MAX_SHORT_COMPONENT = 100 # we remove from consideration each connected compoents of edges < MIN_SHORT_LEN that is larger than MAX_SHORT_COMPONENT @@ -30,7 +30,7 @@ CLEAR_HOMOLOGY = 250000 #for clear bulge like structures this limmit is decreased MIN_ALIGNMENT = 50000 #smaller alingments will be filtered out MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa -FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets +FIXED_HOMOLOGY_WEIGHT = -1000000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only MIN_RDNA_COMPONENT = 500000 @@ -88,95 +88,6 @@ def collapseOrientedNode (edges, node): if ornode in edges: edges.pop(ornode) -#returns pair of nodes near PAR that should be connected via fake homology, currently placeholder -#TODO: this is likely subject for reconsidering when we'll move to other species - -def checkXYcomponent(current_component, matchgraph, G, edges): -#minimal comp size. XY is vulnerable to gaps, so not 150M - MIN_COMP_SIZE = 80000000 -#maximal homozygous, PAR region - MAX_HOM_LEN = 5000000 -#cutoff for additional fake homozygocity - MIN_NEAR_PAR = 3000000 -#TODO these constants are ABSOLUTELY human focused, and even if we'll save the function should be reconsidered. -#TODO: right way with new matchGraph... - total_l = 0 - total_hom_len = 0 - for node in current_component: - if node in G.nodes: - total_l += G.nodes[node]['length'] - if node in matchgraph: - total_hom_len += G.nodes[node]['length'] - - if total_hom_len == 0 or total_hom_len > MAX_HOM_LEN or total_l < MIN_COMP_SIZE: - return [0, 0] - fake_hom = [] - for f_node in current_component: - for s_node in current_component: - if f_node < s_node and G.nodes[f_node]['length'] > MIN_NEAR_PAR and G.nodes[s_node]['length'] > MIN_NEAR_PAR: - for fp in ['<', '>']: - for sp in ['<', '>']: - f_or = fp + f_node - s_or = sp + s_node - if not (f_or in edges) or not (s_or in edges): - continue - for nxt in edges[f_or]: - if nxt in edges[s_or]: - if len(fake_hom) == 0 or fake_hom[-1][0] != f_node or fake_hom[-1][1] != s_node: - fake_hom.append([f_node, s_node]) - if len(fake_hom) > 1: - logger.warning("MULTIPLE CANDIDATES FOR FAKE HOMOLOGY IN XY\n") - for pair in fake_hom: - logger.warning(f"{pair[0]} --- {pair[1]}\n") - return [0, 0] - elif len(fake_hom) == 1: - logger.info(f"Adding FAKE HOMOLOGY between {fake_hom[0][0]} --- {fake_hom[0][1]}") - return [fake_hom[0][0], fake_hom[0][1]] - return [0, 0] - -def fixUnbalanced(part, C, G): - auxs = [0, 0] - lens = [0, 0] - for ind in range (0, 2): - for node in part[ind]: - if node.startswith('Aux'): - auxs[ind] += 1 - elif node in G.nodes: - lens[ind] += G.nodes[node]['length'] - else: - print(f"Someting went wrong with node {node}") -# print (f"In fix unbalanced {lens[0]} {lens[1]}") - #just _a_ limitation on the swap length is required - maxswap = min(lens[0]/10, lens[1]/10) - curswap = 0 - edge_swapped =0 - for ind in range(0, 2): - #do not move edges _to_ the part where aux edge is present -otherwise it could be swapped with aux - if auxs[1 - ind] == 0: -# print(f'processing components {part[0]} {part[1]}') - changed = True - while changed: - changed = False - for node in part[ind]: - if node in G: - cost = 0 - for i in part[ind]: - if [i, node] in C.edges: - cost += C.edges[i, node]['weight'] - for j in part[1 - ind]: - if [j, node] in C.edges: - cost -= C.edges[j, node]['weight'] - if cost < 0 and curswap + G.nodes[node]['length'] < maxswap: - changed = True - curswap += G.nodes[node]['length'] - part[1 - ind].add(node) - part[ind].remove(node) - edge_swapped +=1 - print (f"SWAPPED {node}") - break - if curswap > 0: - print (f"Fixing uneven component, moved {curswap} bases and {edge_swapped} nodes") - def getMedianCov(nodeset): med_cov = -1 total_length = 0 @@ -199,8 +110,8 @@ def report_phased_node(node_id, hap, output_file): elif hap == 0: output_file.write(f'{node_id}\t100000\t0\t100000:0\t#FF8888\n') -def create_initial_partition_noKL(C, matchGraph, seed): - random.seed(seed) +def create_initial_partition_noKL(C, matchGraph): + random.seed(239) neg_graph = nx.Graph() for ec in matchGraph.edges(): if ec[0] in C and ec[1] in C: @@ -230,38 +141,75 @@ def create_initial_partition_noKL(C, matchGraph, seed): enemies[n].append(other) logger.debug(f"Friends of {n}: {friends[n]}") logger.debug(f"Enemies of {n}: {enemies[n]}") - parts = [set(), set()] - swap_together = {} - - #TODO: place for randomization - C_nodes = list(C.nodes()) + processed_nodes = set() + opposite = {} for n in C_nodes: #lets remove non-homologous nodes if not n in enemies and n.find("Aux") == -1: C.remove_node(n) logger.debug(f"Removing node {n}") else: - swap_together[n] = [n] start_component = random.randint(0, 1) - if not n in parts[0] and not n in parts[1]: - parts[start_component].add(n) - if n in friends: - for f in friends[n]: - parts[start_component].add(f) - for f in enemies[n]: - parts[1 - start_component].add(f) - - for n in friends: - for f in friends[n]: - swap_together[n].append(f) - for f in enemies[n]: - swap_together[n].append(f) - - return parts, swap_together - + if not n in processed_nodes: + friends[n].append(n) + if len (friends[n]) > 1: + f = glue_nodes(C, friends[n]) + else: + f = friends[n][0] + if len (enemies[n]) > 1: + e = glue_nodes(C, enemies[n]) + else: + e = enemies[n][0] + for n in friends[n]: + processed_nodes.add(n) + for n in enemies[n]: + processed_nodes.add(n) + parts[start_component].add(f) + parts[1 - start_component].add(e) + opposite[f] = e + opposite[e] = f + return parts, opposite + +def random_swap(parts, opposite, seed): + random.seed(seed) + used = set() + res = [set(),set()] + for ind in range (2): + for n in parts[ind]: + if not n in used: + swap = random.randint(0,1) + if swap: + n_ind = 1 - ind + else: + n_ind = ind + res[n_ind].add(n) + res[1 - n_ind].add(opposite[n]) + used.add(n) + used.add(opposite[n]) + return res +def glue_nodes(C, node_list): + new_node = '_'.join(node_list) + C.add_node(new_node) + for n in node_list: + for neighbor in C.neighbors(n): + if neighbor not in C.neighbors(new_node): + C.add_edge(new_node, neighbor, weight=C.edges[n, neighbor]['weight']) + else: + C[new_node][neighbor]['weight'] += C.edges[n, neighbor]['weight'] + C.remove_nodes_from(node_list) + return new_node + +def unglue_nodes(parts): + new_parts = [set(), set()] + for ind in range(2): + for n in parts[ind]: + original_nodes = n.split('_') + for orig in original_nodes: + new_parts[ind].add(orig) + return new_parts def optimize_partition(C, parts, swap_together): cur_score = score_partition(C, parts) @@ -292,59 +240,6 @@ def optimize_partition(C, parts, swap_together): return parts - - -#TODO: rewrite, leaving old version for better tests -def create_initial_partition(C, matchGraph): - p1 = [] - p2 = [] - for n in sorted(C.nodes()): - if n in matchGraph and len(matchGraph.edges(n)) > 0: - #TODO: replace with something reasonable! multiple nodes will not work here. Transform each homology component into one node before?.. Anyway, it still should be fixed later with K-L iterations. - #TODO: distance from starting vertex in match graph % 2 - for ec in matchGraph.edges(n): - if matchGraph.edges[ec]['weight'] >= 0: - continue - if ec[0] == n and ec[1] in p1: - if n not in p1: - p2.append(n) - elif ec[0] == n and ec[1] in p2: - if n not in p2: - p1.append(n) - elif ec[1] == n and ec[0] in p1: - if n not in p1: - p2.append(n) - elif ec[1] == n and ec[0] in p2: - if n not in p2: - p1.append(n) - elif n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - - if n not in p1 and n not in p2: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) - else: - if random.random() <= 0.5: - p1.append(n) - else: - p2.append(n) -# print("Initial partitions are %s and %s" % (set(p1), set(p2))) - if len(p1) * len(p2) > 0: -# logger.info (f"{len(p1)} {len(p2)} {len(C.nodes())}") - #Debug for weird crashes - inter = list(set(p1).intersection(p2)) - missing = set(C.nodes()) - set(p1) - set(p2) - if len(inter) > 0: - logger.info (f"Intersecting {inter}") - if len(missing) > 0: - logger.info (f"Missing {missing}") - return [set(p1), set(p2)] - def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists): C = nx.Graph() @@ -394,13 +289,6 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep C.remove_nodes_from(short) logger.info(f'Currently {C.number_of_nodes()} nodes') - # Adding some auxilary vertices to allow slightly unbalanced partitions - # sqrt/2 is quite arbitrary and subject to change - aux_nodes = int(math.sqrt(C.number_of_nodes() // 2)) - if C.number_of_nodes() <= 3: - aux_nodes = 0 - for i in range(0, aux_nodes): - C.add_node("Aux" + str(i)) for e in hicGraph.edges(current_component): # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) if e[0] in C and e[1] in C and matchGraph.get_edge_data(e[0], e[1]) == None: @@ -414,12 +302,6 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep similar_edges[ind].add(e[ind]) #TODO: likely this is not needed anymore since we already added links between homologous edges. C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) - #Possibly we'll need to restore tis for backcomp -# for e0like in similar_edges[0]: -# for e1like in similar_edges[1]: -# if e0like in dists and e1like in dists[e0like]: #and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']: -# C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) -# break logger.info(f'Currently {C.number_of_nodes()} in current component') logger.debug(C.nodes()) if C.number_of_nodes() > 1: @@ -427,15 +309,8 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep if u in C and v in C: if w != None: C.add_edge(u, v, weight=w) - res = checkXYcomponent(current_component, matchGraph, G, edges) - if res != [0, 0]: - sys.stderr.write(f"XY found, {res[0]} {res[1]}, adding fake links\n") - if res[0] in C and res[1] in C: - C.add_edge(res[0], res[1], weight=-10 * FIXED_WEIGHT) - C.add_edge(res[1], res[0], weight=-10 * FIXED_WEIGHT) for edge in C.edges: logger.debug(f'HIC edge: {edge} {C.edges[edge]}') - return C def score_partition(C, partition): @@ -547,12 +422,6 @@ def process_haploid_component(current_component, G, tsv_file, haploid_component_ for n in nodes_to_report: report_phased_node(n, haplotype, tsv_file) -def only_aux(C): - for n in C.nodes(): - if not n.startswith('Aux'): - return False - return True - def output_graph_stats(G): degrees = [val for (node, val) in G.degree()] mean = sum(degrees) / G.number_of_nodes() @@ -575,21 +444,10 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une old_colors[e] = cur_col cur_col += 1 -#TODO: no_rdna is false, so efficiently we are still removing rDNA component. Should we? - ''' - if not (no_rdna): - #Store rDNA component, not to add additional links from matchGraph - #sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n") - #Here we remove large connected components of short edge, just to exclude rDNA cluster - delete_set = gf.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT) - else: - sys.stderr.write(f"Not using rDNA component removal heuristics\n") - ''' filtered_graph = open(os.path.join(output_dir, FILTERED_GFA_FILENAME), 'w') tsv_output = os.path.join(output_dir, RESULT_FILENAME) tsv_file = open(tsv_output, 'w') tsv_file.write("node\tmat\tpat\tmat:pat\tcolor\n") - translate = open(graph_gfa, 'r') for line in translate: @@ -639,18 +497,14 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une # load hic connections based on mappings, weight corresponds to number of times we see a connection hicGraph = gf.loadHiCGraph(hic_byread) compressed_file = open(os.path.join(output_dir, HIC_COMPRESSED_FILENAME), 'w') - max_w = 0 + for node1, node2 in hicGraph.edges(): - if max_w < hicGraph[node1][node2]["weight"]: - max_w = hicGraph[node1][node2]["weight"] compressed_file.write(f'X {node1} {node2} {hicGraph[node1][node2]["weight"]}\n') - FIXED_WEIGHT = max_w - logger.info(f'Constant for homologous edges: {-10 * FIXED_WEIGHT}') #Adding link between matched edges to include separated sequence to main component #TODO: only one of those should be used - mg = match_graph.MatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) + mg = match_graph.MatchGraph(mashmap_sim, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) matchGraph = mg.getMatchGraph() component_colors = gf.getComponentColors(G) @@ -676,7 +530,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une haploid_component_count = 0 dists = dict(nx.all_pairs_dijkstra_path_length(or_G, weight=lambda u, v, d: or_G.edges[u, v]['mid_length'])) for current_component in sorted(nx.connected_components(G), key=len, reverse=True): - logger.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) + if len(current_component) > 1: + logger.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) cleared = remove_pseudo_homology(current_component, or_G, dists, mg) #only backward compatibility with KL @@ -693,15 +548,12 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une #TODO: cycle for randomized starts #optimize_partition(C, parts, swap_together) - best_score = - FIXED_WEIGHT * C.number_of_nodes() * C.number_of_nodes() + best_score = FIXED_HOMOLOGY_WEIGHT * C.number_of_nodes() * C.number_of_nodes() best_part = [set(), set()] - + init_parts, opposite = create_initial_partition_noKL(C, matchGraph) for seed in range(0, KLIN_STARTS): # iterate on starting partition random.seed(seed) - parts, swap_together = create_initial_partition_noKL(C, matchGraph, seed) - if only_aux(C): - logger.info("Only aux nodes left in component, skipping") - continue + parts = random_swap(init_parts, opposite, seed) part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) logger.debug(f"init_part:\n{sorted(part[0])}\n{sorted(part[1])}") @@ -712,6 +564,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une best_part = part best_score = sum_w logger.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + best_part = unglue_nodes(best_part) #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) # best_part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) # logger.debug(f"parts:\n{sorted(parts[0])}\n{sorted(parts[1])}") @@ -739,7 +592,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une total_w += hicGraph[e[0]][e[1]]["weight"] if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: add_part[0].add(n) - logger.info(f"Edge {n} assigned color 0") + logger.debug(f"Edge {n} assigned color 0") elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: add_part[1].add(n) logger.debug(f"Edge {n} assigned color 1") @@ -761,7 +614,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une logger.debug(f"Edge {n} did not pass coverage check") logger.info(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition') - logger.info(f'RES\t{add_part}') + logger.info(f'Added short edges\t{add_part}') best_part[0].update(add_part[0]) best_part[1].update(add_part[1]) for ind in range (0, 2): From 1855544d2941bbe166ff4cfcacc90f7a4110ebf2 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 5 Sep 2025 09:53:34 -0400 Subject: [PATCH 03/19] update MBG version --- .gitmodules | 2 +- src/MBG | 2 +- src/Snakefiles/1-buildGraph.sm | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.gitmodules b/.gitmodules index 804bc752..4a9bcb81 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,7 @@ url = https://github.com/marbl/rukki [submodule "src/MBG"] path = src/MBG - url = https://github.com/maickrau/MBG + url = https://github.com/skoren/MBG [submodule "src/hifioverlapper"] path = src/hifioverlapper url = https://github.com/maickrau/hifioverlapper diff --git a/src/MBG b/src/MBG index f312a891..8fc6ca8d 160000 --- a/src/MBG +++ b/src/MBG @@ -1 +1 @@ -Subproject commit f312a8915eca046fcd80658633f3f72c74ed5ce4 +Subproject commit 8fc6ca8d141c131b44931e7c960426f8a8c48bad diff --git a/src/Snakefiles/1-buildGraph.sm b/src/Snakefiles/1-buildGraph.sm index 2538f002..8c875fc1 100644 --- a/src/Snakefiles/1-buildGraph.sm +++ b/src/Snakefiles/1-buildGraph.sm @@ -111,10 +111,13 @@ awk 'BEGIN \\\\ }} \\\\ \$1 == "S" \\\\ {{ \\\\ - if (\$6 != "") {{ - \$4 = \$6; - }} - print \$2, substr(\$4, 6), length(\$3); \\\\ + if (\$6 != "" && match(\$6, /^kl:f:/)) {{ \\\\ + \$4 = \$6; \\\\ + }} \\\\ + if (!match(\$4, /^kl:f:/) && !match(\$4, /^ll:f:/)) {{ \\\\ + print "ERROR: line "\$0" does not match expected format for coverages"; exit 1 \\\\ + }} \\\\ + print \$2, substr(\$4, 6), length(\$3); \\\\ }}' \\\\ < ../{output.graph} \\\\ > ../{output.hifi_node_cov} From d4b403a6bc0c9681841e4952008d9a0c29878f34 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 5 Sep 2025 09:56:08 -0400 Subject: [PATCH 04/19] update canu branch and consensus options for consensus updates update combine and consensus times due to bam generation enable bam output by default and allow consensus options properly process and drop contigs in layout when they have only noisy long reads no accurate read support when merging layouts output layout read use counts used in bam MAPQ and consensus --- src/Snakefiles/6-layoutContigs.sm | 15 ++++-- src/Snakefiles/7-combineConsensus.sm | 15 +++--- src/Snakefiles/7-generateConsensus.sm | 10 ++-- src/Snakefiles/functions.sm | 9 ++-- src/canu | 2 +- src/scripts/bam_rename.py | 2 +- src/scripts/check_layout_gaps.py | 2 +- src/scripts/fasta_combine.py | 2 +- src/scripts/fasta_util.py | 5 +- src/scripts/get_layout_from_mbg.py | 10 ++-- src/scripts/merge_layouts.py | 77 ++++++++++++++++++++++----- src/verkko.sh | 31 ++++++----- 12 files changed, 125 insertions(+), 55 deletions(-) diff --git a/src/Snakefiles/6-layoutContigs.sm b/src/Snakefiles/6-layoutContigs.sm index 5d7aa8af..d7d0e993 100644 --- a/src/Snakefiles/6-layoutContigs.sm +++ b/src/Snakefiles/6-layoutContigs.sm @@ -53,6 +53,7 @@ rule generateLayoutContigsInputs: edges = '6-layoutContigs/combined-edges.gfa', hifialns = '6-layoutContigs/hifi.alignments.gaf', ontalns = '6-layoutContigs/ont.alignments.gaf', + ontgapaln = '6-layoutContigs/ont-gapfill.txt', paths = '6-layoutContigs/consensus_paths.txt', lengths = '6-layoutContigs/nodelens.txt' log: @@ -91,8 +92,8 @@ cat {params.aligns} ../{input.paths} | sed 's/^/hifi_/' \\\\ cat ../{input.ontgaps} | sed 's/^/ont_/' >> ../{output.hifialns} # now get the ONT alignments, again prepending ont_ but ignoring any reads we've already included due to it being used as a gap fill -cat ../{input.ontgaps} |cut -f 1 > ../6-layoutContigs/ont-gapfill.txt -{PYTHON} {VERKKO}/scripts/replace_path_nodes.py ../{input.ontalns} ../{output.nodemap} |grep -F -v -w -f ../6-layoutContigs/ont-gapfill.txt | sed 's/^/ont_/' > ../{output.ontalns} || true +cat ../{input.ontgaps} |cut -f 1 > ../{output.ontgapaln} +{PYTHON} {VERKKO}/scripts/replace_path_nodes.py ../{input.ontalns} ../{output.nodemap} |grep -F -v -w -f ../{output.ontgapaln} | sed 's/^/ont_/' > ../{output.ontalns} || true cat ../{input.graph} \\\\ | awk 'BEGIN \\\\ @@ -144,6 +145,7 @@ rule layoutContigs: output: layout = rules.verkko.input.layout, scfmap = rules.verkko.input.scfmap, + dropped = '6-layoutContigs/unitig-popped.layout.scfmap.dropped', gaps = '6-layoutContigs/gaps.txt' params: haveONT = config['withONT'], @@ -180,15 +182,18 @@ fi ../{input.paths} \\\\ ../{input.lengths} \\\\ ../6-layoutContigs/unitig_initial.layout \\\\ - ../{output.scfmap} + ../6-layoutContigs/unitig_initial.layout.scfmap if [ "{params.haveONT}" = "True" ] && [ "{params.haveBAM}" = "True" ]; then + cp ../6-layoutContigs/unitig_initial.layout.scfmap.dropped ../{output.dropped} {PYTHON} {VERKKO}/scripts/merge_layouts.py \\\\ ../6-layoutContigs/unitig_initial.layout \\\\ ../{input.ontalns} \\\\ - > ../{output.layout} + ../{output.layout} else - cp ../6-layoutContigs/unitig_initial.layout ../{output.layout} + cp ../6-layoutContigs/unitig_initial.layout.scfmap.dropped ../{output.dropped} + cp ../6-layoutContigs/unitig_initial.layout.scfmap ../{output.scfmap} + cp ../6-layoutContigs/unitig_initial.layout ../{output.layout} fi {PYTHON} {VERKKO}/scripts/check_layout_gaps.py < ../{output.layout} > ../{output.gaps} diff --git a/src/Snakefiles/7-combineConsensus.sm b/src/Snakefiles/7-combineConsensus.sm index 1c2627e8..1e745cab 100644 --- a/src/Snakefiles/7-combineConsensus.sm +++ b/src/Snakefiles/7-combineConsensus.sm @@ -28,7 +28,7 @@ def combineConsensusI(wildcards): def combineConsensusP(wildcards): return expand( "packages/part{nnnn}.fasta", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) def combineConsensusB(wildcards): - return expand( "packages/part{nnnn}.bam*.bam", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) + return expand( "packages/part{nnnn}.bam*.bam", nnnn = glob_wildcards("7-consensus/packages/part{xxxx}.cnspack").xxxx) rule combineConsensus: input: @@ -36,7 +36,7 @@ rule combineConsensus: tigmap = rules.buildPackages.output.tigmap, scfmap = rules.layoutContigs.output.scfmap, layout = rules.layoutContigs.output.layout, - ontalns = rules.generateLayoutContigsInputs.output.ontalns, + ontalns = rules.generateLayoutContigsInputs.output.ontgapaln if config['withONT'] == "True" else {rules.emptyfile.output}, hificov = rules.verkko.input.hifi5cov, finished = rules.buildPackages.output.finished, pathstsv = rules.rukki.output.pathstsv if config['ruk_enable'] == "True" else rules.emptyfile.output, @@ -65,8 +65,8 @@ rule combineConsensus: resources: job_id = 1, n_cpus = 8, - mem_gb = lambda wildcards, input, attempt: getAlignMemoryRequest(attempt, 2, input.packcns), - time_h = 4 + mem_gb = lambda wildcards, input, attempt: getAlignMemoryRequest(attempt, 5, input.packcns), + time_h = 48 shell: ''' cd 7-consensus @@ -82,8 +82,8 @@ if [ -e ../../label2 ] ; then label2=`cat ../../label2` ; fi cat > ./combineConsensus.sh < 0: mem = factor * sum / 1024 / 1024 / 1024 @@ -141,7 +142,7 @@ def getConsensusMemoryRequest(attempt, jobid): # Figure out what partitionin for l in f: words = l.strip().split(); if jobid == words[5]: - mem = max(mem, 5.0 + 0.80 * float(words[4])) + mem = max(mem, 5.0 + 0.90 * float(words[4])) return int(math.ceil(math.ceil(mem)*scl)) diff --git a/src/canu b/src/canu index 3c2099fb..42d86c03 160000 --- a/src/canu +++ b/src/canu @@ -1 +1 @@ -Subproject commit 3c2099fbcb5c9b4fb4b4bc6267139e1821d2bc16 +Subproject commit 42d86c03d668984cbec79926651e8253a930482a diff --git a/src/scripts/bam_rename.py b/src/scripts/bam_rename.py index 133a9d99..244ae8bb 100755 --- a/src/scripts/bam_rename.py +++ b/src/scripts/bam_rename.py @@ -11,7 +11,7 @@ bamfile = pysam.AlignmentFile("-", "rb") layout = sys.argv[1] gap_info = sys.argv[2] -scfmap = seq.readScfMap(sys.argv[3]) +scfmap,_ = seq.readScfMap(sys.argv[3]) namedict = seq.readNameMap(sys.argv[4]) lens = dict() offsets = dict() diff --git a/src/scripts/check_layout_gaps.py b/src/scripts/check_layout_gaps.py index 193062d4..679d4d2a 100755 --- a/src/scripts/check_layout_gaps.py +++ b/src/scripts/check_layout_gaps.py @@ -16,7 +16,7 @@ def check_alns(current_tig, current_len, current_alns): if current_start > last_end: print(current_tig + "\t" + str(last_end) + "\t" + str(current_start) + " (len " + str(current_len) + ")") if current_start < 0 or current_end > current_len: - print("layout outside contig: " + current_tig + "\t" + current_start + "\t" + current_end + "\t" + " (len " + str(current_len) + ")") + print("layout outside contig: " + current_tig + "\t" + str(current_start) + "\t" + str(current_end) + "\t" + " (len " + str(current_len) + ")") last_end = max(last_end, current_end) if last_end < current_len: print(current_tig + "\t" + str(last_end) + "\t" + str(current_len) + " (len " + str(current_len) + ")") diff --git a/src/scripts/fasta_combine.py b/src/scripts/fasta_combine.py index 8dde500f..a53cdd20 100755 --- a/src/scripts/fasta_combine.py +++ b/src/scripts/fasta_combine.py @@ -23,7 +23,7 @@ mode = sys.argv[1] output_name = sys.argv[2] namedict = seq.readNameMap(sys.argv[3]) - scfmap = seq.readScfMap(sys.argv[4]) + scfmap,_ = seq.readScfMap(sys.argv[4]) filenames = sys.argv[5:] else: diff --git a/src/scripts/fasta_util.py b/src/scripts/fasta_util.py index 757e3cb2..41654f12 100755 --- a/src/scripts/fasta_util.py +++ b/src/scripts/fasta_util.py @@ -144,6 +144,7 @@ def replaceName(name, namedict, mode): # def readScfMap(filename): scfmap = dict() + names = dict() ctgname = "" ctglist = [] @@ -153,11 +154,13 @@ def readScfMap(filename): if words[0] == "path": ctgname = words[1] + path = words[2] ctglist = [] elif words[0] == "end": scfmap[ctgname] = ctglist + names[ctgname] = path else: ctglist.append(words[0]) - return(scfmap) + return(scfmap, names) diff --git a/src/scripts/get_layout_from_mbg.py b/src/scripts/get_layout_from_mbg.py index ad5be91c..ecb5d78d 100755 --- a/src/scripts/get_layout_from_mbg.py +++ b/src/scripts/get_layout_from_mbg.py @@ -5,6 +5,8 @@ import re import graph_functions as gf +random.seed(42) + mapping_file = sys.argv[1] edge_overlap_file = sys.argv[2] @@ -650,7 +652,7 @@ def get_exact_match_length(clusters): # we look at randomly placed reads and pick a single location for each one # if we're first we'll select a location, single value between 1 and number of times it's used if line[0] not in read_output_counts: - read_output_counts[line[0]] = ("", "", 0) + read_output_counts[line[0]] = ("", "", 0, read_actual_counts[line[0]]) #sys.stderr.write("For reach %s which occurs %d times in contigs %s we are seing it for the first time in %s and selecting"%(line[0], read_actual_counts[line[0]], str(read_actual_contigs[line[0]]), contig)) if len(read_actual_contigs[line[0]]) <= 1: #sys.stderr.write(" read is used %d times in %s contigs so placing everywhere "%(read_actual_counts[line[0]], str(read_actual_contigs[line[0]]))) @@ -659,12 +661,12 @@ def get_exact_match_length(clusters): read_actual_counts[line[0]] = random.randint(1, read_actual_counts[line[0]]) #sys.stderr.write(" position %d\n"%(read_actual_counts[line[0]])) #check if we are we right instance - read_output_counts[line[0]] = (read_output_counts[line[0]][0], read_output_counts[line[0]][1], read_output_counts[line[0]][2] + 1) + read_output_counts[line[0]] = (read_output_counts[line[0]][0], read_output_counts[line[0]][1], read_output_counts[line[0]][2] + 1, read_output_counts[line[0]][3]) #sys.stderr.write("The count for read %s is now %s\n"%(line[0], str(read_output_counts[line[0]]))) if read_actual_counts[line[0]] > 0 and read_output_counts[line[0]][2] != read_actual_counts[line[0]]: continue # record the contig and coordinate we will use - read_output_counts[line[0]] = (contig, line[1], read_actual_counts[line[0]]) + read_output_counts[line[0]] = (contig, line[1], read_actual_counts[line[0]], read_output_counts[line[0]][3]) start_pos = min(start_pos, line[1]) start_pos = min(start_pos, line[2]) end_pos = max(end_pos, line[1]) @@ -685,7 +687,7 @@ def get_exact_match_length(clusters): end = line[2] - start_pos #sys.stderr.write("For read %s which has been selected to be in position %s we are expecting %d\n"%(line[0], str(read_output_counts[line[0]]), read_actual_counts[line[0]])) if read_actual_counts[line[0]] < 0 or (read_output_counts[line[0]][0] == contig and read_output_counts[line[0]][1] == line[1]): - print(f"{line[0]}\t{bgn}\t{end}", file=tig_layout_file) + print(f"{line[0]}\t{bgn}\t{end}\t{read_output_counts[line[0]][3]}", file=tig_layout_file) print(f"end", file=tig_layout_file) tig_layout_file.close() diff --git a/src/scripts/merge_layouts.py b/src/scripts/merge_layouts.py index b42e69a5..d12b695f 100755 --- a/src/scripts/merge_layouts.py +++ b/src/scripts/merge_layouts.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import sys +import fasta_util as seq +import re def read_list(file_path): read_list = set() @@ -31,8 +33,8 @@ def read_layout_file(file_path, flag): elif line == "end": current_tig = None else: - read, start, end = line.split("\t") - layout_data[-1]["reads"].append({"read": read, "start": int(start), "end": int(end), "flag": flag}) + read, start, end, copy = line.split("\t") + layout_data[-1]["reads"].append({"read": read, "start": int(start), "end": int(end), "copy" : int(copy), "flag": flag}) return layout_data @@ -42,6 +44,7 @@ def merge_layout_files(file1, flag1, file2, flag2): layout2 = read_list(file2) merged_layout = [] + dropped = set() # Merge layout data for tigs present in both files for layout1_entry in layout1: @@ -54,24 +57,70 @@ def merge_layout_files(file1, flag1, file2, flag2): else: keepTig = True - if keepTig == True: merged_layout.append(layout1_entry) + if keepTig == True: + merged_layout.append(layout1_entry) + else: + dropped.add(layout1_entry["tig"]) - return merged_layout + return merged_layout, dropped -file1_path = sys.argv[1] -file2_path = sys.argv[2] +file1_path = sys.argv[1] +file2_path = sys.argv[2] +output_prefix = sys.argv[3] +scfmap, scfname = seq.readScfMap(f"{sys.argv[1]}.scfmap") +tig_layout_file = open(f"{output_prefix}", mode="w") +scf_layout_file = open(f"{output_prefix}.scfmap", mode="w") +nul_layout_file = open(f"{output_prefix}.scfmap.dropped", mode="a") -merged_layout = merge_layout_files(file1_path, 0, file2_path, 1) +merged_layout, dropped = merge_layout_files(file1_path, 0, file2_path, 1) # Print the merged layout for entry in merged_layout: - print("tig", entry["tig"], sep='\t') - print("len", entry["len"], sep='\t') - print("trm", entry["trm"], sep='\t') - print("rds", entry["num_reads"], sep='\t') + print("tig", entry["tig"], sep='\t', file=tig_layout_file) + print("len", entry["len"], sep='\t', file=tig_layout_file) + print("trm", entry["trm"], sep='\t', file=tig_layout_file) + print("rds", entry["num_reads"], sep='\t', file=tig_layout_file) for read_entry in entry["reads"]: - print(read_entry["read"], read_entry["start"], read_entry["end"], read_entry['flag'], sep='\t') - - print("end") + print(read_entry["read"], read_entry["start"], read_entry["end"], read_entry['copy'], read_entry['flag'], sep='\t', file=tig_layout_file) + + print("end", file=tig_layout_file) + +# print the merged scfmap +for clist in scfmap: + keep = False + for piece in scfmap[clist]: + numn = re.match(r"\[N(\d+)N]", piece) + if numn or piece not in dropped: + keep= True + + # now that we've checked that some pieces are saved, output it + if keep == False: + print(f"{clist} has no reads assigned and is not output.", file=nul_layout_file) + continue + + previous = "NONE" + new_pieces = [] + print(f"path {clist} {scfname[clist]}", file=scf_layout_file) + for piece in scfmap[clist]: + is_gap = re.match(r"\[N\d+N\]", piece) + if is_gap: + if previous == "NONE": + continue + if previous == "piece": + new_pieces.append(f"{piece}") + previous = "gap" + else: + #merging consecutive gaps + last = new_pieces.pop() + previous = "gap" + last_int = int(last[2:-2]) + cur_int = int(piece[2:-2]) + new_pieces.append("[N%dN]"%(last_int + cur_int)) + else: + if piece in dropped: continue + new_pieces.append(f"{piece}") + previous = "piece" + print("\n".join(new_pieces), file=scf_layout_file) + print("end", file=scf_layout_file) diff --git a/src/verkko.sh b/src/verkko.sh index 77d26676..6b48eaf4 100755 --- a/src/verkko.sh +++ b/src/verkko.sh @@ -209,7 +209,7 @@ hic2="" withont="False" withhic="False" withporec="False" -withbam="False" +withbam="True" withref="False" keepinter="True" @@ -300,7 +300,7 @@ mbg_unitig_abundance=2 # split_ont, partitioning ont reads for alignment spl_bases=3000000000 spl_reads=150000 -spl_min_length=0 +spl_min_length=$cor_min_read # align_ont, alignment of ont reads to the initial graph ali_mxm_length=30 @@ -330,6 +330,10 @@ ruk_hap2="" ruk_type="" ruk_fract="0.9" +#consensus +cns_num_iter="2" +cns_max_cov="50" + # HiC heuristics haplo_divergence=0.05 rdna_scaff="True" @@ -414,7 +418,7 @@ ruk_time_h=4 # create_layout lay_n_cpus=1 -lay_mem_gb=32 +lay_mem_gb=64 lay_time_h=24 # get_ont_subset @@ -423,14 +427,14 @@ sub_mem_gb=16 sub_time_h=24 # partition_consensus -par_n_cpus=8 -par_mem_gb=8 -par_time_h=24 +par_n_cpus=1 +par_mem_gb=16 +par_time_h=96 # cns cns_n_cpus=8 cns_mem_gb=0 -cns_time_h=24 +cns_time_h=48 # align_hic stuff ahc_n_cpus=24 @@ -596,12 +600,14 @@ while [ $# -gt 0 ] ; do elif [ "$opt" = "--correct-k-mer-size" ] ; then mer_size=$arg; shift elif [ "$opt" = "--correct-k-mer-window" ] ; then mer_window=$arg; shift - elif [ "$opt" = "--correct-k-mer-coverage" ] ; then mer_coverage=$arg; shift - elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; shift + elif [ "$opt" = "--correct-k-mer-coverage" ] ; then mer_coverage=$arg; shift + elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; $spl_min_length=$arg; shift elif [ "$opt" = "--correct-min-overlap-length" ] ; then cor_min_overlap=$arg; shift elif [ "$opt" = "--correct-index-batches" ] ; then cor_index_batches=$arg; shift elif [ "$opt" = "--correct-overlap-batches" ] ; then cor_overlap_batches=$arg; shift + elif [ "$opt" = "--consensus-num-iterations" ] ; then cns_num_iter=$arg; shift + elif [ "$opt" = "--consensus-max-coverage" ] ; then cns_max_cov=$arg; shift # # MBG options @@ -621,7 +627,6 @@ while [ $# -gt 0 ] ; do elif [ "$opt" = "--split-bases" ] ; then spl_bases=$arg; shift elif [ "$opt" = "--split-reads" ] ; then spl_reads=$arg; shift - elif [ "$opt" = "--min-ont-length" ] ; then spl_min_length=$arg; shift # # alignONT options @@ -652,7 +657,7 @@ while [ $# -gt 0 ] ; do # Post-processing options # - elif [ "$opt" = "--consensus-bam" ] ; then withbam="True"; lay_mem_gb=`expr $lay_mem_gb \* 2`; + elif [ "$opt" = "--no-consensus-bam" ] ; then withbam="False"; lay_mem_gb=`expr $lay_mem_gb \/ 2`; elif [ "$opt" = "--discard-short" ] ; then short_contig_length=$arg; shift elif [ "$opt" = "--screen-human-contaminants" ]; then screen="$screen ebv human-ebv-AJ507799.2.fasta.gz" @@ -1108,7 +1113,6 @@ if [ "x$help" = "xhelp" -o "x$errors" != "x" ] ; then echo "ONT splitting:" echo " --split-bases" echo " --split-reads" - echo " --min-ont-length" echo " " echo " " echo "GraphAligner:" @@ -1247,6 +1251,9 @@ echo >> ${outd}/verkko.yml "ruk_hap2: '${ruk_hap2}'" echo >> ${outd}/verkko.yml "ruk_type: '${ruk_type}'" echo >> ${outd}/verkko.yml "ruk_fract: '${ruk_fract}'" echo >> ${outd}/verkko.yml "" +echo >> ${outd}/verkko.yml "# consensus" +echo >> ${outd}/verkko.yml "cns_num_iter: '${cns_num_iter}'" +echo >> ${outd}/verkko.yml "cns_max_cov: '${cns_max_cov}'" echo >> ${outd}/verkko.yml "# HiC algo options" echo >> ${outd}/verkko.yml "no_rdna_tangle: '${no_rdna_tangle}'" echo >> ${outd}/verkko.yml "uneven_depth: '${uneven_depth}'" From 958f22e80b1edb20156e22605fcddac121f2bbf0 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 5 Sep 2025 09:59:01 -0400 Subject: [PATCH 05/19] move index only on successfull completion, safety in case jobs terminate early but not reported as failed by grid engine --- src/Snakefiles/c2-findOverlaps.sm | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Snakefiles/c2-findOverlaps.sm b/src/Snakefiles/c2-findOverlaps.sm index 1f51e832..01fbb002 100644 --- a/src/Snakefiles/c2-findOverlaps.sm +++ b/src/Snakefiles/c2-findOverlaps.sm @@ -61,8 +61,12 @@ done -n 1 -w {params.merwindow} \\\\ --max-coverage {params.mercount} \\\\ --tmp-file-count {params.nindex} \\\\ - -o matchchains \\\\ - {input.reads} + -o matchchains.WORKING \\\\ + {input.reads} \\\\ + && \\\\ + mv matchchains.WORKING.metadata matchchains.metadata \\\\ + && \\\\ + mv matchchains.WORKING.positions matchchains.positions if [ ! -s matchchains.positions ]; then echo "Error: indexing failed, check above for any errors" From 5fe146a4068c0410b803153c60f9e1ab9ba0af5c Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 5 Sep 2025 10:00:27 -0400 Subject: [PATCH 06/19] update gap filling criteria, disallow single read gap closing --- src/Snakefiles/2-processGraph.sm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Snakefiles/2-processGraph.sm b/src/Snakefiles/2-processGraph.sm index 0ca6f2a7..88e011d8 100644 --- a/src/Snakefiles/2-processGraph.sm +++ b/src/Snakefiles/2-processGraph.sm @@ -47,17 +47,17 @@ cat > ./processGraph.sh < gapped-once-hifi-resolved.gfa echo "" echo Gap insertion pass 2. -{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-once-hifi-resolved.gfa ../{input.paths} 2 300 paths-nogap-2.gaf gaps-hifi-2.gaf gaptwo n \\\\ +{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-once-hifi-resolved.gfa ../{input.paths} 2 50 paths-nogap-2.gaf gaps-hifi-2.gaf gaptwo n \\\\ > gapped-twice-hifi-resolved.gfa echo "" echo Gap insertion pass 3. -{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-hifi-resolved.gfa ../{input.paths} 1 5 paths-nogap-3.gaf gaps-hifi-3.gaf gapthree n \\\\ +{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-hifi-resolved.gfa ../{input.paths} 2 300 paths-nogap-3.gaf gaps-hifi-3.gaf gapthree n \\\\ > gapped-hifi-resolved.gfa echo "" From fb5bd61111733af566040ae7c54b909b32fd3995 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 5 Sep 2025 10:09:17 -0400 Subject: [PATCH 07/19] add bamtools, pysam, samtools, seqtk to dependencies and fix CI test --- .github/workflows/makefile.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/makefile.yml b/.github/workflows/makefile.yml index 056d4402..1cec4155 100644 --- a/.github/workflows/makefile.yml +++ b/.github/workflows/makefile.yml @@ -44,9 +44,9 @@ jobs: - name: Install dependencies run: | sudo apt-get update - mamba install -c conda-forge -c bioconda 'python=3.11' 'pulp<=2.7.0' 'snakemake-minimal>7,<8' GraphAligner mashmap winnowmap - sudo apt-get install gcc-11 g++-11 libcurl4-openssl-dev liblzma-dev libbz2-dev - pip install networkx + mamba install -c conda-forge -c bioconda 'python=3.11' 'pulp<=2.7.0' 'snakemake-minimal>7,<8' GraphAligner mashmap winnowmap samtools seqtk + sudo apt-get install gcc-11 g++-11 libcurl4-openssl-dev liblzma-dev libbz2-dev bamtools libbamtools-dev + pip install networkx pysam sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 @@ -74,6 +74,10 @@ jobs: echo >> x.py "print(G)" python x.py + echo "" + echo "BAMTOOLS TEST" + echo " "`command -v bamtools` + - name: Compile run: | cd src @@ -120,10 +124,12 @@ jobs: echo > lib/verkko/bin/utgcns '#!/bin/sh' echo >> lib/verkko/bin/utgcns 'if [ ! -e part001.fasta ]; then' echo >> lib/verkko/bin/utgcns 'cp ../../cache/part001.fasta packages/part001.fasta.WORKING' + echo >> lib/verkko/bin/utgcns 'samtools dict ../../cache/part001.fasta | samtools view -b -S > packages/part001.bam00000002.bam' echo >> lib/verkko/bin/utgcns 'fi' echo >> lib/verkko/bin/utgcns '' echo >> lib/verkko/bin/utgcns 'if [ ! -e part002.fasta ]; then' echo >> lib/verkko/bin/utgcns 'cp ../../cache/part002.fasta packages/part002.fasta.WORKING' + echo >> lib/verkko/bin/utgcns 'samtools dict ../../cache/part002.fasta | samtools view -b -S > packages/part002.bam00000001.bam' echo >> lib/verkko/bin/utgcns 'fi' chmod 755 mbg.sh From a795867f78832f07d7ed6586add0deae5a0a9746 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Wed, 10 Sep 2025 15:56:51 -0400 Subject: [PATCH 08/19] add utility script to convert from utig4- paths to utig1- paths --- src/scripts/get_utig1_from_utig4.py | 194 ++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100755 src/scripts/get_utig1_from_utig4.py diff --git a/src/scripts/get_utig1_from_utig4.py b/src/scripts/get_utig1_from_utig4.py new file mode 100755 index 00000000..073c6859 --- /dev/null +++ b/src/scripts/get_utig1_from_utig4.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python + +import random +import sys +import re +import graph_functions as gf + +MAX_GAP_SIZE=100000 +mapping_file = sys.argv[1] +edge_overlap_file = sys.argv[2] + +#Either paths from rukki or just single nodes +paths_file = sys.argv[3] + +nodelens_file = sys.argv[4] + +#transform paths to base elements - mbg nodes and gaps. +def get_leafs(path, mapping, edge_overlaps, raw_node_lens): + path_len = 0 + for i in range(0, len(path)): + path_len += raw_node_lens[path[i][1:]] + if i > 0: path_len -= edge_overlaps[gf.canon(path[i-1], path[i])] + result = [(n, 0, raw_node_lens[n[1:]]) for n in path] + overlaps = [] + for i in range(1, len(path)): + overlaps.append(edge_overlaps[gf.canon(path[i-1], path[i])]) + current_len = 0 + for i in range(0, len(result)): + assert result[i][2] > result[i][1] + assert result[i][2] <= raw_node_lens[result[i][0][1:]] + assert result[i][1] >= 0 + current_len += result[i][2] - result[i][1] + if i > 0: current_len -= overlaps[i-1] + assert current_len == path_len + while True: + any_replaced = False + new_result = [] + new_overlaps = [] + for i in range(0, len(result)): + if result[i][0][1:] not in mapping: + new_result.append(result[i]) + if i > 0: new_overlaps.append(overlaps[i-1]) + else: + any_replaced = True + part = [(n, 0, raw_node_lens[n[1:]]) for n in mapping[result[i][0][1:]][0]] + part[0] = (part[0][0], part[0][1] + mapping[result[i][0][1:]][1], part[0][2]) + part[-1] = (part[-1][0], part[-1][1], part[-1][2] - mapping[result[i][0][1:]][2]) + if result[i][0][0] == "<": + part = [(gf.revnode(n[0]), raw_node_lens[n[0][1:]] - n[2], raw_node_lens[n[0][1:]] - n[1]) for n in part[::-1]] + old_start_clip = result[i][1] + old_end_clip = (raw_node_lens[result[i][0][1:]] - result[i][2]) + part[0] = (part[0][0], part[0][1] + old_start_clip, part[0][2]) + part[-1] = (part[-1][0], part[-1][1], part[-1][2] - old_end_clip) + new_result += part + if i > 0: new_overlaps.append(overlaps[i-1]) + for j in range(1, len(part)): + new_overlaps.append(edge_overlaps[gf.canon(part[j-1][0], part[j][0])]) + assert len(new_result) == len(new_overlaps)+1 + assert len(new_result) >= len(result) + if not any_replaced: break + result = new_result + overlaps = new_overlaps + current_len = 0 + for i in range(0, len(result)): + # strangely, this assertion is not always true. + # The ONT based k-mer increase can create a node where the overlap is greater than the initial MBG node size + # and in that case the initial MBG node will have a "negative" length within the contig + # assert result[i][2] > result[i][1] + assert result[i][2] <= raw_node_lens[result[i][0][1:]] + assert result[i][1] >= 0 + current_len += result[i][2] - result[i][1] + if i > 0: current_len -= overlaps[i-1] + assert current_len == path_len + return (result, overlaps) + +raw_node_lens = {} +with open(nodelens_file) as f: + for l in f: + parts = l.strip().split('\t') + assert parts[0] not in raw_node_lens or raw_node_lens[parts[0]] == int(parts[1]) + raw_node_lens[parts[0]] = int(parts[1]) + +edge_overlaps = {} +with open(edge_overlap_file) as f: + for l in f: + parts = l.strip().split('\t') + assert parts[0] == "L" + fromnode = (">" if parts[2] == "+" else "<") + parts[1] + tonode = (">" if parts[4] == "+" else "<") + parts[3] + overlap = int(parts[5][:-1]) + key = gf.canon(fromnode, tonode) + if key in edge_overlaps: assert edge_overlaps[key] == overlap + edge_overlaps[key] = overlap + +node_mapping = {} +cut_mapping = {} +with open(mapping_file) as f: + for l in f: + + parts = l.strip().split('\t') + assert parts[0] not in node_mapping + if not re.search(r"utig\d+[a-z]?-" , parts[1]): + continue + path = parts[1].split(':')[0].replace('<', "\t<").replace('>', "\t>").strip().split('\t') + left_clip = int(parts[1].split(':')[1]) + right_clip = int(parts[1].split(':')[2]) + node_mapping[parts[0]] = (path, left_clip, right_clip) + left_len = raw_node_lens[parts[0]] + right_len = 0 + for i in range(0, len(path)): + right_len += raw_node_lens[path[i][1:]] + if i > 0: right_len -= edge_overlaps[gf.canon(path[i-1], path[i])] + assert left_len == right_len - left_clip - right_clip + + # save the mapping of cut nodes to their respective coordinates so we can find them later + if (len(path) == 1 and path[0][1:] in raw_node_lens): + new_name = path[0][1:] + ":" + str(left_clip) + ":" + str(raw_node_lens[path[0][1:]]-right_clip) + cut_mapping[new_name] = parts[0] +pieceid = 0 + +#all these contains info about contigs - here nodes or rukki paths splitted by N +#paths are transformed into mbg nodes and gaps with get_leafs procedure +contig_lens = {} +contig_nodeseqs = {} +contig_nodeoverlaps = {} +contig_node_offsets = {} +contig_pieces = {} +with open(paths_file) as f: + for l in f: + lp = l.strip().split('\t') + + # Find all words that + # begin with [<>], contain anything but [ + # begin with [N, contain digits and end with N] or N:optional-description] + # we dump the description here and anly keep the N, digits N] part + # + fullname = lp[0] + pathfull = re.findall(r"([<>][^[]+|\[N\d+N(?:[^\]]+){0,1}\])", lp[1]) + + contig_pieces[fullname] = [] + + for pp in pathfull: + #pp is either path without gaps or gap. In latest case do nothing + gp = re.match(r"\[(N\d+N)(?:[^\]]+){0,1}\]", pp) + if gp: + tuned_numn = min(round(int(gp.group(1)[1:-1]) * 1.5), MAX_GAP_SIZE) + contig_pieces[fullname].append("[N" + str(tuned_numn) + "N:gap]") + continue + + pieceid = pieceid + 1 + pathname = fullname + + (path, overlaps) = get_leafs(re.findall(r"[<>][^<>]+", pp), node_mapping, edge_overlaps, raw_node_lens) + # skip a path if the only thing in it is a gapfill + if len(path) == 1 and path[0][0][1:4] == "gap": + continue + + contig_nodeseqs[pathname] = path + contig_nodeoverlaps[pathname] = overlaps + contig_node_offsets[pathname] = [] + pos = 0 + for i in range(0, len(path)-1): + contig_node_offsets[pathname].append(pos) + pos += path[i][2] - path[i][1] + pos -= overlaps[i] + contig_node_offsets[pathname].append(pos) + contig_lens[pathname] = contig_node_offsets[pathname][-1] + path[-1][2] - path[-1][1] + check_len = 0 + for i in range(0, len(path)): + check_len += path[i][2] - path[i][1] + if i > 0: check_len -= overlaps[i-1] + assert contig_lens[pathname] == check_len + pathstr = "" + for i in range(0, len(path)): + pathstr += path[i][0] # + ":" + str(path[i][1]) + ":" + str(path[i][2]) + "(" + str(contig_node_offsets[pathname][i]) + ")" + if path[i][1] != 0 or path[i][2] != raw_node_lens[path[i][0][1:]]: + new_name = "" + if path[i][0][0] == ">": + new_name = path[i][0][1:] + ":" + str(path[i][1]) + ":" + str(path[i][2]) + else: + new_name = path[i][0][1:] + ":" + str(raw_node_lens[path[i][0][1:]]-path[i][2]) + ":" + str(raw_node_lens[path[i][0][1:]]-path[i][1]) + if (new_name in cut_mapping): + pathstr += "_" + cut_mapping[new_name].strip().split("_")[-1] + #if i < len(path)-1: pathstr += "-" + str(overlaps[i]) + #sys.stdout.write(pathname + "\t" + pathstr + "\n") + contig_pieces[fullname].append(pathstr) + + + #contig_pieces[fullname].append("end") + +for fullname in contig_pieces: + if "name" in fullname: continue + sys.stdout.write(fullname + "\t" + "".join(contig_pieces[fullname]) + "\n") + From 5fbe6f4e0900b4acfb035a1c99d38a16be238722 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Thu, 11 Sep 2025 09:20:44 -0400 Subject: [PATCH 09/19] utig4 tranlation handle overlaps containing full utig1- nodes --- src/scripts/get_utig1_from_utig4.py | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/scripts/get_utig1_from_utig4.py b/src/scripts/get_utig1_from_utig4.py index 073c6859..df588b52 100755 --- a/src/scripts/get_utig1_from_utig4.py +++ b/src/scripts/get_utig1_from_utig4.py @@ -121,8 +121,6 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): #all these contains info about contigs - here nodes or rukki paths splitted by N #paths are transformed into mbg nodes and gaps with get_leafs procedure contig_lens = {} -contig_nodeseqs = {} -contig_nodeoverlaps = {} contig_node_offsets = {} contig_pieces = {} with open(paths_file) as f: @@ -155,14 +153,16 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): if len(path) == 1 and path[0][0][1:4] == "gap": continue - contig_nodeseqs[pathname] = path - contig_nodeoverlaps[pathname] = overlaps contig_node_offsets[pathname] = [] + contig_offsets = {} pos = 0 + end = -1 for i in range(0, len(path)-1): contig_node_offsets[pathname].append(pos) + contig_offsets[path[i][0]] = pos pos += path[i][2] - path[i][1] pos -= overlaps[i] + contig_offsets[path[-1][0]] = pos contig_node_offsets[pathname].append(pos) contig_lens[pathname] = contig_node_offsets[pathname][-1] + path[-1][2] - path[-1][1] check_len = 0 @@ -172,23 +172,32 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): assert contig_lens[pathname] == check_len pathstr = "" for i in range(0, len(path)): - pathstr += path[i][0] # + ":" + str(path[i][1]) + ":" + str(path[i][2]) + "(" + str(contig_node_offsets[pathname][i]) + ")" - if path[i][1] != 0 or path[i][2] != raw_node_lens[path[i][0][1:]]: - new_name = "" + assert path[i][0] in contig_offsets + # build a name using the contig without the <> but also append coordinates if it's partial match to check for cut node + # if a cut version exists, use that name instead, otherwise use the original node name + new_name = path[i][0][1:] + if path[i][1] != 0 or path[i][2] != raw_node_lens[new_name]: if path[i][0][0] == ">": new_name = path[i][0][1:] + ":" + str(path[i][1]) + ":" + str(path[i][2]) else: - new_name = path[i][0][1:] + ":" + str(raw_node_lens[path[i][0][1:]]-path[i][2]) + ":" + str(raw_node_lens[path[i][0][1:]]-path[i][1]) + new_name = path[i][0][1:] + ":" + str(raw_node_lens[new_name]-path[i][2]) + ":" + str(raw_node_lens[new_name]-path[i][1]) + if new_name not in cut_mapping: + new_name = path[i][0][1:] + + # when we see the name in our path already and the offset is earlier than the largest we have already seen, this is an overlap + # we skip these overlapping nodes from the path and continue at the new unique/larger offset node + if contig_offsets[path[i][0]] <= end and new_name in pathstr: + continue + end = contig_offsets[path[i][0]] + if path[i][1] != 0 or path[i][2] != raw_node_lens[path[i][0][1:]]: if (new_name in cut_mapping): - pathstr += "_" + cut_mapping[new_name].strip().split("_")[-1] - #if i < len(path)-1: pathstr += "-" + str(overlaps[i]) - #sys.stdout.write(pathname + "\t" + pathstr + "\n") + pathstr += path[i][0] + "_" + cut_mapping[new_name].strip().split("_")[-1] + else: + pathstr += path[i][0] + else: + pathstr += path[i][0] contig_pieces[fullname].append(pathstr) - - #contig_pieces[fullname].append("end") - for fullname in contig_pieces: if "name" in fullname: continue sys.stdout.write(fullname + "\t" + "".join(contig_pieces[fullname]) + "\n") - From 43aefbaec81993af5c125205ed031771b5edfb38 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 12 Sep 2025 14:28:22 -0400 Subject: [PATCH 10/19] fix for empty partitions --- src/scripts/cluster.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index aa07c641..1f66b88f 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -121,7 +121,8 @@ def create_initial_partition_noKL(C, matchGraph): neg_graph.add_node(ec[i]) neg_graph.add_edge(ec[0], ec[1]) elif (ec[0] in C or ec[1] in C) and matchGraph.edges[ec]['weight'] < 0: - logger.warning(f"Weird, edge {ec} partially in component{C}") + #This may happen since we delete high-covered nodes, although should not be often + logger.info(f"Weird, edge {ec} partially in component{C}") #exit(1) dists = dict(nx.all_pairs_shortest_path_length(neg_graph)) #friends - should always be in the same component @@ -544,13 +545,20 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une continue C = create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_depth, edges, dists) - - #TODO: cycle for randomized starts + if len(C.nodes) == 1: + continue + #TODO: cycle for randomized starts #optimize_partition(C, parts, swap_together) best_score = FIXED_HOMOLOGY_WEIGHT * C.number_of_nodes() * C.number_of_nodes() best_part = [set(), set()] init_parts, opposite = create_initial_partition_noKL(C, matchGraph) + if len(init_parts[0]) == 0 or len(init_parts[1]) == 0: + logger.warning(f"Trying to phase component with empty set in initial partition, reporting it as haploid") + #May happen in weird cases of uneven coverage when some nodes from connected_component are removed from C + haploid_component_count += 1 + process_haploid_component(current_component, G, tsv_file, haploid_component_count) + continue for seed in range(0, KLIN_STARTS): # iterate on starting partition random.seed(seed) parts = random_swap(init_parts, opposite, seed) @@ -624,7 +632,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une report_phased_node(contig, ind, tsv_file) for contig in sorted(only_weights.keys()): tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') - + logger.info("Phasing successfully finished") if __name__ == "__main__": if len(sys.argv) != 7: From 221f7a9c2f2b05929490ad58320bada96ce1416b Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Thu, 18 Sep 2025 14:03:39 -0400 Subject: [PATCH 11/19] fix handling of offset for repeated nodes --- src/scripts/get_utig1_from_utig4.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/scripts/get_utig1_from_utig4.py b/src/scripts/get_utig1_from_utig4.py index df588b52..a4767e6e 100755 --- a/src/scripts/get_utig1_from_utig4.py +++ b/src/scripts/get_utig1_from_utig4.py @@ -154,15 +154,12 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): continue contig_node_offsets[pathname] = [] - contig_offsets = {} pos = 0 end = -1 for i in range(0, len(path)-1): contig_node_offsets[pathname].append(pos) - contig_offsets[path[i][0]] = pos pos += path[i][2] - path[i][1] pos -= overlaps[i] - contig_offsets[path[-1][0]] = pos contig_node_offsets[pathname].append(pos) contig_lens[pathname] = contig_node_offsets[pathname][-1] + path[-1][2] - path[-1][1] check_len = 0 @@ -172,7 +169,6 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): assert contig_lens[pathname] == check_len pathstr = "" for i in range(0, len(path)): - assert path[i][0] in contig_offsets # build a name using the contig without the <> but also append coordinates if it's partial match to check for cut node # if a cut version exists, use that name instead, otherwise use the original node name new_name = path[i][0][1:] @@ -182,13 +178,14 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens): else: new_name = path[i][0][1:] + ":" + str(raw_node_lens[new_name]-path[i][2]) + ":" + str(raw_node_lens[new_name]-path[i][1]) if new_name not in cut_mapping: - new_name = path[i][0][1:] + new_name = path[i][0][1:] # when we see the name in our path already and the offset is earlier than the largest we have already seen, this is an overlap # we skip these overlapping nodes from the path and continue at the new unique/larger offset node - if contig_offsets[path[i][0]] <= end and new_name in pathstr: + #sys.stderr.write("Checking node %s with coordinates %d-%d and offset is %d vs %d and is already used is %d\n"%(path[i][0], path[i][1], path[i][2], (contig_node_offsets[pathname][i]-path[i][1]), end, (new_name in pathstr))) + if (contig_node_offsets[pathname][i]-path[i][1]) <= end and new_name in pathstr: continue - end = contig_offsets[path[i][0]] + end = contig_node_offsets[pathname][i]-path[i][1] if path[i][1] != 0 or path[i][2] != raw_node_lens[path[i][0][1:]]: if (new_name in cut_mapping): pathstr += path[i][0] + "_" + cut_mapping[new_name].strip().split("_")[-1] From 5e0a8711c323daebb501be160a588da88f7afcc8 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 23 Sep 2025 19:09:14 -0400 Subject: [PATCH 12/19] changed logic for filtering hic reads from similar regions --- src/scripts/cluster.py | 59 +++++++++++++++++++++----- src/scripts/graph_functions.py | 1 + src/scripts/launch_phasing.py | 3 +- src/scripts/scaffolding/match_graph.py | 23 ++++++++-- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index 1f66b88f..44cd1d6e 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -34,6 +34,8 @@ MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only MIN_RDNA_COMPONENT = 500000 + + logger = logger_wrap.initLogger("phasing.log") @@ -292,7 +294,7 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep logger.info(f'Currently {C.number_of_nodes()} nodes') for e in hicGraph.edges(current_component): # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) - if e[0] in C and e[1] in C and matchGraph.get_edge_data(e[0], e[1]) == None: + if e[0] in C and e[1] in C and (matchGraph.get_edge_data(e[0], e[1]) == None or matchGraph.get_edge_data(e[0], e[1])['weight'] == 0): # if edges are too distant in graph, hi-c info is trash # using homologous edges when counting distances to help with coverage gaps similar_edges = [set(), set()] @@ -308,7 +310,8 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep if C.number_of_nodes() > 1: for u, v, w in matchGraph.edges.data("weight"): if u in C and v in C: - if w != None: + #No more clearing of links between nonhomologous nodes with similar regions. + if w != None and w != 0: C.add_edge(u, v, weight=w) for edge in C.edges: logger.debug(f'HIC edge: {edge} {C.edges[edge]}') @@ -431,7 +434,43 @@ def output_graph_stats(G): logger.info("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f" % ( G.number_of_nodes(), G.number_of_edges(), mean, res, mean + 5 * res)) -def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, uneven_depth): +def loadHiCWithFiltering(hic_byread, mashmap_nonhpc, min_alignment): + #mashmap line: utig4-345 198652527 104460000 104510000 + utig4-838 52114952 34308831 34345700 7 50000 13 id:f:0.951746 kc:f:0.940909 + hicGraph = nx.Graph() + nonhpcHomology = match_graph.HomologyStorage(logger, mashmap_nonhpc, min_alignment) + hic_file = open(hic_byread, 'r') + for line in hic_file: + if "#" in line: + continue + line = line.strip().split() + if len(line) < 3: + continue + #introducing multimappers + if line[1].find(",") != -1 or line[2].find(",") != -1: + continue + if line[1] == line[2]: + continue + hicGraph.add_node(line[1]) + hicGraph.add_node(line[2]) + if len(line) == 6: + first_coord = int(line[4]) + second_coord = int(line[5]) + if nonhpcHomology.isInFilteredInterval(line[1], line[2], first_coord, second_coord): + continue + if len(line) > 3: + add_w = int(line[3]) + else: + add_w = 1 + w = hicGraph.get_edge_data(line[1], line[2], 0) + if w == 0: + hicGraph.add_edge(line[1], line[2], weight=add_w) + else: + w = w['weight'] + add_w + hicGraph[line[1]][line[2]]['weight'] = w + return hicGraph + +#We use nonhpc_mashmap for read filtering and hpc_mashmap for homology detection. Cound use nonhpc for both, but then there'll be hpc/nonhpc length-related issues +def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_dir, no_rdna, uneven_depth): G = nx.Graph() gf.load_indirect_graph(graph_gfa, G) output_graph_stats(G) @@ -493,10 +532,10 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une logger.info(f"Will not use coverage based homozygous nodes detection") else: logger.info(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}") - - + # load hic connections based on mappings, weight corresponds to number of times we see a connection - hicGraph = gf.loadHiCGraph(hic_byread) + hicGraph = loadHiCWithFiltering(hic_byread, nonhpc_mashmap, MIN_ALIGNMENT) + #hicGraph = gf.loadHiCGraph(hic_byread) compressed_file = open(os.path.join(output_dir, HIC_COMPRESSED_FILENAME), 'w') for node1, node2 in hicGraph.edges(): @@ -505,7 +544,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une #Adding link between matched edges to include separated sequence to main component #TODO: only one of those should be used - mg = match_graph.MatchGraph(mashmap_sim, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) + mg = match_graph.MatchGraph(hpc_mashmap, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) matchGraph = mg.getMatchGraph() component_colors = gf.getComponentColors(G) @@ -635,7 +674,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une logger.info("Phasing successfully finished") if __name__ == "__main__": - if len(sys.argv) != 7: - print(f'Usage: {sys.argv[0]} graph.gfa homologous_nodes.matches hic_byread output_dir, no_rdna, uneven_depth') + if len(sys.argv) != 8: + print(f'Usage: {sys.argv[0]} graph.gfa hpc.mashmap nonhpc.mashmap hic_byread output_dir no_rdna uneven_depth') exit() - run_clustering(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) + run_clustering(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], sys.argv[7]) diff --git a/src/scripts/graph_functions.py b/src/scripts/graph_functions.py index 49cf38f3..bae7bbcc 100755 --- a/src/scripts/graph_functions.py +++ b/src/scripts/graph_functions.py @@ -308,6 +308,7 @@ def topological_sort(nodelens, edges): assert belongs_to_component[edge[0]] >= belongs_to_component[node] return (result, belongs_to_component) +#Likely not used anymore def loadHiCGraph(hic_byread_file): hicGraph = nx.Graph() hic_file = open(hic_byread_file, 'r') diff --git a/src/scripts/launch_phasing.py b/src/scripts/launch_phasing.py index 5ee24665..76c92f5b 100755 --- a/src/scripts/launch_phasing.py +++ b/src/scripts/launch_phasing.py @@ -26,6 +26,7 @@ os.makedirs(output_dir, exist_ok=True) matches_file = os.path.join(output_dir, "unitigs_hpc50.mashmap") + nonhpc_mashmap = os.path.join(output_dir, "unitigs_nonhpc50.mashmap") hic_file = os.path.join(output_dir, "hic_mapping.byread.output") if not os.path.exists(hic_file): hic_file = os.path.join(output_dir, "hic.byread.compressed") @@ -34,7 +35,7 @@ hic_file = compressed_hic noseq_gfa = os.path.join(output_dir, "unitigs.hpc.noseq.gfa") - cluster.run_clustering(noseq_gfa, matches_file, hic_file, output_dir, no_rdna, uneven_depth) + cluster.run_clustering(noseq_gfa, matches_file, nonhpc_mashmap, hic_file, output_dir, no_rdna, uneven_depth) #Saved for further debug diff --git a/src/scripts/scaffolding/match_graph.py b/src/scripts/scaffolding/match_graph.py index 69de528a..6ed12579 100755 --- a/src/scripts/scaffolding/match_graph.py +++ b/src/scripts/scaffolding/match_graph.py @@ -3,13 +3,15 @@ import graph_functions as gf import sys from scaffolding import logger_wrap - #classes for matchGraph construction + +#TODO: only filtered intervals are really used. So we will fail for genomes with diverged haplotypes... class HomologyInfo: #when counting approximate positions, do not allow jumps longer than this * neighboring intervals_lens JUMP_JOINING_FRACTION = 0.5 #DO not want huge jumps anyway JUMP_JOINING_ABSOLUTE = 5000000 + FILTERED_IDY_CUTOFF = 0.995 def __init__(self, node1, node2, len1, len2, logger): self.nodes = [node1, node2] @@ -33,7 +35,7 @@ def parseIDY(self, idy): def addInterval(self, intervals, orientation, idy): real_idy = self.parseIDY(idy) #Constant not depending on genome, intervals too similar for hi-c alignment to use - if real_idy > 0.995: + if real_idy > self.FILTERED_IDY_CUTOFF: for i in range(0, 2): self.filtered_intervals[i].append(intervals[i]) @@ -48,6 +50,13 @@ def addInterval(self, intervals, orientation, idy): self.orientation = orientation self.largest_interval_center = [(intervals[0][1] + intervals[0][0])/2, (intervals[1][1] + intervals[1][0])/2] + def isInFilteredInterval(self, coord1, coord2): + for i in range(0, 2): + for interval in self.filtered_intervals[i]: + if interval[0] <= coord1 <= interval[1] and interval[0] <= coord2 <= interval[1]: + return True + return False + #TODO: whether we should use orientation. def fillCoverage(self): for ind in range(0, 2): @@ -110,7 +119,7 @@ def getMinLength(self): return min(self.len[0], self.len[1]) class HomologyStorage: - #{node1: {node2: HomologyInfo(node_1, node_2)}} + #{node1: {node2: HomologyInfo(node_1, node_2)}} def __init__(self, logger, mashmap_file, min_alignment): self.homologies = {} self.lens = {} @@ -133,6 +142,7 @@ def __init__(self, logger, mashmap_file, min_alignment): continue used_lines +=1 #utig4-0 2145330 0 990000 + utig4-0 2145330 12 994065 37 994053 51 id:f:0.999992 kc:f:0.874893 + self.addHomology(arr[0], arr[5], int(arr[1]), int(arr[6]), [[int(arr[2]), int(arr[3])], [int(arr[7]), int(arr[8])]], arr[4], arr[12]) self.logger.info(f"Loaded {used_lines} out of {total_lines} mashmap lines") self.logger.info(f"{len(self.homologies)} nodes have at least one used homology") @@ -162,6 +172,13 @@ def isValid(self, node1, node2): def getLength(self, node): return self.lens[node] + def isInFilteredInterval(self, node1, node2, coord1, coord2): + if not self.isValid(node1, node2): + return False + if self.homologies[node1][node2].isInFilteredInterval(coord1, coord2): + return True + return False + def getApproximatePositionLength(self, node1, node2, ind): if not self.isValid(node1, node2): return 0 From c1a8675cfa1f43a2350751ca99a5d1842d2299e8 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Sat, 27 Sep 2025 18:39:53 -0400 Subject: [PATCH 13/19] logging simplified --- src/Snakefiles/8-hicPipeline.sm | 1 + src/scripts/cluster.py | 104 +++++------ src/scripts/launch_phasing.py | 2 + src/scripts/launch_scaffolding.py | 11 +- src/scripts/logging_utils.py | 83 +++++++++ src/scripts/scaffolding/logger_wrap.py | 48 ----- src/scripts/scaffolding/match_graph.py | 45 ++--- src/scripts/scaffolding/scaffold_graph.py | 216 +++++++++++----------- 8 files changed, 268 insertions(+), 242 deletions(-) create mode 100755 src/scripts/logging_utils.py delete mode 100755 src/scripts/scaffolding/logger_wrap.py diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index 204ada61..2ee2647a 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -489,6 +489,7 @@ chmod +x ./transform_bwa.sh rule hicPhasing: input: mashmap_matches = rules.runMashMap.output.unitigs_hpc50, + mashmap_nohpcmatches = rules.runMashMap.output.unitigs_nonhpc50, hicmapping_byread = '8-hicPipeline/hic_mapping.byread.output', graph='8-hicPipeline/unitigs.hpc.noseq.gfa' output: diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index 44cd1d6e..ec7db03e 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -6,7 +6,7 @@ import os from networkx.algorithms import community import graph_functions as gf -from scaffolding import match_graph, logger_wrap +from scaffolding import match_graph #TODO: move constants to some more relevant place @@ -36,8 +36,6 @@ MIN_RDNA_COMPONENT = 500000 -logger = logger_wrap.initLogger("phasing.log") - def check_non_empty(part, G): for p in part: @@ -98,11 +96,11 @@ def getMedianCov(nodeset): sum_l = 0 sorted_nodes = sorted(nodeset, key=lambda node: node[1]['coverage']) for node in sorted_nodes: - logger.debug(f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') + logging.debug(f'Node {node[0]} coverage {node[1]["coverage"]} length {node[1]["length"]}') sum_l += node[1]['length'] if 2*sum_l > total_length: med_cov = node[1]['coverage'] - logger.info(f'Median coverage is {med_cov}') + logging.info(f'Median coverage is {med_cov}') break return med_cov @@ -124,7 +122,7 @@ def create_initial_partition_noKL(C, matchGraph): neg_graph.add_edge(ec[0], ec[1]) elif (ec[0] in C or ec[1] in C) and matchGraph.edges[ec]['weight'] < 0: #This may happen since we delete high-covered nodes, although should not be often - logger.info(f"Weird, edge {ec} partially in component{C}") + logging.info(f"Weird, edge {ec} partially in component{C}") #exit(1) dists = dict(nx.all_pairs_shortest_path_length(neg_graph)) #friends - should always be in the same component @@ -142,8 +140,8 @@ def create_initial_partition_noKL(C, matchGraph): friends[n].append(other) else: enemies[n].append(other) - logger.debug(f"Friends of {n}: {friends[n]}") - logger.debug(f"Enemies of {n}: {enemies[n]}") + logging.debug(f"Friends of {n}: {friends[n]}") + logging.debug(f"Enemies of {n}: {enemies[n]}") parts = [set(), set()] C_nodes = list(C.nodes()) processed_nodes = set() @@ -152,7 +150,7 @@ def create_initial_partition_noKL(C, matchGraph): #lets remove non-homologous nodes if not n in enemies and n.find("Aux") == -1: C.remove_node(n) - logger.debug(f"Removing node {n}") + logging.debug(f"Removing node {n}") else: start_component = random.randint(0, 1) if not n in processed_nodes: @@ -228,7 +226,7 @@ def optimize_partition(C, parts, swap_together): for n in all_nodes: score_change = score_swap(C, parts, swap_together[n]) - logger.debug(f"Trying to move {n} and its neighbors {swap_together[n]}, old_score {cur_score}, swap score {score_change}") + logging.debug(f"Trying to move {n} and its neighbors {swap_together[n]}, old_score {cur_score}, swap score {score_change}") if score_change > 0: upd_score = cur_score + score_change for swap_node in swap_together[n]: @@ -238,7 +236,7 @@ def optimize_partition(C, parts, swap_together): parts[j].remove(swap_node) break not_changed = 0 - logger.debug(f"Moved {n} and its neighbors {swap_together[n]},improving score") + logging.debug(f"Moved {n} and its neighbors {swap_together[n]},improving score") not_changed += 1 return parts @@ -265,10 +263,10 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep sys.stderr.write("Error, got a node not in original graph %s !" % (n)) sys.exit() if G.nodes[n]['coverage'] > local_max_cov and (not uneven_depth): - logger.debug("While partitoning dropping node %s coverage too high" % (n)) + logging.debug("While partitoning dropping node %s coverage too high" % (n)) short.append(n) elif not (n in matchGraph) and uneven_depth: - logger.info("While partitoning dropping node %s uneven coverage and no matches" % (n)) + logging.info("While partitoning dropping node %s uneven coverage and no matches" % (n)) short.append(n) elif G.nodes[n]['length'] < MIN_LEN: collapseOrientedNode(edges, n) @@ -287,11 +285,11 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep break if not good: - logger.info("While partitoning dropping node %s low links count" % (n)) + logging.info("While partitoning dropping node %s low links count" % (n)) short.append(n) C.remove_nodes_from(short) - logger.info(f'Currently {C.number_of_nodes()} nodes') + logging.info(f'Currently {C.number_of_nodes()} nodes') for e in hicGraph.edges(current_component): # currently only added edges if these nodes are in the component and not matches (homologous) but should allow links to singletons too (to phase disconnected nodes correctly) if e[0] in C and e[1] in C and (matchGraph.get_edge_data(e[0], e[1]) == None or matchGraph.get_edge_data(e[0], e[1])['weight'] == 0): @@ -305,8 +303,8 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep similar_edges[ind].add(e[ind]) #TODO: likely this is not needed anymore since we already added links between homologous edges. C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight']) - logger.info(f'Currently {C.number_of_nodes()} in current component') - logger.debug(C.nodes()) + logging.info(f'Currently {C.number_of_nodes()} in current component') + logging.debug(C.nodes()) if C.number_of_nodes() > 1: for u, v, w in matchGraph.edges.data("weight"): if u in C and v in C: @@ -314,7 +312,7 @@ def create_graph_to_phase(current_component, G, matchGraph, hicGraph, uneven_dep if w != None and w != 0: C.add_edge(u, v, weight=w) for edge in C.edges: - logger.debug(f'HIC edge: {edge} {C.edges[edge]}') + logging.debug(f'HIC edge: {edge} {C.edges[edge]}') return C def score_partition(C, partition): @@ -325,8 +323,8 @@ def score_partition(C, partition): if [i, j] in C.edges(): sum_w += C.edges[i, j]['weight'] if C.edges[i, j]['weight'] < 0: - - logger.error(f"Negative edge {i} {j} with weight {C.edges[i, j]['weight']} IN THE SAME PARTITION") + + logging.error(f"Negative edge {i} {j} with weight {C.edges[i, j]['weight']} IN THE SAME PARTITION") exit() return sum_w def score_swap(C, partition, to_swap): @@ -374,9 +372,9 @@ def is_duplication_like(n1, n2, dists, or_G): dist = (dists[or_n1][or_n2] - or_G.nodes[or_n1]['length'] - or_G.nodes[or_n2]['length']) / 2 if dist < CLOSE_SUSP_HOMOLOGY_DIST: if or_n2 in dists and or_n1 in dists[or_n2] and dists[or_n2][or_n1] < CLOSE_SUSP_HOMOLOGY_DIST: - logger.info(f"Nodes {or_n1} and {or_n2} from haploid component in loop, {dist}") + logging.info(f"Nodes {or_n1} and {or_n2} from haploid component in loop, {dist}") else: - logger.info(f"Nodes {or_n1} and {or_n2} one direction close {dist} but not another") + logging.info(f"Nodes {or_n1} and {or_n2} one direction close {dist} but not another") return True return False @@ -403,13 +401,13 @@ def remove_pseudo_homology(current_component, or_G, dists, mg): if (mg.isDiploid(current_component)): if clear_homology_length * MAX_UNDIPLOID_FRACTION < total_homology_length: if clear_homology_length > 0: - logger.info(f"Cleaning diploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + logging.info(f"Cleaning diploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") clear_links(clear_diploids, mg) else: - logger.info(f"NOT cleaning diploid component {clear_diploids}, too high fraction. hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + logging.info(f"NOT cleaning diploid component {clear_diploids}, too high fraction. hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") else: if clear_homology_length > 0: - logger.info(f"Cleaning haploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") + logging.info(f"Cleaning haploid component {clear_diploids} hom to clear:{clear_homology_length} hom total: {total_homology_length} size total: {total_len}") if clear_homology_length == total_homology_length: return "no_diploid" return "diploid" @@ -431,13 +429,13 @@ def output_graph_stats(G): mean = sum(degrees) / G.number_of_nodes() variance = sum([((x - mean) ** 2) for x in degrees]) / G.number_of_nodes() res = variance ** 0.5 - logger.info("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f" % ( + logging.info("Loaded a graph with %d nodes and %d edges avg degree %f and stdev %f max is %f" % ( G.number_of_nodes(), G.number_of_edges(), mean, res, mean + 5 * res)) def loadHiCWithFiltering(hic_byread, mashmap_nonhpc, min_alignment): #mashmap line: utig4-345 198652527 104460000 104510000 + utig4-838 52114952 34308831 34345700 7 50000 13 id:f:0.951746 kc:f:0.940909 hicGraph = nx.Graph() - nonhpcHomology = match_graph.HomologyStorage(logger, mashmap_nonhpc, min_alignment) + nonhpcHomology = match_graph.HomologyStorage(mashmap_nonhpc, min_alignment) hic_file = open(hic_byread, 'r') for line in hic_file: if "#" in line: @@ -529,9 +527,9 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d MAX_COV = med_cov * 1.5 if (uneven_depth): - logger.info(f"Will not use coverage based homozygous nodes detection") + logging.info(f"Will not use coverage based homozygous nodes detection") else: - logger.info(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}") + logging.info(f"Will use coverage based homozygous nodes detection, cutoff: {MAX_COV}") # load hic connections based on mappings, weight corresponds to number of times we see a connection hicGraph = loadHiCWithFiltering(hic_byread, nonhpc_mashmap, MIN_ALIGNMENT) @@ -544,7 +542,7 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d #Adding link between matched edges to include separated sequence to main component #TODO: only one of those should be used - mg = match_graph.MatchGraph(hpc_mashmap, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) + mg = match_graph.MatchGraph(hpc_mashmap, G, FIXED_HOMOLOGY_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT) matchGraph = mg.getMatchGraph() component_colors = gf.getComponentColors(G) @@ -553,16 +551,16 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d for [v1,v2] in matchGraph.edges(): if v1 in G.nodes and v2 in G.nodes and matchGraph[v1][v2]['weight'] < 0: if component_colors[v1] != component_colors[v2]: - logger.info(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}") + logging.info(f"Adding graph link between homologous {v1} {v2}, components {component_colors[v1]} and {component_colors[v2]}") G.add_edge(v1, v2) - logger.info(f"Loaded hic info with {hicGraph.number_of_nodes()} nodes and {hicGraph.number_of_edges()} edges") + logging.info(f"Loaded hic info with {hicGraph.number_of_nodes()} nodes and {hicGraph.number_of_edges()} edges") compressed_file.close() dists = dict(nx.all_pairs_dijkstra_path_length(G, weight=lambda u, v, d: G.nodes[v]['length'])) - logger.info("Distances counted") + logging.info("Distances counted") # connected components decomposition and log the IDs and partition each one @@ -571,7 +569,7 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d dists = dict(nx.all_pairs_dijkstra_path_length(or_G, weight=lambda u, v, d: or_G.edges[u, v]['mid_length'])) for current_component in sorted(nx.connected_components(G), key=len, reverse=True): if len(current_component) > 1: - logger.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) + logging.info("Connected component with %d nodes is: %s" % (len(current_component), current_component)) cleared = remove_pseudo_homology(current_component, or_G, dists, mg) #only backward compatibility with KL @@ -593,7 +591,7 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d best_part = [set(), set()] init_parts, opposite = create_initial_partition_noKL(C, matchGraph) if len(init_parts[0]) == 0 or len(init_parts[1]) == 0: - logger.warning(f"Trying to phase component with empty set in initial partition, reporting it as haploid") + logging.warning(f"Trying to phase component with empty set in initial partition, reporting it as haploid") #May happen in weird cases of uneven coverage when some nodes from connected_component are removed from C haploid_component_count += 1 process_haploid_component(current_component, G, tsv_file, haploid_component_count) @@ -602,20 +600,20 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d random.seed(seed) parts = random_swap(init_parts, opposite, seed) part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) - logger.debug(f"init_part:\n{sorted(part[0])}\n{sorted(part[1])}") + logging.debug(f"init_part:\n{sorted(part[0])}\n{sorted(part[1])}") sum_w = score_partition(C, part) if (sum_w > best_score): - logger.info(f'Seed {seed} score {sum_w} improved over {best_score}') + logging.info(f'Seed {seed} score {sum_w} improved over {best_score}') best_part = part best_score = sum_w - logger.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") + logging.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") best_part = unglue_nodes(best_part) #try to move relatively short edges to fix case of unbalanced haplo sizes (complex repeats on only one haplo) # best_part = community.kernighan_lin.kernighan_lin_bisection(C, partition=parts, max_iter=KLIN_ITER, weight='weight', seed=seed) -# logger.debug(f"parts:\n{sorted(parts[0])}\n{sorted(parts[1])}") - #logger.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") +# logging.debug(f"parts:\n{sorted(parts[0])}\n{sorted(parts[1])}") + #logging.debug(f"best_part:\n{sorted(best_part[0])}\n{sorted(best_part[1])}") """ """ add_part = [set(), set()] @@ -629,39 +627,39 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d total_w = 0 for e in hicGraph.edges(n): if (e[0] != n or e[1] != n) and (e[0] in current_component and e[1] in current_component): - #logger.debug(f'edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}') + #logging.debug(f'edge {e} to short {hicGraph[e[0]][e[1]]["weight"]}') for ind in range(0, 2): if e[0] in best_part[ind] or e[1] in best_part[ind]: if hicGraph[e[0]][e[1]]["weight"] > MIN_WEIGHT: - logger.debug(f'{e} edge to partition {ind}!') + logging.debug(f'{e} edge to partition {ind}!') weights[ind] += hicGraph[e[0]][e[1]]["weight"] all_weights[ind] += hicGraph[e[0]][e[1]]["weight"] total_w += hicGraph[e[0]][e[1]]["weight"] if weights[0] > SIGNIFICANT_MAJORITY * weights[1]: add_part[0].add(n) - logger.debug(f"Edge {n} assigned color 0") + logging.debug(f"Edge {n} assigned color 0") elif weights[1] > SIGNIFICANT_MAJORITY * weights[0]: add_part[1].add(n) - logger.debug(f"Edge {n} assigned color 1") + logging.debug(f"Edge {n} assigned color 1") else: - logger.debug(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}") - logger.debug(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}") + logging.debug(f"Edge {n} weights {weights[0]} and {weights[1]} coverage {G.nodes[n]['coverage']}") + logging.debug(f"Edge {n} real weights {all_weights[0]} and {all_weights[1]} coverage {G.nodes[n]['coverage']} total weight {total_w}") if (all_weights[0] > SIGNIFICANT_MAJORITY * all_weights[1] or all_weights[1] > SIGNIFICANT_MAJORITY * all_weights[0]) and (all_weights[0] > MIN_WEIGHT or all_weights[1]> MIN_WEIGHT): - logger.debug(f"Edge {n} real weights allows to make a decision forbidden by weights!") + logging.debug(f"Edge {n} real weights allows to make a decision forbidden by weights!") if G.nodes[n]['length'] < MIN_LEN: only_weights[n] = all_weights else: if all_weights[0] > all_weights[1]: add_part[0].add(n) - logger.debug(f"Edge {n} assigned color 0 by additional all_weights") + logging.debug(f"Edge {n} assigned color 0 by additional all_weights") else: add_part[1].add(n) - logger.debug(f"Edge {n} assigned color 1 by additional all_weights") + logging.debug(f"Edge {n} assigned color 1 by additional all_weights") elif G.nodes[n]['length'] < MIN_LEN: - logger.debug(f"Edge {n} did not pass coverage check") + logging.debug(f"Edge {n} did not pass coverage check") - logger.info(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition') - logger.info(f'Added short edges\t{add_part}') + logging.info(f'Added {len(add_part[0]) + len (add_part[1])} short edges to bipartition') + logging.info(f'Added short edges\t{add_part}') best_part[0].update(add_part[0]) best_part[1].update(add_part[1]) for ind in range (0, 2): @@ -671,7 +669,7 @@ def run_clustering (graph_gfa, hpc_mashmap, nonhpc_mashmap, hic_byread, output_d report_phased_node(contig, ind, tsv_file) for contig in sorted(only_weights.keys()): tsv_file.write(f'{contig}\t{only_weights[contig][0]}\t{only_weights[contig][1]}\t{only_weights[contig][0]}:{only_weights[contig][1]}\t#88FF88\n') - logger.info("Phasing successfully finished") + logging.info("Phasing successfully finished") if __name__ == "__main__": if len(sys.argv) != 8: diff --git a/src/scripts/launch_phasing.py b/src/scripts/launch_phasing.py index 76c92f5b..5f112ca1 100755 --- a/src/scripts/launch_phasing.py +++ b/src/scripts/launch_phasing.py @@ -6,6 +6,7 @@ from os import listdir from os.path import isfile, join import cluster +import logging_utils if __name__ == "__main__": if len(sys.argv) < 4: print(f'Usage: {sys.argv[0]} ') @@ -35,6 +36,7 @@ hic_file = compressed_hic noseq_gfa = os.path.join(output_dir, "unitigs.hpc.noseq.gfa") + logging_utils.setup_logging(os.path.join(output_dir, "phasing.log")) cluster.run_clustering(noseq_gfa, matches_file, nonhpc_mashmap, hic_file, output_dir, no_rdna, uneven_depth) #Saved for further debug diff --git a/src/scripts/launch_scaffolding.py b/src/scripts/launch_scaffolding.py index 82f44dd5..a7484506 100755 --- a/src/scripts/launch_scaffolding.py +++ b/src/scripts/launch_scaffolding.py @@ -9,8 +9,8 @@ from numpy import argmax import graph_functions as gf import logging -from scaffolding import scaffold_graph, logger_wrap, path_storage - +from scaffolding import scaffold_graph, path_storage +import logging_utils #get all rdna nodes from mashmap #get nodes from seqtk telo #select nodes that are between 500K and 2M and close to both rDNA and telomeres to be short arm @@ -36,16 +36,13 @@ scaff_rukki_gaf_file = os.path.join(hicrun_dir, "scaff_rukki.paths.gaf") translation_paths_file = os.path.join (hicrun_dir, "final_contigs/6-layoutContigs/unitig-popped.layout.scfmap") translation_hap1_file = os.path.join (hicrun_dir, os.pardir, "translation_hap1") -rdna_file = os.path.join(hicrun_dir, os.pardir, "rdnanodes.txt") hic_byread = os.path.join(hicrun_dir, "hic.byread.compressed") -hicverkko_log = os.path.join(hicrun_dir, "hicverkko.log") - G = nx.DiGraph() gf.load_direct_graph(gfa_file, G) -logger = logger_wrap.initLogger(os.path.join(hicrun_dir, "scaffolding.log")) +logging_utils.setup_logging(os.path.join(hicrun_dir, "scaffolding.log")) path_mashmap = os.path.join(hicrun_dir, "paths2ref.mashmap") #path_mashmap = os.path.join(hicrun_dir, "paths2chm13100K.mashmap") rukki_path_storage = path_storage.PathStorage(G) @@ -54,6 +51,6 @@ read_aln = os.path.join(hicrun_dir, "hic_mapping.byread.output") else: read_aln = os.path.join(hicrun_dir, "hic_nodefiltered.bam") -sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec, logger) +sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec) res = sg.generateScaffolds() diff --git a/src/scripts/logging_utils.py b/src/scripts/logging_utils.py new file mode 100755 index 00000000..aa59cdc8 --- /dev/null +++ b/src/scripts/logging_utils.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import sys +import logging +import time +import os +import subprocess + + +def setup_logging(log_file, file_log_level = logging.DEBUG): + log_level = logging.INFO + # Configure logging with runtime from program start + start_time = time.time() + + class RuntimeFormatter(logging.Formatter): + def __init__(self, fmt=None, datefmt=None, start_time=None): + super().__init__(fmt, datefmt) + self.start_time = start_time + + def format(self, record): + record.runtime = time.time() - self.start_time + return super().format(record) + + datefmt = '%H:%M:%S' + + def format_runtime(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = int(seconds % 60) + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + class RuntimeFormatter(logging.Formatter): + def __init__(self, fmt=None, datefmt=None, start_time=None): + super().__init__(fmt, datefmt) + self.start_time = start_time + + def format(self, record): + runtime_seconds = time.time() - self.start_time + record.runtime = format_runtime(runtime_seconds) + return super().format(record) + + log_format = '%(runtime)s - %(levelname)s - [%(filename)s:%(funcName)s:%(lineno)d] - %(message)s' + formatter = RuntimeFormatter(log_format, datefmt) + + # Always log to both file and console + #log_file = os.path.join(args.outdir, f"{args.basename}.log") + + # Configure root logger + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + # File handler + file_handler = logging.FileHandler(log_file, mode='w') + file_handler.setFormatter(RuntimeFormatter(log_format, datefmt=datefmt, start_time=start_time)) + file_handler.setLevel(file_log_level) + root_logger.addHandler(file_handler) + + # Console handler + console_handler = logging.StreamHandler(sys.stderr) + console_handler.setFormatter(RuntimeFormatter(log_format, datefmt=datefmt, start_time=start_time)) + console_handler.setLevel(logging.INFO) + root_logger.addHandler(console_handler) + + # Log the GitHub commit hash if available + try: + commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=sys.path[0]).strip().decode('utf-8') + logging.info(f"GitHub Commit Hash: {commit_hash}") + except Exception as e: + logging.warning(f"Failed to retrieve GitHub commit hash: {e}") + + # Log the command-line arguments + logging.info(f"Command-line arguments: {' '.join(sys.argv)}") + logging.info(f"Logging to file: {log_file}") + + +def log_assert(condition, message, logger=None): + """Assert a condition and log an error message if it fails.""" + if not condition: + error_msg = f"Assertion failed: {message}" + if logger: + logger.error(error_msg) + else: + logging.error(error_msg) + exit(1) + #raise AssertionError(error_msg) diff --git a/src/scripts/scaffolding/logger_wrap.py b/src/scripts/scaffolding/logger_wrap.py deleted file mode 100755 index 6941ba9e..00000000 --- a/src/scripts/scaffolding/logger_wrap.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -import logging -import time - -class UpdatedFormatter(logging.Formatter): - def __init__(self, fmt=None, datefmt=None, style='%', start_time=None): - super().__init__(fmt, datefmt, style) - self.start_time = start_time or time.time() - - def format(self, record): - elapsed_seconds = time.time() - self.start_time - elapsed_time = self.formatTime(record, datefmt="%H:%M:%S") - elapsed_seconds = record.relativeCreated / 1000 - hours, remainder = divmod(elapsed_seconds, 3600) - minutes, seconds = divmod(remainder, 60) - record.elapsed_seconds = "%02d:%02d:%02d" % (hours, minutes, seconds) - record.elapsed_time = elapsed_time - return super().format(record) - -class UpdatedAdapter(logging.LoggerAdapter): - def __init__(self, logger, classname): - super().__init__(logger, {'classname': classname}) - - def process(self, msg, kwargs): - return f'{self.extra["classname"]} - {msg}', kwargs - - -def initLogger(log_file): - logger = logging.getLogger('HiCPipeline') - - # Clear any existing handlers to prevent duplication and unexpected behavior - logger.handlers.clear() - - logger.setLevel(logging.DEBUG) - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - formatter = UpdatedFormatter('%(elapsed_seconds)s %(levelname)s: %(message)s', datefmt='%H:%M:%S') - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - file_handler = logging.FileHandler(log_file, mode='w') - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - - # Prevent propagation to root logger (which might have its own handlers) - logger.propagate = False - - return logger \ No newline at end of file diff --git a/src/scripts/scaffolding/match_graph.py b/src/scripts/scaffolding/match_graph.py index 6ed12579..fe5a03d8 100755 --- a/src/scripts/scaffolding/match_graph.py +++ b/src/scripts/scaffolding/match_graph.py @@ -2,7 +2,7 @@ import networkx as nx import graph_functions as gf import sys -from scaffolding import logger_wrap +import logging #classes for matchGraph construction #TODO: only filtered intervals are really used. So we will fail for genomes with diverged haplotypes... @@ -13,7 +13,7 @@ class HomologyInfo: JUMP_JOINING_ABSOLUTE = 5000000 FILTERED_IDY_CUTOFF = 0.995 - def __init__(self, node1, node2, len1, len2, logger): + def __init__(self, node1, node2, len1, len2): self.nodes = [node1, node2] self.len = [len1, len2] self.covered = [0, 0] @@ -28,7 +28,6 @@ def __init__(self, node1, node2, len1, len2, logger): #TODO: should be used for homology check in scaffolding, with specific IDY cutoff and not sorted self.filtered_intervals = [[],[]] - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) def parseIDY(self, idy): return float(idy.split(":")[2]) @@ -78,7 +77,7 @@ def fillCoverage(self): if state == 0: current_end = prev covered_intervals.append([current_start, current_end]) - self.logger.debug (f"Covered-intervals {self.nodes} {covered_intervals}") + logging.debug (f"Covered-intervals {self.nodes} {covered_intervals}") #TODO Or possibly use approximate alignment intervals? self.covered[ind] = total_c #if there are short trashy intervals between long ones - do not want them to spoil jumping. So dynamic programming here @@ -97,7 +96,7 @@ def fillCoverage(self): if (jump > self.JUMP_JOINING_FRACTION * prev_len and jump > self.JUMP_JOINING_FRACTION * next_len) or jump > self.JUMP_JOINING_ABSOLUTE: continue available_next[i].add(j) - self.logger.debug(f"Available next {self.nodes} {available_next}") + logging.debug(f"Available next {self.nodes} {available_next}") max_i = 0 for i in range (0, len(covered_intervals)): for j in available_next[i]: @@ -120,12 +119,9 @@ def getMinLength(self): class HomologyStorage: #{node1: {node2: HomologyInfo(node_1, node_2)}} - def __init__(self, logger, mashmap_file, min_alignment): + def __init__(self, mashmap_file, min_alignment): self.homologies = {} self.lens = {} - - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) - total_lines = 0 used_lines = 0 for line in open(mashmap_file, 'r'): @@ -144,8 +140,8 @@ def __init__(self, logger, mashmap_file, min_alignment): #utig4-0 2145330 0 990000 + utig4-0 2145330 12 994065 37 994053 51 id:f:0.999992 kc:f:0.874893 self.addHomology(arr[0], arr[5], int(arr[1]), int(arr[6]), [[int(arr[2]), int(arr[3])], [int(arr[7]), int(arr[8])]], arr[4], arr[12]) - self.logger.info(f"Loaded {used_lines} out of {total_lines} mashmap lines") - self.logger.info(f"{len(self.homologies)} nodes have at least one used homology") + logging.info(f"Loaded {used_lines} out of {total_lines} mashmap lines") + logging.info(f"{len(self.homologies)} nodes have at least one used homology") self.fillCoverage() #do we want to add other direction of pair? @@ -153,7 +149,7 @@ def addHomology(self, node1, node2, len1, len2, intervals, orientation, idy): if not node1 in self.homologies: self.homologies[node1] = {} if not node2 in self.homologies[node1]: - self.homologies[node1][node2] = HomologyInfo(node1, node2, len1, len2, self.logger) + self.homologies[node1][node2] = HomologyInfo(node1, node2, len1, len2) self.homologies[node1][node2].addInterval(intervals, orientation, idy) self.lens[node1] = len1 self.lens[node2] = len2 @@ -196,13 +192,10 @@ class MatchGraph: #homologous intervals should cover at least 1/3 of at least one of the nodes in pair REQUIRED_COVERAGE_FRACTION = 1/3 - def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignment, logger): + def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignment): self.matchGraph = nx.Graph() - self.hom_storage = HomologyStorage(logger, mashmap_sim, min_alignment) + self.hom_storage = HomologyStorage(mashmap_sim, min_alignment) self.G = G - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) - - #Do not want to think whether we use diGraph or Graph, internal structures are not oriented neighbours = {} indirect_nodes = set() @@ -212,14 +205,14 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm for edge in G.edges(): for i in range(0, 2): neighbours[edge[i].strip('-+')].add(edge[1 - i].strip('-+')) - self.logger.info(f"Loaded {len(self.hom_storage.homologies)} homologies") + logging.info(f"Loaded {len(self.hom_storage.homologies)} homologies") for node1 in self.hom_storage.homologies: for node2 in self.hom_storage.homologies[node1]: #we deleted some nodes after mashmap if node1 in indirect_nodes and node2 in indirect_nodes: cur_homology = self.hom_storage.homologies[node1][node2].getCoveredLen() if cur_homology > min_big_homology: - self.logger.debug(f"Adding normal edge {node1} {node2} {cur_homology}") + logging.debug(f"Adding normal edge {node1} {node2} {cur_homology}") self.matchGraph.add_edge(node1, node2, homology_len = cur_homology) else: #less strict condition for bulge-like structure @@ -228,7 +221,7 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm # possibly check whether we have something already phased nearby? # strict bulge-like condition, either R1[A/B]R2 or R1[A/B] or [A/B]R2. Digraph can be more precise here but anyway if cur_homology > len_cutoff and neighbours[node1] == neighbours[node2] and len(neighbours[node1]) > 0: - self.logger.debug(f"Adding bulge-like edge {node1} {node2} {cur_homology} {len_cutoff}") + logging.debug(f"Adding bulge-like edge {node1} {node2} {cur_homology} {len_cutoff}") self.matchGraph.add_edge(node1, node2, homology_len = cur_homology) # while we build the initial partition give a big bonus edge for putting the homologous nodes into different partitions @@ -263,10 +256,10 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm else: self.matchGraph.remove_edge(ec[0],ec[1]) - self.logger.info("Loaded match info with %d nodes and %d edges" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) + logging.info("Loaded match info with %d nodes and %d edges" % (self.matchGraph.number_of_nodes(), self.matchGraph.number_of_edges())) for d in sorted(self.matchGraph.edges()): - self.logger.debug(f"homology edge {d} : {self.matchGraph.edges[d]}") + logging.debug(f"homology edge {d} : {self.matchGraph.edges[d]}") def getMatchGraph(self): return self.matchGraph @@ -333,7 +326,7 @@ def isHomologousPath (self, paths, lens): hom_size += self.matchGraph.edges[nor_p0, nor_p1]['homology_len'] if hom_size * 2> lens[0] or hom_size * 2> lens[1]: - self.logger.debug (f"Found homologous paths {paths} with homology size {hom_size}") + logging.debug (f"Found homologous paths {paths} with homology size {hom_size}") return True else: return False @@ -346,15 +339,15 @@ def getHomologousPaths(self, path_storage, path_id): for hom_node in self.matchGraph.neighbors(node): if self.matchGraph.edges[node, hom_node]['weight'] < 0: homologous_nodes.add(hom_node) - self.logger.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}") + logging.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}") paths_to_check = set() for node in homologous_nodes: paths_to_check.update(path_storage.getPathsFromNode(node)) - self.logger.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}") + logging.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}") for susp_path_id in paths_to_check: if self.isHomologousPath([path_storage.getPathById(path_id), path_storage.getPathById(susp_path_id)], [path_storage.getLength(path_id), path_storage.getLength(susp_path_id)]): homologous_paths.append(susp_path_id) - self.logger.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}") + logging.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}") return homologous_paths diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py index fff229ab..471be7c4 100755 --- a/src/scripts/scaffolding/scaffold_graph.py +++ b/src/scripts/scaffolding/scaffold_graph.py @@ -16,7 +16,8 @@ import pysam #TODO: remove clother to release? import psutil -from scaffolding import logger_wrap, match_graph, path_storage +from scaffolding import match_graph, path_storage + #from line_profiler import LineProfiler class ReferencePosition: @@ -117,26 +118,25 @@ class ScaffoldGraph: MAX_GRAPH_FOR_DISTANCES = 1000000 MAX_COMPONENT_FOR_DISTANCES = 50000 - def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger): - self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) + def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec): self.rukki_paths = rukki_paths self.uncompressed_lens = gf.get_lengths(uncompressed_fasta) self.multiplicities = rukki_paths.getEdgeMultiplicities() interesting_nodes = self.getInterestingNodes(self.uncompressed_lens) - self.logger.info(f"Total nodes {len(G.nodes)} interesting nodes {len(interesting_nodes)}") + logging.info(f"Total nodes {len(G.nodes)} interesting nodes {len(interesting_nodes)}") #upd_G - graph with telomeric nodes self.tel_nodes, self.upd_G = gf.get_telomeric_nodes(telomere_locations_file, G) - self.logger.debug("Telomeric nodes") - self.logger.debug(self.tel_nodes) + logging.debug("Telomeric nodes") + logging.debug(self.tel_nodes) - self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT, logger) + self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT) if porec == True: - self.logger.info ("Loading pore-c alignments") + logging.info ("Loading pore-c alignments") all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True) else: - self.logger.info ("Loading Hi-C alignments") + logging.info ("Loading Hi-C alignments") all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True) @@ -179,15 +179,15 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat unique_scores = {} self.dists = {} if G.number_of_nodes() > ScaffoldGraph.MAX_GRAPH_FOR_DISTANCES: - self.logger.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances") + logging.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances") else: max_comp_size = len(max(nx.weakly_connected_components(G), key=len)) if max_comp_size > ScaffoldGraph.MAX_COMPONENT_FOR_DISTANCES: - self.logger.info(f"Biggest component is too big {max_comp_size}, not calculating pairwise distances") + logging.info(f"Biggest component is too big {max_comp_size}, not calculating pairwise distances") else: - self.logger.info("Calculating pairwise distances for assembly graph nodes") + logging.info("Calculating pairwise distances for assembly graph nodes") self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length'])) - self.logger.info("Pairwise distances in assembly graph calculated") + logging.info("Pairwise distances in assembly graph calculated") self.haploids = self.getHaploidPaths(self.rukki_paths) self.scores = {} @@ -217,7 +217,7 @@ def isHaploidPath(self, paths, nor_path_id): if total_hom * 2 < path_len: #DEBUG ONLY if path_len > ScaffoldGraph.MIN_EXPECTED_T2T: - self.logger.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ") + logging.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ") return True else: return False @@ -260,7 +260,7 @@ def nodeToPathDist(self, node, path, check_homologous): closest = ScaffoldGraph.TOO_FAR add_dist = 0 for path_node in path: - #self.logger.debug (f"Checking nodepair dist from {node} to {path_node}") + #logging.debug (f"Checking nodepair dist from {node} to {path_node}") if path_node in self.upd_G.nodes: #TODO: likely this is not needed # if path_node == node: @@ -268,9 +268,9 @@ def nodeToPathDist(self, node, path, check_homologous): # else: closest = min(closest, add_dist + self.orNodeDist(node, path_node)) if check_homologous: - #self.logger.debug(f"Checking homologous to node {path_node}: {self.homologousOrNodes(path_node)}") + #logging.debug(f"Checking homologous to node {path_node}: {self.homologousOrNodes(path_node)}") for hom_node in self.match_graph.getHomologousOrNodes(path_node, True): - #self.logger.debug (f"Checking homologous nodepair dist from {node} to {hom_node}") + #logging.debug (f"Checking homologous nodepair dist from {node} to {hom_node}") if hom_node == node: closest = 0 closest = min(closest, add_dist + self.orNodeDist(node, hom_node)) @@ -286,7 +286,7 @@ def pathDist(self, path_from:list, path_to:list, check_homologous_from:bool, che closest = min(closest, self.nodeToPathDist(node, path_to, check_homologous_to) + add_dist) if check_homologous_from: for hom_node in self.match_graph.getHomologousOrNodes(node, True) : - #self.logger.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}") + #logging.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}") closest = min(closest, (self.nodeToPathDist(hom_node, path_to, check_homologous_to) + add_dist)) add_dist += self.compressed_lens[node.strip("-+")] return closest @@ -327,7 +327,7 @@ def norPathIdDist(self, nor_path_id_from:str, nor_path_id_to:str, paths:path_sto def getPathPositions(self, path_mashmap): - hom_storage = match_graph.HomologyStorage(self.logger, path_mashmap, ScaffoldGraph.MIN_HOMOLOGY_REF) + hom_storage = match_graph.HomologyStorage(path_mashmap, ScaffoldGraph.MIN_HOMOLOGY_REF) for path_id in hom_storage.homologies: all_refs = [] if self.rukki_paths.getLength(path_id) < ScaffoldGraph.MIN_PATH_TO_SCAFFOLD: @@ -337,9 +337,9 @@ def getPathPositions(self, path_mashmap): all_refs.sort(reverse=True) if len(all_refs) == 1 or all_refs[0][0] > ScaffoldGraph.RATIO_HOMOLOGY_REF * all_refs[1][0]: best_ref = all_refs[0][1] - self.logger.debug(f"Clearly best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") + logging.debug(f"Clearly best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") if all_refs[0][0] > ScaffoldGraph.LEN_FRAC_REF * hom_storage.getLength(path_id): - self.logger.debug(f"Best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") + logging.debug(f"Best homology for {path_id} is {best_ref} with size{all_refs[0][0]}") if not (best_ref+'+') in self.reference_positions: self.reference_positions[best_ref+ "+"] = [] self.reference_positions[best_ref+ "-"] = [] @@ -353,12 +353,12 @@ def getPathPositions(self, path_mashmap): self.reference_positions[best_ref + "-"].append(ReferencePosition(path_id + gf.rc_orientation(hom_info.orientation), best_ref + '-', hom_storage.getLength(best_ref) - hom_info.approximate_positions[1][1], hom_storage.getLength(best_ref) - hom_info.approximate_positions[1][0], hom_storage.getLength(path_id), gf.rc_orientation(hom_info.orientation))) else: - self.logger.debug(f"Best homology for {path_id} is {best_ref} not covered enough frac {all_refs[0][0] / hom_storage.getLength(path_id)}") + logging.debug(f"Best homology for {path_id} is {best_ref} not covered enough frac {all_refs[0][0] / hom_storage.getLength(path_id)}") for ref in self.reference_positions: - self.logger.debug(f"Reference positions for {ref}") + logging.debug(f"Reference positions for {ref}") self.reference_positions[ref] = sorted(self.reference_positions[ref], key=lambda x: x.average_pos) for pos in self.reference_positions[ref]: - self.logger.debug(f"{pos.name_q} {pos.name_r} {pos.average_pos} {pos.ref_start} {pos.ref_end} {pos.query_len} {pos.orientation}") + logging.debug(f"{pos.name_q} {pos.name_r} {pos.average_pos} {pos.ref_start} {pos.ref_end} {pos.query_len} {pos.orientation}") def isNextByRef(self, from_path_id, to_path_id): @@ -395,19 +395,19 @@ def forbiddenPair(self, from_path_id, to_path_id): #Heterogametic chromosomes get more links since there is no homologous one to absorb multiple alignments, so no connection of diploid and long enough ahploids if nor_from_path_id in self.haploids and self.rukki_paths.getLength(nor_from_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_to_path_id in self.haploids): if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END: - self.logger.warning(f"Banning distant link from long haploid {from_path_id} to diploid {to_path_id}") + logging.warning(f"Banning distant link from long haploid {from_path_id} to diploid {to_path_id}") return True else: - self.logger.warning(f"Allowing link from long haploid {from_path_id} to diploid {to_path_id}, close in graph") + logging.warning(f"Allowing link from long haploid {from_path_id} to diploid {to_path_id}, close in graph") if nor_to_path_id in self.haploids and self.rukki_paths.getLength(nor_to_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_from_path_id in self.haploids): if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END: - self.logger.warning(f"Banning distant link from diploid {from_path_id} to long haploid {to_path_id}") + logging.warning(f"Banning distant link from diploid {from_path_id} to long haploid {to_path_id}") return True else: - self.logger.warning(f"Allowing link from diploid {from_path_id} to long haploid {to_path_id}, close in graph") + logging.warning(f"Allowing link from diploid {from_path_id} to long haploid {to_path_id}, close in graph") #relatively short fragments with telomere are special case, we may fail to detect orientation there but belive in telomere. if self.rukki_paths.getLength(nor_to_path_id) <= ScaffoldGraph.SHORT_TEL_CUTOFF and self.scaffold_graph.nodes[to_path_id]['telomere'][0]: - self.logger.error(f"Banning link from {from_path_id} into short telomere, should not happen {to_path_id}") + logging.error(f"Banning link from {from_path_id} into short telomere, should not happen {to_path_id}") return True return False @@ -436,7 +436,7 @@ def getScores(self, or_path_id, path_storage, type): connection_nodemap = self.unique_connections_nodemap else: - self.logger.error(f"Unknown type {type}") + logging.error(f"Unknown type {type}") return res nor_path_id = gf.nor_path_id(or_path_id) precounted = True @@ -473,11 +473,11 @@ def getScores(self, or_path_id, path_storage, type): #type: weight/unique_weight def findExtension(self, cur_path_id, type): local_scores = [] - self.logger.info(f"Checking {cur_path_id}") + logging.info(f"Checking {cur_path_id}") if not (cur_path_id in self.scaffold_graph.nodes): return "NONE",0 if self.scaffold_graph.nodes[cur_path_id]['telomere'][1]: - self.logger.info (f"Stopped at the telomere") + logging.info (f"Stopped at the telomere") return "DONE",0 local_scores = self.getScores(cur_path_id, self.rukki_paths, type) @@ -492,20 +492,20 @@ def findExtension(self, cur_path_id, type): second_best_ind = j break if len(local_scores) == 0: - self.logger.info (f"Nothing next") + logging.info (f"Nothing next") return "NONE", 0 elif local_scores[best_ind][1] <= ScaffoldGraph.MIN_LINKS: - self.logger.info (f"very few links, best valid candidate {local_scores[best_ind]}") + logging.info (f"very few links, best valid candidate {local_scores[best_ind]}") return "NONE", 0 #not valid but waay best solution exists elif self.scaffold_graph.nodes[local_scores[best_ind][0]]['telomere'][0]: - self.logger.info (f"Best {local_scores[best_ind]}, is good count but actually are in telomere!") + logging.info (f"Best {local_scores[best_ind]}, is good count but actually are in telomere!") return "NONE", 0 elif len(local_scores) == 1 or (len(local_scores) > 1 and local_scores[second_best_ind][1] == 0): - self.logger.info (f"Only one next, {local_scores[best_ind]}") + logging.info (f"Only one next, {local_scores[best_ind]}") return local_scores[best_ind][0], ScaffoldGraph.CLEAR_MAJORITY*2 else: - self.logger.info (f"Really best {local_scores[best_ind]}, second best {local_scores[second_best_ind]}") + logging.info (f"Really best {local_scores[best_ind]}, second best {local_scores[second_best_ind]}") return local_scores[best_ind][0],local_scores[best_ind][1]/local_scores[second_best_ind][1] #made function to allow to use uniques @@ -513,17 +513,17 @@ def findExtension(self, cur_path_id, type): def findNextPath(self, current_path_id, nor_used_path_ids, type): next_path_id, ratio_fwd = self.findExtension(current_path_id, type) if next_path_id.strip('-+') in nor_used_path_ids: - self.logger.info (f"Extention {next_path_id} looks good but already used") + logging.info (f"Extention {next_path_id} looks good but already used") return "NONE" prev_path_id, ratio_bwd = self.findExtension(gf.rc_path_id(next_path_id), type) if gf.rc_path_id(prev_path_id) != current_path_id: - self.logger.info (f"backward check failed for {next_path_id}") + logging.info (f"backward check failed for {next_path_id}") return "NONE" if ratio_fwd > ScaffoldGraph.CLEAR_MAJORITY and ratio_bwd > ScaffoldGraph.CLEAR_MAJORITY: - self.logger.info (f"Extention {next_path_id} looks good") + logging.info (f"Extention {next_path_id} looks good") return next_path_id elif ratio_fwd * ratio_bwd > ScaffoldGraph.CLEAR_MAJORITY * ScaffoldGraph.CLEAR_MAJORITY * ScaffoldGraph.CLEAR_MAJORITY and type == "unique_weight": - self.logger.info (f" Risky extension, not clear majority for one direction but very clear for other {current_path_id} {next_path_id} {ratio_fwd} {ratio_bwd}") + logging.info (f" Risky extension, not clear majority for one direction but very clear for other {current_path_id} {next_path_id} {ratio_fwd} {ratio_bwd}") return next_path_id return "NONE" @@ -548,8 +548,8 @@ def generateScaffolds(self): tel_starting_paths.sort(key=lambda x: self.rukki_paths.getLength(x.strip('-+')), reverse=True) middle_paths.sort(key=lambda x: self.rukki_paths.getLength(x.strip('-+')), reverse=True) starting_paths = tel_starting_paths + middle_paths - self.logger.info ("Starting paths") - self.logger.info (starting_paths) + logging.info ("Starting paths") + logging.info (starting_paths) #from ID to amount of scaffolds scaff_sizes = {} for or_from_path_id in starting_paths: @@ -563,30 +563,30 @@ def generateScaffolds(self): while True: next_path_id = self.findNextPath(cur_path_id, nor_used_path_ids, "weight") if next_path_id == "NONE": - self.logger.info (f"Failed to find regular extension for {cur_path_id}, trying unique") + logging.info (f"Failed to find regular extension for {cur_path_id}, trying unique") next_path_id = self.findNextPath(cur_path_id, nor_used_path_ids, "unique_weight") if next_path_id == "DONE": - self.logger.info ("All done, stopped at telomere") + logging.info ("All done, stopped at telomere") if tried_reverse: break else: - self.logger.info (f"Reversing {cur_scaffold}") + logging.info (f"Reversing {cur_scaffold}") cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info (f"Reversed {cur_scaffold}") + logging.info (f"Reversed {cur_scaffold}") cur_path_id = cur_scaffold[-1] tried_reverse = True continue elif next_path_id == "NONE": - self.logger.info ("Failed to find extension, stopping") + logging.info ("Failed to find extension, stopping") if tried_reverse: break else: - self.logger.info (f"Reversing {cur_scaffold}") + logging.info (f"Reversing {cur_scaffold}") cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info (f"Reversed {cur_scaffold}") + logging.info (f"Reversed {cur_scaffold}") cur_path_id = cur_scaffold[-1] tried_reverse = True continue @@ -600,10 +600,10 @@ def generateScaffolds(self): if nor_prev_path_id in homologous_paths: # self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]): #TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation - self.logger.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}") + logging.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}") hom_before = True if hom_before: - self.logger.info (f"Homologous paths before in scaffold, not extending {cur_path_id} with {next_path_id}") + logging.info (f"Homologous paths before in scaffold, not extending {cur_path_id} with {next_path_id}") if tried_reverse: break else: @@ -612,7 +612,7 @@ def generateScaffolds(self): cur_path_id = cur_scaffold[-1] tried_reverse = True continue - self.logger.info (f"Extending {cur_path_id} with {next_path_id}") + logging.info (f"Extending {cur_path_id} with {next_path_id}") #possibly not do so with paths of length one? They can be more successful in other direction cur_scaffold.append(next_path_id) @@ -621,7 +621,7 @@ def generateScaffolds(self): #Reversing for comparison with previous runs only cur_scaffold = [gf.rc_path_id(x) for x in cur_scaffold] cur_scaffold.reverse() - self.logger.info(f"scaffold {cur_scaffold}\n") + logging.info(f"scaffold {cur_scaffold}\n") if len(cur_scaffold) > 1: res.append(cur_scaffold) else: @@ -642,28 +642,28 @@ def generateScaffolds(self): total_new_t2t += 1 scaffolded_rephased.extend(scaff_result[3]) final_paths.addPath(scaff_result[0]) - self.logger.info (f"Rephased scaffolds to other haplo, need to check homologous {scaffolded_rephased}") + logging.info (f"Rephased scaffolds to other haplo, need to check homologous {scaffolded_rephased}") #this should be better done after completeConnectedComponent but it is very weird case if it matters rephased_fix = self.getAdditionalRephase(scaffolded_rephased, nor_used_path_ids, self.rukki_paths) - self.logger.info(f"Rephased fix {rephased_fix}") - self.logger.info(f"used_ids {nor_used_path_ids}") + logging.info(f"Rephased fix {rephased_fix}") + logging.info(f"used_ids {nor_used_path_ids}") for nor_path_id in self.rukki_paths.getPathIds(): - self.logger.debug(f"Checking {nor_path_id}") + logging.debug(f"Checking {nor_path_id}") if not (nor_path_id in nor_used_path_ids): if nor_path_id in rephased_fix: - self.logger.debug(f"in reph fix {nor_path_id}") + logging.debug(f"in reph fix {nor_path_id}") final_paths.addPath(rephased_fix[nor_path_id]) else: final_paths.addPath(self.rukki_paths.getPathTsv(nor_path_id)) - self.logger.info("Starting last telomere tuning") + logging.info("Starting last telomere tuning") component_tuned_paths = self.completeConnectedComponent(final_paths) telomere_cheating = len(final_paths.getPathIds()) - len(component_tuned_paths.getPathIds()) self.fixHaploidNames(component_tuned_paths) - self.logger.warning (f"Total normal scaffolds {total_scf} last telomere tuned {telomere_cheating} total jumps {total_jumps + telomere_cheating} new T2T {total_new_t2t + telomere_cheating}") + logging.warning (f"Total normal scaffolds {total_scf} last telomere tuned {telomere_cheating} total jumps {total_jumps + telomere_cheating} new T2T {total_new_t2t + telomere_cheating}") self.outputScaffolds(component_tuned_paths) return res @@ -685,17 +685,17 @@ def getAdditionalRephase(self, scaffolded_rephased, used_ids, paths): splitted_id = alt_path_id.split("_") old_prefix = splitted_id[0] if alt_path_id in used_ids: - self.logger.info(f"Not rephasing {alt_path_id} because it is scaffolded anyway") + logging.info(f"Not rephasing {alt_path_id} because it is scaffolded anyway") continue if old_label in label_switch and old_prefix in prefix_switch: new_label = label_switch[old_label] new_prefix = prefix_switch[old_prefix] new_id = new_prefix + "_" + "_".join(splitted_id[1:]) - self.logger.warning(f"Rephasing {alt_path_id} to {new_id} because of {nor_old_id} scaffolded") + logging.warning(f"Rephasing {alt_path_id} to {new_id} because of {nor_old_id} scaffolded") new_tsv = "\t".join([new_id, paths.getPathString(alt_path_id), new_label]) res[alt_path_id] = new_tsv else: - self.logger.warning(f"Rephasing failed for {alt_path_id} initiated by {nor_old_id}") + logging.warning(f"Rephasing failed for {alt_path_id} initiated by {nor_old_id}") return res #if only two paths with one telomere missing remains in connected component and all other long nodes are in T2T and some conditions on distance and length then we connect def completeConnectedComponent(self, prefinal_paths): @@ -730,7 +730,7 @@ def completeConnectedComponent(self, prefinal_paths): node_comp[node] = comp_id - self.logger.info(f"Checking {len(nor_components)} weakly connected components ") + logging.info(f"Checking {len(nor_components)} weakly connected components ") for path_id in prefinal_paths.getPathIds(): cur_color = 0 for node in prefinal_paths.getPathById(path_id): @@ -785,12 +785,12 @@ def completeConnectedComponent(self, prefinal_paths): t2t_paths += 1 #component not finished, stopping elif (not start_telo) and not (end_telo): - self.logger.debug(f"Found path {path_id} of length {prefinal_paths.getLength(path_id)} without telomeres, do nothing with component ") + logging.debug(f"Found path {path_id} of length {prefinal_paths.getLength(path_id)} without telomeres, do nothing with component ") missing_telos = 1000 break #only two non-t2t paths of reasonable length in connected component if (total_long_paths > 0): - self.logger.debug(f"Component {comp_id} has {missing_telos} non-t2t telos, {t2t_paths} t2t paths, {total_long_paths} long paths") + logging.debug(f"Component {comp_id} has {missing_telos} non-t2t telos, {t2t_paths} t2t paths, {total_long_paths} long paths") if missing_telos == 2: or_ids = [] @@ -802,7 +802,7 @@ def completeConnectedComponent(self, prefinal_paths): dist = self.orPathIdDist(or_ids[0], or_ids[1], prefinal_paths, True) #Paths are not far away - self.logger.info (f"Potential t2t connectivity cheat {or_ids} dist {dist}") + logging.info (f"Potential t2t connectivity cheat {or_ids} dist {dist}") if dist < ScaffoldGraph.CLOSE_IN_GRAPH: sum_len = prefinal_paths.getLength(to_scaffold[0]) + prefinal_paths.getLength(to_scaffold[1]) @@ -816,7 +816,7 @@ def completeConnectedComponent(self, prefinal_paths): if cur_len > ScaffoldGraph.MIN_EXPECTED_T2T: haploid_paths += 1 else: - self.logger.debug(f"Path {alt_path_id} is haploid but too short") + logging.debug(f"Path {alt_path_id} is haploid but too short") short_haploid_paths += 1 if telos[alt_path_id][0] and telos[alt_path_id][1] and cur_len > ScaffoldGraph.MIN_EXPECTED_T2T: if cur_len > sum_len * ScaffoldGraph.SIMILAR_LEN_FRAC and cur_len * ScaffoldGraph.SIMILAR_LEN_FRAC < sum_len: @@ -826,7 +826,7 @@ def completeConnectedComponent(self, prefinal_paths): #Either haploid or there is homologous T2T of similar length, not too short if ((distant_len_t2t == 0 and haploid_paths ==2) or similar_len_t2t > 0) and sum_len > ScaffoldGraph.MIN_EXPECTED_T2T: node_sec = [] - self.logger.info(f"t2t connectivity cheat worked {or_ids}!") + logging.info(f"t2t connectivity cheat worked {or_ids}!") new_tsv_tuple = self.scaffoldFromOrPathIds(or_ids, prefinal_paths) res.addPath(new_tsv_tuple[0]) if new_tsv_tuple[1]: @@ -835,14 +835,14 @@ def completeConnectedComponent(self, prefinal_paths): for i in range(0, 2): scaffolded_paths.add(to_scaffold[i].strip('-+')) else: - self.logger.info(f"t2t connectivity cheat {or_ids} failed, total_len {sum_len} distant len count {distant_len_t2t} haploid paths {haploid_paths} similar len {similar_len_t2t} short haploid paths: {short_haploid_paths}") + logging.info(f"t2t connectivity cheat {or_ids} failed, total_len {sum_len} distant len count {distant_len_t2t} haploid paths {haploid_paths} similar len {similar_len_t2t} short haploid paths: {short_haploid_paths}") else: - self.logger.info(f"t2t connectivity cheat {or_ids} failed, too far {dist}") + logging.info(f"t2t connectivity cheat {or_ids} failed, too far {dist}") for path_id in prefinal_paths.getPathIds(): if not (path_id in scaffolded_paths): res.addPath(prefinal_paths.getPathTsv(path_id)) - self.logger.warning (f"Total last telomere tuned scaffolds (all t2t) {added_t2t}, jumps {total_jumps}") + logging.warning (f"Total last telomere tuned scaffolds (all t2t) {added_t2t}, jumps {total_jumps}") return res #Returns (new_tsv_line, is_t2t, num_jumps, nor_rephased_ids) @@ -887,7 +887,7 @@ def scaffoldFromOrPathIds(self, or_ids, prefinal_paths): for i in range (0, len(or_ids) - 1): swap_info = (or_ids[i].strip('-+'), or_ids[i+1].strip('-+'), or_ids[i][-1] + or_ids[i+1][-1]) if (swap_info) in self.dangerous_swaps: - self.logger.warning (f"consecutive pair, {swap_info} did signficant changes on hi-c counts based on {self.dangerous_swaps[swap_info]}") + logging.warning (f"consecutive pair, {swap_info} did signficant changes on hi-c counts based on {self.dangerous_swaps[swap_info]}") path_str = "\t".join([largest_id, ",".join(scf_path), largest_label]) telo_end = False telo_start = False @@ -897,7 +897,7 @@ def scaffoldFromOrPathIds(self, or_ids, prefinal_paths): if self.upd_G.has_edge(tel_node, scf_path[0]): telo_start = True if len(or_ids) > 1: - self.logger.info (f"Added SCAFFOLD {or_ids} {telo_start} {telo_end} ") + logging.info (f"Added SCAFFOLD {or_ids} {telo_start} {telo_end} ") return (path_str, (telo_start and telo_end), len(or_ids) - 1, rephased) #large haploid paths should be assigned based on connectivity. Complete human chrX will always be in hap1 @@ -916,12 +916,12 @@ def fixHaploidNames(self, paths: path_storage.PathStorage): if not (close_diploid_path): large_haploid_ids.append(path_id) else: - self.logger.warning(f"large haploid and diploid paths connected, {path_id} and {alt_path}, lens {paths.getLength(path_id)} {paths.getLength(alt_path)}, no relabeing") + logging.warning(f"large haploid and diploid paths connected, {path_id} and {alt_path}, lens {paths.getLength(path_id)} {paths.getLength(alt_path)}, no relabeing") large_haploid_ids.sort(key=lambda x: paths.getLength(x), reverse=True) for path_id in paths.getPathIds(): if path_id.split("_")[0] == "mat": - self.logger.info("Trio labeling detected, haploid haplotype reassignment cancelled") + logging.info("Trio labeling detected, haploid haplotype reassignment cancelled") return names_prefix = {} @@ -956,18 +956,18 @@ def fixHaploidNames(self, paths: path_storage.PathStorage): if new_label != cur_label: utig_id = path_id.split("_")[-1] new_id = names_prefix[new_label] + utig_id - self.logger.info(f"Reasigning haploid label for {path_id} {cur_label} to {new_id} {new_label}, dist {clothest_dist} len {paths.getLength(path_id)}") + logging.info(f"Reasigning haploid label for {path_id} {cur_label} to {new_id} {new_label}, dist {clothest_dist} len {paths.getLength(path_id)}") reassigned += 1 else: - self.logger.info(f"Reassignment of haploid label for {path_id} not required, dist {clothest_dist} len {paths.getLength(path_id)}") + logging.info(f"Reassignment of haploid label for {path_id} not required, dist {clothest_dist} len {paths.getLength(path_id)}") large_haploid_info.append([new_id, ",".join(paths.getPathById(path_id)), new_label]) for path_id in large_haploid_ids: paths.removePath(path_id) for info in large_haploid_info: paths.addPath("\t".join(info)) - self.logger.info(f"Reassigned {reassigned} large haploids, groups of distant haploid paths detected: {not_connected_hap_assign}") + logging.info(f"Reassigned {reassigned} large haploids, groups of distant haploid paths detected: {not_connected_hap_assign}") if not_connected_hap_assign != 0 and not_connected_hap_assign != 2: - self.logger.warning(f"Strange number of not connected haploid groups {not_connected_hap_assign}") + logging.warning(f"Strange number of not connected haploid groups {not_connected_hap_assign}") def outputScaffolds(self, final_paths): @@ -990,7 +990,7 @@ def get_connections_porec(self, alignment_file, use_multimappers:bool): for line in open (alignment_file): ind += 1 if (ind % 10000000 == 0): - self.logger.debug (f"Processed {ind} pore-c alignments") + logging.debug (f"Processed {ind} pore-c alignments") arr = line.split() first = arr[1] @@ -1009,7 +1009,7 @@ def get_connections_porec(self, alignment_file, use_multimappers:bool): unique_storage[names_pair] = [] res[names_pair].append((coords_pair[0], coords_pair[1], weight)) unique_storage[names_pair].append((coords_pair[0], coords_pair[1], weight)) - self.logger.debug (f"Finally processed {ind} pore-c alignments") + logging.debug (f"Finally processed {ind} pore-c alignments") return res, unique_storage def get_connections_bam(self, bam_filename, use_multimappers:bool): @@ -1034,7 +1034,7 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): if (total_reads % 10000000 == 0): if (total_reads % 100000000 == 0): before_compressed = total_compressed - self.logger.debug("Starting map compression...") + logging.debug("Starting map compression...") for names_pair in res: if len(res[names_pair]) > 1000: names_sorted = sorted(res[names_pair]) @@ -1051,14 +1051,14 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): compressed.append((names_sorted[ii][0], names_sorted[ii][1], cur_w)) ii = jj res[names_pair] = compressed - self.logger.debug(f"On current iteration compressed {total_compressed - before_compressed}") - self.logger.debug (f"Processed {total_reads} alignment strings") - self.logger.debug (f"Of them unique vaild unique pairs {unique_pairs}, total pairs {all_pairs} total valid {valid_pairs} ") - self.logger.debug (f"Current memory usage {(psutil.virtual_memory().used / 1024 / 1024 / 1024):.2f} GB") - self.logger.debug (f"Created new pairs {created_pair} total inserted pairs = {total_inserted} total compressed = {total_compressed}") + logging.debug(f"On current iteration compressed {total_compressed - before_compressed}") + logging.debug (f"Processed {total_reads} alignment strings") + logging.debug (f"Of them unique vaild unique pairs {unique_pairs}, total pairs {all_pairs} total valid {valid_pairs} ") + logging.debug (f"Current memory usage {(psutil.virtual_memory().used / 1024 / 1024 / 1024):.2f} GB") + logging.debug (f"Created new pairs {created_pair} total inserted pairs = {total_inserted} total compressed = {total_compressed}") #gc.collect() - #self.logger.debug (f"Current memory usage after GC {psutil.virtual_memory().used / 1024 / 1024 / 1024}GB") - #self.logger.debug (f"Mem usage of main map {asizeof.asizeof(res) / 1024 / 1024 / 1024}GB") + #logging.debug (f"Current memory usage after GC {psutil.virtual_memory().used / 1024 / 1024 / 1024}GB") + #logging.debug (f"Mem usage of main map {asizeof.asizeof(res) / 1024 / 1024 / 1024}GB") #if total_reads == 20000000: # exit() @@ -1110,7 +1110,7 @@ def get_connections_bam(self, bam_filename, use_multimappers:bool): filtered_coords = coords lname0 = len(filtered_names[0]) lname1 = len(filtered_names[1]) - #self.logger.debug (f" {valid} {lname0} {lname1}") + #logging.debug (f" {valid} {lname0} {lname1}") if valid and lname0 > 0 and lname1 > 0: valid_pairs += 1 for i in range (0, lname0): @@ -1176,7 +1176,7 @@ def fixOrientation(self, path_ids, scores): if correct_or != "": pair_or = ["",""] pair_or[i] = correct_or - self.logger.debug (f"tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug (f"tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") for second_or in ('-', '+'): correct_pair = pair_or.copy() correct_pair[1 - i] = second_or @@ -1187,16 +1187,16 @@ def fixOrientation(self, path_ids, scores): #just logging if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS* self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: self.dangerous_swaps[(path_ids[0], path_ids[1], correct_pair)] = "telomeres" - self.logger.debug(f"Dangerous telomeric tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']} from {incorrect_pair} to {correct_pair}") - self.logger.debug (f"moving {incorrect_pair} to {correct_pair}") + logging.debug(f"Dangerous telomeric tuning pair {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']} from {incorrect_pair} to {correct_pair}") + logging.debug (f"moving {incorrect_pair} to {correct_pair}") if self.rukki_paths.getLength(path_ids[i]) <= self.NEAR_PATH_END: scores[correct_pair] += scores[incorrect_pair] #Logic different with connectivity! we never want to connect through telomere else: if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: - self.logger.debug(f"Dangerous telomeric tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug(f"Dangerous telomeric tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") scores[incorrect_pair] = 0 - self.logger.debug (f"telomeric tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug (f"telomeric tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") for i in range (0, 2): #TODO: WIP @@ -1209,7 +1209,7 @@ def fixOrientation(self, path_ids, scores): to_check_ids[1 - i] = path_ids[1 - i] + fixed_orientation to_check_ids[i] = path_ids[i] + orient shortest_paths[orient] = self.orPathIdDist(to_check_ids[0], to_check_ids[1], self.rukki_paths, True) - self.logger.debug(f"Checking dists {to_check_ids} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}") + logging.debug(f"Checking dists {to_check_ids} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}") if shortest_paths['-'] < min_cutoff and shortest_paths['+'] > max_cutoff: correct_or = "-" @@ -1229,15 +1229,15 @@ def fixOrientation(self, path_ids, scores): #just logging if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: self.dangerous_swaps[(path_ids[0], path_ids[1], correct_pair)] = "connectivity" - self.logger.debug(f"Dangerous connectivity tuning pair {path_ids}, i {i} scores {scores}from {incorrect_pair} to {correct_pair}") + logging.debug(f"Dangerous connectivity tuning pair {path_ids}, i {i} scores {scores}from {incorrect_pair} to {correct_pair}") if self.rukki_paths.getLength(path_ids[i]) <= self.NEAR_PATH_END: scores[correct_pair] += scores[incorrect_pair] scores[incorrect_pair] = 0 else: if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]: - self.logger.debug(f"Dangerous connectivity tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + logging.debug(f"Dangerous connectivity tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") #Do not want to trust graph connectivity more than hi-c links, so not setting scores of incorrect links to zero. Not the same as in telomere cheat! - #self.logger.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") + #logging.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}") #Code duplication:( for first_or in ('-', '+'): for second_or in ('-', '+'): @@ -1246,7 +1246,7 @@ def fixOrientation(self, path_ids, scores): dist_paths = self.orPathIdDist(to_check[0], to_check[1], self.rukki_paths, True) if dist_paths < self.CLOSE_IN_GRAPH and dist_paths < self.rukki_paths.getLength(path_ids[0]) / 4 and dist_paths < self.rukki_paths.getLength(path_ids[1]) / 4: scores[or_str] *= self.CONNECTIVITY_MULTIPLICATIVE_BONUS - self.logger.debug(f"Connectivity bonus applied pair {to_check_ids}, scores {scores}") + logging.debug(f"Connectivity bonus applied pair {to_check_ids}, scores {scores}") return scores @@ -1322,9 +1322,9 @@ def getNodePairConnections(self, pair, orientations, connections, shift_before, scores[str] += conn[2] else: scores["middle"] += conn[2] - self.logger.debug (f"Scores for {pair} {scores} {orientations} filtered/not_filtered {filtered} {not_filtered}") - #self.logger.debug (f"Shifts {shift_before} {shift_after}") - #self.logger.debug (f"Ignored intervals {intervals}") + logging.debug (f"Scores for {pair} {scores} {orientations} filtered/not_filtered {filtered} {not_filtered}") + #logging.debug (f"Shifts {shift_before} {shift_after}") + #logging.debug (f"Ignored intervals {intervals}") return scores #TODO: remove, not used anymore @@ -1397,12 +1397,12 @@ def getPathPairConnections(self, path_ids, connections, lens): for key in scores_pair: scores[key] += scores_pair[key] if self.isPathPairForDebug(path_ids, self.rukki_paths): - self.logger.debug (f"Pathscores for {path_ids} {scores}") + logging.debug (f"Pathscores for {path_ids} {scores}") scores = self.fixOrientation(path_ids, scores) for orientation in scores: if len(orientation) == 2: if self.isNextByRef(path_ids[0] + orientation[0], path_ids[1] + orientation[1]): - self.logger.debug (f"Reference connection found! {path_ids} {orientation}") + logging.debug (f"Reference connection found! {path_ids} {orientation}") scores[orientation] *= self.REFERENCE_MULTIPLICATIVE_BONUS scores[orientation] /= self.INT_NORMALIZATION @@ -1418,7 +1418,7 @@ def getPathPairConnections(self, path_ids, connections, lens): #Never want to apply this bonus twice if very_long <= 1 and very_short == 0: coeff = self.NEAR_PATH_END / min(self.rukki_paths.getLength(path_ids[0]), self.rukki_paths.getLength(path_ids[1])) - self.logger.debug(f"Normalization is applied to {path_ids} , lens {self.rukki_paths.getLength(path_ids[0])} {self.rukki_paths.getLength(path_ids[1])} coefficent {coeff}") + logging.debug(f"Normalization is applied to {path_ids} , lens {self.rukki_paths.getLength(path_ids[0])} {self.rukki_paths.getLength(path_ids[1])} coefficent {coeff}") scores[orientation] *= coeff return scores From 70d6ce555c254a3dd8b6c37be073a80d37b1ee86 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Sat, 27 Sep 2025 18:48:06 -0400 Subject: [PATCH 14/19] missed makefile change --- src/main.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.mk b/src/main.mk index 379d9ea0..18559415 100644 --- a/src/main.mk +++ b/src/main.mk @@ -290,7 +290,7 @@ FILES += \ scripts/hic_prefilter.py -> ../lib/verkko/scripts/hic_prefilter.py \ scripts/launch_scaffolding.py -> ../lib/verkko/scripts/launch_scaffolding.py \ scripts/remove_nodes_add_telomere.py -> ../lib/verkko/scripts/remove_nodes_add_telomere.py \ - scripts/scaffolding/logger_wrap.py -> ../lib/verkko/scripts/scaffolding/logger_wrap.py \ + scripts/logging_utils.py -> ../lib/verkko/scripts/logging_utils.py \ scripts/scaffolding/match_graph.py -> ../lib/verkko/scripts/scaffolding/match_graph.py \ scripts/scaffolding/path_storage.py -> ../lib/verkko/scripts/scaffolding/path_storage.py \ scripts/scaffolding/scaff_prefilter.py -> ../lib/verkko/scripts/scaffolding/scaff_prefilter.py \ From 613079dea50d8e307dbfe5bb869743b02f682dbb Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 29 Sep 2025 17:21:50 -0400 Subject: [PATCH 15/19] excessive requirement removed --- src/Snakefiles/8-hicPipeline.sm | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index 2ee2647a..d6c7cda5 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -419,7 +419,6 @@ chmod +x ./scaffold_mergeBWA.sh rule mergeBWA: input: split_finished = rules.splitHIC.output.split_finished, - index = '8-hicPipeline/unitigs.fasta.bwt', alignments = lambda wildcards: expand("8-hicPipeline/mapped{nnnn}.bam", nnnn = splitHICoutputs(wildcards)) output: From 74bbdcebd5fd3e1d808dc0705326808865514650 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Tue, 18 Nov 2025 16:44:59 -0500 Subject: [PATCH 16/19] fix name typo --- src/Snakefiles/8-hicPipeline.sm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index d6c7cda5..b0537861 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -488,7 +488,7 @@ chmod +x ./transform_bwa.sh rule hicPhasing: input: mashmap_matches = rules.runMashMap.output.unitigs_hpc50, - mashmap_nohpcmatches = rules.runMashMap.output.unitigs_nonhpc50, + mashmap_nohpcmatches = rules.runMashMap.output.unitigs_nohpc50, hicmapping_byread = '8-hicPipeline/hic_mapping.byread.output', graph='8-hicPipeline/unitigs.hpc.noseq.gfa' output: From 62594e19464c48f17d1b1acad7c5fb1432f9d950 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Tue, 18 Nov 2025 16:45:43 -0500 Subject: [PATCH 17/19] increase open files while sorting bam outputs --- src/Snakefiles/7-combineConsensus.sm | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Snakefiles/7-combineConsensus.sm b/src/Snakefiles/7-combineConsensus.sm index 1e745cab..c4c8aa92 100644 --- a/src/Snakefiles/7-combineConsensus.sm +++ b/src/Snakefiles/7-combineConsensus.sm @@ -121,6 +121,17 @@ if [ "{params.haveBAM}" = "True" ]; then echo "--------------------" echo "Combining consensus bam results." echo "" + # increase ulimit for files in case we have lots of bam files to merge + max=\`ulimit -Hn\` + bef=\`ulimit -Sn\` + if [ \$bef -lt \$max ] ; then + ulimit -Sn \$max + aft=\`ulimit -Sn\` + echo " Changed max open files from \$bef to \$aft (max \$max)." + else + echo " Max open files limited to \$bef, no increase possible." + fi + mem_per_core=\`(expr \( {resources.mem_gb} \* 70 \) / \( 100 \* {threads} \) | awk '{{if (\$1 < 1) print "1G"; else print \$1"G"}}') || true\` rm -f ../{output.bam}.tmp.* From 9f1f9595996db5b9867803a7a8dfacdbc7ed7d0c Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Tue, 18 Nov 2025 16:47:41 -0500 Subject: [PATCH 18/19] update canu tag when running initial consensus (for Hi-C phasing), skip bam output/iterations to save time --- src/Snakefiles/7-combineConsensus.sm | 5 +++-- src/Snakefiles/7-generateConsensus.sm | 14 +++++++++++--- src/canu | 2 +- src/verkko.sh | 9 ++++++++- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/Snakefiles/7-combineConsensus.sm b/src/Snakefiles/7-combineConsensus.sm index c4c8aa92..cacfd8f0 100644 --- a/src/Snakefiles/7-combineConsensus.sm +++ b/src/Snakefiles/7-combineConsensus.sm @@ -59,7 +59,8 @@ rule combineConsensus: short_len = config['short_contig_length'], screens = config['screen'], packbam = combineConsensusB, - haveBAM = config['withBAM'] + haveBAM = config['withBAM'], + quick = config['cns_quick'] threads: 8 resources: @@ -117,7 +118,7 @@ fi # Copy the screened assembly to the output. cp assembly.fasta ../{output.cns} -if [ "{params.haveBAM}" = "True" ]; then +if [ "{params.haveBAM}" = "True" ] && [ '{params.quick}' != 'True' ]; then echo "--------------------" echo "Combining consensus bam results." echo "" diff --git a/src/Snakefiles/7-generateConsensus.sm b/src/Snakefiles/7-generateConsensus.sm index d8fd1e22..237db7c0 100644 --- a/src/Snakefiles/7-generateConsensus.sm +++ b/src/Snakefiles/7-generateConsensus.sm @@ -36,7 +36,8 @@ rule generateConsensus: bam = '7-consensus/packages/part{nnnn}.bam' if config['withBAM'] == "True" else rules.emptyfile.output, minread = config['cor_min_read'], iterations = config['cns_num_iter'], - coverage = config['cns_max_cov'] + coverage = config['cns_max_cov'], + quick = config['cns_quick'] threads: int(config['cns_n_cpus']), resources: @@ -65,18 +66,25 @@ if [ "\$(expr {params.minread} - 500)" -gt 0 ]; then minread=500 fi +# when generating initial consensus skip bam generation and only 1 round +if [ '{params.quick}' = 'True' ]; then + align="-norealign" + iter="-I 1" +else + iter="-I {params.iterations}" +fi + {VERKKO}/bin/utgcns \\\\ -threads {threads} \\\\ -import ../{input.package} \\\\ -A ../{output.consensus}.WORKING \\\\ - -I {params.iterations} -C 2 \\\\ + \$iter -C 2 \\\\ \$align \\\\ -maxcoverage {params.coverage} \\\\ -e 0.05 \\\\ -em 0.10 \\\\ -EM {params.ont_ids} \\\\ -l \$minread \\\\ - -edlib \\\\ && \\\\ mv ../{output.consensus}.WORKING ../{output.consensus} \\\\ && \\\\ diff --git a/src/canu b/src/canu index 42d86c03..80d4b50d 160000 --- a/src/canu +++ b/src/canu @@ -1 +1 @@ -Subproject commit 42d86c03d668984cbec79926651e8253a930482a +Subproject commit 80d4b50d82b37e4a517295e68901ea58fde0200c diff --git a/src/verkko.sh b/src/verkko.sh index 6b48eaf4..f30812e0 100755 --- a/src/verkko.sh +++ b/src/verkko.sh @@ -333,6 +333,7 @@ ruk_fract="0.9" #consensus cns_num_iter="2" cns_max_cov="50" +cns_quick="False" # HiC heuristics haplo_divergence=0.05 @@ -444,7 +445,7 @@ ahc_time_h=48 # fast things in hic pipeline fhc_n_cpus=8 fhc_mem_gb=16 -fhc_time_h=24 +fhc_time_h=36 # hic scaffolding pipeline shc_n_cpus=8 @@ -966,12 +967,16 @@ if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" -o "x$withbam" = "xTrue" ] elif [ ! -e "$bwa" ] ; then errors="${errors}Can't find BWA executable at '$bwa'.\n" fi + # first instance of consensus should be quick + cns_quick="True" elif [ "x$withporec" = "xTrue" ]; then if [ "x$minimap" = "x" ] ; then errors="${errors}Can't find MINIMAP2 executable in \$PATH or \$VERKKO/bin/minimap2.\n" elif [ ! -e "$minimap" ] ; then errors="${errors}Can't find MINIMAP2 executable at '$bwa'.\n" fi + # first instance of consensus should be quick + cns_quick="True" fi if [ "x$withhic" = "xTrue" -a "x$withporec" = "xTrue" ]; then errors="${errors}Both --hic1/--hic2 and --porec cannot be specified at the same time, please only specify one\n" @@ -1254,6 +1259,7 @@ echo >> ${outd}/verkko.yml "" echo >> ${outd}/verkko.yml "# consensus" echo >> ${outd}/verkko.yml "cns_num_iter: '${cns_num_iter}'" echo >> ${outd}/verkko.yml "cns_max_cov: '${cns_max_cov}'" +echo >> ${outd}/verkko.yml "cns_quick: '${cns_quick}'" echo >> ${outd}/verkko.yml "# HiC algo options" echo >> ${outd}/verkko.yml "no_rdna_tangle: '${no_rdna_tangle}'" echo >> ${outd}/verkko.yml "uneven_depth: '${uneven_depth}'" @@ -1534,6 +1540,7 @@ if [ "x$withhic" = "xTrue" -o "x$withporec" = "xTrue" ] ; then fi cd $newoutd + sed -i.bak -E "s/^(cns_quick:[[:space:]]*)'True'/\1'False'/" verkko.yml sed -i.bak 's/runRukkiHIC/cnspath/g' snakemake.sh sed -i.bak 's/HiC_rdnascaff/cnspath/g' snakemake.sh ./snakemake.sh From 34cd46993b6c77745e4daa6a4ec3aa7e8996b263 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Fri, 21 Nov 2025 13:12:25 -0500 Subject: [PATCH 19/19] copilot fixes --- src/Snakefiles/functions.sm | 1 + src/scripts/logging_utils.py | 10 ---------- src/verkko.sh | 2 +- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/Snakefiles/functions.sm b/src/Snakefiles/functions.sm index ea5710e3..44385ebe 100644 --- a/src/Snakefiles/functions.sm +++ b/src/Snakefiles/functions.sm @@ -14,6 +14,7 @@ import math import re +import glob def getNumLines(input): infile = input.pop() diff --git a/src/scripts/logging_utils.py b/src/scripts/logging_utils.py index aa59cdc8..38ddc2e6 100755 --- a/src/scripts/logging_utils.py +++ b/src/scripts/logging_utils.py @@ -28,16 +28,6 @@ def format_runtime(seconds): seconds = int(seconds % 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d}" - class RuntimeFormatter(logging.Formatter): - def __init__(self, fmt=None, datefmt=None, start_time=None): - super().__init__(fmt, datefmt) - self.start_time = start_time - - def format(self, record): - runtime_seconds = time.time() - self.start_time - record.runtime = format_runtime(runtime_seconds) - return super().format(record) - log_format = '%(runtime)s - %(levelname)s - [%(filename)s:%(funcName)s:%(lineno)d] - %(message)s' formatter = RuntimeFormatter(log_format, datefmt) diff --git a/src/verkko.sh b/src/verkko.sh index f30812e0..141d95e6 100755 --- a/src/verkko.sh +++ b/src/verkko.sh @@ -602,7 +602,7 @@ while [ $# -gt 0 ] ; do elif [ "$opt" = "--correct-k-mer-size" ] ; then mer_size=$arg; shift elif [ "$opt" = "--correct-k-mer-window" ] ; then mer_window=$arg; shift elif [ "$opt" = "--correct-k-mer-coverage" ] ; then mer_coverage=$arg; shift - elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; $spl_min_length=$arg; shift + elif [ "$opt" = "--correct-min-read-length" ] ; then cor_min_read=$arg; spl_min_length=$arg; shift elif [ "$opt" = "--correct-min-overlap-length" ] ; then cor_min_overlap=$arg; shift elif [ "$opt" = "--correct-index-batches" ] ; then cor_index_batches=$arg; shift