From f265a796b4aa32e7c371e8953b128ef1abe58898 Mon Sep 17 00:00:00 2001 From: ahcorcha Date: Mon, 25 Feb 2019 16:51:34 -0500 Subject: [PATCH 1/3] HTML report --- RCOpt.sh | 31 ++++- src/_Python/__init__.py | 0 src/_Python/my_utils.py | 220 ++++++++++++++++++++++++++++++++++++ src/_Python/summary_html.py | 114 +++++++++++++++++++ 4 files changed, 360 insertions(+), 5 deletions(-) create mode 100644 src/_Python/__init__.py create mode 100644 src/_Python/my_utils.py create mode 100644 src/_Python/summary_html.py diff --git a/RCOpt.sh b/RCOpt.sh index e0ed6ce..273d39f 100644 --- a/RCOpt.sh +++ b/RCOpt.sh @@ -3,9 +3,9 @@ ####################### define executables FASTAtoRF="./bin/FASTAtoRF" rndForest="./src/_R/_predict.RF.R" +summaryHTML="./src/_Python/summary_html.py" RCOpt="./bin/RCOpt" memebin="" - ####################### identify the input arguments jobid=$1 proteins=$2 @@ -16,10 +16,13 @@ if [ "$jobid" = "" ]; then exit fi + echo "Job ID: "$jobid echo "Input FASTA file for the target protein(s): "$proteins echo "Input FASTA file for the peaks: "$peaks + + if [ -e "$proteins" ]; then echo "Protein sequence file found." else @@ -84,12 +87,19 @@ sed 's/"//g' $RF_out > $out_file.RF_out.txt $RCOpt -rf $out_file.RF_out.txt -fasta $all -out $out_file -mode 3 >>$out_folder/log.step2.txt +report=$out_folder/results.report.txt +results_pfm=${out_folder}/results.PFM.txt +# echo ${report} +# [ -f ${report} ] && echo "Found" || echo "Not found" +# test=${out_file}.PFM.txt +# echo ${test} +# [ -f ${test} ] && echo "Found" || echo "Not found" +######################################################################################## +######################################################################################## +######################################################################################## - - - - +### the next lines create and fill log.info.txt #***************************************************************************************** # The following lines check the input/output, and produce appropriate messages @@ -166,6 +176,17 @@ else info="$numC2H2 sequences were found in the input FASTA for C2H2-ZF proteins.\n"$info fi +############################################################################## +echo "Creating HTML report" +## Selects opt motif , based on the pearson corr (its pval < 0.001). +## Write matrices in meme format, creates PNG, runs centrimo and graphs it's output. +## creates summary html table. +python3 ${summaryHTML} --peaks ${peaks} \ + --report ${report} \ + --pfms ${results_pfm} \ + --prefix ${out_folder} \ + --job_id ${jobid} + ####################### write the appropriate messages to the output diff --git a/src/_Python/__init__.py b/src/_Python/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/_Python/my_utils.py b/src/_Python/my_utils.py new file mode 100644 index 0000000..cab04eb --- /dev/null +++ b/src/_Python/my_utils.py @@ -0,0 +1,220 @@ +import numpy +import matplotlib.pyplot as plt +import subprocess + +meme_matrix_template = "MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from uniform background):\nA 0.25000 C 0.25000 G 0.25000 T 0.25000\nMOTIFS%s\nletter-probability matrix: alength= 4 w= 18 nsites= 1 E= 0\n%s" + +def run_cmd(cmd): + subprocess.call(cmd, shell=True) + + +def select_opt_motif(report_path): + + with open(report_path) as report: + header = next(report) # skip and store header + # 0:EXPERIMENT,1:MOTIF,2:OPTIMIZED,3:INITIAL_AUC, + # 4:INITIAL_P,5:OPTIMIZED_AUC,6:OPTIMIZED_P,7:CORRELATION,8:CORRELATION_P + corr_p = int(8) + opt_p = int(6) + # Mock motif + selected_motif = ['', 'error', '0', '0.5', '1', \ + '-1', '1', '0', '1'] + + for line in report: + current_motif = line.replace("\n", "").split("\t") + + # Do not inclde motifs without correlation to their intial sequence. + if float(current_motif[corr_p]) > float(0.001): + continue + + # Redefine selected motif as the one with the smallest pval. + if float(current_motif[opt_p]) < float(selected_motif[opt_p]): + selected_motif = current_motif + + + if selected_motif[1] == "error": + print("There is no motif with a CORRELATION_P smaller than 0.001.\nStopping") + exit() + + return selected_motif + +def convert_cisbp_to_meme(matrix): + + name = "None" + position = False + frequencies = "" + matrix_ = matrix.split("\n") + + for line in matrix_: + + if line.find("Motif\t") >= 0: + name = line.split("\t")[1] + continue + + if position: + line_ = "\t".join(line.split("\t")[1:]) + frequencies += "%s\n" % (line_) + + if line == "Pos\tA\tC\tG\tT": + position = True + + return meme_matrix_template % (name, frequencies) + + +def write_cisbp_meme_motif(motif_id, motif_cisbp_file, motif_meme_file, PFMS_PATH): + + with open(PFMS_PATH, 'r') as matrices: + matrices = matrices.read().split("\n\n") + motif_id_line = "Motif\t" + motif_id + "\n" + + for matrix in matrices: + + if matrix.find(motif_id_line) >= 0: + + cisbp_motif = open(motif_cisbp_file, 'w') + cisbp_motif.write(matrix) + cisbp_motif.close() + + meme_motif = open(motif_meme_file, 'w') + meme_matrix = convert_cisbp_to_meme(matrix) + meme_motif.write(meme_matrix) + meme_motif.close() + + + +def smooth(y, box_pts): + box = numpy.ones(box_pts)/box_pts + y_smooth = numpy.convolve(y, box, mode='same') + return y_smooth + +def write_centrimo_plot(site_table, out_centrimo_graph, centrimo_score): + points, sites, counts, norm_counts, sum_counts = [], [], [], [], 0.0 + + with open(site_table, 'r') as centrimo_table: + next(centrimo_table) # Header + for line in centrimo_table: + line = line.replace("\n", "").split("\t") + sites.append(float(line[0])) + counts.append(float(line[1])) + sum_counts += float(line[1]) + + for count in counts: + # Normalization + norm_counts.append(count / sum_counts) + + fig, ax = plt.subplots(figsize=(10,5), facecolor=("white")) + ax.set_facecolor("lightgray") + ax.set_title("Motif Probability Graph (score ≥ %s bits)" % centrimo_score) + # ax.plot(sites, smooth(norm_counts, 3), color='red', lw=0.2) + ax.plot(sites, smooth(norm_counts, 20), color='navy', lw=1.5) + ax.set_xlabel('Position of Best Site In Sequence') + ax.set_ylabel('Probability') + plt.grid() + + ## Supported formats: png, pdf, ps, eps and svg + plt.savefig(out_centrimo_graph, dpi=100, facecolor='White', edgecolor='w', \ + orientation='portrait', format="png", pad_inches=0.01) + + +def write_html(metadata, centrimo_graph, PREFIX, job_id): + # ['', 'CTCF_HUMAN:3-8', '1', '0.814348', '1.20937e-66', + # '0.867272', '3.51351e-90', '0.86303', '1.55052e-07'] + # 0:EXPERIMENT,1:MOTIF,2:OPTIMIZED,3:INITIAL_AUC, + # 4:INITIAL_P,5:OPTIMIZED_AUC,6:OPTIMIZED_P,7:CORRELATION,8:CORRELATION_P + motif_name = metadata[1].replace("-", "_").replace(":", "_") + + ## Logo PNG. + logo_png = "logo" + motif_name + ".png" + logo_opt_png = "logo" + motif_name + "_opt.png" + ## SEED + seed_auc = metadata[3] + seed_pval = metadata[4] + ## Optimized + opt_auc = metadata[5] + opt_pval = metadata[6] + ## CORR + pcorr = metadata[7] + corr_pval = metadata[8] + + # job_id, motif_name, seed_auc, seed_pval, opt_auc, opt_pval + # pearson_cor, cor_pval, seed_png, opt_png, sites_png + template = load_html_template() + + html_file = template % (job_id, job_id, motif_name, seed_auc, \ + seed_pval, opt_auc, \ + opt_pval, pcorr, corr_pval, logo_png, \ + logo_opt_png, centrimo_graph) + + html_path = PREFIX + "/report.html" + OUT_html = open(html_path, 'w') + OUT_html.write(html_file) + OUT_html.close() + + +def load_html_template(): + + template = """ + + RCADE2 %s + + +
+

RCADE2
%s

+
+ + + + +

%s

+ + + + + + + + + + + + + + + + +
AUCp-value
Seed%s%s
Optimized%s%s
+

Pearson correlation: %s, p-value: %s

+
+

Seed

+ +

Optimized

+ +
+
+

Site counts

+
+ +
+
+

References:

+ Najafabadi, H. S., Albu, M., & Hughes, T. R. (2015). Identification of C2H2-ZF binding preferences from ChIP-seq data using RCADE. Bioinformatics, 31(17), 2879-2881. doi:10.1093/bioinformatics/btv284. [full text]
+ Bailey, T. L., & Machanick, P. (2012). Inferring direct DNA binding from ChIP-seq. Nucleic Acids Res, 40(17), e128. doi:10.1093/nar/gks433 [full text] + + """ + return template + diff --git a/src/_Python/summary_html.py b/src/_Python/summary_html.py new file mode 100644 index 0000000..02af5b1 --- /dev/null +++ b/src/_Python/summary_html.py @@ -0,0 +1,114 @@ +#!/bin/env python3 +import os +import argparse +import my_utils + +parser = argparse.ArgumentParser(description='Create a HTML summary page for RCADE2') + +parser.add_argument('--peaks', dest='PEAK_PATH', action="store", \ + type=str, \ + help='Path to peak file.') + +parser.add_argument('--report', dest='REPORT_PATH', action="store",\ + type=str, \ + help='PFMs report table of tested motifs.') + +parser.add_argument('--pfms', dest='PFMS_PATH', action="store", \ + type=str, + help='Path to PFMs, all seeds/optimized motifs in CIS-BP format.') + +parser.add_argument('--prefix', dest='PREFIX', action="store", \ + type=str, + help='Out path prefix for results.') + +parser.add_argument('--job_id', dest='JOB_ID', action="store", \ + type=str, default="JOB_ID", \ + help='Job ID for RCADE2.') + +args = parser.parse_args() + +## Header example for results..report.txt +# HEADER=["EXPERIMENT", "MOTIF", "OPTIMIZED", "INITIAL_AUC", \ +# "INITIAL_P", "OPTIMIZED_AUC", "OPTIMIZED_P", \ +# "CORRELATION", "CORRELATION_P"] + +def main(): + + if os.path.getsize(args.REPORT_PATH) == 0 or \ + os.path.getsize(args.PFMS_PATH) == 0 or \ + os.path.getsize(args.PEAK_PATH) == 0: + print("Input files are empty.") + exit() + + + log_file = open(args.PREFIX + "/centrimo/centrimo.log", "w") + ########################################################################### + ## Return the information of the selected motif, name and scores + selected_motif = my_utils.select_opt_motif(args.REPORT_PATH) + + ## Base name of the selected motif. + seed_id = str(selected_motif[1]) + ## Name of the optimized selected motif. + opt_id = seed_id + "|opt" + + ########################################################################### + ## Selected seed motif ## + ########################################################################### + # Out file name: results.seed.PFM.txt + seed_cisbp_path = args.PREFIX + "/results.seed.PFM.txt" + # Out file name: results.opt.PFM.meme.txt + seed_meme_path = args.PREFIX + "/results.seed.PFM.meme.txt" + + ## Use name to write selected motif to one file. + my_utils.write_cisbp_meme_motif(seed_id, seed_cisbp_path, seed_meme_path, \ + args.PFMS_PATH) + + ########################################################################### + ## Selected optimized motif ## + ########################################################################### + # Out file name: results.seed.PFM.txt + opt_cisbp_path = args.PREFIX + "/results.opt.PFM.txt" + ## Convert matrix to meme format + opt_meme_path = args.PREFIX + "/results.opt.PFM.meme.txt" + + ## Use name to write selected motif in two formats/files + my_utils.write_cisbp_meme_motif(opt_id, opt_cisbp_path, opt_meme_path, \ + args.PFMS_PATH) + + ########################################################################### + ## Run Centrimo ## + ########################################################################### + centrimo_score = 5 # Default. + # Must be directory as it creates another directory in it. + centrimo_out = args.PREFIX + "/centrimo" + cmd_centrimo = "centrimo --oc %s --score %s %s %s &>> log.step3.txt" % \ + (centrimo_out, centrimo_score, args.PEAK_PATH, opt_meme_path) + + print("Running centrimo", file=log_file) + my_utils.run_cmd(cmd_centrimo) + site_table = centrimo_out + "/site_counts.txt" + centrimo_graph_name = "motif_sites_graph.png" + out_centrimo_graph = args.PREFIX + "/" + centrimo_graph_name + my_utils.write_centrimo_plot(site_table, out_centrimo_graph, centrimo_score) + + ########################################################################### + ## Create PNG image of the logo for optimized and normal motifs ## + ########################################################################### + #### Selected seed + cmd_seed_image = "meme2images -png %s %s" % (seed_meme_path, args.PREFIX) + my_utils.run_cmd(cmd_seed_image) + + #### Selected optimized + cmd_opt_image = "meme2images -png %s %s" % (opt_meme_path, args.PREFIX) + my_utils.run_cmd(cmd_opt_image) + + print("Creating HTML report", file=log_file) + ############################################################ + my_utils.write_html(selected_motif, centrimo_graph_name, args.PREFIX, args.JOB_ID) + ############################################################ + rm_cmd = "rm %s/results.opt.ps" % (args.PREFIX) + my_utils.run_cmd(rm_cmd) + +if __name__ == "__main__": + + main() From 6095e14c8f7a9974704488b970aa2fa9e7cd3d7e Mon Sep 17 00:00:00 2001 From: ahcorcha Date: Mon, 25 Feb 2019 18:13:33 -0500 Subject: [PATCH 2/3] type fix MOTIFS instead of MOTIF in meme matrix --- src/_Python/my_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_Python/my_utils.py b/src/_Python/my_utils.py index cab04eb..861508b 100644 --- a/src/_Python/my_utils.py +++ b/src/_Python/my_utils.py @@ -2,7 +2,7 @@ import matplotlib.pyplot as plt import subprocess -meme_matrix_template = "MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from uniform background):\nA 0.25000 C 0.25000 G 0.25000 T 0.25000\nMOTIFS%s\nletter-probability matrix: alength= 4 w= 18 nsites= 1 E= 0\n%s" +meme_matrix_template = "MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from uniform background):\nA 0.25000 C 0.25000 G 0.25000 T 0.25000\nMOTIF\t%s\nletter-probability matrix: alength= 4 w= 18 nsites= 1 E= 0\n%s" def run_cmd(cmd): subprocess.call(cmd, shell=True) From 0335860929c50aa5d0ce49ad662ddb6f581ec2ea Mon Sep 17 00:00:00 2001 From: ahcorcha Date: Wed, 17 Apr 2019 14:01:41 -0400 Subject: [PATCH 3/3] fixed bug on template meme pfm format --- src/_Python/my_utils.py | 6 +++-- src/_Python/summary_html.py | 51 ++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/src/_Python/my_utils.py b/src/_Python/my_utils.py index 861508b..2f8c8ef 100644 --- a/src/_Python/my_utils.py +++ b/src/_Python/my_utils.py @@ -4,6 +4,8 @@ meme_matrix_template = "MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from uniform background):\nA 0.25000 C 0.25000 G 0.25000 T 0.25000\nMOTIF\t%s\nletter-probability matrix: alength= 4 w= 18 nsites= 1 E= 0\n%s" +meme_matrix_template = "MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies (from uniform background):\nA 0.25000 C 0.25000 G 0.25000 T 0.25000\nMOTIF\t%s\nletter-probability matrix:\n%s" + def run_cmd(cmd): subprocess.call(cmd, shell=True) @@ -23,7 +25,8 @@ def select_opt_motif(report_path): for line in report: current_motif = line.replace("\n", "").split("\t") - # Do not inclde motifs without correlation to their intial sequence. + ## Do not inclde motifs without correlation to + # their intial sequence. if float(current_motif[corr_p]) > float(0.001): continue @@ -31,7 +34,6 @@ def select_opt_motif(report_path): if float(current_motif[opt_p]) < float(selected_motif[opt_p]): selected_motif = current_motif - if selected_motif[1] == "error": print("There is no motif with a CORRELATION_P smaller than 0.001.\nStopping") exit() diff --git a/src/_Python/summary_html.py b/src/_Python/summary_html.py index 02af5b1..4f13f66 100644 --- a/src/_Python/summary_html.py +++ b/src/_Python/summary_html.py @@ -27,7 +27,7 @@ args = parser.parse_args() -## Header example for results..report.txt +## Header example for results.report.txt # HEADER=["EXPERIMENT", "MOTIF", "OPTIMIZED", "INITIAL_AUC", \ # "INITIAL_P", "OPTIMIZED_AUC", "OPTIMIZED_P", \ # "CORRELATION", "CORRELATION_P"] @@ -40,9 +40,10 @@ def main(): print("Input files are empty.") exit() - - log_file = open(args.PREFIX + "/centrimo/centrimo.log", "w") - ########################################################################### + log_path = args.PREFIX + "/centrimo.log" + log_file = open(log_path, "w") + + ######################################################################### ## Return the information of the selected motif, name and scores selected_motif = my_utils.select_opt_motif(args.REPORT_PATH) @@ -51,21 +52,22 @@ def main(): ## Name of the optimized selected motif. opt_id = seed_id + "|opt" - ########################################################################### - ## Selected seed motif ## - ########################################################################### + ######################################################################### + ## Selected seed motif ## + ######################################################################### # Out file name: results.seed.PFM.txt seed_cisbp_path = args.PREFIX + "/results.seed.PFM.txt" # Out file name: results.opt.PFM.meme.txt seed_meme_path = args.PREFIX + "/results.seed.PFM.meme.txt" ## Use name to write selected motif to one file. - my_utils.write_cisbp_meme_motif(seed_id, seed_cisbp_path, seed_meme_path, \ + my_utils.write_cisbp_meme_motif(seed_id, seed_cisbp_path, \ + seed_meme_path, \ args.PFMS_PATH) - ########################################################################### - ## Selected optimized motif ## - ########################################################################### + ######################################################################### + ## Selected optimized motif ## + ######################################################################### # Out file name: results.seed.PFM.txt opt_cisbp_path = args.PREFIX + "/results.opt.PFM.txt" ## Convert matrix to meme format @@ -75,25 +77,28 @@ def main(): my_utils.write_cisbp_meme_motif(opt_id, opt_cisbp_path, opt_meme_path, \ args.PFMS_PATH) - ########################################################################### - ## Run Centrimo ## - ########################################################################### + ######################################################################### + ## Run Centrimo ## + ######################################################################### centrimo_score = 5 # Default. # Must be directory as it creates another directory in it. centrimo_out = args.PREFIX + "/centrimo" - cmd_centrimo = "centrimo --oc %s --score %s %s %s &>> log.step3.txt" % \ - (centrimo_out, centrimo_score, args.PEAK_PATH, opt_meme_path) + cmd_centrimo = "centrimo --oc %s --score %s %s %s &>> %s" % \ + (centrimo_out, centrimo_score, args.PEAK_PATH, \ + opt_meme_path, log_path) print("Running centrimo", file=log_file) my_utils.run_cmd(cmd_centrimo) site_table = centrimo_out + "/site_counts.txt" centrimo_graph_name = "motif_sites_graph.png" out_centrimo_graph = args.PREFIX + "/" + centrimo_graph_name - my_utils.write_centrimo_plot(site_table, out_centrimo_graph, centrimo_score) + + my_utils.write_centrimo_plot(site_table, out_centrimo_graph, \ + centrimo_score) - ########################################################################### - ## Create PNG image of the logo for optimized and normal motifs ## - ########################################################################### + ######################################################################### + ## Create PNG image of the logo for optimized and normal motifs ## + ######################################################################### #### Selected seed cmd_seed_image = "meme2images -png %s %s" % (seed_meme_path, args.PREFIX) my_utils.run_cmd(cmd_seed_image) @@ -104,11 +109,11 @@ def main(): print("Creating HTML report", file=log_file) ############################################################ - my_utils.write_html(selected_motif, centrimo_graph_name, args.PREFIX, args.JOB_ID) + my_utils.write_html(selected_motif, centrimo_graph_name, \ + args.PREFIX, args.JOB_ID) ############################################################ rm_cmd = "rm %s/results.opt.ps" % (args.PREFIX) my_utils.run_cmd(rm_cmd) - -if __name__ == "__main__": +if __name__ == "__main__": main()