diff --git a/ms1searchpy/directms1quantmulti.py b/ms1searchpy/directms1quantmulti.py index 28eb79f..8ed5b38 100755 --- a/ms1searchpy/directms1quantmulti.py +++ b/ms1searchpy/directms1quantmulti.py @@ -35,6 +35,7 @@ def run(): parser.add_argument('-d', '-db', help='path to uniprot fasta file used for ms1searchpy', required=True) parser.add_argument('-samples', help='tsv table with sample details', required=True) parser.add_argument('-out', help='name of DirectMS1quant output files', default='DQmulti') + parser.add_argument('-out_folder', help='folder to store DirectMS1quant output files, default: folder of input files', default='') parser.add_argument('-norm', help='LFQ normalization: (1) using median of CONTROL group, default; (2) using median of all groups', default=1, type=int) parser.add_argument('-proteins_for_figure', help='path to proteins for figure plotting', default='', type=str) parser.add_argument('-figdir', help='path to output folder for figures', default='') @@ -50,9 +51,13 @@ def run(): def process_files(args): - + infolder = args['pdir'] ms1folder = args['pdir'] + if args['out_folder'] : + out_folder = path.abspath(args['out_folder']) + else : + out_folder = ms1folder path_to_fasta = args['d'] df1 = pd.read_table(args['samples'], dtype={'group': str, 'condition': str, 'vs': str}) @@ -82,7 +87,7 @@ def process_files(args): for fn in listdir(ms1folder): if fn.endswith('_proteins_full.tsv'): - f_name = fn.replace('.features_proteins_full.tsv', '') + f_name = fn.replace('_proteins_full.tsv', '') if f_name in f_dict_map: s_files_dict[f_dict_map[f_name]].append(path.join(ms1folder, fn)) @@ -116,7 +121,7 @@ def process_files(args): logger.info('Starting Stage 1: Run pairwise DirectMS1Quant runs...') for i2, i1_val in all_conditions.items(): - out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val)) + out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val)) dquant_params = copy(dquant_params_base) dquant_params['S1'] = s_files_dict[(control_label, i1_val)] dquant_params['S2'] = s_files_dict[i2] @@ -132,7 +137,7 @@ def process_files(args): pep_cnt = Counter() pep_cnt_up = Counter() for i2, i1_val in all_conditions.items(): - out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val)) + out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val)) # if os.path.isfile(out_name): df0_full = pd.read_table(out_name.replace('.tsv', '_quant_peptides.tsv'), usecols=['origseq', 'up', 'down', 'proteins']) @@ -171,7 +176,7 @@ def process_files(args): args_local['allowed_proteins'] = '' for z in listdir(infolder): if z.endswith(replace_label): - zname = z.split('.')[0] + zname = z.replace(replace_label, '') if zname in df1_filenames: args_local['S1'].append(path.join(infolder, z)) names_map[args_local['S1'][-1]] = zname @@ -195,7 +200,7 @@ def process_files(args): all_s_lbls[sample_num].append(label) all_lbls = all_s_lbls['S1'] - out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, )) + out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, )) df_final = pd.read_table(out_name) logger.info('Skipping Stage 2: Prepare full LFQ peptide table...') @@ -206,7 +211,6 @@ def process_files(args): sample_num = 'S1' all_s_lbls[sample_num] = [] - for z in args_local[sample_num]: label = sample_num + '_' + z.replace(replace_label, '') all_s_lbls[sample_num].append(label) @@ -268,7 +272,7 @@ def process_files(args): df_final = df_final[df_final['nonmissing']] logger.info('Total number of PFMs: %d', len(df_final)) - out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, )) + out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, )) df_final.to_csv(out_name, sep='\t', index=False) @@ -365,7 +369,7 @@ def process_files(args): origfnames = [] for cc in all_lbls: - origfn = cc.split('/')[-1].split('.')[0] + origfn = path.basename(cc) origfnames.append(origfn) dft = pd.DataFrame.from_dict([df_final.groupby('proteins')[cc].median().to_dict() for cc in all_lbls]) @@ -381,15 +385,15 @@ def process_files(args): df1x['sample+condition'] = df1x['sample'].apply(lambda x: x + ' ') + df1x['condition'] - out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, )) - out_namex = path.join(ms1folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, )) + out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, )) + out_namex = path.join(out_folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, )) df1.to_csv(out_name, sep='\t', index=False) df1x.to_csv(out_namex, sep='\t', index=False) else: logger.info('Skipping Stage 3: Prepare full LFQ protein table...') - out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, )) + out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, )) df1 = pd.read_table(out_name) if args['start_stage'] <= 4: @@ -452,7 +456,7 @@ def process_files(args): if not path.isdir(out_figdir): makedirs(out_figdir) else: - out_figdir = infolder + out_figdir = out_folder plt.savefig(path.join(out_figdir, '%s_%s.png' % (args['out'], gname, ))) diff --git a/ms1searchpy/group_specific.py b/ms1searchpy/group_specific.py index 538b8d9..9d515f0 100755 --- a/ms1searchpy/group_specific.py +++ b/ms1searchpy/group_specific.py @@ -62,31 +62,31 @@ def run(): cnt = Counter(dbname_map.values()) - if group_to_use not in ['dbname', 'OX']: - for ox in cnt.keys(): + # if group_to_use not in ['dbname', 'OX']: + for ox in cnt.keys(): - line = ncbi.get_lineage(ox) - ranks = ncbi.get_rank(line) - if group_to_use not in ranks.values(): - logger.warning('%s does not have %s', str(ox), group_to_use) - group_custom = 'OX:' + ox - # print('{} does not have {}'.format(i, group_to_use)) - # continue + line = ncbi.get_lineage(ox) + ranks = ncbi.get_rank(line) + if group_to_use not in ranks.values(): + logger.warning('%s does not have %s', str(ox), group_to_use) + group_custom = 'OX:' + ox + # print('{} does not have {}'.format(i, group_to_use)) + # continue - else: - ranks_rev = {k[1]:k[0] for k in ranks.items()} - # print(ranks_rev) - group_custom = ranks_rev[group_to_use] + else: + ranks_rev = {k[1]:k[0] for k in ranks.items()} + # print(ranks_rev) + group_custom = group_to_use+':'+str(ranks_rev[group_to_use]) - ox_map[ox] = group_custom + ox_map[ox] = group_custom - for dbname in list(dbname_map.keys()): - dbname_map[dbname] = ox_map[dbname_map[dbname]] + for dbname in list(dbname_map.keys()): + dbname_map[dbname] = ox_map[dbname_map[dbname]] - cnt = Counter(dbname_map.values()) + cnt = Counter(dbname_map.values()) - print(cnt.most_common()) + logging.debug(cnt.most_common()) # return -1 @@ -149,9 +149,10 @@ def run(): escore = lambda x: -x[1] fdr = float(args['fdr']) / 100 - # all_proteins = [] + all_proteins = [] base_out_name = args['out'] + group_to_use + '.tsv' + proteins_out_name = args['out'] + 'proteins_' + group_to_use + '.tsv' out_dict = dict() @@ -192,18 +193,26 @@ def run(): top_proteins = final_iteration(resdict, mass_diff, rt_diff, pept_prot, protsN_tmp, base_out_name, prefix, isdecoy, isdecoy_key, escore, fdr, args['nproc'], prots_spc_basic2=prots_spc_basic2, output_all=False) - # all_proteins.extend(top_proteins) + tax_level, taxonomy_id = group_name.split(':') + ext_top_proteins = [] + for el in top_proteins : + tmp_lst = [elem for elem in el] + [taxonomy_id, tax_level] + ext_top_proteins.append(tmp_lst) + all_proteins.extend(ext_top_proteins) out_dict[group_name] = len(top_proteins) # print(top_proteins) print('\n') # break with open(base_out_name, 'w') as output: - output.write('taxid\tproteins\n') + output.write('group\ttaxid\tproteins\n') for k, v in out_dict.items(): - output.write('\t'.join((str(k), str(v))) + '\n') + output.write('\t'.join(map(str, k.split(':')+[v])) + '\n') - + with open(proteins_out_name, 'w') as output: + output.write('dbname\tscore\tmatched peptides\ttheoretical peptides\tdecoy\ttaxid\ttax_level\n') + for x in all_proteins: + output.write('\t'.join(x) + '\n') if __name__ == '__main__': run()