markmipt · PostoenkoVI · Oct 4, 2024 · Oct 8, 2024 · Oct 9, 2024 · Jul 8, 2025
diff --git a/ms1searchpy/directms1quantmulti.py b/ms1searchpy/directms1quantmulti.py
@@ -35,6 +35,7 @@ def run():
     parser.add_argument('-d', '-db', help='path to uniprot fasta file used for ms1searchpy', required=True)
     parser.add_argument('-samples', help='tsv table with sample details', required=True)
     parser.add_argument('-out', help='name of DirectMS1quant output files', default='DQmulti')
+    parser.add_argument('-out_folder', help='folder to store DirectMS1quant output files, default: folder of input files', default='')
     parser.add_argument('-norm', help='LFQ normalization: (1) using median of CONTROL group, default; (2) using median of all groups', default=1, type=int)
     parser.add_argument('-proteins_for_figure', help='path to proteins for figure plotting', default='', type=str)
     parser.add_argument('-figdir', help='path to output folder for figures', default='')
@@ -50,9 +51,13 @@ def run():
 
 
 def process_files(args):
-
+    
     infolder = args['pdir']
     ms1folder = args['pdir']
+    if args['out_folder'] :
+        out_folder = path.abspath(args['out_folder'])
+    else :
+        out_folder = ms1folder
     path_to_fasta = args['d']
 
     df1 = pd.read_table(args['samples'], dtype={'group': str, 'condition': str, 'vs': str})
@@ -82,7 +87,7 @@ def process_files(args):
 
     for fn in listdir(ms1folder):
         if fn.endswith('_proteins_full.tsv'):
-            f_name = fn.replace('.features_proteins_full.tsv', '')
+            f_name = fn.replace('_proteins_full.tsv', '')
             if f_name in f_dict_map:
                 s_files_dict[f_dict_map[f_name]].append(path.join(ms1folder, fn))
 
@@ -116,7 +121,7 @@ def process_files(args):
         logger.info('Starting Stage 1: Run pairwise DirectMS1Quant runs...')
 
         for i2, i1_val in all_conditions.items():
-            out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val))
+            out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val))
             dquant_params = copy(dquant_params_base)
             dquant_params['S1'] = s_files_dict[(control_label, i1_val)]
             dquant_params['S2'] = s_files_dict[i2]
@@ -132,7 +137,7 @@ def process_files(args):
     pep_cnt = Counter()
     pep_cnt_up = Counter()
     for i2, i1_val in all_conditions.items():
-        out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val))
+        out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val))
         # if os.path.isfile(out_name):
         df0_full = pd.read_table(out_name.replace('.tsv', '_quant_peptides.tsv'), usecols=['origseq', 'up', 'down', 'proteins'])
 
@@ -171,7 +176,7 @@ def process_files(args):
     args_local['allowed_proteins'] = ''
     for z in listdir(infolder):
         if z.endswith(replace_label):
-            zname = z.split('.')[0]
+            zname = z.replace(replace_label, '')
             if zname in df1_filenames:
                 args_local['S1'].append(path.join(infolder, z))
                 names_map[args_local['S1'][-1]] = zname
@@ -195,7 +200,7 @@ def process_files(args):
             all_s_lbls[sample_num].append(label)
 
         all_lbls = all_s_lbls['S1']
-        out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
+        out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
         df_final = pd.read_table(out_name)
 
         logger.info('Skipping Stage 2: Prepare full LFQ peptide table...')
@@ -206,7 +211,6 @@ def process_files(args):
         sample_num = 'S1'
 
         all_s_lbls[sample_num] = []
-
         for z in args_local[sample_num]:
             label = sample_num + '_' + z.replace(replace_label, '')
             all_s_lbls[sample_num].append(label)
@@ -268,7 +272,7 @@ def process_files(args):
         df_final = df_final[df_final['nonmissing']]
         logger.info('Total number of PFMs: %d', len(df_final))
 
-        out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
+        out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
         df_final.to_csv(out_name, sep='\t', index=False)
 
 
@@ -365,7 +369,7 @@ def process_files(args):
 
         origfnames = []
         for cc in all_lbls:
-            origfn = cc.split('/')[-1].split('.')[0]
+            origfn = path.basename(cc)
             origfnames.append(origfn)
 
         dft = pd.DataFrame.from_dict([df_final.groupby('proteins')[cc].median().to_dict() for cc in all_lbls])
@@ -381,15 +385,15 @@ def process_files(args):
         df1x['sample+condition'] = df1x['sample'].apply(lambda x: x + ' ') + df1x['condition']
 
 
-        out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
-        out_namex = path.join(ms1folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, ))
+        out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
+        out_namex = path.join(out_folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, ))
         df1.to_csv(out_name, sep='\t', index=False)
         df1x.to_csv(out_namex, sep='\t', index=False)
 
     else:
 
         logger.info('Skipping Stage 3: Prepare full LFQ protein table...')
-        out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
+        out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
         df1 = pd.read_table(out_name)
 
     if args['start_stage'] <= 4:
@@ -452,7 +456,7 @@ def process_files(args):
                             if not path.isdir(out_figdir):
                                 makedirs(out_figdir)
                         else:
-                            out_figdir = infolder
+                            out_figdir = out_folder
                         plt.savefig(path.join(out_figdir, '%s_%s.png' % (args['out'], gname, )))
 
 

diff --git a/ms1searchpy/group_specific.py b/ms1searchpy/group_specific.py
@@ -62,31 +62,31 @@ def run():
 
     cnt = Counter(dbname_map.values())
 
-    if group_to_use not in ['dbname', 'OX']:
-        for ox in cnt.keys():
+    # if group_to_use not in ['dbname', 'OX']:
+    for ox in cnt.keys():
 
-            line = ncbi.get_lineage(ox)
-            ranks = ncbi.get_rank(line)
-            if group_to_use not in ranks.values():
-                logger.warning('%s does not have %s', str(ox), group_to_use)
-                group_custom = 'OX:' + ox
-                # print('{} does not have {}'.format(i, group_to_use))
-                # continue
+        line = ncbi.get_lineage(ox)
+        ranks = ncbi.get_rank(line)
+        if group_to_use not in ranks.values():
+            logger.warning('%s does not have %s', str(ox), group_to_use)
+            group_custom = 'OX:' + ox
+            # print('{} does not have {}'.format(i, group_to_use))
+            # continue
 
-            else:
-                ranks_rev = {k[1]:k[0] for k in ranks.items()}
-                # print(ranks_rev)
-                group_custom = ranks_rev[group_to_use]
+        else:
+            ranks_rev = {k[1]:k[0] for k in ranks.items()}
+            # print(ranks_rev)
+            group_custom = group_to_use+':'+str(ranks_rev[group_to_use])
 
-            ox_map[ox] = group_custom
+        ox_map[ox] = group_custom
 
 
-        for dbname in list(dbname_map.keys()):
-            dbname_map[dbname] = ox_map[dbname_map[dbname]]
+    for dbname in list(dbname_map.keys()):
+        dbname_map[dbname] = ox_map[dbname_map[dbname]]
 
-        cnt = Counter(dbname_map.values())
+    cnt = Counter(dbname_map.values())
 
-    print(cnt.most_common())
+    logging.debug(cnt.most_common())
 
     # return -1
 
@@ -149,9 +149,10 @@ def run():
     escore = lambda x: -x[1]
     fdr = float(args['fdr']) / 100
 
-    # all_proteins = []
+    all_proteins = []
 
     base_out_name = args['out'] + group_to_use + '.tsv'
+    proteins_out_name = args['out'] + 'proteins_' + group_to_use + '.tsv'
 
     out_dict = dict()
 
@@ -192,18 +193,26 @@ def run():
 
 
         top_proteins = final_iteration(resdict, mass_diff, rt_diff, pept_prot, protsN_tmp, base_out_name, prefix, isdecoy, isdecoy_key, escore, fdr, args['nproc'], prots_spc_basic2=prots_spc_basic2, output_all=False)
-        # all_proteins.extend(top_proteins)
+        tax_level, taxonomy_id = group_name.split(':')
+        ext_top_proteins = []
+        for el in top_proteins :
+            tmp_lst = [elem for elem in el] + [taxonomy_id, tax_level]
+            ext_top_proteins.append(tmp_lst)
+        all_proteins.extend(ext_top_proteins)
         out_dict[group_name] = len(top_proteins)
         # print(top_proteins)
         print('\n')
         # break
 
     with open(base_out_name, 'w') as output:
-        output.write('taxid\tproteins\n')
+        output.write('group\ttaxid\tproteins\n')
         for k, v in out_dict.items():
-            output.write('\t'.join((str(k), str(v))) + '\n')
+            output.write('\t'.join(map(str, k.split(':')+[v])) + '\n')
 
-
+    with open(proteins_out_name, 'w') as output:
+        output.write('dbname\tscore\tmatched peptides\ttheoretical peptides\tdecoy\ttaxid\ttax_level\n')
+        for x in all_proteins:
+            output.write('\t'.join(x) + '\n')    
 
 if __name__ == '__main__':
     run()