Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions ms1searchpy/directms1quantmulti.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def run():
parser.add_argument('-d', '-db', help='path to uniprot fasta file used for ms1searchpy', required=True)
parser.add_argument('-samples', help='tsv table with sample details', required=True)
parser.add_argument('-out', help='name of DirectMS1quant output files', default='DQmulti')
parser.add_argument('-out_folder', help='folder to store DirectMS1quant output files, default: folder of input files', default='')
parser.add_argument('-norm', help='LFQ normalization: (1) using median of CONTROL group, default; (2) using median of all groups', default=1, type=int)
parser.add_argument('-proteins_for_figure', help='path to proteins for figure plotting', default='', type=str)
parser.add_argument('-figdir', help='path to output folder for figures', default='')
Expand All @@ -50,9 +51,13 @@ def run():


def process_files(args):

infolder = args['pdir']
ms1folder = args['pdir']
if args['out_folder'] :
out_folder = path.abspath(args['out_folder'])
else :
out_folder = ms1folder
path_to_fasta = args['d']

df1 = pd.read_table(args['samples'], dtype={'group': str, 'condition': str, 'vs': str})
Expand Down Expand Up @@ -82,7 +87,7 @@ def process_files(args):

for fn in listdir(ms1folder):
if fn.endswith('_proteins_full.tsv'):
f_name = fn.replace('.features_proteins_full.tsv', '')
f_name = fn.replace('_proteins_full.tsv', '')
if f_name in f_dict_map:
s_files_dict[f_dict_map[f_name]].append(path.join(ms1folder, fn))

Expand Down Expand Up @@ -116,7 +121,7 @@ def process_files(args):
logger.info('Starting Stage 1: Run pairwise DirectMS1Quant runs...')

for i2, i1_val in all_conditions.items():
out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val))
out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s' % (outlabel, ''.join(list(i2)), control_label, i1_val))
dquant_params = copy(dquant_params_base)
dquant_params['S1'] = s_files_dict[(control_label, i1_val)]
dquant_params['S2'] = s_files_dict[i2]
Expand All @@ -132,7 +137,7 @@ def process_files(args):
pep_cnt = Counter()
pep_cnt_up = Counter()
for i2, i1_val in all_conditions.items():
out_name = path.join(ms1folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val))
out_name = path.join(out_folder, '%s_directms1quant_out_%s_vs_%s%s.tsv' % (outlabel, ''.join(list(i2)), control_label, i1_val))
# if os.path.isfile(out_name):
df0_full = pd.read_table(out_name.replace('.tsv', '_quant_peptides.tsv'), usecols=['origseq', 'up', 'down', 'proteins'])

Expand Down Expand Up @@ -171,7 +176,7 @@ def process_files(args):
args_local['allowed_proteins'] = ''
for z in listdir(infolder):
if z.endswith(replace_label):
zname = z.split('.')[0]
zname = z.replace(replace_label, '')
if zname in df1_filenames:
args_local['S1'].append(path.join(infolder, z))
names_map[args_local['S1'][-1]] = zname
Expand All @@ -195,7 +200,7 @@ def process_files(args):
all_s_lbls[sample_num].append(label)

all_lbls = all_s_lbls['S1']
out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
df_final = pd.read_table(out_name)

logger.info('Skipping Stage 2: Prepare full LFQ peptide table...')
Expand All @@ -206,7 +211,6 @@ def process_files(args):
sample_num = 'S1'

all_s_lbls[sample_num] = []

for z in args_local[sample_num]:
label = sample_num + '_' + z.replace(replace_label, '')
all_s_lbls[sample_num].append(label)
Expand Down Expand Up @@ -268,7 +272,7 @@ def process_files(args):
df_final = df_final[df_final['nonmissing']]
logger.info('Total number of PFMs: %d', len(df_final))

out_name = path.join(ms1folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
out_name = path.join(out_folder, '%s_peptide_LFQ.tsv' % (outlabel, ))
df_final.to_csv(out_name, sep='\t', index=False)


Expand Down Expand Up @@ -365,7 +369,7 @@ def process_files(args):

origfnames = []
for cc in all_lbls:
origfn = cc.split('/')[-1].split('.')[0]
origfn = path.basename(cc)
origfnames.append(origfn)

dft = pd.DataFrame.from_dict([df_final.groupby('proteins')[cc].median().to_dict() for cc in all_lbls])
Expand All @@ -381,15 +385,15 @@ def process_files(args):
df1x['sample+condition'] = df1x['sample'].apply(lambda x: x + ' ') + df1x['condition']


out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
out_namex = path.join(ms1folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, ))
out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
out_namex = path.join(out_folder, '%s_peptide_LFQ_processed.tsv' % (outlabel, ))
df1.to_csv(out_name, sep='\t', index=False)
df1x.to_csv(out_namex, sep='\t', index=False)

else:

logger.info('Skipping Stage 3: Prepare full LFQ protein table...')
out_name = path.join(ms1folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
out_name = path.join(out_folder, '%s_proteins_LFQ.tsv' % (outlabel, ))
df1 = pd.read_table(out_name)

if args['start_stage'] <= 4:
Expand Down Expand Up @@ -452,7 +456,7 @@ def process_files(args):
if not path.isdir(out_figdir):
makedirs(out_figdir)
else:
out_figdir = infolder
out_figdir = out_folder
plt.savefig(path.join(out_figdir, '%s_%s.png' % (args['out'], gname, )))


Expand Down
55 changes: 32 additions & 23 deletions ms1searchpy/group_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,31 +62,31 @@ def run():

cnt = Counter(dbname_map.values())

if group_to_use not in ['dbname', 'OX']:
for ox in cnt.keys():
# if group_to_use not in ['dbname', 'OX']:
for ox in cnt.keys():

line = ncbi.get_lineage(ox)
ranks = ncbi.get_rank(line)
if group_to_use not in ranks.values():
logger.warning('%s does not have %s', str(ox), group_to_use)
group_custom = 'OX:' + ox
# print('{} does not have {}'.format(i, group_to_use))
# continue
line = ncbi.get_lineage(ox)
ranks = ncbi.get_rank(line)
if group_to_use not in ranks.values():
logger.warning('%s does not have %s', str(ox), group_to_use)
group_custom = 'OX:' + ox
# print('{} does not have {}'.format(i, group_to_use))
# continue

else:
ranks_rev = {k[1]:k[0] for k in ranks.items()}
# print(ranks_rev)
group_custom = ranks_rev[group_to_use]
else:
ranks_rev = {k[1]:k[0] for k in ranks.items()}
# print(ranks_rev)
group_custom = group_to_use+':'+str(ranks_rev[group_to_use])

ox_map[ox] = group_custom
ox_map[ox] = group_custom


for dbname in list(dbname_map.keys()):
dbname_map[dbname] = ox_map[dbname_map[dbname]]
for dbname in list(dbname_map.keys()):
dbname_map[dbname] = ox_map[dbname_map[dbname]]

cnt = Counter(dbname_map.values())
cnt = Counter(dbname_map.values())

print(cnt.most_common())
logging.debug(cnt.most_common())

# return -1

Expand Down Expand Up @@ -149,9 +149,10 @@ def run():
escore = lambda x: -x[1]
fdr = float(args['fdr']) / 100

# all_proteins = []
all_proteins = []

base_out_name = args['out'] + group_to_use + '.tsv'
proteins_out_name = args['out'] + 'proteins_' + group_to_use + '.tsv'

out_dict = dict()

Expand Down Expand Up @@ -192,18 +193,26 @@ def run():


top_proteins = final_iteration(resdict, mass_diff, rt_diff, pept_prot, protsN_tmp, base_out_name, prefix, isdecoy, isdecoy_key, escore, fdr, args['nproc'], prots_spc_basic2=prots_spc_basic2, output_all=False)
# all_proteins.extend(top_proteins)
tax_level, taxonomy_id = group_name.split(':')
ext_top_proteins = []
for el in top_proteins :
tmp_lst = [elem for elem in el] + [taxonomy_id, tax_level]
ext_top_proteins.append(tmp_lst)
all_proteins.extend(ext_top_proteins)
out_dict[group_name] = len(top_proteins)
# print(top_proteins)
print('\n')
# break

with open(base_out_name, 'w') as output:
output.write('taxid\tproteins\n')
output.write('group\ttaxid\tproteins\n')
for k, v in out_dict.items():
output.write('\t'.join((str(k), str(v))) + '\n')
output.write('\t'.join(map(str, k.split(':')+[v])) + '\n')


with open(proteins_out_name, 'w') as output:
output.write('dbname\tscore\tmatched peptides\ttheoretical peptides\tdecoy\ttaxid\ttax_level\n')
for x in all_proteins:
output.write('\t'.join(x) + '\n')

if __name__ == '__main__':
run()