-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.py
More file actions
326 lines (258 loc) · 12.7 KB
/
load_data.py
File metadata and controls
326 lines (258 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import pandas as pd
import requests
import os.path
import bs4
import requests
import urllib3
import csv
from os import path
#Data loader functions belong here. This is where
# information about the data files is found.
def load_proteomics(version='current', level='protein',
prefix="", suffix="Total Intensity",
contains=[], prepend_label=""):
# Step 1. Which file are we reading from?
file = get_file(key = version)
if file==1:
print("Error with file download.")
return False
# Step 2. Read the file
df = pd.read_csv(file, sep='\t', header=0, index_col=index_col)
#We want to use "Protein" "Protein IDs" or "Protein ID" as the index,
# or "Sequence" in the case of peptides;
# here we check which exists in this file.
index = False
if level == 'peptide': index = "Sequence"
elif level == "protein":
if df.columns.contains('Protein'): index = "Protein"
elif df.columns.contains("Protein IDs"): index = "Protein IDs"
if index:
df.set_index(index)
# Step 3. Filter
headings = df.columns
if suffix:#filter by options such as suffix
headings = [i for i in headings if i.endswith(suffix)]
if prefix:#filter by columns beginning in prefix
headings = [i for i in headings if i.startswith(prefix)]
for req in contains:
headings = [i for i in headings if req in i]
for req in not_contains:
headings = [i for i in headings if req not in i]
# Optional 3b: Filter rows
# This may not be necessary or may be done differently.
# For example, ignoring those MQ marks as likely contaminants
#drop contaminents and decoys
if 'Potential contaminant' in df.headers:
df = df.drop(df[df['Potential contaminant'] == '+'].index)
if 'Reverse' in df.headers:
df = df.drop(df[df.Reverse == '+'].index)
if level=='protein':
#optionally, discard those that were only identified by site
#this will not work for peptide level analysis
if 'Only identified by site' in df.headers:
df = df.drop(df[df['Only identified by site'] == '+'].index)
df = df[headings]
# Step 4. Clean headers
# Remove the prefix (ie, "Total Intensity") from the column names
# optionally prepends a sample type (ie, "HeLa")
new_names={}
for c in df.columns.values:
sample_name = c[len(prefix):].strip()
sample_name = c[:len(suffix)].strip()
new_names[c] = "{0}_{1}".format(prepend_label, sample_name)
df.rename(columns=new_names, inplace=True)
df.head()
# Return data
return df
def load_max_quant(version = 'current', level='protein',
prefix="Intensity", contains=["_"],
sample_type=""
):
#Takes a file and returns a dataframe.
# file: the file path to read from
# The rest of the paramters are used to select the columns.
# By default, it will look for ones starting with 'Reporter intensity'
# that do not contain 'count' or 'corrected' and use the 'Protein IDs'
# column as the indecies. These will be the raw intensity values.
#file = get_file(key = version)#We need to add max_quant files to the index_url on box so we can use their keys on this
if level=='protein':
path = "data/proteinGroups_{0}.txt".format(version)
url = "data/proteinGroups_{0}_url.txt".format(version)
elif level=="peptide":
path = "data/peptides_{0}.txt".format(version)
url = "data/peptides_{0}_url.txt".format(version)
else:
#unknown level
print ("Please specify either 'protein' or 'peptide' level.")
return False
file = download_file(download_to_path=path, url_file_path=url)
#read in data
df = pd.read_csv(file, sep='\t', header=0, index_col=0)
#filter the columns based on the prefix and other "contains" requirements
headings = df.columns
if prefix:#filter by columns beginning in prefix
headings = [i for i in headings if i.startswith(prefix)]
for req in contains:
headings = [i for i in headings if req in i]
#drop contaminents and decoys
df = df.drop(df[df['Potential contaminant'] == '+'].index)
df = df.drop(df[df.Reverse == '+'].index)
if level=='protein':
#optionally, discard those that were only identified by site
#this will not work for peptide
df = df.drop(df[df['Only identified by site'] == '+'].index)
df = df[headings]
# Remove the prefix (ie, "Total Intensity") from the column names
# optionally prepends a sample type (ie, "HeLa"
new_names={}
for c in df.columns.values:
sample_name = c[len(prefix):].strip()
new_names[c] = "{0}_{1}".format(sample_type, sample_name)
df.rename(columns=new_names, inplace=True)
df.head()
return df
def get_file(key = 'current'):
#Takes the version we are looking for and sets up a table
#from the url file so that we can use the version passed in as
#a key to identify what url from the index table to download.
url_file = open('data/index_url.txt', 'r')
url = url_file.read().strip()
url_file.close()
table_file_path = download_file(download_to_path="data/index_table.tsv", url = url)
table = pd.read_csv(table_file_path, sep='\t', header = 0, index_col = 'key')
file_url = table.loc[key]
file_name="data/{0}.tsv".format(key)
url_name = file_url[0]
return download_file(download_to_path=file_name, url=url_name, redownload = False)
def load_FragPipe(version = 'current', contains=[],level='protein',
suffix="Total Intensity"):
#Takes a file and returns a dataframe.
# file: the file path to read from
# The rest of the paramters are used to select the columns.
# By default, it will look for ones ending with 'Total intensity'
# that do not contain 'count' or 'corrected' and use the 'Protein IDs'
# column as the indecies. These will be the raw intensity values.
file = get_file(key = version)
if file==1:
print("Error with file download.")
return False
if version=='June':not_contains=['15']#drop extra replicate - Yiran said these two weren't good quality, I just forgot to not run it so for now I'll exclude it at this level
else: not_contains=[]
#read in data
if level == 'protein': index_col = 3
else: index_col=0 #for peptides and by default, take the first column as index
df = pd.read_csv(file, sep='\t', header=0, index_col=index_col)
#filter the columns based on the prefix and other "contains" requirements
headings = df.columns
if suffix:#filter by options such as suffix, contains
headings = [i for i in headings if i.endswith(suffix)]
for req in contains:
headings = [i for i in headings if req in i]
for req in not_contains:
headings = [i for i in headings if req not in i]
df = df[headings]
# Remove the "Total Intensity" part of the column names
new_names={}
for c in df.columns.values:
new_names[c] = c.split(' ')[0]
df.rename(columns=new_names, inplace=True)
df.head()
return df
def download_file(download_to_path="data/datafile.txt", url='',
password_file_path="data/password.txt", redownload=False):
"""Download a file from a given url to the specified location.
Parameters:
path (str): The path to the file to save the file to on the local machine.
Returns:
str: The path the file was downloaded to.
"""
if redownload or path.exists(download_to_path) == False: #If the file has been downloaded, or the user wants to update, download the file
if url == '':
print("URL MUST BE SPECIFIED FOR DOWNLOAD")
return 1
for i in range(2):
with requests.Session() as session: # Use a session object to save cookies
# Construct the urls for our GET and POST requests
get_url = url
post_url = get_url.replace("https://byu.box.com/shared", "https://byu.app.box.com/public")
# Send initial GET request and parse the request token out of the response
get_response = session.get(get_url)
soup = bs4.BeautifulSoup(get_response.text, "html.parser")
token_tag = soup.find(id="request_token")
#print (token_tag)
#print (type(token_tag))
#This cheks if there is a password file and if it found a password requirement on the file
if token_tag is not None:
#This identifies if the error was with the password file path.
if path.exists(password_file_path) == False:
print("MISSING PASSWORD FILE")
return 1
#print("Checking password...")
password_file = open(password_file_path, 'r')
password = password_file.read().strip()
password_file.close()
token = token_tag.get("value")
# Send a POST request, with the password and token, to get the data
payload = {
'password': password,
'request_token': token}
response = session.post(post_url, data=payload)
with open(download_to_path, 'wb') as dest:
dest.write(response.content)
#This will download the file if it was not password protected
else:
#print("No password needed")
response = requests.get(post_url, allow_redirects=True)
with open(download_to_path, 'wb') as out_file:
out_file.write(response.content)
return download_to_path
def load_fasta(file="data/uniprot-filtered-proteome_3AUP000005640_reviewed_human.fasta"):
#file is formated:
#>sp|Q96IY4|CBPB2_HUMAN Carboxypeptidase B2 OS=Homo sapiens OX=9606 GN=CPB2 PE=1 SV=2
#MKLCS...
headings = {}
with open(file) as f:
for line in f:
if line.startswith('>'):#header line
ID = line.split('|')[1]
name=line.split('|')[2].split('=')[0].strip('OS')
headings[ID]=name
headings = pd.Series(list(headings.values()), index=headings.keys())
return headings
def names_max_quant():
file = download_file(download_to_path="data/proteinGroups.txt", url_file_path="data/proteinGroups_url.txt")
df = pd.read_csv(file, sep='\t', header=0, index_col=0, usecols=['Protein IDs','Gene names','Fasta headers'])
return df
def names_FragPipe(month='June', contains=['Subject1']):
file_name="data/combined_protein_{0}_FP.tsv".format(month)
url_file_path="data/combined_protein_{0}_FP_url.txt".format(month)
file = download_file(download_to_path=file_name, url_file_path=url_file_path)
df = pd.read_csv(file, sep='\t', header=0, index_col=0, usecols=['Protein ID','Gene Names','Description'])
return df
#the following are for CBC and metabolics data
def to_num(x, exclude=['date', 'Testing entity']):
if x.name not in exclude:
x = pd.to_numeric(x)
return x
def load_cbc(sub=0, time = '2020_09'):
cbc_data = pd.concat([pd.read_csv("data/{0}/cbc_sub1.tsv".format(time), sep='\t'),
pd.read_csv("data/{0}/cbc_sub2.tsv".format(time), sep='\t')])
units = cbc_data.iloc[0]
cbc_data = cbc_data.drop(cbc_data['subject']==np.nan)
cbc_data = cbc_data.apply(lambda x: to_num(x))
cbc_data['date']=pd.to_datetime(cbc_data['date'])
if sub: # if the user specifies they only want a particular subject
cbc_data=cbc_data[cbc_data.apply(lambda x: x['subject']==sub, axis='columns')]
return cbc_data
def load_metabolites(sub=0, time = '2020_09'):
metab_data = pd.concat([pd.read_csv("data/{0}/metab_sub1.tsv".format(time), sep='\t'),
pd.read_csv("data/{0}/metab_sub2.tsv".format(time), sep='\t')], sort=False)
metab_data['date']=pd.to_datetime(metab_data['date'])
if sub:
metab_data=metab_data[metab_data.apply(lambda x: x['Subject']==sub, axis='columns')]
return metab_data
def load_reference(time = '2020_09'):
met_ref =pd.read_csv("data/{0}/metab_reference.tsv".format(time), sep='\t', index_col=0)
cbc_ref =pd.read_csv("data/{0}/cbc_reference.tsv".format(time), sep='\t', index_col=0)
ref = pd.concat([met_ref, cbc_ref], axis=1)
return ref