Lymphocytes1/load_data.py at master · PayneLab/Lymphocytes1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pandas as pd
import requests
import os.path
import bs4
import requests
import urllib3
import csv
from os import path

#Data loader functions belong here. This is where
#  information about the data files is found.

def load_max_quant(version = 'current'):
    #Takes a file and returns a dataframe.
    #    file: the file path to read from
    #    The rest of the paramters are used to select the columns.
    #    By default, it will look for ones starting with 'Reporter intensity'
    #        that do not contain 'count' or 'corrected' and use the 'Protein IDs'
    #        column as the indecies. These will be the raw intensity values.
    file = get_file(key = version)#We need to add max_quant files to the index_url on box so we can use their keys on this

    prefix="Intensity"
    contains=["_"]

    with open(file, 'r') as _file:
        line = _file.readline().strip()
        headings = line.split('\t')
    headings = [i.strip('"') for i in headings]
    if prefix:#filter by columns beginning in prefix
        headings = [i for i in headings if i.startswith(prefix)]
    for req in contains:
        headings = [i for i in headings if req in i]

    final_col = ["Protein IDs"]
    for i in headings: final_col.append(i)
    df = pd.read_csv(file, sep='\t', header=0, index_col=0)
    df = df.drop(df[df['Potential contaminant'] == '+'].index)
    df = df.drop(df[df.Reverse == '+'].index)
    df = df.drop(df[df['Only identified by site'] == '+'].index)
    df = df[headings]

    return df

def get_file(key = 'current'):
    #Takes the version we are looking for and sets up a table
    #from the url file so that we can use the version passed in as
    #a key to identify what url from the index table to download.
    url_file = open('data/index_url.txt', 'r')
    url = url_file.read().strip()
    url_file.close()

    table_file_path = download_file(download_to_path="data/index_table.tsv", url = url)
    table = pd.read_csv(table_file_path, sep='\t', header = 0, index_col = 'key')
    file_url = table.loc[key]

    file_name="data/{0}.tsv".format(key)
    url_name = file_url[0]


    return download_file(download_to_path=file_name, url=url_name, redownload = True)

def load_FragPipe(version = 'current', contains=['Subject1']):
    #Takes a file and returns a dataframe.
    #    file: the file path to read from
    #    The rest of the paramters are used to select the columns.
    #    By default, it will look for ones ending with 'Total intensity'
    #        that do not contain 'count' or 'corrected' and use the 'Protein IDs'
    #        column as the indecies. These will be the raw intensity values.
    file = get_file(key = version)
    if file==1:
        print("Error with file download.")
        return False

    suffix="Total Intensity"
    if version=='June':not_contains=['15']#drop extra replicate - Yiran said these two weren't good quality, I just forgot to not run it so for now I'll exclude it at this level
    else: not_contains=[]

    with open(file, 'r') as _file:
        line = _file.readline().strip()
        headings = line.split('\t')
    headings = [i.strip('"') for i in headings]
    if suffix:#filter by columns beginning in prefix
        headings = [i for i in headings if i.endswith(suffix)]
    for req in contains:
        headings = [i for i in headings if req in i]
    for req in not_contains:
        headings = [i for i in headings if req not in i]

    final_col = ["Protein ID"]
    for i in headings: final_col.append(i)
    df = pd.read_csv(file, sep='\t', header=0, index_col=3)
    #df = df.drop(df[df['Potential contaminant'] == '+'].index)
    #df = df.drop(df[df.Reverse == '+'].index)
    #df = df.drop(df[df['Only identified by site'] == '+'].index)
    df = df[headings]


    # Remove the "Total Intensity" part of the column names
    new_names={}
    for c in df.columns.values:
        new_names[c] = c.split(' ')[0]
    df.rename(columns=new_names, inplace=True)
    df.head()

    return df

def download_file(download_to_path="data/datafile.txt", url='',
                  password_file_path="data/password.txt", redownload=False):
    """Download a file from a given url to the specified location.
    Parameters:
    path (str): The path to the file to save the file to on the local machine.
    Returns:
    str: The path the file was downloaded to.
    """

    if redownload or path.exists(download_to_path) == False: #If the file has been downloaded, or the user wants to update, download the file
        if url == '':
            print("URL MUST BE SPECIFIED FOR DOWNLOAD")
            return 1

        for i in range(2):

            with requests.Session() as session: # Use a session object to save cookies
                # Construct the urls for our GET and POST requests
                get_url = url
                post_url = get_url.replace("https://byu.box.com/shared", "https://byu.app.box.com/public")

                # Send initial GET request and parse the request token out of the response
                get_response = session.get(get_url)
                soup = bs4.BeautifulSoup(get_response.text, "html.parser")
                token_tag = soup.find(id="request_token")
                #print (token_tag)
                #print (type(token_tag))

                #This cheks if there is a password file and if it found a password requirement on the file
                if token_tag is not None:
                    #This identifies if the error was with the password file path.
                    if path.exists(password_file_path) == False:
                        print("MISSING PASSWORD FILE")
                        return 1

                    #print("Checking password...")
                    password_file = open(password_file_path, 'r')
                    password = password_file.read().strip()
                    password_file.close()
                    token = token_tag.get("value")

                    # Send a POST request, with the password and token, to get the data
                    payload = {
                        'password': password,
                        'request_token': token}
                    response = session.post(post_url, data=payload)

                    with open(download_to_path, 'wb') as dest:
                        dest.write(response.content)


                #This will download the file if it was not password protected
                else:
                    #print("No password needed")
                    response = requests.get(post_url, allow_redirects=True)
                    with open(download_to_path, 'wb') as out_file:
                        out_file.write(response.content)


    return download_to_path

def load_fasta():
    file="data/uniprot-filtered-proteome_3AUP000005640_reviewed_human.fasta"
    #file is formated:
    #>sp|Q96IY4|CBPB2_HUMAN Carboxypeptidase B2 OS=Homo sapiens OX=9606 GN=CPB2 PE=1 SV=2
    #MKLCS...
    headings = {}
    with open(file) as f:
        for line in f:
            if line.startswith('>'):#header line
                ID = line.split('|')[1]
                name=line.split('|')[2].split('=')[0].strip('OS')
                headings[ID]=name
    headings = pd.Series(list(headings.values()), index=headings.keys())

    return headings


def names_max_quant():
    file = download_file(download_to_path="data/proteinGroups.txt", url_file_path="data/proteinGroups_url.txt")
    df = pd.read_csv(file, sep='\t', header=0, index_col=0, usecols=['Protein IDs','Gene names','Fasta headers'])

    return df


def names_FragPipe(month='June', contains=['Subject1']):
    file_name="data/combined_protein_{0}_FP.tsv".format(month)
    url_file_path="data/combined_protein_{0}_FP_url.txt".format(month)
    file = download_file(download_to_path=file_name, url_file_path=url_file_path)
    df = pd.read_csv(file, sep='\t', header=0, index_col=0, usecols=['Protein ID','Gene Names','Description'])
    return df