-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_for_rf.py
More file actions
39 lines (33 loc) · 1.31 KB
/
split_for_rf.py
File metadata and controls
39 lines (33 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
import pickle
chrom = "22"
def get_gene_annotation (gene_anot_filepath, chrom, gene_types=["protein_coding"]):
gene_anot = pd.read_csv(gene_anot_filepath, sep="\t")
gene_anot = gene_anot[(gene_anot["chr"]==str(chrom)) &
(gene_anot["gene_type"].isin(gene_types))]
return gene_anot
def get_gene_expression(gene_expression_file_name, gene_annot):
expr_df = pd.read_csv(gene_expression_file_name, header = 0, index_col = 0, delimiter='\t')
expr_df = expr_df.T
inter = list(set(gene_annot['gene_id']).intersection(set(expr_df.columns)))
#print(len(inter))
expr_df = expr_df.loc[:, inter ]
return expr_df
gene_expression_file = "Z:/data/mesa_models/meqtl_sorted_AFA_MESA_Epi_GEX_data_sidno_Nk-10.txt"
gene_annotation_file = "Z:/data/mesa_models/gencode.v18.annotation.parsed.txt"
geneannot = get_gene_annotation(gene_annotation_file, chrom)
expr_df = get_gene_expression(gene_expression_file, geneannot)
genes = list(expr_df.columns)
len(genes)
a = 0
for i in range(1,23,1):
if i == 22:
ls1 = genes[231:244]
with open("Z:/data/paper_hyperopt/RF/chr22_"+str(i), "wb") as chunk:
pickle.dump(ls1, chunk)
else:
ls1 = genes[11+a:22+a]
with open ("Z:/data/paper_hyperopt/RF/chr22_"+str(i), "wb") as chunk:
pickle.dump(ls1, chunk)
a = a + 11