-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclasses.py
More file actions
133 lines (125 loc) · 6.27 KB
/
classes.py
File metadata and controls
133 lines (125 loc) · 6.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import pandas as pd
import numpy as np
import networkx as nx
class datab:
"""Datab class is made to contain the data from a csv file containing the otu (columns) occurrence of metagenomic samoles (raws)
It's methods are:
samples A pandas dataframe containing the data readed forom the csv file
filename A string that keep track of the file name used to construct the object
description A dictionary with various labeled informations about the operations performed on the object
"Classification level" Specify if the occurrence is orderedo byo otu or by an other classification (phylim, family, class, order...)+
"Filtering procedure" How the database have been filtered
"Normalization" How the databas have been normalized
"Correlation" How the correlation matrix have been obrained
"Graf filtering procedure" How the graph have been filtered
c_matrix A pandas matrix, containing the correlation matrix evaluated from the data
graph A networkx graph that is constructed from the correlation matrix
"""
def __init__(self, file_name = None):
"""The constructor can have at most 1 argument. If not provided, the database is created empthy object
file_name path to the occurrence file
"""
self.c_matrix = None
self.graph = None
if file_name == None:
self.description = {}
self.filename = ""
self.samples = None
else:
self.samples = pd.read_csv(file_name, index_col=0)
self.filename = file_name.split('.')[0]
self.description = {"Classification level":"otu", "Filtering procedure" : "", "Normalization": "nothing", "Correlation": "not yet", "Grph filtering procedure": "not yet"}
def init_form_panda(self, samples, description, filename = "no_file"):
"""Initialize empthy datab objects with from runtime variables
samples a pandas dataframe containing occurrence
description a description for the new datab
filename a filename can be rovided, if is intended to refer the data to an external database
"""
self.samples = samples
self.description = description
self.filename = filename
def get_info(self):
"""Print on the terminal some usefull informations about the database
"""
graph_info = {}
print("Samples informations")
print(self.samples.info())
print("Pipline history")
print(self.description)
if self.graph == None:
print("Grafico non ancora costruito")
else:
print("Returno le seguenti informazioni sul network")
print("Nodes info : {names : betweenness centrality}")
graph_info["Nodes"] = nx.beetweennes_centrality(self.graph)
graph_info["Edges"] = nx.edge_betweenness_centrality(self.graph)
return graph_info
def filter_median(self, m):
"""Remove all the database columns which have a median value under the value of the argument m
"""
not_keep = dict(self.samples.median() < m)
k = [lab for lab in not_keep if not_keep[lab]]
r = self.samples[k].copy()
self.samples = self.samples.drop(k, axis = "columns")
self.description["Filtering procedure"] += "Filtered out clumn with median under " + str(m) + "\n"
return r
def filter_prevalence(self, p):
"""Remove all the database columns which have less the p% of non 0 values
"""
limit = (100-p)*self.samples.shape[0]/100
not_keep = [l for l in self.samples if list(self.samples[l]).count(0) > limit]
self.samples = self.samples.drop(not_keep, axis = "columns")
self.description["Filtering procedure"] += "Filtered out column with more then " + str(p) + "% 0 values\n"
def get_sparcc_matrix(self, iterations):
"""Initialize the method c_matrix with a correlation matrix obtained with sparcc algorithm
The algorithm performs "iterations" iterations
The function also returns the matrix
"""
if self.description["Normalization"] != "nothing":
print("Per questo metodo, i dati non devono essere normalizzati")
return
name = self.filename + "_sparcc_utility.tsv"
with open(name, mode = 'w', encoding="utf-8") as tab:
self.samples.T.to_csv(tab, sep = '\t')
os.system("python3 ../SparCC3/SparCC.py " + name + " -i " + str(iterations) + " --cor_file=" + name.split("_utility")[0] + "_cor_matrix.tsv")
self.c_matrix = pd.read_csv(name.split("_utility")[0] + "_cor_matrix.tsv", sep='\t', index_col=0)
self.description["Correlation"] = "Correelation evaluated with SparCC"
return self.c_matrix
def get_pearson_matrix(self, mode = "L1"):
"""Initialize the method c_matrix with a Pearson correlation matrix
This function ofers 2 different modes to normalize the datas before computing correlation:
mode = "L1" (default) Normalize the data of each sample for their sum
mode = "CLR" Normalize the data of each sample for their center log rateo
The function also returns the matrix
"""
if mode == "L1":
norm = self.samples.sum(axis = 1)
self.samples = self.samples.div(norm, axis = 0)
self.description["Normalization"] = "Normalized with L1"
elif mode == "CLR":
# temporary = self.samples.replace(0, np.nan)
temporary = self.samples.replace(to_replace = 0, value = 0.5)
G = pow(temporary.product(axis = 1, skipna = True), 1/self.samples.shape[1])
self.samples = np.log(self.samples.div(G, axis = 0))
self.description["Normalization"] = "Normalized with CLR"
else:
print("normalization mode not implemented")
return 0
self.description["Normalization"] = mode
self.c_matrix = self.samples.corr(method = "pearson")
self.description["Correlation"] = "Correlation evalueted with Pearson"
return self.c_matrix
def make_graph(self, density):
"""Initialize the graph method with a networkx object obtained using c_matrix as an adjacency matrix
The graph is weighted and undirected
After the graph is created, self loops are removed
Then are also removed the smallest edges untill is reached the density specified in the argument
"""
self.graph = nx.from_pandas_adjacency(self.c_matrix)
self.graph.remove_edges_from(nx.selfloop_edges(self.graph))
sorted_by_weight = sorted(list(self.graph.edges(data="weight")), key = lambda tup: abs(tup[2]))
while nx.density(self.graph) > density:
self.graph.remove_edge(sorted_by_weight[0][0], sorted_by_weight[0][1])
del sorted_by_weight[0]
self.description["Graph filtering procedure"] = "Removing self loops\nRemoving edges untill i get the density of " + str(density) + "\n"