-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdataPreprocessing.py
More file actions
196 lines (146 loc) · 7.58 KB
/
dataPreprocessing.py
File metadata and controls
196 lines (146 loc) · 7.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import sys
import os
sys.path.append("../")
sys.path.append("../..")
class DataPreprocessing():
'''This class has interfaces of Data Preprocessing.
**Data Preprocessing Modules**::
Refine Data, Remove Outlier, Impute Missing Data
'''
def __init__(self):
pass
def get_refinedData(self, data, refine_param):
"""
This function gets refined data with static frequency, without redundency data.
It refines data adaptively depending on flag status. (removeDuplication, staticFrequency)
"removeDuplication" :It removes duplicated data.
"staticFrequency" :The data will have a constant timestamp index.
data: DataFrame
input data
refine_param: json
refine_param['removeDuplication']={'flag':(Boolean)}
refine_param['staticFrequency'] ={'flag':(Boolean), 'frequency':[None|timeinfo]}
refine_param['ststicFreeuncy']['frequnecy'] == None -> infer original frequency and make static time stamp.
:return data : New refined DataFrame output
:return data: DataFrame
Example
-------
>>> data = input_data
>>> from Clust.clust.preprocessing.DataProcessing import DataPreprocessing
>>> refine_param = {'removeDuplication': {'flag': True}, 'staticFrequency': {'flag': True, 'frequency': '1H'}}
>>> output = DataPreprocessing().get_refinedData(data, refine_param)
"""
result = data.copy()
if refine_param['removeDuplication']['flag']== True:
from Clust.clust.preprocessing.refinement import redundancy
result = redundancy.ExcludeRedundancy().get_result(result)
if refine_param['staticFrequency']['flag'] == True:
from Clust.clust.preprocessing.refinement import frequency
inferred_freq = refine_param['staticFrequency']['frequency']
result = frequency.RefineFrequency().get_RefinedData(result, inferred_freq)
self.refinedData = result
return self.refinedData
def get_errorToNaNData(self, data, outlier_param):
"""
This function gets data with more NaN. This function converts data identified as errors to NaN. This module finds fake data generated due to network errors, etc., and converts it to NaN.
Example
-------
>>> outlier_param = {'certainErrorToNaN': {'flag': True}, 'unCertainErrorToNaN': {----}}
>>> datawithMoreCertainNaN, datawithMoreUnCertainNaN = DataPreprocessing().get_errorToNaNData(data, outlier_param)
data: dataFrame
input data
outlier_param: json
outlier Param
return: dataFrame
result
**Two Outlier Detection Modules**::
datawithMoreCertainNaN, datawithMoreUnCertainNaN
``datawithMoreCertainNaN``: Clear Error to NaN
``datawithMoreUnCertainNaN``: UnClear Error to NaN
"""
from Clust.clust.preprocessing.errorDetection import errorToNaN
self.datawithMoreCertainNaN = errorToNaN.errorToNaN().getDataWithCertainNaN(data, outlier_param['certainErrorToNaN'])
self.datawithMoreUnCertainNaN = errorToNaN.errorToNaN().getDataWithUncertainNaN(self.datawithMoreCertainNaN, outlier_param['unCertainErrorToNaN'])
return self.datawithMoreCertainNaN, self.datawithMoreUnCertainNaN
def get_imputedData(self, data, imputation_param):
""" Get imputed data
:param data: input data
:type data: DataFrame
:param refine_param: imputation_param
:type refine_param: json
:return: New Dataframe after imputation
:rtype: DataFrame
example
>>> imputation_param = {'serialImputation': {'flag': True, 'imputation_method': [{'min': 0, 'max': 3, 'method': 'KNN', 'parameter': {}}, {'min': 4, 'max': 6, 'method': 'mean', 'parameter': {}}], 'totalNonNanRatio': 80}}
>>> output = DataPreprocessing().get_imputedData(data, outlier_param)
"""
self.imputedData = data.copy()
if imputation_param['serialImputation']['flag'] == True:
from Clust.clust.preprocessing.imputation import Imputation
self.imputedData = Imputation.SerialImputation().get_dataWithSerialImputationMethods(self.imputedData, imputation_param['serialImputation'])
return self.imputedData
# Add New Function
class DataProcessing(DataPreprocessing):
'''This class provides funtion having packged preprocessing procedure.
'''
def __init__(self, process_param):
'''Set process_param related to each preprocessing module.
:param process_param: process_param
:type process_param: json
'''
self.refine_param = process_param['refine_param']
self.outlier_param = process_param['outlier_param']
self.imputation_param = process_param['imputation_param']
def preprocessing(self, input_data, flag):
""" Produces only one clean data with one preprocessing module.
:param input_data: input data
:type input_data: DataFrame
:param flag: preprocessing name
:type flag: string
:return: New Dataframe after one preprocessing (flag)
:rtype: DataFrame
example
>>> output = DataProcessing().preprocessing(data, 'refine')
"""
if flag == 'refine':
result = self.get_refinedData(input_data, self.refine_param)
elif flag =='errorToNaN':
result = self.get_errorToNaNData(input_data, self.outlier_param)
elif flag == 'imputation':
result = self.get_imputedData(input_data, self.imputation_param)
elif flag == 'all':
result = self.all_preprocessing(input_data)
return result
def all_preprocessing(self, input_data):
""" Produces partial Processing data depending on process_param
:param input_data: input data
:type input_data: DataFrame
:return: New Dataframe after preprocessing according to the process_param
:rtype: json (key: process name, value : output DataFrame)
example
>>> output = DataProcessing(process_param).all_preprocessing(data)
"""
###########
refined_data = self.get_refinedData(input_data, self.refine_param)
###########
datawithMoreCertainNaN, datawithMoreUnCertainNaN = self.get_errorToNaNData(refined_data, self.outlier_param)
###########
imputed_data = self.get_imputedData(datawithMoreUnCertainNaN, self.imputation_param)
###########
result ={'original':input_data, 'refined_data':refined_data, 'datawithMoreCertainNaN':datawithMoreCertainNaN,
'datawithMoreUnCertainNaN':datawithMoreUnCertainNaN, 'imputed_data':imputed_data}
return result
## Get Multiple output
def multiDataset_all_preprocessing(self, multiple_dataset):
""" Produces multiple DataFrame Processing result depending on process_param
:param input_data: multiple_dataset
:type input_data: json (having DataFrame value)
:return: json having New Dataframe after preprocessing according to the process_param
:rtype: json (value : output DataFrame)
example
>>> output = DataProcessing(process_param).multiDataset_all_preprocessing(multiple_dataset)
"""
output={}
for key in list(multiple_dataset.keys()):
output[key] = self.all_preprocessing(multiple_dataset[key])
return output