Skip to content

Commit 4af2419

Browse files
authored
support handling large time series with high sampling rates (#13)
* add changed files * fix lint issues * test using py3.8 ---------
1 parent e321b3f commit 4af2419

File tree

12 files changed

+360
-258
lines changed

12 files changed

+360
-258
lines changed

.github/workflows/latest-dependencies.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ jobs:
88
runs-on: ubuntu-latest
99
steps:
1010
- uses: actions/checkout@v2
11-
- name: Set up Python 3.7
11+
- name: Set up Python 3.8
1212
uses: actions/setup-python@v2
1313
with:
14-
python-version: 3.7
14+
python-version: 3.8
1515
- name: Update dependencies
1616
run: |
1717
python -m pip install --upgrade pip

.github/workflows/lint.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ jobs:
1111
runs-on: ubuntu-latest
1212
steps:
1313
- uses: actions/checkout@v2
14-
- name: Set up Python 3.7
14+
- name: Set up Python 3.8
1515
uses: actions/setup-python@v2
1616
with:
17-
python-version: 3.7
17+
python-version: 3.8
1818
- name: Install dependencies
1919
run: |
2020
python -m pip install --upgrade pip

.github/workflows/minimum.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ${{ matrix.os }}
1212
strategy:
1313
matrix:
14-
python-version: [3.6, 3.7]
14+
python-version: [3.8]
1515
os: [ubuntu-latest]
1616
mongodb-version: [3.6]
1717
steps:

.github/workflows/restapi.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ${{ matrix.os }}
1212
strategy:
1313
matrix:
14-
python-version: [3.6, 3.7]
14+
python-version: [3.8]
1515
os: [ubuntu-latest]
1616
mongodb-version: [3.6]
1717
steps:

.github/workflows/unit.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ${{ matrix.os }}
1212
strategy:
1313
matrix:
14-
python-version: [3.6, 3.7]
14+
python-version: [3.8]
1515
os: [ubuntu-latest]
1616
steps:
1717
- uses: actions/checkout@v2

setup.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919

2020
install_requires = [
2121
# Sintel
22-
'orion-ml>=0.2.0,<1',
23-
22+
# 'orion-ml>=0.4.0',
23+
'orion-ml@git+https://git@github.com/sintel-dev/Orion.git',
2424
# General
2525
'termcolor==1.1.0',
2626
'PyYAML==5.1',
@@ -29,7 +29,7 @@
2929

3030
# Auth
3131
'oauthlib==3.1.0',
32-
'pyOpenSSL==19.1.0',
32+
'pyOpenSSL==23.0.0',
3333

3434
# Math
3535
'pyts==0.10.0',
@@ -41,7 +41,7 @@
4141
'itsdangerous==2.0.1',
4242
'MarkupSafe==2.0.1',
4343
'requests==2.24.0',
44-
'Werkzeug==0.15.3',
44+
'Werkzeug==0.15.5',
4545
'gevent>=21.12.0',
4646
'flasgger==0.9.5',
4747
'Jinja2>=2.10,<3.1',
@@ -97,7 +97,8 @@
9797
'Natural Language :: English',
9898
'Programming Language :: Python :: 3',
9999
'Programming Language :: Python :: 3.6',
100-
'Programming Language :: Python :: 3.7'
100+
'Programming Language :: Python :: 3.7',
101+
'Programming Language :: Python :: 3.8'
101102
],
102103
description=("Sintel (Signal Intelligence): A Machine Learning Framework"
103104
"to Extract Insights from Signals"),
@@ -119,7 +120,7 @@
119120
keywords='sintel',
120121
name='sintel',
121122
packages=find_packages(include=['sintel', 'sintel.*']),
122-
python_requires='>=3.6, <3.8',
123+
python_requires='>=3.6, <3.9',
123124
test_suite='tests',
124125
url='https://github.com/sintel-dev/sintel',
125126
version='0.1.0.dev0',

sintel/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from gridfs import GridFS
99
from mongoengine import connect
1010
from pymongo import MongoClient
11+
from pymongo.database import Database
1112
from termcolor import colored
1213

1314
from sintel import g
@@ -26,9 +27,12 @@ def __init__(self, cf, docker=False):
2627
if not docker:
2728
self._db = connect(db=cf['db'], host=cf['host'], port=cf['port'],
2829
username=cf['username'], password=cf['password'])
30+
_fs = GridFS(Database(self._db, cf['db']))
2931
else:
3032
self._db = connect(db=cf['dk_db'], host=cf['dk_host'], port=cf['dk_port'],
3133
username=cf['dk_username'], password=cf['dk_password'])
34+
_fs = GridFS(Database(self._db, cf['dk_db']))
35+
g['_fs'] = _fs
3236

3337
def _init_flask_app(self, env):
3438
app = Flask(

sintel/db/explorer.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import logging
88
import os
9+
import pickle
910
from datetime import datetime, timezone
1011

1112
import numpy as np
@@ -18,6 +19,7 @@
1819
from pymongo.database import Database
1920
from sklearn.impute import SimpleImputer
2021

22+
from sintel import g
2123
from sintel.data import load_signal
2224
from sintel.db import schema
2325

@@ -1142,42 +1144,49 @@ def get_prediction(cls, signalrun, start_time=None, stop_time=None):
11421144
"""
11431145

11441146
signalrun_doc = schema.Signalrun.find_one(signalrun=signalrun)
1145-
signal_doc = signalrun_doc.signal
1146-
1147-
signal_start_year = datetime.utcfromtimestamp(signal_doc.start_time).year
1148-
1149-
if start_time is None:
1150-
start_time = signal_doc.start_time
1151-
if stop_time is None:
1152-
stop_time = signal_doc.stop_time
1153-
1154-
start_dt = datetime.utcfromtimestamp(start_time)
1155-
stop_dt = datetime.utcfromtimestamp(stop_time)
1156-
start_idx = (start_dt.year - signal_start_year) * 12 + start_dt.month
1157-
stop_idx = (stop_dt.year - signal_start_year) * 12 + stop_dt.month
1158-
1159-
pred_docs = schema.Prediction.find(signalrun=signalrun,
1160-
index__gte=start_idx, index__lte=stop_idx)
1161-
pred_docs = pred_docs.order_by('+index')
1162-
1147+
# signal_doc = signalrun_doc.signal
1148+
1149+
# signal_start_year = datetime.utcfromtimestamp(signal_doc.start_time).year
1150+
1151+
# if start_time is None:
1152+
# start_time = signal_doc.start_time
1153+
# if stop_time is None:
1154+
# stop_time = signal_doc.stop_time
1155+
1156+
# start_dt = datetime.utcfromtimestamp(start_time)
1157+
# stop_dt = datetime.utcfromtimestamp(stop_time)
1158+
# start_idx = (start_dt.year - signal_start_year) * 12 + start_dt.month
1159+
# stop_idx = (stop_dt.year - signal_start_year) * 12 + stop_dt.month
1160+
1161+
# pred_docs = schema.Prediction.find(signalrun=signalrun,
1162+
# index__gte=start_idx, index__lte=stop_idx)
1163+
# pred_docs = pred_docs.order_by('+index')
1164+
1165+
# prediction_results = dict()
1166+
# data = list()
1167+
# for idx, doc in enumerate(pred_docs):
1168+
# if idx == 0:
1169+
# # first month
1170+
# prediction_results['attrs'] = doc.attrs
1171+
# for d in doc.data:
1172+
# if d[0] >= start_time and d[0] <= stop_time:
1173+
# data.append(d)
1174+
# elif idx != 0 and idx == len(pred_docs) - 1:
1175+
# # last month but not the first
1176+
# for d in doc.data:
1177+
# if d[0] >= start_time and d[0] <= stop_time:
1178+
# data.append(d)
1179+
# else:
1180+
# data.extend(doc.data)
1181+
1182+
# prediction_results['data'] = data
11631183
prediction_results = dict()
1164-
data = list()
1165-
for idx, doc in enumerate(pred_docs):
1166-
if idx == 0:
1167-
# first month
1168-
prediction_results['attrs'] = doc.attrs
1169-
for d in doc.data:
1170-
if d[0] >= start_time and d[0] <= stop_time:
1171-
data.append(d)
1172-
elif idx != 0 and idx == len(pred_docs) - 1:
1173-
# last month but not the first
1174-
for d in doc.data:
1175-
if d[0] >= start_time and d[0] <= stop_time:
1176-
data.append(d)
1177-
else:
1178-
data.extend(doc.data)
1184+
grid_out_doc = g['_fs'].find_one(
1185+
{'filename': f'sp-{signalrun_doc.id}'}, no_cursor_timeout=True)
1186+
pdata = pickle.loads(grid_out_doc.read())
1187+
prediction_results['attrs'] = pdata['attrs']
1188+
prediction_results['data'] = pdata['data']
11791189

1180-
prediction_results['data'] = data
11811190
return prediction_results
11821191

11831192
# ########## #

sintel/db/utils.py

Lines changed: 63 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -134,42 +134,60 @@ def _inverse_scale_transform(v, a0, b0, a1, b1):
134134
return k * (b1 - a1) + a1
135135

136136

137-
def _split_large_prediction_data(doc, signal):
138-
current_year = -1
139-
current_month = -1
140-
year_month_data = list()
141-
142-
signal_start_dt = datetime.utcfromtimestamp(signal.start_time)
143-
144-
for d in doc['data']:
145-
dt = datetime.utcfromtimestamp(d[0])
146-
y_idx = dt.year - signal_start_dt.year
147-
m_idx = dt.month
148-
index = y_idx * 12 + (m_idx - 1)
149-
if (dt.year != current_year or current_month != dt.month):
150-
if len(year_month_data) > 0:
151-
pred_doc = {
152-
'signalrun': doc['signalrun'],
153-
'attrs': doc['attrs'],
154-
'index': index,
155-
'data': year_month_data
156-
}
157-
schema.Prediction.insert(**pred_doc)
158-
year_month_data = list()
159-
current_year = dt.year
160-
current_month = dt.month
161-
162-
year_month_data.append(d)
163-
164-
# handle the last one
165-
if len(year_month_data) > 0:
166-
pred_doc = {
167-
'signalrun': doc['signalrun'],
168-
'attrs': doc['attrs'],
169-
'index': index,
170-
'data': year_month_data
171-
}
172-
schema.Prediction.insert(**pred_doc)
137+
def _split_large_prediction_data(doc, signalrun):
138+
139+
# save as gridfs
140+
kwargs = {
141+
"filename": f'sp-{signalrun.id}',
142+
"variable": 'prediction doc'
143+
}
144+
with g_fs.new_file(**kwargs) as f:
145+
pickle.dump(doc, f)
146+
147+
return
148+
# test load
149+
# for grid_out in g_fs.find({'filename': f'sp-{signalrun.id}'}, no_cursor_timeout=True):
150+
# daa = pickle.loads(grid_out.read())
151+
# print(daa.keys())
152+
# grid_out_doc = g_fs.find_one({'filename': f'sp-{signalrun.id}'}, no_cursor_timeout=True)
153+
# daa = pickle.loads(grid_out_doc.read())
154+
# print(daa.keys())
155+
156+
# current_year = -1
157+
# current_month = -1
158+
# year_month_data = list()
159+
160+
# signal_start_dt = datetime.utcfromtimestamp(signalrun.signal.start_time)
161+
162+
# for d in doc['data']:
163+
# dt = datetime.utcfromtimestamp(d[0])
164+
# y_idx = dt.year - signal_start_dt.year
165+
# m_idx = dt.month
166+
# index = y_idx * 12 + (m_idx - 1)
167+
# if (dt.year != current_year or current_month != dt.month):
168+
# if len(year_month_data) > 0:
169+
# pred_doc = {
170+
# 'signalrun': doc['signalrun'],
171+
# 'attrs': doc['attrs'],
172+
# 'index': index,
173+
# 'data': year_month_data
174+
# }
175+
# schema.Prediction.insert(**pred_doc)
176+
# year_month_data = list()
177+
# current_year = dt.year
178+
# current_month = dt.month
179+
180+
# year_month_data.append(d)
181+
182+
# # handle the last one
183+
# if len(year_month_data) > 0:
184+
# pred_doc = {
185+
# 'signalrun': doc['signalrun'],
186+
# 'attrs': doc['attrs'],
187+
# 'index': index,
188+
# 'data': year_month_data
189+
# }
190+
# schema.Prediction.insert(**pred_doc)
173191

174192

175193
def _update_prediction(signalrun, v, stock=False):
@@ -288,7 +306,7 @@ def _update_prediction(signalrun, v, stock=False):
288306
'data': data_
289307
}
290308

291-
_split_large_prediction_data(doc, signalrun.signal)
309+
_split_large_prediction_data(doc, signalrun)
292310
except Exception as e:
293311
print(e)
294312

@@ -303,13 +321,15 @@ def _update_period(signalrun, v, stock=False):
303321
# optimal interval for periodical description
304322
diff = (v['raw_index'][1] - v['raw_index'][0]) / 60
305323
my_interval = 1440
306-
for interval in [30, 60, 120, 180, 240, 360, 480, 720]:
324+
for interval in [6, 30, 60, 120, 180, 240, 360, 480, 720]:
307325
if diff <= interval:
308326
my_interval = interval
309327
break
310328

311329
day_bin_num = 24 * 60 // my_interval
312330

331+
print(f'*update period* my_interval: {my_interval}m, day_bin_num: {day_bin_num}')
332+
313333
docs = []
314334
# year
315335
for y in range(year_start, year_end + 1):
@@ -348,11 +368,12 @@ def _update_period(signalrun, v, stock=False):
348368
schema.Period.insert_many(docs)
349369

350370

351-
def _update_raw(signal, interval=21600, method=['mean'], stock=False):
371+
def _update_raw(signal, interval=360, method=['mean'], stock=False):
352372
# interval should be changed case by case
353373
# ses -> 360 seconds
354374
# nasa -> 4 hours
355375
# stock -> 1 day
376+
print(f'*update raw* interval: {interval}s')
356377
X = load_signal(signal.data_location, timestamp_column=signal.timestamp_column,
357378
value_column=signal.value_column, stock=stock)
358379

@@ -428,6 +449,9 @@ def _update_raw(signal, interval=21600, method=['mean'], stock=False):
428449

429450
def update_db(fs, exp_filter=None, stock=False):
430451

452+
global g_fs
453+
g_fs = fs
454+
431455
# get signalrun list
432456

433457
# TODO: remove utc setting, it should be always True

0 commit comments

Comments
 (0)