support handling large time series with high sampling rates (#13)

dyuliu · web-flow · commit 4af2419ca80a · 2023-01-31T13:47:16.000-05:00
* add changed files

* fix lint issues

* test using py3.8

---------
diff --git a/.github/workflows/latest-dependencies.yml b/.github/workflows/latest-dependencies.yml
@@ -8,10 +8,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Update dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -11,10 +11,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.8]
         os: [ubuntu-latest]
         mongodb-version: [3.6]
     steps:
diff --git a/.github/workflows/restapi.yml b/.github/workflows/restapi.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.8]
         os: [ubuntu-latest]
         mongodb-version: [3.6]
     steps:
diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.8]
         os: [ubuntu-latest]
     steps:
     - uses: actions/checkout@v2
diff --git a/setup.py b/setup.py
@@ -19,8 +19,8 @@
 
 install_requires = [
     # Sintel
-    'orion-ml>=0.2.0,<1',
-
+    # 'orion-ml>=0.4.0',
+    'orion-ml@git+https://git@github.com/sintel-dev/Orion.git',
     # General
     'termcolor==1.1.0',
     'PyYAML==5.1',
@@ -29,7 +29,7 @@
 
     # Auth
     'oauthlib==3.1.0',
-    'pyOpenSSL==19.1.0',
+    'pyOpenSSL==23.0.0',
 
     # Math
     'pyts==0.10.0',
@@ -41,7 +41,7 @@
     'itsdangerous==2.0.1',
     'MarkupSafe==2.0.1',
     'requests==2.24.0',
-    'Werkzeug==0.15.3',
+    'Werkzeug==0.15.5',
     'gevent>=21.12.0',
     'flasgger==0.9.5',
     'Jinja2>=2.10,<3.1',
@@ -97,7 +97,8 @@
         'Natural Language :: English',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7'
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8'
     ],
     description=("Sintel (Signal Intelligence): A Machine Learning Framework"
                  "to Extract Insights from Signals"),
@@ -119,7 +120,7 @@
     keywords='sintel',
     name='sintel',
     packages=find_packages(include=['sintel', 'sintel.*']),
-    python_requires='>=3.6, <3.8',
+    python_requires='>=3.6, <3.9',
     test_suite='tests',
     url='https://github.com/sintel-dev/sintel',
     version='0.1.0.dev0',
diff --git a/sintel/core.py b/sintel/core.py
@@ -8,6 +8,7 @@
 from gridfs import GridFS
 from mongoengine import connect
 from pymongo import MongoClient
+from pymongo.database import Database
 from termcolor import colored
 
 from sintel import g
@@ -26,9 +27,12 @@ def __init__(self, cf, docker=False):
         if not docker:
             self._db = connect(db=cf['db'], host=cf['host'], port=cf['port'],
                                username=cf['username'], password=cf['password'])
+            _fs = GridFS(Database(self._db, cf['db']))
         else:
             self._db = connect(db=cf['dk_db'], host=cf['dk_host'], port=cf['dk_port'],
                                username=cf['dk_username'], password=cf['dk_password'])
+            _fs = GridFS(Database(self._db, cf['dk_db']))
+        g['_fs'] = _fs
 
     def _init_flask_app(self, env):
         app = Flask(
diff --git a/sintel/db/explorer.py b/sintel/db/explorer.py
@@ -6,6 +6,7 @@
 import json
 import logging
 import os
+import pickle
 from datetime import datetime, timezone
 
 import numpy as np
@@ -18,6 +19,7 @@
 from pymongo.database import Database
 from sklearn.impute import SimpleImputer
 
+from sintel import g
 from sintel.data import load_signal
 from sintel.db import schema
 
@@ -1142,42 +1144,49 @@ def get_prediction(cls, signalrun, start_time=None, stop_time=None):
         """
 
         signalrun_doc = schema.Signalrun.find_one(signalrun=signalrun)
-        signal_doc = signalrun_doc.signal
-
-        signal_start_year = datetime.utcfromtimestamp(signal_doc.start_time).year
-
-        if start_time is None:
-            start_time = signal_doc.start_time
-        if stop_time is None:
-            stop_time = signal_doc.stop_time
-
-        start_dt = datetime.utcfromtimestamp(start_time)
-        stop_dt = datetime.utcfromtimestamp(stop_time)
-        start_idx = (start_dt.year - signal_start_year) * 12 + start_dt.month
-        stop_idx = (stop_dt.year - signal_start_year) * 12 + stop_dt.month
-
-        pred_docs = schema.Prediction.find(signalrun=signalrun,
-                                           index__gte=start_idx, index__lte=stop_idx)
-        pred_docs = pred_docs.order_by('+index')
-
+#         signal_doc = signalrun_doc.signal
+
+#         signal_start_year = datetime.utcfromtimestamp(signal_doc.start_time).year
+
+#         if start_time is None:
+#             start_time = signal_doc.start_time
+#         if stop_time is None:
+#             stop_time = signal_doc.stop_time
+
+#         start_dt = datetime.utcfromtimestamp(start_time)
+#         stop_dt = datetime.utcfromtimestamp(stop_time)
+#         start_idx = (start_dt.year - signal_start_year) * 12 + start_dt.month
+#         stop_idx = (stop_dt.year - signal_start_year) * 12 + stop_dt.month
+
+#         pred_docs = schema.Prediction.find(signalrun=signalrun,
+#                                            index__gte=start_idx, index__lte=stop_idx)
+#         pred_docs = pred_docs.order_by('+index')
+
+#         prediction_results = dict()
+#         data = list()
+#         for idx, doc in enumerate(pred_docs):
+#             if idx == 0:
+#                 # first month
+#                 prediction_results['attrs'] = doc.attrs
+#                 for d in doc.data:
+#                     if d[0] >= start_time and d[0] <= stop_time:
+#                         data.append(d)
+#             elif idx != 0 and idx == len(pred_docs) - 1:
+#                 # last month but not the first
+#                 for d in doc.data:
+#                     if d[0] >= start_time and d[0] <= stop_time:
+#                         data.append(d)
+#             else:
+#                 data.extend(doc.data)
+
+#         prediction_results['data'] = data
         prediction_results = dict()
-        data = list()
-        for idx, doc in enumerate(pred_docs):
-            if idx == 0:
-                # first month
-                prediction_results['attrs'] = doc.attrs
-                for d in doc.data:
-                    if d[0] >= start_time and d[0] <= stop_time:
-                        data.append(d)
-            elif idx != 0 and idx == len(pred_docs) - 1:
-                # last month but not the first
-                for d in doc.data:
-                    if d[0] >= start_time and d[0] <= stop_time:
-                        data.append(d)
-            else:
-                data.extend(doc.data)
+        grid_out_doc = g['_fs'].find_one(
+            {'filename': f'sp-{signalrun_doc.id}'}, no_cursor_timeout=True)
+        pdata = pickle.loads(grid_out_doc.read())
+        prediction_results['attrs'] = pdata['attrs']
+        prediction_results['data'] = pdata['data']
 
-        prediction_results['data'] = data
         return prediction_results
 
     # ########## #
diff --git a/sintel/db/utils.py b/sintel/db/utils.py
@@ -134,42 +134,60 @@ def _inverse_scale_transform(v, a0, b0, a1, b1):
     return k * (b1 - a1) + a1
 
 
-def _split_large_prediction_data(doc, signal):
-    current_year = -1
-    current_month = -1
-    year_month_data = list()
-
-    signal_start_dt = datetime.utcfromtimestamp(signal.start_time)
-
-    for d in doc['data']:
-        dt = datetime.utcfromtimestamp(d[0])
-        y_idx = dt.year - signal_start_dt.year
-        m_idx = dt.month
-        index = y_idx * 12 + (m_idx - 1)
-        if (dt.year != current_year or current_month != dt.month):
-            if len(year_month_data) > 0:
-                pred_doc = {
-                    'signalrun': doc['signalrun'],
-                    'attrs': doc['attrs'],
-                    'index': index,
-                    'data': year_month_data
-                }
-                schema.Prediction.insert(**pred_doc)
-            year_month_data = list()
-            current_year = dt.year
-            current_month = dt.month
-
-        year_month_data.append(d)
-
-    # handle the last one
-    if len(year_month_data) > 0:
-        pred_doc = {
-            'signalrun': doc['signalrun'],
-            'attrs': doc['attrs'],
-            'index': index,
-            'data': year_month_data
-        }
-        schema.Prediction.insert(**pred_doc)
+def _split_large_prediction_data(doc, signalrun):
+
+    # save as gridfs
+    kwargs = {
+        "filename": f'sp-{signalrun.id}',
+        "variable": 'prediction doc'
+    }
+    with g_fs.new_file(**kwargs) as f:
+        pickle.dump(doc, f)
+
+    return
+    # test load
+    # for grid_out in g_fs.find({'filename': f'sp-{signalrun.id}'}, no_cursor_timeout=True):
+    #     daa = pickle.loads(grid_out.read())
+    #     print(daa.keys())
+    # grid_out_doc = g_fs.find_one({'filename': f'sp-{signalrun.id}'}, no_cursor_timeout=True)
+    # daa = pickle.loads(grid_out_doc.read())
+    # print(daa.keys())
+
+#     current_year = -1
+#     current_month = -1
+#     year_month_data = list()
+
+#     signal_start_dt = datetime.utcfromtimestamp(signalrun.signal.start_time)
+
+#     for d in doc['data']:
+#         dt = datetime.utcfromtimestamp(d[0])
+#         y_idx = dt.year - signal_start_dt.year
+#         m_idx = dt.month
+#         index = y_idx * 12 + (m_idx - 1)
+#         if (dt.year != current_year or current_month != dt.month):
+#             if len(year_month_data) > 0:
+#                 pred_doc = {
+#                     'signalrun': doc['signalrun'],
+#                     'attrs': doc['attrs'],
+#                     'index': index,
+#                     'data': year_month_data
+#                 }
+#                 schema.Prediction.insert(**pred_doc)
+#             year_month_data = list()
+#             current_year = dt.year
+#             current_month = dt.month
+
+#         year_month_data.append(d)
+
+#     # handle the last one
+#     if len(year_month_data) > 0:
+#         pred_doc = {
+#             'signalrun': doc['signalrun'],
+#             'attrs': doc['attrs'],
+#             'index': index,
+#             'data': year_month_data
+#         }
+#         schema.Prediction.insert(**pred_doc)
 
 
 def _update_prediction(signalrun, v, stock=False):
@@ -288,7 +306,7 @@ def _update_prediction(signalrun, v, stock=False):
             'data': data_
         }
 
-        _split_large_prediction_data(doc, signalrun.signal)
+        _split_large_prediction_data(doc, signalrun)
     except Exception as e:
         print(e)
 
@@ -303,13 +321,15 @@ def _update_period(signalrun, v, stock=False):
     # optimal interval for periodical description
     diff = (v['raw_index'][1] - v['raw_index'][0]) / 60
     my_interval = 1440
-    for interval in [30, 60, 120, 180, 240, 360, 480, 720]:
+    for interval in [6, 30, 60, 120, 180, 240, 360, 480, 720]:
         if diff <= interval:
             my_interval = interval
             break
 
     day_bin_num = 24 * 60 // my_interval
 
+    print(f'*update period* my_interval: {my_interval}m, day_bin_num: {day_bin_num}')
+
     docs = []
     # year
     for y in range(year_start, year_end + 1):
@@ -348,11 +368,12 @@ def _update_period(signalrun, v, stock=False):
     schema.Period.insert_many(docs)
 
 
-def _update_raw(signal, interval=21600, method=['mean'], stock=False):
+def _update_raw(signal, interval=360, method=['mean'], stock=False):
     # interval should be changed case by case
     # ses -> 360 seconds
     # nasa -> 4 hours
     # stock -> 1 day
+    print(f'*update raw*  interval: {interval}s')
     X = load_signal(signal.data_location, timestamp_column=signal.timestamp_column,
                     value_column=signal.value_column, stock=stock)
 
@@ -428,6 +449,9 @@ def _update_raw(signal, interval=21600, method=['mean'], stock=False):
 
 def update_db(fs, exp_filter=None, stock=False):
 
+    global g_fs
+    g_fs = fs
+
     # get signalrun list
 
     # TODO: remove utc setting, it should be always True
diff --git a/sintel/resources/datarun.py b/sintel/resources/datarun.py
diff --git a/sintel/runners/anomaly_detection.py b/sintel/runners/anomaly_detection.py
diff --git a/tutorials/Anomaly Detection.ipynb b/tutorials/Anomaly Detection.ipynb