onex/ONEXBindings.cpp at master · c2research/onex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#include <boost/python.hpp>
#include <algorithm>
#include "OnlineSession.h"

#include <boost/foreach.hpp>
#include <boost/range/combine.hpp>

namespace py = boost::python;

OnlineSession os;

/**
 * Reduce the number of data points by putting every certain number of
 * data points into bins. Data points in each bin are averaged and become
 * a single new data point.
 *
 * \param seq     the sequence to be reduced.
 * \param binSize number of data points in a bin.
 * \return the reduced sequence.
 */
vector<seqitem_t> reduceSequence(vector<seqitem_t> seq, int binSize) {
  vector<seqitem_t> reduced;
  for (int i = 0; i < seq.size(); i += binSize) {
    seqitem_t binSum = 0;
    for (int j = 0; j < binSize; j++) {
      if (i + j >= seq.size()) break;
      binSum += seq[i + j];
    }
    reduced.push_back(binSum / binSize);
  }
  return reduced;
}

/**
 * Load dataset from a given path.
 *
 * \param path to the dataset in relation to the server.
 *
 * \return index of the dataset in the dataset list.
*/
int loadDataset(const char* path)
{
  int index = os.loaddb(path);
  return index;
}

/**
 * Load dataset from a given path with options to specify number of sequences,
 * length of each sequence, and the number of starting columns to be dropped at
 * each row.
 *
 * \param path path to the dataset.
 * \param seqCount number of time series sequence in the dataset.
 * \param seqLength length of each sequence.
 * \param firstColumnsDrop number of starting columns to drop at each row.
 *
 * \return index of the dataset in the dataset list.
 */
int loadDatasetWithParams(const char* path, int seqCount, int seqLength, int firstColumnsDrop)
{
  int index = os.loadolddb(path, seqCount, seqLength, firstColumnsDrop);
  return index;
}

/**
 * Unloading a dataset.
 */
int unloadDataset(int index)
{
  return os.killdb(index);
}

py::tuple normalizeDataset(int index)
{
  pair<seqitem_t, seqitem_t> normalization = os.normalize(index);
  return py::make_tuple(normalization.first, normalization.second);
}

/**
 * Perform grouping on dataset given its index and ST.
 *
 * \return number of groups.
 */
int groupDataset(int index, double ST)
{
  return os.initdbgroups(index, ST);
}

/**
  * Find the most similar subsequence in a dataset to a query.
  *
  * \param dbIndex index of the dataset where the similar subsequence will be
  *                sought.
  * \param qIndex  index of the dataset containing the query.
  * \param qSeq    index of the sequence containing the query.
  * \param qStart  starting position of the query in the sequence.
  * \param qEnd    ending position of the query in the sequence.
  * \param strat   search startegy, can be chosen from:
  *                  INTERMIX      = 0
  *                  EHIGHER_LOWER = 1
  *                  ELOWER_HIGHER = 2
  *                  EBOTTOM_TOP   = 3
  *                  ETOP_BOTTOM   = 4
  * \param qWarp ...
  * \return a Python tuple containing information of the best match:
  *         (dist, seq, start, end)
  *             dist  - distance from the query to the match
  *             seq   - index of the sequence containg the result in the dataset
  *                     being sought.
  *             start - starting position of the result in the sequence.
  *             end   - ending position of the result in the sequence.
  */
py::tuple findSimilar(int dbIndex, int qIndex, int qSeq,
                      int qStart, int qEnd, int strat, int warp)
{
  kBest best = os.similar(dbIndex, qIndex, qSeq, TimeInterval(qStart, qEnd), strat, warp);
  return py::make_tuple(best.dist, best.seq, best.interval.start, best.interval.end);
}

/**
 * Get a subsequence in a dataset.
 *
 * \param dbIndex index of a dataset in the dataset list.
 * \param dbSeq index of a sequence in the dataset.
 * \param dbStart starting position of the subsequence in the sequence.
 * \param dbEnd ending position of the subsequence in the sequence.
 * \param binSize number of data points in a bin that used for data compression.
 *
 * \return a Python list containing the data points in the subsequence.
 */
py::list getSubsequence(int dbIndex, int dbSeq, int dbStart, int dbEnd, int binSize = 1)
{
  py::list result;
  TimeSeriesInterval interval = os.getinterval(dbIndex, dbSeq, TimeInterval(dbStart, dbEnd));
  vector<seqitem_t> reducedInterval;
  for (int i = 0; i < interval.length(); i++) {
    reducedInterval.push_back(interval[i]);
  }
  reducedInterval = reduceSequence(reducedInterval, binSize);
  for (int i = 0; i < reducedInterval.size(); i++) {
    result.append(reducedInterval[i]);
  }
  return result;
}

py::list getSubsequenceDefault(int dbIndex, int dbSeq, int dbStart, int dbEnd) {
  return getSubsequence(dbIndex, dbSeq, dbStart, dbEnd, 1);
}

// py::list getSequences(int dbIndex, py::list indices, int binSize)
// {
//   py::list seqs;
//   int seqLength = os.getdbseqlength(dbIndex);
//   for (int i = 0; i < py::len(indices); i++) {
//     int index = py::extract<int>(indices[i]);
//     py::list seq = getSubsequence(dbIndex, index, 0, seqLength - 1, binSize);
//     seqs.append(seq);
//   }
//   return seqs;
// }

/**
 * Get all sequences in a dataset.
 *
 * \param dbIndex index of a dataset in the dataset list.
 * \param binSize number of data points in a bin that used for data compression.
 * \return a list where each element is a tuple (ts, groupId) where:
 *         ts: is Python list representing a sequence in the dataset
 *         groupId: is the id of the group that the sequence resides in
 */
py::list getAllSequences(int dbIndex, int binSize)
{
  py::list result;
  int seqCount = os.getdbseqcount(dbIndex);
  int seqLength = os.getdbseqlength(dbIndex);
  for (int i = 0; i < seqCount; i++) {
    py::list ts = getSubsequence(dbIndex, i, 0, seqLength - 1, binSize);
    int groupId = os.getGroupIndex(dbIndex, i, TimeInterval(0, seqLength - 1)).second;
    result.append(py::make_tuple(ts, groupId));
  }
  return result;
}

/**
 * Get the distance between two subsequence.
 *
 * \param dbIndexA index of the first dataset in memory
 * \param dbSeqA index of a sequence in the first dataset
 * \param startA starting position of the first subsequence
 * \param endA ending position of the first subsequence
 * \param dbIndexB index of the second dataset in memory
 * \param dbSeqB index of a sequence in the second dataset
 * \param startB starting position of the second subsequence
 * \param endB ending position of the second subsequence
 * \return dtw distance between two time series.
 */
seqitem_t getDistance(int dbIndexA, int dbSeqA, int startA, int endA,
                     int dbIndexB, int dbSeqB, int startB, int endB)
{
  seqitem_t distance = os.findDist(dbIndexA, dbIndexB, dbSeqA, dbSeqB,
                                  TimeInterval(startA, endA), TimeInterval(startB, endB),
                                  getDistMetric("dtw_lp2"));
  return distance;
}

/**
 * Get the warping path between two subsequence.
 *
 * \param dbIndexA index of the first dataset in memory
 * \param dbSeqA index of a sequence in the first dataset
 * \param startA starting position of the first subsequence
 * \param endA ending position of the first subsequence
 * \param dbIndexB index of the second dataset in memory
 * \param dbSeqB index of a sequence in the second dataset
 * \param startB starting position of the second subsequence
 * \param endB ending position of the second subsequence
 * \return a Python list containing Python tuples representing pairs of
 *         indices each of which matches a point from the first subsequence
 *         to a point in the second subsequence.
 */
py::list getWarpingPath(int dbIndexA, int dbSeqA, int startA, int endA,
                        int dbIndexB, int dbSeqB, int startB, int endB)
{
  py::list result;
  warping_path_t warp = os.getWarpingPath(dbIndexA, dbIndexB, dbSeqA, dbSeqB,
                                          TimeInterval(startA, endA), TimeInterval(startB, endB),
                                          getDistMetric("dtw_lp2"));
  for (int i = 0; i < warp.size(); i++) {
    py::list pair;
    pair.append(warp[i].first);
    pair.append(warp[i].second);
    result.append(pair);
  }
  return result;
}

bool _intervalCmp(kBest A, kBest B) {
  return A.interval.end < B.interval.end;
}

/**
 * Get seasonal patterns of a given length in a specified sequence.
 *
 * \param dbIndex index of the a dataset
 * \param dbSeq index of a sequence in the dataset
 * \param length length of the desired repeated patterns
 * \return a Python list of seasonal patterns. Each list of seasonal pattern contains
 *         pairs of starting and ending positions of subsequences which are similar to
 *         each other.
 */
py::list getSeasonal(int dbIndex, int dbSeq, int length)
{
  vector< vector<kBest> > seasonalGroups = os.seasonalSimilarity(dbIndex, dbSeq, length);
  py::list seasonals;
  for (int i = 0; i < seasonalGroups.size(); i++) {
    // Sort for the greedy algorithm
    sort(seasonalGroups[i].begin(), seasonalGroups[i].end(), _intervalCmp);
    py::list seasonal;
    int lastEnd = -1;
    for (int j = 0; j < seasonalGroups[i].size(); j++) {
      int curStart = seasonalGroups[i][j].interval.start;
      int curEnd = seasonalGroups[i][j].interval.end;
      // Greedily choose non-overlap subsequences
      if (curStart > lastEnd) {
        py::list startEnd;
        startEnd.append(curStart);
        startEnd.append(curEnd);
        seasonal.append(startEnd);
        lastEnd = curEnd;
      }
    }
    if (py::len(seasonal) > 1) {
      seasonals.append(seasonal);
    }
  }
  return seasonals;
}

/**
 * Get 'representatives' of all the groups
 *
 * \param dbIndex index of a dataset.
 * \return a list of tuples where each tuple is a
 *         vector of doubles and then a count of members in the group
 */
py::list getGroupRepresentatives(int dbIndex)
{
  vector<vector<seqitem_t> > representatives = os.getGroupRepresentatives(dbIndex);
  vector<int> counts = os.getGroupCounts(dbIndex);

  py::list result;

  vector<seqitem_t> rep;
  int cnt;

  BOOST_FOREACH(boost::tie(rep, cnt), boost::combine(representatives, counts))
  {
    py::list rep_py;
    BOOST_FOREACH( double r, rep ){
      rep_py.append(r);
    }
    result.append(py::make_tuple(rep_py,cnt));
  }

  return result;
}

/**
 * Get all the ts locaitons in a group
 * ...
 */
py::list getGroupValues(int dbIndex, int length, int groupIndex)
{
  vector<TimeSeriesInterval> seqs = os.getGroupValues(dbIndex, length, groupIndex);
  py::list sequenceLocations;
  BOOST_FOREACH(TimeSeriesInterval seq, seqs){
    TimeInterval interval = seq.getInterval();
    sequenceLocations.append(py::make_tuple(seq.getSeqNum(), interval.start, interval.end));
  }

  return sequenceLocations;
}

/**
 * Get a group index of a time series.
 * \param dbIndex index of the a dataset
 * \param dbSeq index of a sequence in the dataset
 * \param start starting position of the sequence
 * \param end ending position of the sequence
 * \return a tuple (len, idx) where len is the length of the time series
 *         and idx is the index of the group among the set of groups of length len.
 */
py::tuple getGroupIndex(int dbIndex, int dbSeq, int start, int end)
{
  pair<int, int> index = os.getGroupIndex(dbIndex, dbSeq, TimeInterval(start, end));
  return py::make_tuple(index.first, index.second);
}

/**
 * Get the number of sequence in a dataset.
 *
 * \param dbIndex index of a dataset.
 * \return number of sequence in the dataset.
 */
int getDatasetSeqCount(int dbIndex)
{
  return os.getdbseqcount(dbIndex);
}

/**
 * Get the length of each sequence in a dataset.
 *
 * \param dbIndex index of a dataset.
 * \return length of each sequence in the dataset.
 */
int getDatasetSeqLength(int dbIndex)
{
  return os.getdbseqlength(dbIndex);
}


BOOST_PYTHON_MODULE(ONEXBindings)
{
  py::def("loadDataset", loadDataset);
  py::def("loadDatasetWithParams", loadDatasetWithParams);
  py::def("unloadDataset", unloadDataset);
  py::def("normalizeDataset", normalizeDataset);
  py::def("groupDataset", groupDataset);
  py::def("findSimilar", findSimilar);
  py::def("getSubsequence", getSubsequence);
  py::def("getSubsequence", getSubsequenceDefault);
  // py::def("getSequences", getSequences);
  py::def("getAllSequences", getAllSequences);
  py::def("getDatasetSeqCount", getDatasetSeqCount);
  py::def("getDatasetSeqLength", getDatasetSeqLength);
  py::def("getDistance", getDistance);
  py::def("getWarpingPath", getWarpingPath);
  py::def("getSeasonal", getSeasonal);
  py::def("getGroupRepresentatives", getGroupRepresentatives);
  py::def("getGroupValues", getGroupValues);
  py::def("getGroupIndex", getGroupIndex);
}