Skip to content

Commit 6c70614

Browse files
authored
[FSTORE-414] feature_view.create_train_test_split returns empty df (#854)
bug fix feature_view.create_train_test_split returns empty df
1 parent 23e1fb4 commit 6c70614

File tree

9 files changed

+164
-83
lines changed

9 files changed

+164
-83
lines changed

java/src/main/java/com/logicalclocks/hsfs/engine/SparkEngine.java

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.apache.spark.sql.Row;
5050
import org.apache.spark.sql.SaveMode;
5151
import org.apache.spark.sql.SparkSession;
52+
import org.apache.spark.sql.functions;
5253
import org.apache.spark.sql.streaming.DataStreamReader;
5354
import org.apache.spark.sql.streaming.DataStreamWriter;
5455
import org.apache.spark.sql.streaming.StreamingQuery;
@@ -57,6 +58,7 @@
5758
import org.apache.spark.sql.types.BinaryType;
5859
import org.apache.spark.sql.types.BooleanType;
5960
import org.apache.spark.sql.types.ByteType;
61+
import org.apache.spark.sql.types.DataTypes;
6062
import org.apache.spark.sql.types.DateType;
6163
import org.apache.spark.sql.types.DecimalType;
6264
import org.apache.spark.sql.types.DoubleType;
@@ -303,27 +305,43 @@ private Dataset<Row>[] timeSeriesSplit(TrainingDataset trainingDataset, Query qu
303305
int i = 0;
304306
for (Split split : splits) {
305307
if (dataset.count() > 0) {
308+
String eventTime = query.getLeftFeatureGroup().getEventTime();
306309
String eventTimeType =
307-
query.getLeftFeatureGroup().getFeature(query.getLeftFeatureGroup().getEventTime()).getType();
310+
query.getLeftFeatureGroup().getFeature(eventTime).getType();
311+
308312
if (BIGINT.getType().equals(eventTimeType)) {
313+
String tmpEventTime = eventTime + "_hopsworks_tmp";
314+
sparkSession.sqlContext()
315+
.udf()
316+
.register("checkEpochUDF", (Long input) -> {
317+
if (Long.toString(input).length() > 10) {
318+
input = input / 1000;
319+
return input.longValue();
320+
} else {
321+
return input;
322+
}
323+
}, DataTypes.LongType);
324+
dataset = dataset.withColumn(tmpEventTime,functions.callUDF(
325+
"checkEpochUDF", dataset.col(eventTime)));
326+
309327
// event time in second. `getTime()` return in millisecond.
310328
datasetSplits[i] = dataset.filter(
311329
String.format(
312330
"%d/1000 <= `%s` and `%s` < %d/1000",
313331
split.getStartTime().getTime(),
314-
query.getLeftFeatureGroup().getEventTime(),
315-
query.getLeftFeatureGroup().getEventTime(),
332+
tmpEventTime,
333+
tmpEventTime,
316334
split.getEndTime().getTime()
317335
)
318-
);
336+
).drop(tmpEventTime);
319337
} else if (DATE.getType().equals(eventTimeType) || TIMESTAMP.getType().equals(eventTimeType)) {
320338
// unix_timestamp return in second. `getTime()` return in millisecond.
321339
datasetSplits[i] = dataset.filter(
322340
String.format(
323341
"%d/1000 <= unix_timestamp(`%s`) and unix_timestamp(`%s`) < %d/1000",
324342
split.getStartTime().getTime(),
325-
query.getLeftFeatureGroup().getEventTime(),
326-
query.getLeftFeatureGroup().getEventTime(),
343+
eventTime,
344+
eventTime,
327345
split.getEndTime().getTime()
328346
)
329347
);

python/hsfs/engine/spark.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
concat,
4040
col,
4141
from_json,
42-
unix_timestamp,
42+
udf,
4343
)
4444
from pyspark.sql.avro.functions import from_avro, to_avro
4545
from pyspark.sql.types import (
@@ -492,14 +492,13 @@ def _random_split(self, dataset, training_dataset):
492492
def _time_series_split(
493493
self, training_dataset, dataset, event_time, drop_event_time=False
494494
):
495-
result_dfs = {}
496-
ts_type = dataset.select(event_time).dtypes[0][1]
497-
ts_col = (
498-
unix_timestamp(col(event_time)) * 1000
499-
if ts_type in ["date", "timestamp"]
500-
# jdbc supports timestamp precision up to second only.
501-
else col(event_time) * 1000
495+
# registering the UDF
496+
_convert_event_time_to_timestamp = udf(
497+
util.convert_event_time_to_timestamp, LongType()
502498
)
499+
500+
result_dfs = {}
501+
ts_col = _convert_event_time_to_timestamp(col(event_time))
503502
for split in training_dataset.splits:
504503
result_df = dataset.filter(ts_col >= split.start_time).filter(
505504
ts_col < split.end_time

python/hsfs/feature_view.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,9 @@ def get_batch_query(
190190
191191
# Arguments
192192
start_time: Start event time for the batch query. Optional. Strings should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`,
193-
`%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`.
193+
`%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
194194
end_time: End event time for the batch query. Optional. Strings should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`,
195-
`%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`.
195+
`%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
196196
197197
# Returns
198198
`str`: batch query
@@ -272,10 +272,10 @@ def get_batch_data(
272272
# Arguments
273273
start_time: Start event time for the batch query. Optional. Strings should be
274274
formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
275-
or `%Y-%m-%d %H:%M:%S.%f`.
275+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
276276
end_time: End event time for the batch query. Optional. Strings should be
277277
formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
278-
or `%Y-%m-%d %H:%M:%S.%f`.
278+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
279279
read_options: User provided read options. Defaults to `{}`.
280280
"""
281281

@@ -336,10 +336,10 @@ def create_training_data(
336336
# Arguments
337337
start_time: Start event time for the training dataset query. Optional. Strings should
338338
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
339-
or `%Y-%m-%d %H:%M:%S.%f`.
339+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
340340
end_time: End event time for the training dataset query. Optional. Strings should
341341
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
342-
or `%Y-%m-%d %H:%M:%S.%f`.
342+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
343343
storage_connector: Storage connector defining the sink location for the
344344
training dataset, defaults to `None`, and materializes training dataset
345345
on HopsFS.
@@ -444,16 +444,16 @@ def create_train_test_split(
444444
test_size: size of test set.
445445
train_start: Start event time for the train split query. Strings should
446446
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
447-
or `%Y-%m-%d %H:%M:%S.%f`.
447+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
448448
train_end: End event time for the train split query. Strings should
449449
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
450-
or `%Y-%m-%d %H:%M:%S.%f`.
450+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
451451
test_start: Start event time for the test split query. Strings should
452452
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
453-
or `%Y-%m-%d %H:%M:%S.%f`.
453+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
454454
test_end: End event time for the test split query. Strings should
455455
be formatted in one of the following ormats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
456-
or `%Y-%m-%d %H:%M:%S.%f`.
456+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
457457
storage_connector: Storage connector defining the sink location for the
458458
training dataset, defaults to `None`, and materializes training dataset
459459
on HopsFS.
@@ -570,22 +570,22 @@ def create_train_validation_test_split(
570570
test_size: size of test set.
571571
train_start: Start event time for the train split query. Strings should
572572
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
573-
or `%Y-%m-%d %H:%M:%S.%f`.
573+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
574574
train_end: End event time for the train split query. Strings should
575575
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
576-
or `%Y-%m-%d %H:%M:%S.%f`.
576+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
577577
validation_start: Start event time for the validation split query. Strings
578578
should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
579-
or `%Y-%m-%d %H:%M:%S.%f`.
579+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
580580
validation_end: End event time for the validation split query. Strings
581581
should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
582-
or `%Y-%m-%d %H:%M:%S.%f`.
582+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
583583
test_start: Start event time for the test split query. Strings should
584584
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
585-
or `%Y-%m-%d %H:%M:%S.%f`.
585+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
586586
test_end: End event time for the test split query. Strings should
587587
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
588-
or `%Y-%m-%d %H:%M:%S.%f`.
588+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
589589
storage_connector: Storage connector defining the sink location for the
590590
training dataset, defaults to `None`, and materializes training dataset
591591
on HopsFS.
@@ -718,11 +718,11 @@ def training_data(
718718
start_time: Start event time for the training dataset query. Strings should
719719
be formatted in one of the following
720720
formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
721-
or `%Y-%m-%d %H:%M:%S.%f`.
721+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
722722
end_time: End event time for the training dataset query. Strings should be
723723
formatted in one of the following
724724
formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
725-
or `%Y-%m-%d %H:%M:%S.%f`.
725+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
726726
description: A string describing the contents of the training dataset to
727727
improve discoverability for Data Scientists, defaults to empty string
728728
`""`.
@@ -794,13 +794,13 @@ def train_test_split(
794794
or `%Y-%m-%d %H:%M:%S.%f`.
795795
train_end: End event time for the train split query. Strings should
796796
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
797-
or `%Y-%m-%d %H:%M:%S.%f`.
797+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
798798
test_start: Start event time for the test split query. Strings should
799799
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
800-
or `%Y-%m-%d %H:%M:%S.%f`.
800+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
801801
test_end: End event time for the test split query. Strings should
802802
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
803-
or `%Y-%m-%d %H:%M:%S.%f`.
803+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
804804
description: A string describing the contents of the training dataset to
805805
improve discoverability for Data Scientists, defaults to empty string
806806
`""`.
@@ -893,22 +893,22 @@ def train_validation_test_split(
893893
test_size: size of test set. Should be between 0 and 1.
894894
train_start: Start event time for the train split query. Strings should
895895
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
896-
or `%Y-%m-%d %H:%M:%S.%f`.
896+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
897897
train_end: End event time for the train split query. Strings should
898898
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
899-
or `%Y-%m-%d %H:%M:%S.%f`.
899+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
900900
validation_start: Start event time for the validation split query. Strings
901901
should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
902-
or `%Y-%m-%d %H:%M:%S.%f`.
902+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
903903
validation_end: End event time for the validation split query. Strings
904904
should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
905-
or `%Y-%m-%d %H:%M:%S.%f`.
905+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
906906
test_start: Start event time for the test split query. Strings should
907907
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
908-
or `%Y-%m-%d %H:%M:%S.%f`.
908+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
909909
test_end: End event time for the test split query. Strings should
910910
be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`,
911-
or `%Y-%m-%d %H:%M:%S.%f`.
911+
or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds.
912912
description: A string describing the contents of the training dataset to
913913
improve discoverability for Data Scientists, defaults to empty string
914914
`""`.

python/hsfs/training_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,8 @@ def _append_time_split(
238238
TrainingDatasetSplit(
239239
name=split_name,
240240
split_type=TrainingDatasetSplit.TIME_SERIES_SPLIT,
241-
start_time=util.convert_event_time_to_timestamp(start_time),
242-
end_time=util.convert_event_time_to_timestamp(end_time),
241+
start_time=start_time,
242+
end_time=end_time,
243243
)
244244
)
245245

python/hsfs/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def convert_event_time_to_timestamp(event_time):
180180
if event_time == 0:
181181
raise ValueError("Event time should be greater than 0.")
182182
# jdbc supports timestamp precision up to second only.
183-
if len(str(event_time)) < 13:
183+
if len(str(event_time)) <= 10:
184184
event_time = event_time * 1000
185185
return event_time
186186
else:

python/tests/core/test_feature_view_engine.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,9 @@ def test_save_time_travel_query(self, mocker):
134134
)
135135

136136
fv = feature_view.FeatureView(
137-
name="fv_name", query=query.as_of(1000), featurestore_id=feature_store_id
137+
name="fv_name",
138+
query=query.as_of(1000000000),
139+
featurestore_id=feature_store_id,
138140
)
139141

140142
# Act
@@ -449,7 +451,10 @@ def test_get_batch_query(self, mocker):
449451

450452
# Act
451453
fv_engine.get_batch_query(
452-
feature_view_obj=fv, start_time=1, end_time=2, with_label=False
454+
feature_view_obj=fv,
455+
start_time=1000000000,
456+
end_time=2000000000,
457+
with_label=False,
453458
)
454459

455460
# Assert
@@ -486,7 +491,7 @@ def test_get_batch_query_string(self, mocker):
486491

487492
# Act
488493
result = fv_engine.get_batch_query_string(
489-
feature_view_obj=fv, start_time=1, end_time=2
494+
feature_view_obj=fv, start_time=1000000000, end_time=2000000000
490495
)
491496

492497
# Assert
@@ -526,7 +531,7 @@ def test_get_batch_query_string_pit_query(self, mocker):
526531

527532
# Act
528533
result = fv_engine.get_batch_query_string(
529-
feature_view_obj=fv, start_time=1, end_time=2
534+
feature_view_obj=fv, start_time=1000000000, end_time=2000000000
530535
)
531536

532537
# Assert

0 commit comments

Comments
 (0)