5
5
#include < exprtk.hpp>
6
6
#include < numpy/ndarrayobject.h>
7
7
8
+ #include < arrow/type.h>
9
+ #include < arrow/table.h>
10
+ #include < arrow/c/abi.h>
11
+ #include < arrow/c/bridge.h>
12
+
13
+ #include < csp/adapters/parquet/ParquetReader.h>
14
+ #include < csp/adapters/utils/StructAdapterInfo.h>
15
+ #include < csp/adapters/utils/ValueDispatcher.h>
16
+
8
17
static void * init_nparray ()
9
18
{
10
19
csp::python::AcquireGIL gil;
@@ -325,6 +334,137 @@ DECLARE_CPPNODE( exprtk_impl )
325
334
326
335
EXPORT_CPPNODE ( exprtk_impl );
327
336
337
+ DECLARE_CPPNODE ( record_batches_to_struct )
338
+ {
339
+ using InMemoryTableParquetReader = csp::adapters::parquet::InMemoryTableParquetReader;
340
+ using SingleTableParquetReader = csp::adapters::parquet::SingleTableParquetReader;
341
+ class MyTableReader : public InMemoryTableParquetReader
342
+ {
343
+ public:
344
+ MyTableReader ( std::vector<std::string> columns, std::shared_ptr<arrow::Schema> schema ):
345
+ InMemoryTableParquetReader ( nullptr , columns, false , {}, false )
346
+ {
347
+ m_schema = schema;
348
+ }
349
+ std::string getCurFileOrTableName () const override { return " IN_RECORD_BATCH" ; }
350
+ void initialize () { setColumnAdaptersFromCurrentTable (); }
351
+ void parseBatches ( std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches )
352
+ {
353
+ // TODO: Check if the schema has not changed
354
+ auto table_result = arrow::Table::FromRecordBatches (record_batches);
355
+ if ( !table_result.ok () )
356
+ CSP_THROW ( NotImplemented, " Unable to make table from record batches" );
357
+
358
+ setTable ( table_result.ValueUnsafe () );
359
+
360
+ if ( !readNextRowGroup () )
361
+ CSP_THROW ( NotImplemented, " Unable to read row group from table" );
362
+
363
+ while ( readNextRow () )
364
+ {
365
+ for ( auto & adapter: getStructAdapters () )
366
+ {
367
+ adapter -> dispatchValue ( nullptr );
368
+ }
369
+ }
370
+ }
371
+ void stop ()
372
+ {
373
+ InMemoryTableParquetReader::clear ();
374
+ }
375
+ protected:
376
+ bool openNextFile () override { return false ; }
377
+ void clear () override { setTable ( nullptr ); }
378
+ };
379
+
380
+ SCALAR_INPUT ( DialectGenericType, schema_ptr );
381
+ SCALAR_INPUT ( StructMetaPtr, cls );
382
+ SCALAR_INPUT ( DictionaryPtr, properties );
383
+ TS_INPUT ( Generic, data );
384
+
385
+ TS_OUTPUT ( Generic );
386
+
387
+ std::shared_ptr<MyTableReader> reader;
388
+ CspTypePtr outType;
389
+ std::vector<StructPtr>* m_structsVecPtr;
390
+
391
+ using StructAdapterInfo = csp::adapters::utils::StructAdapterInfo;
392
+ using ValueDispatcher = csp::adapters::utils::ValueDispatcher<StructPtr &>;
393
+
394
+ INIT_CPPNODE ( record_batches_to_struct )
395
+ {
396
+ auto & input_def = tsinputDef ( " data" );
397
+ if ( input_def.type -> type () != CspType::Type::ARRAY )
398
+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array type, got " << input_def.type -> type () );
399
+
400
+ auto * aType = static_cast <const CspArrayType *>( input_def.type .get () );
401
+ CspTypePtr elemType = aType -> elemType ();
402
+ if ( elemType -> type () != CspType::Type::DIALECT_GENERIC )
403
+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array of DIALECT_GENERIC type, got " << elemType -> type () );
404
+
405
+ auto & output_def = tsoutputDef ( " " );
406
+ if ( output_def.type -> type () != CspType::Type::ARRAY )
407
+ CSP_THROW ( NotImplemented, " record_batches_to_struct expected ts array type, got " << output_def.type -> type () );
408
+ }
409
+
410
+ START ()
411
+ {
412
+ // Create Adapters for Schema
413
+ PyObject* capsule = csp::python::toPythonBorrowed (schema_ptr);
414
+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer (capsule, " arrow_schema" ) );
415
+ auto result = arrow::ImportSchema (c_schema);
416
+ if ( !result.ok () )
417
+ CSP_THROW ( NotImplemented, " Unable to import schema" );
418
+ std::shared_ptr<arrow::Schema> schema = result.ValueUnsafe ();
419
+ std::vector<std::string> columns;
420
+ for ( int idx = 0 ; idx < schema -> num_fields (); idx++ )
421
+ {
422
+ auto & field = schema -> field ( idx );
423
+ columns.push_back (field -> name ());
424
+ }
425
+ reader = std::make_shared<MyTableReader>( columns, schema );
426
+ reader -> initialize ();
427
+
428
+ outType = std::make_shared<csp::CspStructType>( cls.value () );
429
+ auto field_map = properties.value () -> get<DictionaryPtr>( " field_map" );
430
+ StructAdapterInfo key{ outType, field_map };
431
+ auto & struct_adapter = reader -> getStructAdapter ( key );
432
+ struct_adapter.addSubscriber ( [this ]( StructPtr * s )
433
+ {
434
+ if ( s ) this -> m_structsVecPtr -> push_back ( *s );
435
+ else CSP_THROW ( NotImplemented, " StructPtr was null" );
436
+ }, {} );
437
+ }
438
+
439
+ INVOKE ()
440
+ {
441
+ if ( csp.ticked ( data ) )
442
+ {
443
+ auto & py_batches = data.lastValue <std::vector<DialectGenericType>>();
444
+ std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
445
+ for ( auto & py_batch: py_batches )
446
+ {
447
+ PyObject* py_tuple = csp::python::toPythonBorrowed ( py_batch );
448
+ PyObject* py_schema = PyTuple_GET_ITEM ( py_tuple, 0 );
449
+ PyObject* py_array = PyTuple_GET_ITEM ( py_tuple, 1 );
450
+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer ( py_schema, " arrow_schema" ) );
451
+ struct ArrowArray * c_array = reinterpret_cast <struct ArrowArray *>( PyCapsule_GetPointer ( py_array, " arrow_array" ) );
452
+ auto result = arrow::ImportRecordBatch (c_array, c_schema);
453
+ if ( !result.ok () )
454
+ CSP_THROW ( NotImplemented, " Unable to import record batch from c interface" );
455
+ batches.emplace_back (result.ValueUnsafe ());
456
+ }
457
+ std::vector<StructPtr> & out = unnamed_output ().reserveSpace <std::vector<StructPtr>>();
458
+ out.clear ();
459
+ m_structsVecPtr = &out;
460
+ reader -> parseBatches ( batches );
461
+ m_structsVecPtr = nullptr ;
462
+ }
463
+ }
464
+ };
465
+
466
+ EXPORT_CPPNODE ( record_batches_to_struct );
467
+
328
468
}
329
469
330
470
// Base nodes
@@ -350,6 +490,7 @@ REGISTER_CPPNODE( csp::cppnodes, struct_fromts );
350
490
REGISTER_CPPNODE ( csp::cppnodes, struct_collectts );
351
491
352
492
REGISTER_CPPNODE ( csp::cppnodes, exprtk_impl );
493
+ REGISTER_CPPNODE ( csp::cppnodes, record_batches_to_struct );
353
494
354
495
static PyModuleDef _cspbaselibimpl_module = {
355
496
PyModuleDef_HEAD_INIT,
0 commit comments