55#include < exprtk.hpp>
66#include < numpy/ndarrayobject.h>
77
8+ #include < arrow/type.h>
9+ #include < arrow/table.h>
10+ #include < arrow/c/abi.h>
11+ #include < arrow/c/bridge.h>
12+
13+ #include < csp/adapters/parquet/ParquetReader.h>
14+ #include < csp/adapters/utils/StructAdapterInfo.h>
15+ #include < csp/adapters/utils/ValueDispatcher.h>
16+
817static void * init_nparray ()
918{
1019 csp::python::AcquireGIL gil;
@@ -325,6 +334,137 @@ DECLARE_CPPNODE( exprtk_impl )
325334
326335EXPORT_CPPNODE ( exprtk_impl );
327336
337+ DECLARE_CPPNODE ( record_batches_to_struct )
338+ {
339+ using InMemoryTableParquetReader = csp::adapters::parquet::InMemoryTableParquetReader;
340+ using SingleTableParquetReader = csp::adapters::parquet::SingleTableParquetReader;
341+ class MyTableReader : public InMemoryTableParquetReader
342+ {
343+ public:
344+ MyTableReader ( std::vector<std::string> columns, std::shared_ptr<arrow::Schema> schema ):
345+ InMemoryTableParquetReader ( nullptr , columns, false , {}, false )
346+ {
347+ m_schema = schema;
348+ }
349+ std::string getCurFileOrTableName () const override { return " IN_RECORD_BATCH" ; }
350+ void initialize () { setColumnAdaptersFromCurrentTable (); }
351+ void parseBatches ( std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches )
352+ {
353+ // TODO: Check if the schema has not changed
354+ auto table_result = arrow::Table::FromRecordBatches (record_batches);
355+ if ( !table_result.ok () )
356+ CSP_THROW ( NotImplemented, " Unable to make table from record batches" );
357+
358+ setTable ( table_result.ValueUnsafe () );
359+
360+ if ( !readNextRowGroup () )
361+ CSP_THROW ( NotImplemented, " Unable to read row group from table" );
362+
363+ while ( readNextRow () )
364+ {
365+ for ( auto & adapter: getStructAdapters () )
366+ {
367+ adapter -> dispatchValue ( nullptr );
368+ }
369+ }
370+ }
371+ void stop ()
372+ {
373+ InMemoryTableParquetReader::clear ();
374+ }
375+ protected:
376+ bool openNextFile () override { return false ; }
377+ void clear () override { setTable ( nullptr ); }
378+ };
379+
380+ SCALAR_INPUT ( DialectGenericType, schema_ptr );
381+ SCALAR_INPUT ( StructMetaPtr, cls );
382+ SCALAR_INPUT ( DictionaryPtr, properties );
383+ TS_INPUT ( Generic, data );
384+
385+ TS_OUTPUT ( Generic );
386+
387+ std::shared_ptr<MyTableReader> reader;
388+ CspTypePtr outType;
389+ std::vector<StructPtr>* m_structsVecPtr;
390+
391+ using StructAdapterInfo = csp::adapters::utils::StructAdapterInfo;
392+ using ValueDispatcher = csp::adapters::utils::ValueDispatcher<StructPtr &>;
393+
394+ INIT_CPPNODE ( record_batches_to_struct )
395+ {
396+ auto & input_def = tsinputDef ( " data" );
397+ if ( input_def.type -> type () != CspType::Type::ARRAY )
398+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array type, got " << input_def.type -> type () );
399+
400+ auto * aType = static_cast <const CspArrayType *>( input_def.type .get () );
401+ CspTypePtr elemType = aType -> elemType ();
402+ if ( elemType -> type () != CspType::Type::DIALECT_GENERIC )
403+ CSP_THROW ( TypeError, " record_batches_to_struct expected ts array of DIALECT_GENERIC type, got " << elemType -> type () );
404+
405+ auto & output_def = tsoutputDef ( " " );
406+ if ( output_def.type -> type () != CspType::Type::ARRAY )
407+ CSP_THROW ( NotImplemented, " record_batches_to_struct expected ts array type, got " << output_def.type -> type () );
408+ }
409+
410+ START ()
411+ {
412+ // Create Adapters for Schema
413+ PyObject* capsule = csp::python::toPythonBorrowed (schema_ptr);
414+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer (capsule, " arrow_schema" ) );
415+ auto result = arrow::ImportSchema (c_schema);
416+ if ( !result.ok () )
417+ CSP_THROW ( NotImplemented, " Unable to import schema" );
418+ std::shared_ptr<arrow::Schema> schema = result.ValueUnsafe ();
419+ std::vector<std::string> columns;
420+ for ( int idx = 0 ; idx < schema -> num_fields (); idx++ )
421+ {
422+ auto & field = schema -> field ( idx );
423+ columns.push_back (field -> name ());
424+ }
425+ reader = std::make_shared<MyTableReader>( columns, schema );
426+ reader -> initialize ();
427+
428+ outType = std::make_shared<csp::CspStructType>( cls.value () );
429+ auto field_map = properties.value () -> get<DictionaryPtr>( " field_map" );
430+ StructAdapterInfo key{ outType, field_map };
431+ auto & struct_adapter = reader -> getStructAdapter ( key );
432+ struct_adapter.addSubscriber ( [this ]( StructPtr * s )
433+ {
434+ if ( s ) this -> m_structsVecPtr -> push_back ( *s );
435+ else CSP_THROW ( NotImplemented, " StructPtr was null" );
436+ }, {} );
437+ }
438+
439+ INVOKE ()
440+ {
441+ if ( csp.ticked ( data ) )
442+ {
443+ auto & py_batches = data.lastValue <std::vector<DialectGenericType>>();
444+ std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
445+ for ( auto & py_batch: py_batches )
446+ {
447+ PyObject* py_tuple = csp::python::toPythonBorrowed ( py_batch );
448+ PyObject* py_schema = PyTuple_GET_ITEM ( py_tuple, 0 );
449+ PyObject* py_array = PyTuple_GET_ITEM ( py_tuple, 1 );
450+ struct ArrowSchema * c_schema = reinterpret_cast <struct ArrowSchema *>( PyCapsule_GetPointer ( py_schema, " arrow_schema" ) );
451+ struct ArrowArray * c_array = reinterpret_cast <struct ArrowArray *>( PyCapsule_GetPointer ( py_array, " arrow_array" ) );
452+ auto result = arrow::ImportRecordBatch (c_array, c_schema);
453+ if ( !result.ok () )
454+ CSP_THROW ( NotImplemented, " Unable to import record batch from c interface" );
455+ batches.emplace_back (result.ValueUnsafe ());
456+ }
457+ std::vector<StructPtr> & out = unnamed_output ().reserveSpace <std::vector<StructPtr>>();
458+ out.clear ();
459+ m_structsVecPtr = &out;
460+ reader -> parseBatches ( batches );
461+ m_structsVecPtr = nullptr ;
462+ }
463+ }
464+ };
465+
466+ EXPORT_CPPNODE ( record_batches_to_struct );
467+
328468}
329469
330470// Base nodes
@@ -350,6 +490,7 @@ REGISTER_CPPNODE( csp::cppnodes, struct_fromts );
350490REGISTER_CPPNODE ( csp::cppnodes, struct_collectts );
351491
352492REGISTER_CPPNODE ( csp::cppnodes, exprtk_impl );
493+ REGISTER_CPPNODE ( csp::cppnodes, record_batches_to_struct );
353494
354495static PyModuleDef _cspbaselibimpl_module = {
355496 PyModuleDef_HEAD_INIT,
0 commit comments