diff --git a/python/requirements.txt b/python/requirements.txt index 81136fb0..d4b4859e 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,5 +1,5 @@ Sphinx>=4.0.2 -pyarrow==15.0.2 +pyarrow==16.1.0 pandas>=1.2.5 opentelemetry-api>=1.0.0 -opentelemetry-sdk>=1.0.0 \ No newline at end of file +opentelemetry-sdk>=1.0.0 diff --git a/python/source/flight.rst b/python/source/flight.rst index 67c359c1..66b581eb 100644 --- a/python/source/flight.rst +++ b/python/source/flight.rst @@ -967,4 +967,126 @@ middleware, and one in the server middleware. :hide: # Shutdown the server - server.shutdown() \ No newline at end of file + server.shutdown() + +Sending Mixed Schema Data over a single Flight Stream +===================================================== + +In some applications it is desirable to transmit multiple record batches or tables with +*different* schemas over a single Arrow Flight stream. Although the IPC stream +specification does not allow for multiple schemas within a stream, it is possible to +work around this limitation by embedding the mixed schema data within a +:class:`pyarrow.UnionArray`. + +The folllowing example shows how to transmit two record batches with different schemas +over a single Flight stream. + +.. testcode:: + + import pyarrow as pa + + # Create record batches with different schemas + n_legs = pa.array([2, 2, 4, 4, 5, 100]) + animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + animals_recbatch = pa.RecordBatch.from_arrays( + [n_legs, animals], names=["n_legs", "animals"], + ) + + years = pa.array([2020, 2022, 2021, 2022]) + months = pa.array([3, 5, 7, 9]) + days = pa.array([1, 5, 9, 13]) + dates_recbatch = pa.RecordBatch.from_arrays( + [years, months, days], names=["year", "month", "day"], + ) + + # Convert the record batches to struct arrays + animals_array = animals_recbatch.to_struct_array() + dates_array = dates_recbatch.to_struct_array() + + # Create a UnionArray containing the animals data followed by the dates data + n_animals, n_dates = animals_recbatch.num_rows, dates_recbatch.num_rows + types = pa.array(([0] * n_animals) + ([1] * n_dates), type=pa.int8()) + offsets = pa.array(list(range(n_animals)) + list(range(n_dates)), type=pa.int32()) + union_array = pa.UnionArray.from_dense( + types, offsets, [animals_array, dates_array], ['animals', 'dates'], + ) + + # Finally create a single column RecordBatch from the UnionArray + union_recbatch = pa.RecordBatch.from_arrays([union_array], ['animals_dates_union']) + +.. testcode:: + :hide: + + print(union_recbatch) + +.. testoutput:: + :hide: + + pyarrow.RecordBatch + animals_dates_union: dense_union=0, dates: struct=1> + child 0, animals: struct + child 0, n_legs: int64 + child 1, animals: string + child 1, dates: struct + child 0, year: int64 + child 1, month: int64 + child 2, day: int64 + ---- + animals_dates_union: -- is_valid: all not null-- type_ids: [0,0,0,0,0,0,1,1,1,1]-- value_offsets: [0,1,2,3,4,5,0,1,2,3] + -- child 0 type: struct + -- is_valid: all not null + -- child 0 type: int64 + [2,2,4,4,5,100] + -- child 1 type: string + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + -- child 1 type: struct + -- is_valid: all not null + -- child 0 type: int64 + [2020,2022,2021,2022] + -- child 1 type: int64 + [3,5,7,9] + -- child 2 type: int64 + [1,5,9,13] + +We can then use :meth:`pyarrow.flight.FlightStreamWriter.write_batch` to write the +"mixed schema" record batch to the Flight stream. On the client side we can reconstruct +the original record batches as follows. + +.. testcode:: + :hide: + + chunk = (union_recbatch, None) + +.. testcode:: + + import pyarrow as pa + + # Unpack chunk from FlightStreamReader.read_chunk() + data, metadata = chunk + union_array = data.column(0) + animals_recbatch = pa.RecordBatch.from_struct_array(union_array.field(0)) + dates_recbatch = pa.RecordBatch.from_struct_array(union_array.field(1)) + +.. testcode:: + :hide: + + print(animals_recbatch) + print(dates_recbatch) + +.. testoutput:: + :hide: + + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13]