mirror of
https://github.com/meshtastic/python.git
synced 2026-01-01 04:17:59 -05:00
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
"""Utilities for Apache Arrow serialization."""
|
|
|
|
import pyarrow as pa
|
|
|
|
chunk_size = 1000 # disk writes are batched based on this number of rows
|
|
|
|
|
|
class ArrowWriter:
|
|
"""Writes an arrow file in a streaming fashion"""
|
|
|
|
def __init__(self, file_name: str):
|
|
"""Create a new ArrowWriter object.
|
|
|
|
file_name (str): The name of the file to write to.
|
|
"""
|
|
self.sink = pa.OSFile(file_name, "wb") # type: ignore
|
|
self.new_rows: list[dict] = []
|
|
self.schema: pa.Schema | None = None # haven't yet learned the schema
|
|
self.writer: pa.RecordBatchFileWriter | None = None
|
|
|
|
def close(self):
|
|
"""Close the stream and writes the file as needed."""
|
|
self._write()
|
|
if self.writer:
|
|
self.writer.close()
|
|
self.sink.close()
|
|
|
|
def _write(self):
|
|
"""Write the new rows to the file."""
|
|
if len(self.new_rows) > 0:
|
|
if self.schema is None:
|
|
self.schema = pa.Table.from_pylist(self.new_rows).schema
|
|
self.writer = pa.ipc.new_stream(self.sink, self.schema)
|
|
|
|
self.writer.write_batch(pa.RecordBatch.from_pylist(self.new_rows))
|
|
self.new_rows = []
|
|
|
|
def add_row(self, row_dict: dict):
|
|
"""Add a row to the arrow file.
|
|
We will automatically learn the schema from the first row. But all rows must use that schema.
|
|
"""
|
|
self.new_rows.append(row_dict)
|
|
if len(self.new_rows) >= chunk_size:
|
|
self._write()
|