-
Notifications
You must be signed in to change notification settings - Fork 1
Add optional record filter by field/value #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |||||||||||
| import time | ||||||||||||
|
|
||||||||||||
| import pyarrow as pa | ||||||||||||
| import pyarrow.compute as pc | ||||||||||||
| from confluent_kafka import TopicPartition | ||||||||||||
|
|
||||||||||||
| from millpond import arrow_converter, config, consumer, ducklake, logging_config, metrics, schema, server | ||||||||||||
|
|
@@ -27,6 +28,21 @@ def _convert_batch(values: list[bytes]) -> pa.Table | None: | |||||||||||
| return table | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _apply_filter(table: pa.Table, cfg: config.Config) -> pa.Table: | ||||||||||||
| """Filter table by field/value if configured. Returns the (possibly smaller) table.""" | ||||||||||||
| if cfg.filter_field is None: | ||||||||||||
| return table | ||||||||||||
| if cfg.filter_field not in table.column_names: | ||||||||||||
| log.warning("Filter field %r not in table schema, keeping all records", cfg.filter_field) | ||||||||||||
| return table | ||||||||||||
| column = pc.cast(table[cfg.filter_field], pa.string()) | ||||||||||||
| filtered = table.filter(pc.equal(column, cfg.filter_value)) | ||||||||||||
| filtered_out = len(table) - len(filtered) | ||||||||||||
| if filtered_out > 0: | ||||||||||||
| metrics.records_skipped_total.labels(reason="filter").inc(filtered_out) | ||||||||||||
| return filtered | ||||||||||||
|
jghoman marked this conversation as resolved.
|
||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _write_with_retry(db, table_name, consolidated, schema_mgr, partition_by=None): | ||||||||||||
| """Write to DuckLake with exponential backoff on transient failures.""" | ||||||||||||
| for attempt in range(_WRITE_MAX_RETRIES): | ||||||||||||
|
|
@@ -53,21 +69,16 @@ def _write_with_retry(db, table_name, consolidated, schema_mgr, partition_by=Non | |||||||||||
| time.sleep(delay) | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _flush(db, cfg, kafka, consolidated, pending_bytes, pending_records, offsets, elapsed, schema_mgr, trigger="time"): | ||||||||||||
| """Write to DuckLake, commit offsets, update metrics.""" | ||||||||||||
| t0 = time.monotonic() | ||||||||||||
| _write_with_retry(db, cfg.ducklake_table, consolidated, schema_mgr, cfg.partition_by) | ||||||||||||
| write_duration = time.monotonic() - t0 | ||||||||||||
|
|
||||||||||||
| # Commit offsets synchronously — at-least-once requires knowing commit succeeded | ||||||||||||
| def _commit_offsets(kafka, offsets): | ||||||||||||
| """Commit Kafka offsets with retries. Returns the TopicPartition list committed.""" | ||||||||||||
| tp_offsets = [ | ||||||||||||
| TopicPartition(topic, partition, offset + 1) # +1: committed offset is next-to-fetch | ||||||||||||
| for (topic, partition), offset in offsets.items() | ||||||||||||
| ] | ||||||||||||
| for attempt in range(_COMMIT_MAX_RETRIES): | ||||||||||||
| try: | ||||||||||||
| kafka.commit(offsets=tp_offsets, asynchronous=False) | ||||||||||||
| break | ||||||||||||
| return tp_offsets | ||||||||||||
| except Exception: | ||||||||||||
| metrics.errors_total.labels(type="offset_commit").inc() | ||||||||||||
| if attempt == _COMMIT_MAX_RETRIES - 1: | ||||||||||||
|
|
@@ -87,6 +98,15 @@ def _flush(db, cfg, kafka, consolidated, pending_bytes, pending_records, offsets | |||||||||||
| ) | ||||||||||||
| time.sleep(delay) | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def _flush(db, cfg, kafka, consolidated, pending_bytes, pending_records, offsets, elapsed, schema_mgr, trigger="time"): | ||||||||||||
| """Write to DuckLake, commit offsets, update metrics.""" | ||||||||||||
| t0 = time.monotonic() | ||||||||||||
| _write_with_retry(db, cfg.ducklake_table, consolidated, schema_mgr, cfg.partition_by) | ||||||||||||
| write_duration = time.monotonic() - t0 | ||||||||||||
|
|
||||||||||||
| tp_offsets = _commit_offsets(kafka, offsets) | ||||||||||||
|
|
||||||||||||
| log.info( | ||||||||||||
| "Flush: %d records, %d bytes, %d columns, write=%.2fs, elapsed=%.1fs", | ||||||||||||
| len(consolidated), | ||||||||||||
|
|
@@ -178,9 +198,11 @@ def on_signal(signum, _frame): | |||||||||||
| table = _convert_batch(values) | ||||||||||||
| if table is not None: | ||||||||||||
| skipped = len(values) - len(table) | ||||||||||||
| pending.append(table) | ||||||||||||
| pending_bytes += table.nbytes | ||||||||||||
| pending_records += len(table) | ||||||||||||
| table = _apply_filter(table, cfg) | ||||||||||||
| if len(table) > 0: | ||||||||||||
| pending.append(table) | ||||||||||||
| pending_bytes += table.nbytes | ||||||||||||
| pending_records += len(table) | ||||||||||||
| metrics.pending_bytes.set(pending_bytes) | ||||||||||||
| else: | ||||||||||||
|
jghoman marked this conversation as resolved.
|
||||||||||||
| skipped = len(values) | ||||||||||||
|
|
@@ -226,6 +248,13 @@ def on_signal(signum, _frame): | |||||||||||
| metrics.pending_bytes.set(0) | ||||||||||||
| last_flush = time.monotonic() | ||||||||||||
|
|
||||||||||||
| elif time_triggered and offsets: | ||||||||||||
| # No pending records but offsets advanced (e.g. all records filtered out). | ||||||||||||
| # Commit offsets so restarts don't replay already-processed data. | ||||||||||||
| _commit_offsets(kafka, offsets) | ||||||||||||
|
||||||||||||
| _commit_offsets(kafka, offsets) | |
| tp_offsets = _commit_offsets(kafka, offsets) | |
| if tp_offsets: | |
| for tp in tp_offsets: | |
| metrics.last_committed_offset.labels(topic=tp.topic, partition=tp.partition).set(tp.offset) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Record Filtering docs state that Millpond “keeps only records where a specified column matches a given string”, but the current implementation keeps all records when
FILTER_FIELDisn’t present in a batch schema (and only logs a warning). Please clarify the intended behavior for batches missing the filter column (drop all vs keep all) and ensure the documentation matches the actual behavior.