|
12 | 12 |
|
13 | 13 |
|
14 | 14 | use anyhow::{anyhow, Result}; |
15 | | -use arrow_array::builder::StringBuilder; |
16 | | -use arrow_array::RecordBatch; |
| 15 | +use arrow_array::builder::{BinaryBuilder, StringBuilder, TimestampNanosecondBuilder}; |
| 16 | +use arrow_array::{ArrayRef, RecordBatch}; |
17 | 17 | use arrow_json::reader::ReaderBuilder; |
18 | | -use arrow_schema::SchemaRef; |
| 18 | +use arrow_schema::{Schema, SchemaRef}; |
| 19 | +use std::collections::HashMap; |
19 | 20 | use std::sync::Arc; |
20 | 21 |
|
| 22 | +use crate::sql::common::TIMESTAMP_FIELD; |
| 23 | + |
21 | 24 | use super::config::{BadDataPolicy, Format}; |
22 | 25 |
|
23 | 26 | pub struct DataDeserializer { |
24 | 27 | format: Format, |
25 | | - schema: SchemaRef, |
| 28 | + final_schema: SchemaRef, |
| 29 | + decoder_schema: SchemaRef, |
26 | 30 | bad_data_policy: BadDataPolicy, |
27 | 31 | } |
28 | 32 |
|
29 | 33 | impl DataDeserializer { |
30 | 34 | pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy) -> Self { |
| 35 | + let decoder_schema = schema_without_timestamp(schema.as_ref()); |
31 | 36 | Self { |
32 | 37 | format, |
33 | | - schema, |
| 38 | + final_schema: schema, |
| 39 | + decoder_schema, |
34 | 40 | bad_data_policy, |
35 | 41 | } |
36 | 42 | } |
37 | 43 |
|
38 | 44 | pub fn deserialize_batch(&self, messages: &[&[u8]]) -> Result<RecordBatch> { |
| 45 | + self.deserialize_batch_with_kafka_timestamps(messages, &[]) |
| 46 | + } |
| 47 | + |
| 48 | + pub fn deserialize_batch_with_kafka_timestamps( |
| 49 | + &self, |
| 50 | + messages: &[&[u8]], |
| 51 | + kafka_timestamps_ms: &[u64], |
| 52 | + ) -> Result<RecordBatch> { |
39 | 53 | match &self.format { |
40 | | - Format::Json(_) => self.deserialize_json(messages), |
41 | | - Format::RawString => self.deserialize_raw_string(messages), |
42 | | - Format::RawBytes => self.deserialize_raw_bytes(messages), |
| 54 | + Format::Json(_) => self.deserialize_json(messages, kafka_timestamps_ms), |
| 55 | + Format::RawString => self.deserialize_raw_string(messages, kafka_timestamps_ms), |
| 56 | + Format::RawBytes => self.deserialize_raw_bytes(messages, kafka_timestamps_ms), |
43 | 57 | } |
44 | 58 | } |
45 | 59 |
|
46 | | - fn deserialize_json(&self, messages: &[&[u8]]) -> Result<RecordBatch> { |
| 60 | + fn deserialize_json(&self, messages: &[&[u8]], kafka_timestamps_ms: &[u64]) -> Result<RecordBatch> { |
47 | 61 | let mut buffer = Vec::with_capacity(messages.len() * 256); |
48 | 62 | for msg in messages { |
49 | 63 | buffer.extend_from_slice(msg); |
50 | 64 | buffer.push(b'\n'); |
51 | 65 | } |
52 | 66 |
|
53 | 67 | let allow_bad_data = self.bad_data_policy == BadDataPolicy::Drop; |
54 | | - let mut decoder = ReaderBuilder::new(self.schema.clone()) |
55 | | - .with_strict_mode(!allow_bad_data) |
| 68 | + let mut decoder = ReaderBuilder::new(self.decoder_schema.clone()) |
| 69 | + .with_strict_mode(false) |
56 | 70 | .build_decoder()?; |
57 | 71 |
|
58 | 72 | decoder.decode(&buffer)?; |
59 | 73 |
|
60 | | - let batch = if allow_bad_data { |
61 | | - let (batch, _mask, _, _errors) = decoder.flush_with_bad_data()?.unwrap(); |
62 | | - batch |
| 74 | + let (batch, valid_indices) = if allow_bad_data { |
| 75 | + let Some((batch, mask, _, _errors)) = decoder.flush_with_bad_data()? else { |
| 76 | + return Ok(RecordBatch::new_empty(self.final_schema.clone())); |
| 77 | + }; |
| 78 | + let mut indices = Vec::with_capacity(batch.num_rows()); |
| 79 | + for i in 0..mask.len() { |
| 80 | + if mask.value(i) { |
| 81 | + indices.push(i); |
| 82 | + } |
| 83 | + } |
| 84 | + (batch, indices) |
63 | 85 | } else { |
64 | | - decoder |
| 86 | + let batch = decoder |
65 | 87 | .flush()? |
66 | | - .ok_or_else(|| anyhow!("JSON decoder returned no batch"))? |
| 88 | + .unwrap_or_else(|| RecordBatch::new_empty(self.decoder_schema.clone())); |
| 89 | + let indices: Vec<usize> = (0..batch.num_rows()).collect(); |
| 90 | + (batch, indices) |
67 | 91 | }; |
68 | 92 |
|
69 | | - Ok(batch) |
| 93 | + self.rebuild_with_timestamp(batch, kafka_timestamps_ms, &valid_indices) |
70 | 94 | } |
71 | 95 |
|
72 | | - fn deserialize_raw_string(&self, messages: &[&[u8]]) -> Result<RecordBatch> { |
| 96 | + fn deserialize_raw_string(&self, messages: &[&[u8]], kafka_timestamps_ms: &[u64]) -> Result<RecordBatch> { |
| 97 | + let value_idx = self |
| 98 | + .decoder_schema |
| 99 | + .index_of("value") |
| 100 | + .map_err(|_| anyhow!("RawString format requires a 'value' column"))?; |
| 101 | + |
73 | 102 | let mut builder = StringBuilder::with_capacity(messages.len(), messages.len() * 64); |
74 | 103 | for msg in messages { |
75 | 104 | builder.append_value(String::from_utf8_lossy(msg)); |
76 | 105 | } |
77 | 106 |
|
78 | | - let array = Arc::new(builder.finish()); |
79 | | - RecordBatch::try_new(self.schema.clone(), vec![array]) |
80 | | - .map_err(|e| anyhow!("build RawString batch: {e}")) |
| 107 | + let mut columns = vec![None; self.decoder_schema.fields().len()]; |
| 108 | + columns[value_idx] = Some(Arc::new(builder.finish()) as ArrayRef); |
| 109 | + let decoded_columns = columns |
| 110 | + .into_iter() |
| 111 | + .map(|c| c.ok_or_else(|| anyhow!("missing RawString decoded column"))) |
| 112 | + .collect::<Result<Vec<_>>>()?; |
| 113 | + let decoded_batch = RecordBatch::try_new(self.decoder_schema.clone(), decoded_columns) |
| 114 | + .map_err(|e| anyhow!("build RawString decoded batch: {e}"))?; |
| 115 | + let valid_indices: Vec<usize> = (0..decoded_batch.num_rows()).collect(); |
| 116 | + self.rebuild_with_timestamp(decoded_batch, kafka_timestamps_ms, &valid_indices) |
81 | 117 | } |
82 | 118 |
|
83 | | - fn deserialize_raw_bytes(&self, messages: &[&[u8]]) -> Result<RecordBatch> { |
84 | | - use arrow_array::builder::BinaryBuilder; |
85 | | - |
| 119 | + fn deserialize_raw_bytes(&self, messages: &[&[u8]], kafka_timestamps_ms: &[u64]) -> Result<RecordBatch> { |
| 120 | + let value_idx = self |
| 121 | + .decoder_schema |
| 122 | + .index_of("value") |
| 123 | + .map_err(|_| anyhow!("RawBytes format requires a 'value' column"))?; |
86 | 124 | let mut builder = BinaryBuilder::with_capacity(messages.len(), messages.len() * 64); |
87 | 125 | for msg in messages { |
88 | 126 | builder.append_value(msg); |
89 | 127 | } |
90 | 128 |
|
91 | | - let array = Arc::new(builder.finish()); |
92 | | - RecordBatch::try_new(self.schema.clone(), vec![array]) |
93 | | - .map_err(|e| anyhow!("build RawBytes batch: {e}")) |
| 129 | + let mut columns = vec![None; self.decoder_schema.fields().len()]; |
| 130 | + columns[value_idx] = Some(Arc::new(builder.finish()) as ArrayRef); |
| 131 | + let decoded_columns = columns |
| 132 | + .into_iter() |
| 133 | + .map(|c| c.ok_or_else(|| anyhow!("missing RawBytes decoded column"))) |
| 134 | + .collect::<Result<Vec<_>>>()?; |
| 135 | + let decoded_batch = RecordBatch::try_new(self.decoder_schema.clone(), decoded_columns) |
| 136 | + .map_err(|e| anyhow!("build RawBytes decoded batch: {e}"))?; |
| 137 | + let valid_indices: Vec<usize> = (0..decoded_batch.num_rows()).collect(); |
| 138 | + self.rebuild_with_timestamp(decoded_batch, kafka_timestamps_ms, &valid_indices) |
| 139 | + } |
| 140 | + |
| 141 | + fn rebuild_with_timestamp( |
| 142 | + &self, |
| 143 | + decoded_batch: RecordBatch, |
| 144 | + kafka_timestamps_ms: &[u64], |
| 145 | + valid_indices: &[usize], |
| 146 | + ) -> Result<RecordBatch> { |
| 147 | + let mut by_name: HashMap<String, ArrayRef> = decoded_batch |
| 148 | + .schema() |
| 149 | + .fields() |
| 150 | + .iter() |
| 151 | + .zip(decoded_batch.columns().iter()) |
| 152 | + .map(|(f, a)| (f.name().to_string(), a.clone())) |
| 153 | + .collect(); |
| 154 | + |
| 155 | + let mut ts_builder = TimestampNanosecondBuilder::with_capacity(valid_indices.len()); |
| 156 | + for idx in valid_indices { |
| 157 | + let ms = kafka_timestamps_ms.get(*idx).copied().unwrap_or(0); |
| 158 | + ts_builder.append_value((ms as i64).saturating_mul(1_000_000)); |
| 159 | + } |
| 160 | + let timestamp_col: ArrayRef = Arc::new(ts_builder.finish()); |
| 161 | + |
| 162 | + let mut columns = Vec::with_capacity(self.final_schema.fields().len()); |
| 163 | + for field in self.final_schema.fields() { |
| 164 | + if field.name() == TIMESTAMP_FIELD { |
| 165 | + columns.push(timestamp_col.clone()); |
| 166 | + } else { |
| 167 | + let array = by_name |
| 168 | + .remove(field.name()) |
| 169 | + .ok_or_else(|| anyhow!("decoded JSON missing field '{}'", field.name()))?; |
| 170 | + columns.push(array); |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + RecordBatch::try_new(self.final_schema.clone(), columns) |
| 175 | + .map_err(|e| anyhow!("build JSON batch with _timestamp: {e}")) |
94 | 176 | } |
95 | 177 | } |
| 178 | + |
| 179 | +fn schema_without_timestamp(schema: &Schema) -> SchemaRef { |
| 180 | + let fields = schema |
| 181 | + .fields() |
| 182 | + .iter() |
| 183 | + .filter(|f| f.name() != TIMESTAMP_FIELD) |
| 184 | + .cloned() |
| 185 | + .collect::<Vec<_>>(); |
| 186 | + Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone())) |
| 187 | +} |
0 commit comments