Skip to content

Commit 35ad745

Browse files
author
Devdutt Shenoi
committed
refactor: schema mismatch check
1 parent 18c173b commit 35ad745

File tree

1 file changed

+47
-66
lines changed

1 file changed

+47
-66
lines changed

src/event/format/mod.rs

Lines changed: 47 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -171,13 +171,18 @@ pub trait EventFormat: Sized {
171171
)),
172172
);
173173

174-
// prepare the record batch and new fields to be added
175-
let mut new_schema = Arc::new(Schema::new(schema));
176-
if !is_schema_matching(new_schema.clone(), storage_schema, static_schema_flag) {
174+
if !is_schema_matching(&schema, storage_schema, static_schema_flag) {
177175
return Err(anyhow!("Schema mismatch"));
178176
}
179-
new_schema =
180-
update_field_type_in_schema(new_schema, None, time_partition, None, schema_version);
177+
178+
// prepare the record batch and new fields to be added
179+
let new_schema = update_field_type_in_schema(
180+
Arc::new(Schema::new(schema)),
181+
None,
182+
time_partition,
183+
None,
184+
schema_version,
185+
);
181186

182187
let mut rb = Self::decode(data, new_schema.clone())?;
183188
rb = replace_columns(
@@ -204,26 +209,22 @@ pub trait EventFormat: Sized {
204209
) -> Result<Event, AnyError>;
205210
}
206211

212+
/// Determines if a schema matches the storage schema based on configuration.
213+
/// Returns `true` if the schemas match according to the rules:
214+
/// - If `static_schema_flag` is `false`, always returns `true` (flexible schema mode)
215+
/// - If `static_schema_flag` is `true`, returns `true` only if all fields in `schema`
216+
/// exist in `storage_schema` with exactly matching properties
207217
fn is_schema_matching(
208-
new_schema: Arc<Schema>,
218+
schema: &[Arc<Field>],
209219
storage_schema: &HashMap<String, Arc<Field>>,
210220
static_schema_flag: bool,
211221
) -> bool {
212-
if !static_schema_flag {
213-
return true;
214-
}
215-
for field in new_schema.fields() {
216-
let Some(storage_field) = storage_schema.get(field.name()) else {
217-
return false;
218-
};
219-
if field.name() != storage_field.name() {
220-
return false;
221-
}
222-
if field.data_type() != storage_field.data_type() {
223-
return false;
224-
}
225-
}
226-
true
222+
!static_schema_flag
223+
|| !schema.iter().any(|field| {
224+
storage_schema
225+
.get(field.name())
226+
.is_none_or(|storage_field| storage_field != field)
227+
})
227228
}
228229

229230
pub fn get_existing_field_names(
@@ -374,7 +375,7 @@ pub fn override_data_type(
374375
mod tests {
375376
use std::{collections::HashMap, sync::Arc};
376377

377-
use arrow_schema::{DataType, Field, Schema};
378+
use arrow_schema::{DataType, Field};
378379

379380
use super::*;
380381

@@ -383,13 +384,8 @@ mod tests {
383384
Arc::new(Field::new(name.to_string(), data_type, true))
384385
}
385386

386-
// Helper function to create a test schema
387-
fn create_schema(fields: Vec<Arc<Field>>) -> Arc<Schema> {
388-
Arc::new(Schema::new(fields))
389-
}
390-
391387
// Helper function to create a storage schema HashMap
392-
fn create_storage_schema(fields: Vec<Arc<Field>>) -> HashMap<String, Arc<Field>> {
388+
fn create_storage_schema(fields: &[Arc<Field>]) -> HashMap<String, Arc<Field>> {
393389
let mut storage_schema = HashMap::new();
394390
for field in fields {
395391
storage_schema.insert(field.name().to_string(), field.clone());
@@ -403,11 +399,11 @@ mod tests {
403399
let field1 = create_field("id", DataType::Int32);
404400
let field2 = create_field("name", DataType::Utf8);
405401

406-
let schema = create_schema(vec![field1.clone(), field2.clone()]);
407-
let storage_schema = create_storage_schema(vec![field1.clone()]); // Missing field2
402+
let schema = [field1.clone(), field2.clone()];
403+
let storage_schema = create_storage_schema(&[field1.clone()]); // Missing field2
408404

409405
// Even though schemas don't match, function should return true because static_schema_flag is false
410-
assert!(is_schema_matching(schema, &storage_schema, false));
406+
assert!(is_schema_matching(&schema, &storage_schema, false));
411407
}
412408

413409
#[test]
@@ -416,10 +412,10 @@ mod tests {
416412
let field1 = create_field("id", DataType::Int32);
417413
let field2 = create_field("name", DataType::Utf8);
418414

419-
let schema = create_schema(vec![field1.clone(), field2.clone()]);
420-
let storage_schema = create_storage_schema(vec![field1.clone(), field2.clone()]);
415+
let schema = [field1.clone(), field2.clone()];
416+
let storage_schema = create_storage_schema(&schema);
421417

422-
assert!(is_schema_matching(schema, &storage_schema, true));
418+
assert!(is_schema_matching(&schema, &storage_schema, true));
423419
}
424420

425421
#[test]
@@ -428,10 +424,10 @@ mod tests {
428424
let field1 = create_field("id", DataType::Int32);
429425
let field2 = create_field("name", DataType::Utf8);
430426

431-
let schema = create_schema(vec![field1.clone(), field2.clone()]);
432-
let storage_schema = create_storage_schema(vec![field1.clone()]); // Missing field2
427+
let schema = [field1.clone(), field2.clone()];
428+
let storage_schema = create_storage_schema(&[field1.clone()]); // Missing field2
433429

434-
assert!(!is_schema_matching(schema, &storage_schema, true));
430+
assert!(!is_schema_matching(&schema, &storage_schema, true));
435431
}
436432

437433
#[test]
@@ -442,10 +438,10 @@ mod tests {
442438
let field1_different_type = create_field("id", DataType::Int64);
443439
let field2 = create_field("name", DataType::Utf8);
444440

445-
let schema = create_schema(vec![field1.clone(), field2.clone()]);
446-
let storage_schema = create_storage_schema(vec![field1_different_type, field2.clone()]);
441+
let schema = [field1.clone(), field2.clone()];
442+
let storage_schema = create_storage_schema(&[field1_different_type, field2.clone()]);
447443

448-
assert!(!is_schema_matching(schema, &storage_schema, true));
444+
assert!(!is_schema_matching(&schema, &storage_schema, true));
449445
}
450446

451447
#[test]
@@ -456,53 +452,38 @@ mod tests {
456452
let field2 = create_field("name", DataType::Utf8);
457453
let extra_field = create_field("extra", DataType::Boolean);
458454

459-
let schema = create_schema(vec![field1.clone(), field2.clone()]);
460-
let storage_schema =
461-
create_storage_schema(vec![field1.clone(), field2.clone(), extra_field]);
455+
let schema = [field1.clone(), field2.clone()];
456+
let storage_schema = create_storage_schema(&[field1.clone(), field2.clone(), extra_field]);
462457

463-
assert!(is_schema_matching(schema, &storage_schema, true));
458+
assert!(is_schema_matching(&schema, &storage_schema, true));
464459
}
465460

466461
#[test]
467462
fn test_empty_new_schema() {
468463
// When new schema is empty, should return true
469464
let field1 = create_field("id", DataType::Int32);
470465

471-
let empty_schema = create_schema(vec![]);
472-
let storage_schema = create_storage_schema(vec![field1.clone()]);
466+
let storage_schema = create_storage_schema(&[field1.clone()]);
473467

474-
assert!(is_schema_matching(
475-
empty_schema,
476-
&storage_schema,
477-
true
478-
));
468+
assert!(is_schema_matching(&[], &storage_schema, true));
479469
}
480470

481471
#[test]
482472
fn test_empty_storage_schema() {
483473
// When storage schema is empty but new schema has fields, should return false
484474
let field1 = create_field("id", DataType::Int32);
485475

486-
let schema = create_schema(vec![field1.clone()]);
487-
let empty_storage_schema: HashMap<String, Arc<Field>> = HashMap::new();
476+
let schema = [field1.clone()];
477+
let empty_storage_schema = HashMap::new();
488478

489-
assert!(!is_schema_matching(
490-
schema,
491-
&empty_storage_schema,
492-
true
493-
));
479+
assert!(!is_schema_matching(&schema, &empty_storage_schema, true));
494480
}
495481

496482
#[test]
497483
fn test_both_empty_schemas() {
498484
// When both schemas are empty, should return true
499-
let empty_schema = create_schema(vec![]);
500-
let empty_storage_schema: HashMap<String, Arc<Field>> = HashMap::new();
501-
502-
assert!(is_schema_matching(
503-
empty_schema,
504-
&empty_storage_schema,
505-
true
506-
));
485+
let empty_storage_schema = HashMap::new();
486+
487+
assert!(is_schema_matching(&[], &empty_storage_schema, true));
507488
}
508489
}

0 commit comments

Comments
 (0)