|
| 1 | +use assert_cmd::Command; |
| 2 | +use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; |
| 3 | +use std::fs; |
| 4 | +use std::fs::File; |
| 5 | +use std::io::Read; |
| 6 | +use std::path::Path; |
| 7 | +use std::sync::Arc; |
| 8 | +use tempfile::tempdir; |
| 9 | +use tpchgen::generators::OrderGenerator; |
| 10 | +use tpchgen_arrow::{OrderArrow, RecordBatchIterator}; |
| 11 | + |
| 12 | +/// Test TBL output for scale factor 0.001 using tpchgen-cli |
| 13 | +#[test] |
| 14 | +fn test_tpchgen_cli_tbl_scale_factor_0_001() { |
| 15 | + // Create a temporary directory |
| 16 | + let temp_dir = tempdir().expect("Failed to create temporary directory"); |
| 17 | + |
| 18 | + // Run the tpchgen-cli command |
| 19 | + Command::cargo_bin("tpchgen-cli") |
| 20 | + .expect("Binary not found") |
| 21 | + .arg("--scale-factor") |
| 22 | + .arg("0.001") |
| 23 | + .arg("--output-dir") |
| 24 | + .arg(temp_dir.path()) |
| 25 | + .assert() |
| 26 | + .success(); |
| 27 | + |
| 28 | + // List of expected files |
| 29 | + let expected_files = vec![ |
| 30 | + "customer.tbl", |
| 31 | + "lineitem.tbl", |
| 32 | + "nation.tbl", |
| 33 | + "orders.tbl", |
| 34 | + "part.tbl", |
| 35 | + "partsupp.tbl", |
| 36 | + "region.tbl", |
| 37 | + "supplier.tbl", |
| 38 | + ]; |
| 39 | + |
| 40 | + // Verify that all expected files are created |
| 41 | + for file in &expected_files { |
| 42 | + let generated_file = temp_dir.path().join(file); |
| 43 | + assert!( |
| 44 | + generated_file.exists(), |
| 45 | + "File {:?} does not exist", |
| 46 | + generated_file |
| 47 | + ); |
| 48 | + let generated_contents = fs::read(generated_file).expect("Failed to read generated file"); |
| 49 | + let generated_contents = String::from_utf8(generated_contents) |
| 50 | + .expect("Failed to convert generated contents to string"); |
| 51 | + |
| 52 | + // load the reference file |
| 53 | + let reference_file = format!("../tpchgen/data/sf-0.001/{}.gz", file); |
| 54 | + let reference_contents = match read_gzipped_file_to_string(&reference_file) { |
| 55 | + Ok(contents) => contents, |
| 56 | + Err(e) => { |
| 57 | + panic!("Failed to read reference file {reference_file}: {e}"); |
| 58 | + } |
| 59 | + }; |
| 60 | + |
| 61 | + assert_eq!( |
| 62 | + generated_contents, reference_contents, |
| 63 | + "Contents of {:?} do not match reference", |
| 64 | + file |
| 65 | + ); |
| 66 | + } |
| 67 | +} |
| 68 | + |
| 69 | +/// Test generating the order table using --parts and --part options |
| 70 | +#[test] |
| 71 | +fn test_tpchgen_cli_parts() { |
| 72 | + // Create a temporary directory |
| 73 | + let temp_dir = tempdir().expect("Failed to create temporary directory"); |
| 74 | + |
| 75 | + // generate 4 parts of the orders table with scale factor 0.001 |
| 76 | + // into directories /part1, /part2, /part3, /part4 |
| 77 | + let num_parts = 4; |
| 78 | + for part in 1..=num_parts { |
| 79 | + let part_dir = temp_dir.path().join(format!("part{part}")); |
| 80 | + fs::create_dir(&part_dir).expect("Failed to create part directory"); |
| 81 | + |
| 82 | + // Run the tpchgen-cli command for each part |
| 83 | + Command::cargo_bin("tpchgen-cli") |
| 84 | + .expect("Binary not found") |
| 85 | + .arg("--scale-factor") |
| 86 | + .arg("0.001") |
| 87 | + .arg("--output-dir") |
| 88 | + .arg(&part_dir) |
| 89 | + .arg("--parts") |
| 90 | + .arg(num_parts.to_string()) |
| 91 | + .arg("--part") |
| 92 | + .arg(part.to_string()) |
| 93 | + .arg("--tables") |
| 94 | + .arg("orders") |
| 95 | + .assert() |
| 96 | + .success(); |
| 97 | + } |
| 98 | + |
| 99 | + // Read the generated files into a single buffer and compare them |
| 100 | + // to the contents of the reference file |
| 101 | + let mut output_contents = Vec::new(); |
| 102 | + for part in 1..=4 { |
| 103 | + let generated_file = temp_dir |
| 104 | + .path() |
| 105 | + .join(format!("part{part}")) |
| 106 | + .join("orders.tbl"); |
| 107 | + assert!( |
| 108 | + generated_file.exists(), |
| 109 | + "File {:?} does not exist", |
| 110 | + generated_file |
| 111 | + ); |
| 112 | + let generated_contents = |
| 113 | + fs::read_to_string(generated_file).expect("Failed to read generated file"); |
| 114 | + output_contents.append(&mut generated_contents.into_bytes()); |
| 115 | + } |
| 116 | + let output_contents = |
| 117 | + String::from_utf8(output_contents).expect("Failed to convert output contents to string"); |
| 118 | + |
| 119 | + // load the reference file |
| 120 | + let reference_file = read_reference_file("orders", "0.001"); |
| 121 | + assert_eq!(output_contents, reference_file); |
| 122 | +} |
| 123 | + |
| 124 | +#[tokio::test] |
| 125 | +async fn test_write_parquet_orders() { |
| 126 | + // Run the CLI command to generate parquet data |
| 127 | + let output_dir = tempdir().unwrap(); |
| 128 | + let output_path = output_dir.path().join("orders.parquet"); |
| 129 | + Command::cargo_bin("tpchgen-cli") |
| 130 | + .expect("Binary not found") |
| 131 | + .arg("--format") |
| 132 | + .arg("parquet") |
| 133 | + .arg("--tables") |
| 134 | + .arg("orders") |
| 135 | + .arg("--scale-factor") |
| 136 | + .arg("0.001") |
| 137 | + .arg("--output-dir") |
| 138 | + .arg(output_dir.path()) |
| 139 | + .assert() |
| 140 | + .success(); |
| 141 | + |
| 142 | + let batch_size = 4000; |
| 143 | + |
| 144 | + // Create the reference Arrow data using OrderArrow |
| 145 | + let generator = OrderGenerator::new(0.001, 1, 1); |
| 146 | + let mut arrow_generator = OrderArrow::new(generator).with_batch_size(batch_size); |
| 147 | + |
| 148 | + // Read the generated parquet file |
| 149 | + let file = File::open(&output_path).expect("Failed to open parquet file"); |
| 150 | + let options = ArrowReaderOptions::new().with_schema(Arc::clone(arrow_generator.schema())); |
| 151 | + |
| 152 | + let reader = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options) |
| 153 | + .expect("Failed to create ParquetRecordBatchReaderBuilder") |
| 154 | + .with_batch_size(batch_size) |
| 155 | + .build() |
| 156 | + .expect("Failed to build ParquetRecordBatchReader"); |
| 157 | + |
| 158 | + // Compare the record batches |
| 159 | + for batch in reader { |
| 160 | + let parquet_batch = batch.expect("Failed to read record batch from parquet"); |
| 161 | + let arrow_batch = arrow_generator |
| 162 | + .next() |
| 163 | + .expect("Failed to generate record batch from OrderArrow"); |
| 164 | + assert_eq!( |
| 165 | + parquet_batch, arrow_batch, |
| 166 | + "Mismatch between parquet and arrow record batches" |
| 167 | + ); |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +fn read_gzipped_file_to_string<P: AsRef<Path>>(path: P) -> Result<String, std::io::Error> { |
| 172 | + let file = File::open(path)?; |
| 173 | + let mut decoder = flate2::read::GzDecoder::new(file); |
| 174 | + let mut contents = Vec::new(); |
| 175 | + decoder.read_to_end(&mut contents)?; |
| 176 | + let contents = String::from_utf8(contents) |
| 177 | + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; |
| 178 | + Ok(contents) |
| 179 | +} |
| 180 | + |
| 181 | +/// Reads the reference file for the specified table and scale factor. |
| 182 | +/// |
| 183 | +/// example usage: `read_reference_file("orders", "0.001")` |
| 184 | +fn read_reference_file(table_name: &str, scale_factor: &str) -> String { |
| 185 | + let reference_file = format!("../tpchgen/data/sf-{scale_factor}/{table_name}.tbl.gz"); |
| 186 | + match read_gzipped_file_to_string(&reference_file) { |
| 187 | + Ok(contents) => contents, |
| 188 | + Err(e) => { |
| 189 | + panic!("Failed to read reference file {reference_file}: {e}"); |
| 190 | + } |
| 191 | + } |
| 192 | +} |
0 commit comments