Skip to content

Commit 914612f

Browse files
authored
feat: Add integration tests for tpchgen-cli (#156)
* Add basic tpchgen-cli integration tests * Add test for reading parts * Add parquet conversion test * cleanup * REname test * Remove println * fix: Bug introduced by refactor
1 parent 19812e2 commit 914612f

File tree

3 files changed

+200
-2
lines changed

3 files changed

+200
-2
lines changed

tpchgen-cli/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,9 @@ futures = "0.3.31"
2020
num_cpus = "1.0"
2121
log = "0.4.26"
2222
env_logger = "0.11.7"
23+
24+
[dev-dependencies]
25+
assert_cmd = "2.0"
26+
predicates = "3.0"
27+
tempfile = "3.20.0"
28+
flate2 = "1.1.0"

tpchgen-cli/src/plan.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,15 @@ impl GenerationPlan {
126126
}
127127
}
128128

129-
/// Converts the `GenerationPlan` into an iterator of (num_parts, part_number)
129+
/// Converts the `GenerationPlan` into an iterator of (part_number, num_parts)
130130
impl IntoIterator for GenerationPlan {
131131
type Item = (i32, i32);
132132
type IntoIter = std::vec::IntoIter<Self::Item>;
133133

134134
fn into_iter(self) -> Self::IntoIter {
135135
self.part_list
136136
.into_iter()
137-
.map(|part_number| (self.part_count, part_number))
137+
.map(|part_number| (part_number, self.part_count))
138138
.collect::<Vec<_>>()
139139
.into_iter()
140140
}

tpchgen-cli/tests/cli_integration.rs

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
use assert_cmd::Command;
2+
use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder};
3+
use std::fs;
4+
use std::fs::File;
5+
use std::io::Read;
6+
use std::path::Path;
7+
use std::sync::Arc;
8+
use tempfile::tempdir;
9+
use tpchgen::generators::OrderGenerator;
10+
use tpchgen_arrow::{OrderArrow, RecordBatchIterator};
11+
12+
/// Test TBL output for scale factor 0.001 using tpchgen-cli
13+
#[test]
14+
fn test_tpchgen_cli_tbl_scale_factor_0_001() {
15+
// Create a temporary directory
16+
let temp_dir = tempdir().expect("Failed to create temporary directory");
17+
18+
// Run the tpchgen-cli command
19+
Command::cargo_bin("tpchgen-cli")
20+
.expect("Binary not found")
21+
.arg("--scale-factor")
22+
.arg("0.001")
23+
.arg("--output-dir")
24+
.arg(temp_dir.path())
25+
.assert()
26+
.success();
27+
28+
// List of expected files
29+
let expected_files = vec![
30+
"customer.tbl",
31+
"lineitem.tbl",
32+
"nation.tbl",
33+
"orders.tbl",
34+
"part.tbl",
35+
"partsupp.tbl",
36+
"region.tbl",
37+
"supplier.tbl",
38+
];
39+
40+
// Verify that all expected files are created
41+
for file in &expected_files {
42+
let generated_file = temp_dir.path().join(file);
43+
assert!(
44+
generated_file.exists(),
45+
"File {:?} does not exist",
46+
generated_file
47+
);
48+
let generated_contents = fs::read(generated_file).expect("Failed to read generated file");
49+
let generated_contents = String::from_utf8(generated_contents)
50+
.expect("Failed to convert generated contents to string");
51+
52+
// load the reference file
53+
let reference_file = format!("../tpchgen/data/sf-0.001/{}.gz", file);
54+
let reference_contents = match read_gzipped_file_to_string(&reference_file) {
55+
Ok(contents) => contents,
56+
Err(e) => {
57+
panic!("Failed to read reference file {reference_file}: {e}");
58+
}
59+
};
60+
61+
assert_eq!(
62+
generated_contents, reference_contents,
63+
"Contents of {:?} do not match reference",
64+
file
65+
);
66+
}
67+
}
68+
69+
/// Test generating the order table using --parts and --part options
70+
#[test]
71+
fn test_tpchgen_cli_parts() {
72+
// Create a temporary directory
73+
let temp_dir = tempdir().expect("Failed to create temporary directory");
74+
75+
// generate 4 parts of the orders table with scale factor 0.001
76+
// into directories /part1, /part2, /part3, /part4
77+
let num_parts = 4;
78+
for part in 1..=num_parts {
79+
let part_dir = temp_dir.path().join(format!("part{part}"));
80+
fs::create_dir(&part_dir).expect("Failed to create part directory");
81+
82+
// Run the tpchgen-cli command for each part
83+
Command::cargo_bin("tpchgen-cli")
84+
.expect("Binary not found")
85+
.arg("--scale-factor")
86+
.arg("0.001")
87+
.arg("--output-dir")
88+
.arg(&part_dir)
89+
.arg("--parts")
90+
.arg(num_parts.to_string())
91+
.arg("--part")
92+
.arg(part.to_string())
93+
.arg("--tables")
94+
.arg("orders")
95+
.assert()
96+
.success();
97+
}
98+
99+
// Read the generated files into a single buffer and compare them
100+
// to the contents of the reference file
101+
let mut output_contents = Vec::new();
102+
for part in 1..=4 {
103+
let generated_file = temp_dir
104+
.path()
105+
.join(format!("part{part}"))
106+
.join("orders.tbl");
107+
assert!(
108+
generated_file.exists(),
109+
"File {:?} does not exist",
110+
generated_file
111+
);
112+
let generated_contents =
113+
fs::read_to_string(generated_file).expect("Failed to read generated file");
114+
output_contents.append(&mut generated_contents.into_bytes());
115+
}
116+
let output_contents =
117+
String::from_utf8(output_contents).expect("Failed to convert output contents to string");
118+
119+
// load the reference file
120+
let reference_file = read_reference_file("orders", "0.001");
121+
assert_eq!(output_contents, reference_file);
122+
}
123+
124+
#[tokio::test]
125+
async fn test_write_parquet_orders() {
126+
// Run the CLI command to generate parquet data
127+
let output_dir = tempdir().unwrap();
128+
let output_path = output_dir.path().join("orders.parquet");
129+
Command::cargo_bin("tpchgen-cli")
130+
.expect("Binary not found")
131+
.arg("--format")
132+
.arg("parquet")
133+
.arg("--tables")
134+
.arg("orders")
135+
.arg("--scale-factor")
136+
.arg("0.001")
137+
.arg("--output-dir")
138+
.arg(output_dir.path())
139+
.assert()
140+
.success();
141+
142+
let batch_size = 4000;
143+
144+
// Create the reference Arrow data using OrderArrow
145+
let generator = OrderGenerator::new(0.001, 1, 1);
146+
let mut arrow_generator = OrderArrow::new(generator).with_batch_size(batch_size);
147+
148+
// Read the generated parquet file
149+
let file = File::open(&output_path).expect("Failed to open parquet file");
150+
let options = ArrowReaderOptions::new().with_schema(Arc::clone(arrow_generator.schema()));
151+
152+
let reader = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options)
153+
.expect("Failed to create ParquetRecordBatchReaderBuilder")
154+
.with_batch_size(batch_size)
155+
.build()
156+
.expect("Failed to build ParquetRecordBatchReader");
157+
158+
// Compare the record batches
159+
for batch in reader {
160+
let parquet_batch = batch.expect("Failed to read record batch from parquet");
161+
let arrow_batch = arrow_generator
162+
.next()
163+
.expect("Failed to generate record batch from OrderArrow");
164+
assert_eq!(
165+
parquet_batch, arrow_batch,
166+
"Mismatch between parquet and arrow record batches"
167+
);
168+
}
169+
}
170+
171+
fn read_gzipped_file_to_string<P: AsRef<Path>>(path: P) -> Result<String, std::io::Error> {
172+
let file = File::open(path)?;
173+
let mut decoder = flate2::read::GzDecoder::new(file);
174+
let mut contents = Vec::new();
175+
decoder.read_to_end(&mut contents)?;
176+
let contents = String::from_utf8(contents)
177+
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
178+
Ok(contents)
179+
}
180+
181+
/// Reads the reference file for the specified table and scale factor.
182+
///
183+
/// example usage: `read_reference_file("orders", "0.001")`
184+
fn read_reference_file(table_name: &str, scale_factor: &str) -> String {
185+
let reference_file = format!("../tpchgen/data/sf-{scale_factor}/{table_name}.tbl.gz");
186+
match read_gzipped_file_to_string(&reference_file) {
187+
Ok(contents) => contents,
188+
Err(e) => {
189+
panic!("Failed to read reference file {reference_file}: {e}");
190+
}
191+
}
192+
}

0 commit comments

Comments
 (0)