1
+ use std:: vec;
2
+
1
3
use arrow:: array:: {
2
4
Array , BinaryArray , BooleanArray , Date32Array , Decimal128Array , Float32Array , Float64Array ,
3
5
Int16Array , Int32Array , Int64Array , Int8Array , StringArray , TimestampMicrosecondArray ,
4
6
} ;
5
7
use arrow:: datatypes:: DataType ;
6
8
use clap:: Parser ;
7
9
use futures:: { StreamExt , TryStreamExt } ;
10
+ use iceberg:: spec:: TableMetadata ;
8
11
use lakehouse_loader:: delta_destination:: object_store_keys_from_env;
9
12
use lakehouse_loader:: error:: DataLoadingError ;
10
13
use lakehouse_loader:: pg_arrow_source:: PgArrowSource ;
@@ -57,6 +60,10 @@ async fn test_pg_to_delta_e2e() {
57
60
assert ! ( paths[ 2 ] . to_string( ) . ends_with( "-c000.snappy.parquet" ) ) ;
58
61
}
59
62
63
+ const DATA_FILEPATH_PATTERN : & str = r"^iceberg/data/part-00000-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.parquet$" ;
64
+ const MANIFEST_FILEPATH_PATTERN : & str = r"^iceberg/metadata/manifest-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.avro$" ;
65
+ const MANIFEST_LIST_FILEPATH_PATTERN : & str = r"^iceberg/metadata/manifest-list-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.avro$" ;
66
+
60
67
#[ tokio:: test]
61
68
async fn test_pg_to_iceberg ( ) {
62
69
let target_url = "s3://lhl-test-bucket/iceberg" ;
@@ -78,23 +85,40 @@ async fn test_pg_to_iceberg() {
78
85
let ( store, path) =
79
86
object_store:: parse_url_opts ( & Url :: parse ( target_url) . unwrap ( ) , config) . unwrap ( ) ;
80
87
88
+ // THEN iceberg data and metadata files are written
81
89
let mut paths = store
82
90
. list ( Some ( & path) )
83
91
. map_ok ( |m| m. location )
84
92
. boxed ( )
85
93
. try_collect :: < Vec < Path > > ( )
86
94
. await
87
95
. unwrap ( ) ;
88
-
89
96
paths. sort ( ) ;
90
-
91
- // THEN iceberg data and metadata files are written
92
97
assert_eq ! ( paths. len( ) , 5 ) ;
93
- assert ! ( Regex :: new( r"^iceberg/data/part-00000-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.parquet$" ) . unwrap( ) . is_match( paths[ 0 ] . as_ref( ) ) ) ;
94
- assert ! ( Regex :: new( r"^iceberg/metadata/manifest-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.avro$" ) . unwrap( ) . is_match( paths[ 1 ] . as_ref( ) ) ) ;
95
- assert ! ( Regex :: new( r"^iceberg/metadata/manifest-list-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}.avro$" ) . unwrap( ) . is_match( paths[ 2 ] . as_ref( ) ) ) ;
98
+ assert ! ( Regex :: new( DATA_FILEPATH_PATTERN )
99
+ . unwrap( )
100
+ . is_match( paths[ 0 ] . as_ref( ) ) ) ;
101
+ assert ! ( Regex :: new( MANIFEST_FILEPATH_PATTERN )
102
+ . unwrap( )
103
+ . is_match( paths[ 1 ] . as_ref( ) ) ) ;
104
+ assert ! ( Regex :: new( MANIFEST_LIST_FILEPATH_PATTERN )
105
+ . unwrap( )
106
+ . is_match( paths[ 2 ] . as_ref( ) ) ) ;
96
107
assert_eq ! ( & paths[ 3 ] . to_string( ) , "iceberg/metadata/v0.metadata.json" ) ;
97
108
assert_eq ! ( & paths[ 4 ] . to_string( ) , "iceberg/metadata/version-hint.text" ) ;
109
+ // THEN iceberg metadata can be parsed
110
+ let metadata_bytes = store. get ( & paths[ 3 ] ) . await . unwrap ( ) . bytes ( ) . await . unwrap ( ) ;
111
+ let metadata_str = core:: str:: from_utf8 ( & metadata_bytes) . unwrap ( ) ;
112
+ let metadata = serde_json:: from_str :: < TableMetadata > ( metadata_str) . unwrap ( ) ;
113
+ // THEN metadata contains a single snapshot with sequence number 1
114
+ assert_eq ! ( metadata. last_sequence_number( ) , 1 ) ;
115
+ assert_eq ! (
116
+ metadata
117
+ . snapshots( )
118
+ . map( |s| s. sequence_number( ) )
119
+ . collect:: <Vec <_>>( ) ,
120
+ vec![ 1 ]
121
+ ) ;
98
122
99
123
// WHEN we try to write to an existing table without passing the overwrite flag
100
124
// THEN the command errors out
@@ -149,6 +173,50 @@ async fn test_pg_to_iceberg() {
149
173
"--overwrite" ,
150
174
] ;
151
175
assert ! ( do_main( Cli :: parse_from( args. clone( ) ) ) . await . is_ok( ) ) ;
176
+
177
+ // THEN iceberg data and metadata files are written
178
+ let mut paths = store
179
+ . list ( Some ( & path) )
180
+ . map_ok ( |m| m. location )
181
+ . boxed ( )
182
+ . try_collect :: < Vec < Path > > ( )
183
+ . await
184
+ . unwrap ( ) ;
185
+ paths. sort ( ) ;
186
+ assert_eq ! ( paths. len( ) , 9 ) ;
187
+ assert ! ( Regex :: new( DATA_FILEPATH_PATTERN )
188
+ . unwrap( )
189
+ . is_match( paths[ 0 ] . as_ref( ) ) ) ;
190
+ assert ! ( Regex :: new( DATA_FILEPATH_PATTERN )
191
+ . unwrap( )
192
+ . is_match( paths[ 1 ] . as_ref( ) ) ) ;
193
+ assert ! ( Regex :: new( MANIFEST_FILEPATH_PATTERN )
194
+ . unwrap( )
195
+ . is_match( paths[ 2 ] . as_ref( ) ) ) ;
196
+ assert ! ( Regex :: new( MANIFEST_FILEPATH_PATTERN )
197
+ . unwrap( )
198
+ . is_match( paths[ 3 ] . as_ref( ) ) ) ;
199
+ assert ! ( Regex :: new( MANIFEST_LIST_FILEPATH_PATTERN )
200
+ . unwrap( )
201
+ . is_match( paths[ 4 ] . as_ref( ) ) ) ;
202
+ assert ! ( Regex :: new( MANIFEST_LIST_FILEPATH_PATTERN )
203
+ . unwrap( )
204
+ . is_match( paths[ 5 ] . as_ref( ) ) ) ;
205
+ assert_eq ! ( & paths[ 6 ] . to_string( ) , "iceberg/metadata/v0.metadata.json" ) ;
206
+ assert_eq ! ( & paths[ 7 ] . to_string( ) , "iceberg/metadata/v1.metadata.json" ) ;
207
+ assert_eq ! ( & paths[ 8 ] . to_string( ) , "iceberg/metadata/version-hint.text" ) ;
208
+ // THEN iceberg metadata can be parsed
209
+ let metadata_bytes = store. get ( & paths[ 7 ] ) . await . unwrap ( ) . bytes ( ) . await . unwrap ( ) ;
210
+ let metadata_str = core:: str:: from_utf8 ( & metadata_bytes) . unwrap ( ) ;
211
+ let metadata = serde_json:: from_str :: < TableMetadata > ( metadata_str) . unwrap ( ) ;
212
+ // THEN metadata contains two snapshots with sequence numbers 1 and 2
213
+ assert_eq ! ( metadata. last_sequence_number( ) , 2 ) ;
214
+ let mut snapshot_ids = metadata
215
+ . snapshots ( )
216
+ . map ( |s| s. sequence_number ( ) )
217
+ . collect :: < Vec < _ > > ( ) ;
218
+ snapshot_ids. sort ( ) ;
219
+ assert_eq ! ( snapshot_ids, vec![ 1 , 2 ] ) ;
152
220
}
153
221
154
222
#[ tokio:: test]
0 commit comments