Skip to content

Commit 1e5d6e5

Browse files
authored
apacheGH-40633: [C++][Python] Fix ORC crash when file contains unknown timezone (apache#45051)
### Rationale for this change If the timezone database is present on the system, but does not contain a timezone referenced in a ORC file, the ORC reader will crash with an uncaught C++ exception. This can happen for example on Ubuntu 24.04 where some timezone aliases have been removed from the main `tzdata` package to a `tzdata-legacy` package. If `tzdata-legacy` is not installed, trying to read a ORC file that references e.g. the "US/Pacific" timezone would crash. Here is a backtrace excerpt: ``` dremio#12 0x00007f1a3ce23a55 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6 dremio#13 0x00007f1a3ce39391 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6 dremio#14 0x00007f1a3f4accc4 in orc::loadTZDB(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#15 0x00007f1a3f4ad392 in std::call_once<orc::LazyTimezone::getImpl() const::{lambda()dremio#1}>(std::once_flag&, orc::LazyTimezone::getImpl() const::{lambda()dremio#1}&&)::{lambda()dremio#2}::_FUN() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#16 0x00007f1a4298bec3 in __pthread_once_slow (once_control=0xa5ca7c8, init_routine=0x7f1a3ce69420 <__once_proxy>) at ./nptl/pthread_once.c:116 dremio#17 0x00007f1a3f4a9ad0 in orc::LazyTimezone::getEpoch() const () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#18 0x00007f1a3f4e76b1 in orc::TimestampColumnReader::TimestampColumnReader(orc::Type const&, orc::StripeStreams&, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#19 0x00007f1a3f4e84ad in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#20 0x00007f1a3f4e8dd7 in orc::StructColumnReader::StructColumnReader(orc::Type const&, orc::StripeStreams&, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#21 0x00007f1a3f4e8532 in orc::buildReader(orc::Type const&, orc::StripeStreams&, bool, bool, bool) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#22 0x00007f1a3f4925e9 in orc::RowReaderImpl::startNextStripe() () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#23 0x00007f1a3f492c9d in orc::RowReaderImpl::next(orc::ColumnVectorBatch&) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 dremio#24 0x00007f1a3e6b251f in arrow::adapters::orc::ORCFileReader::Impl::ReadBatch(orc::RowReaderOptions const&, std::shared_ptr<arrow::Schema> const&, long) () from /tmp/arrow-HEAD.ArqTs/venv-wheel-3.12-manylinux_2_17_x86_64.manylinux2014_x86_64/lib/python3.12/site-packages/pyarrow/libarrow.so.1900 ``` ### What changes are included in this PR? Catch C++ exceptions when iterating ORC batches instead of letting them slip through. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: apache#40633 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 5ec8b64 commit 1e5d6e5

File tree

2 files changed

+65
-9
lines changed

2 files changed

+65
-9
lines changed

cpp/src/arrow/adapters/orc/adapter.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,28 +145,29 @@ class OrcStripeReader : public RecordBatchReader {
145145

146146
Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
147147
std::unique_ptr<liborc::ColumnVectorBatch> batch;
148-
ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
148+
std::unique_ptr<RecordBatchBuilder> builder;
149+
150+
ORC_BEGIN_CATCH_NOT_OK
151+
batch = row_reader_->createRowBatch(batch_size_);
149152

150153
const liborc::Type& type = row_reader_->getSelectedType();
151154
if (!row_reader_->next(*batch)) {
152155
out->reset();
153156
return Status::OK();
154157
}
155158

156-
std::unique_ptr<RecordBatchBuilder> builder;
157159
ARROW_ASSIGN_OR_RAISE(builder,
158160
RecordBatchBuilder::Make(schema_, pool_, batch->numElements));
159-
160161
// The top-level type must be a struct to read into an arrow table
161162
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
162163

163164
for (int i = 0; i < builder->num_fields(); i++) {
164165
RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
165166
batch->numElements, builder->GetField(i)));
166167
}
168+
ORC_END_CATCH_NOT_OK
167169

168-
ARROW_ASSIGN_OR_RAISE(*out, builder->Flush());
169-
return Status::OK();
170+
return builder->Flush().Value(out);
170171
}
171172

172173
private:
@@ -470,15 +471,13 @@ class ORCFileReader::Impl {
470471
int64_t nrows) {
471472
std::unique_ptr<liborc::RowReader> row_reader;
472473
std::unique_ptr<liborc::ColumnVectorBatch> batch;
474+
std::unique_ptr<RecordBatchBuilder> builder;
473475

474476
ORC_BEGIN_CATCH_NOT_OK
475477
row_reader = reader_->createRowReader(opts);
476478
batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
477-
ORC_END_CATCH_NOT_OK
478479

479-
std::unique_ptr<RecordBatchBuilder> builder;
480480
ARROW_ASSIGN_OR_RAISE(builder, RecordBatchBuilder::Make(schema, pool_, nrows));
481-
482481
// The top-level type must be a struct to read into an arrow table
483482
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
484483

@@ -489,6 +488,7 @@ class ORCFileReader::Impl {
489488
batch->numElements, builder->GetField(i)));
490489
}
491490
}
491+
ORC_END_CATCH_NOT_OK
492492

493493
return builder->Flush();
494494
}

python/pyarrow/tests/test_orc.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
import pytest
1918
import decimal
2019
import datetime
20+
from pathlib import Path
21+
import shutil
22+
import subprocess
23+
import sys
24+
25+
import pytest
2126

2227
import pyarrow as pa
2328
from pyarrow import fs
@@ -140,6 +145,57 @@ def test_example_using_json(filename, datadir):
140145
check_example_file(path, table, need_fix=True)
141146

142147

148+
def test_timezone_database_absent(datadir):
149+
# Example file relies on the timezone "US/Pacific". It should gracefully
150+
# fail, not crash, if the timezone database is not found.
151+
path = datadir / 'TestOrcFile.testDate1900.orc'
152+
code = f"""if 1:
153+
import os
154+
os.environ['TZDIR'] = '/tmp/non_existent'
155+
156+
from pyarrow import orc
157+
try:
158+
orc_file = orc.ORCFile({str(path)!r})
159+
orc_file.read()
160+
except Exception as e:
161+
assert "time zone database" in str(e).lower(), e
162+
else:
163+
assert False, "Should have raised exception"
164+
"""
165+
subprocess.run([sys.executable, "-c", code], check=True)
166+
167+
168+
def test_timezone_absent(datadir, tmpdir):
169+
# Example file relies on the timezone "US/Pacific". It should gracefully
170+
# fail, not crash, if the timezone database is present but the timezone
171+
# is not found (GH-40633).
172+
source_tzdir = Path('/usr/share/zoneinfo')
173+
if not source_tzdir.exists():
174+
pytest.skip(f"Test needs timezone database in {source_tzdir}")
175+
tzdir = Path(tmpdir / 'zoneinfo')
176+
try:
177+
shutil.copytree(source_tzdir, tzdir, symlinks=True)
178+
except OSError as e:
179+
pytest.skip(f"Failed to copy timezone database: {e}")
180+
(tzdir / 'US' / 'Pacific').unlink(missing_ok=True)
181+
182+
path = datadir / 'TestOrcFile.testDate1900.orc'
183+
code = f"""if 1:
184+
import os
185+
os.environ['TZDIR'] = {str(tzdir)!r}
186+
187+
from pyarrow import orc
188+
orc_file = orc.ORCFile({str(path)!r})
189+
try:
190+
orc_file.read()
191+
except Exception as e:
192+
assert "zoneinfo/US/Pacific" in str(e), e
193+
else:
194+
assert False, "Should have raised exception"
195+
"""
196+
subprocess.run([sys.executable, "-c", code], check=True)
197+
198+
143199
def test_orcfile_empty(datadir):
144200
from pyarrow import orc
145201

0 commit comments

Comments
 (0)