Skip to content

Commit 4679309

Browse files
committed
ZIPXLSExtractor extends XLSExtractor and allows the extracion of an XLS file from a zipped archive
1 parent dea0273 commit 4679309

File tree

1 file changed

+51
-1
lines changed

1 file changed

+51
-1
lines changed

ooetl/extractors/__init__.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"""
1515
import logging
1616

17-
from io import BytesIO as _io
17+
from io import BytesIO as _io # noqa
1818

1919
import zipfile
2020
from abc import ABCMeta, abstractmethod
@@ -437,6 +437,56 @@ def extract(self, **kwargs):
437437
return df
438438

439439

440+
class ZIPXLSExtractor(XLSExtractor):
441+
""":class:`Extractor` for remote data, exposed in compressed xls files
442+
"""
443+
444+
def __init__(self, io, xls_filepath, **kwargs):
445+
"""Constructor method
446+
447+
:param source: the source URL of the zipped file
448+
:param xls_filepath: the path of the excel file to extract
449+
:param kwargs: kwargs used by XLSExtractor and ``verify_tls``,
450+
used to avoid https errors due to TLS verification failures
451+
"""
452+
self.verify_tls = kwargs.pop('verify_tls', True)
453+
self.xls_filepath = xls_filepath
454+
super().__init__(io, **kwargs)
455+
456+
def extract(self, **kwargs):
457+
"""Extracts data from remote, zipped xls source
458+
459+
Args:
460+
**kwargs: Arbitrary keyword arguments.
461+
462+
Returns:
463+
:class:`pandas.DataFrame`: The dataframe containing extracted items
464+
"""
465+
r = requests.get(self.io, verify=self.verify_tls)
466+
z = zipfile.ZipFile(_io(r.content))
467+
try:
468+
z_filename = next(
469+
f.filename for f in z.filelist if self.xls_filepath in f.filename
470+
)
471+
except StopIteration:
472+
raise Exception(f"Could not find file {self.xls_filepath} in zipped file from {self.io}")
473+
else:
474+
df = pd.read_excel(
475+
io=z.open(z_filename),
476+
skiprows=self.skiprows,
477+
skipfooter=self.skipfooter,
478+
sheet_name=self.sheet_name,
479+
header=self.header,
480+
dtype=self.dtype,
481+
converters=self.converters,
482+
na_filter=self.na_filter,
483+
na_values=self.na_values,
484+
keep_default_na=self.keep_default_na,
485+
)
486+
487+
return df
488+
489+
440490
class FakeExtractor(Extractor):
441491
"""Extractor that does nothing"""
442492

0 commit comments

Comments
 (0)