|
14 | 14 | """ |
15 | 15 | import logging |
16 | 16 |
|
17 | | -from io import BytesIO as _io |
| 17 | +from io import BytesIO as _io # noqa |
18 | 18 |
|
19 | 19 | import zipfile |
20 | 20 | from abc import ABCMeta, abstractmethod |
@@ -437,6 +437,56 @@ def extract(self, **kwargs): |
437 | 437 | return df |
438 | 438 |
|
439 | 439 |
|
| 440 | +class ZIPXLSExtractor(XLSExtractor): |
| 441 | + """:class:`Extractor` for remote data, exposed in compressed xls files |
| 442 | + """ |
| 443 | + |
| 444 | + def __init__(self, io, xls_filepath, **kwargs): |
| 445 | + """Constructor method |
| 446 | +
|
| 447 | + :param source: the source URL of the zipped file |
| 448 | + :param xls_filepath: the path of the excel file to extract |
| 449 | + :param kwargs: kwargs used by XLSExtractor and ``verify_tls``, |
| 450 | + used to avoid https errors due to TLS verification failures |
| 451 | + """ |
| 452 | + self.verify_tls = kwargs.pop('verify_tls', True) |
| 453 | + self.xls_filepath = xls_filepath |
| 454 | + super().__init__(io, **kwargs) |
| 455 | + |
| 456 | + def extract(self, **kwargs): |
| 457 | + """Extracts data from remote, zipped xls source |
| 458 | +
|
| 459 | + Args: |
| 460 | + **kwargs: Arbitrary keyword arguments. |
| 461 | +
|
| 462 | + Returns: |
| 463 | + :class:`pandas.DataFrame`: The dataframe containing extracted items |
| 464 | + """ |
| 465 | + r = requests.get(self.io, verify=self.verify_tls) |
| 466 | + z = zipfile.ZipFile(_io(r.content)) |
| 467 | + try: |
| 468 | + z_filename = next( |
| 469 | + f.filename for f in z.filelist if self.xls_filepath in f.filename |
| 470 | + ) |
| 471 | + except StopIteration: |
| 472 | + raise Exception(f"Could not find file {self.xls_filepath} in zipped file from {self.io}") |
| 473 | + else: |
| 474 | + df = pd.read_excel( |
| 475 | + io=z.open(z_filename), |
| 476 | + skiprows=self.skiprows, |
| 477 | + skipfooter=self.skipfooter, |
| 478 | + sheet_name=self.sheet_name, |
| 479 | + header=self.header, |
| 480 | + dtype=self.dtype, |
| 481 | + converters=self.converters, |
| 482 | + na_filter=self.na_filter, |
| 483 | + na_values=self.na_values, |
| 484 | + keep_default_na=self.keep_default_na, |
| 485 | + ) |
| 486 | + |
| 487 | + return df |
| 488 | + |
| 489 | + |
440 | 490 | class FakeExtractor(Extractor): |
441 | 491 | """Extractor that does nothing""" |
442 | 492 |
|
|
0 commit comments