diff --git a/.travis.yml b/.travis.yml index d682f24..212072f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ addons: apt: packages: - antiword + - poppler-utils - language-pack-he install: - ".travis/install.sh" diff --git a/knesset_data/protocols/base.py b/knesset_data/protocols/base.py index 21f1744..1772b45 100644 --- a/knesset_data/protocols/base.py +++ b/knesset_data/protocols/base.py @@ -2,7 +2,7 @@ import contextlib from tempfile import mkstemp import os -from .utils import antiword, antixml +from .utils import antiword, antixml, pdftotext from cached_property import cached_property import io, requests import logging @@ -15,8 +15,9 @@ class BaseProtocolFile(object): temp_file_suffix = "temp_knesset_data_protocols_" - def __init__(self, file, proxies=None): + def __init__(self, file, proxies=None, extension=None): self._file_type, self._file_data = file + self.extension = extension self._cleanup = [] self._proxies = proxies if proxies else {} @@ -55,6 +56,8 @@ def file_extension(self): if self._file_type in ("filename", "url") and self._file_data: filename, file_extension = os.path.splitext(self._file_data) return file_extension[1:] + if self.extension is not None: + return self.extension else: return None @@ -86,13 +89,24 @@ def antiword_xml(self): def antiword_text(self): return antixml(self.antiword_xml) + @cached_property + def pdf_text(self): + """ Uses pdftotext to extract text from a PDF document. + + Pages are separated by a 0x0c (form feed) character. + """ + + text = pdftotext(self.file_name).decode('utf-8') + # FIXME: remove explicit bidi characters? + return text + def _close(self): [func() for func in self._cleanup] @classmethod @contextlib.contextmanager - def _get_from(cls, file_type, file_data, proxies=None): - obj = cls((file_type, file_data), proxies=proxies) + def _get_from(cls, file_type, file_data, proxies=None, extension=None): + obj = cls((file_type, file_data), proxies=proxies, extension=extension) try: yield obj finally: @@ -110,8 +124,8 @@ def get_from_url(cls, url, proxies=None): @classmethod @contextlib.contextmanager - def get_from_file(cls, file): - with cls._get_from('file', file) as p: yield p + def get_from_file(cls, file, extension=None): + with cls._get_from('file', file, extension=extension) as p: yield p @classmethod @contextlib.contextmanager diff --git a/knesset_data/protocols/committee.py b/knesset_data/protocols/committee.py index 9d36774..aded9a2 100644 --- a/knesset_data/protocols/committee.py +++ b/knesset_data/protocols/committee.py @@ -50,7 +50,13 @@ def text(self): if self._file_type == 'text': return self._file_data else: - text = decode(self.antiword_text, 'utf-8') + extension = self.file_extension.lower() + if self.file_extension == 'doc': + text = decode(self.antiword_text, 'utf-8') + elif self.file_extension == 'pdf': + text = decode(self.pdf_text, 'utf-8') + else: + text = '' tmp = text.split('OMNITECH') if len(tmp)==2 and len(tmp[0]) < 40: text = tmp[1] diff --git a/knesset_data/protocols/exceptions.py b/knesset_data/protocols/exceptions.py index 89a78cf..a252162 100644 --- a/knesset_data/protocols/exceptions.py +++ b/knesset_data/protocols/exceptions.py @@ -8,3 +8,16 @@ def __str__(self): return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'" else: return "antiword processing failed: {output}".format(output=self.output.split("\n")[0]) + + +class PdftotextNotInstalledException(Exception): + def __str__(self): + return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'" + + +class PdftotextException(CalledProcessError): + def __str__(self): + if not self.output: + return "pdftotext processing silently failed." + else: + return "pdftotext processing failed: {output}".format(output=self.output.split("\n")[0]) diff --git a/knesset_data/protocols/utils.py b/knesset_data/protocols/utils.py index 62ddae3..7467167 100644 --- a/knesset_data/protocols/utils.py +++ b/knesset_data/protocols/utils.py @@ -2,8 +2,10 @@ import logging import subprocess import os +import subprocess import xml.etree.ElementTree as ET -from .exceptions import AntiwordException +from .exceptions import AntiwordException, PdftotextException, \ + PdftotextNotInstalledException import six # solve issues with unicode for python3/2 @@ -44,6 +46,25 @@ def antiword(filename): return xmldata +def pdftotext(filename): + """ returns the text of a PDF file given by its file. + + Uses pdftotext from package poppler-utils on Debian + """ + if not os.path.exists(filename): + raise IOError('File not found: %s'%filename) + try: + text = subprocess.check_output(['pdftotext', filename, '-'], + stderr=subprocess.STDOUT) + except FileNotFoundError as e: + raise PdftotextNotInstalledException() + except subprocess.CalledProcessError as e: + print("Tzafrir: <>") + sys.exit(2) + raise PdftotextException(e.returncode, e.cmd, e.output) + return text + + def fix_hyphens(text): return text.replace(u"\n\n–\n\n",u" – ")