diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py new file mode 100644 index 0000000..c29b44a --- /dev/null +++ b/iepy/preprocess/ner/regexp.py @@ -0,0 +1,107 @@ +import re +import codecs + +from nltk.text import TokenSearcher as NLTKTokenSearcher + +from iepy.preprocess.ner.base import BaseNERRunner + + +class RegExpNERRunner(BaseNERRunner): + + def __init__(self, label, regexp, override=False): + super(RegExpNERRunner, self).__init__(override=override) + self.label = label + self.regexp = regexp + + def run_ner(self, doc): + entities = [] + tokens = doc.tokens + searcher = TokenSearcher(tokens) + for match in searcher.finditer(self.regexp): + entity_oc = self.process_match(match) + if type(entity_oc) == list: + entities.extend(entity_oc) + else: + entities.append(entity_oc) + return entities + + def process_match(self, match): + name = ' '.join(match.group()) + kind = self.label + offset, offset_end = match.span() + entity_oc = self.build_occurrence(name, kind, name, offset, offset_end) + + return entity_oc + + +class TokenSearcher(NLTKTokenSearcher): + """ + From nltk.text.TokenSearcher documentation: + + A class that makes it easier to use regular expressions to search + over tokenized strings. The tokenized string is converted to a + string where tokens are marked with angle brackets -- e.g., + ``''``. The regular expression + passed to the ``findall()`` method is modified to treat angle + brackets as non-capturing parentheses, in addition to matching the + token boundaries; and to have ``'.'`` not match the angle brackets. + """ + + def __init__(self, tokens): + # replace < and > inside tokens with \< and \> + _raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens) + # preprend >< instead of < for easier token counting + self._raw = '><' + _raw + '>' + + def finditer(self, regexp): + regexp = preprocess_regexp(regexp) + + i = re.finditer(regexp, self._raw) + while True: + try: + m = next(i) + start, end = m.span() + token_start = self._raw[:start].count('><') + token_end = self._raw[:end].count('><') + yield MatchObject(m, token_start, token_end) + except: + return + + +class MatchObject: + + def __init__(self, m, token_start, token_end): + self.m = m + self.all = m.group() + self.all_start, self.all_end = m.span() + self.token_start = token_start + self.token_end = token_end + + def group(self, *args): + result = self.m.group(*args) + if result: + return result[1:-1].split('><') + else: + return None + + def span(self, *args): + start, end = self.m.span(*args) + span_start = self.all[:start - self.all_start].count('<') + span_end = self.all[:end - self.all_start].count('<') + + return (self.token_start + span_start, self.token_start + span_end) + + +def preprocess_regexp(regexp): + # preprocess the regular expression + regexp = re.sub(r'\s', '', regexp) + # replace < and > only if not double (<< or >>): + regexp = re.sub(r'(?)>(?!>)', ')>)', regexp) + # now, replace << >> with < > resp. + regexp = re.sub(r'<<', '<', regexp) + regexp = re.sub(r'>>', '>', regexp) + # Replace . (if not preceded by \) with [^>] + regexp = re.sub(r'(?]', regexp) + + return regexp diff --git a/tests/test_regexp_ner.py b/tests/test_regexp_ner.py new file mode 100644 index 0000000..54eca4a --- /dev/null +++ b/tests/test_regexp_ner.py @@ -0,0 +1,44 @@ +from unittest import TestCase + +from iepy.preprocess.ner.regexp import RegExpNERRunner, TokenSearcher + +from .factories import SentencedIEDocFactory +from .manager_case import ManagerTestCase +from .test_ner import NERTestMixin + + +class TestRegExpNERRunner(ManagerTestCase, NERTestMixin): + + def test(self): + doc = SentencedIEDocFactory( + text="Chase notes she's negative for HIV and Hepatitis C") + ner_runner = RegExpNERRunner('disease', '|<[A-C]>') + ner_runner(doc) + # (the tokenizer splits she's in two parts) + entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')] + self.check_ner_result(doc, entities_triples) + + doc = SentencedIEDocFactory( + text="Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI") + ner_runner = RegExpNERRunner('MEDICAL_TEST', '<[A-Z]+>|') + ner_runner(doc) + # (the tokenizer splits can't in two parts) + entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')] + self.check_ner_result(doc, entities_triples) + + +class TestTokenSearcher(TestCase): + + def test(self): + sent = "Chase notes she 's negative for HIV and Hepatitis C" + regexp = '|<[A-C]>' + searcher = TokenSearcher(sent.split()) + matches = [(m.group(), m.span()) for m in searcher.finditer(regexp)] + self.assertEqual(matches, [(['HIV'], (6, 7)), (['Hepatitis', 'C'], (8, 10))]) + + def test_named_group(self): + sent = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can 't undergo an MRI" + regexp = '(|) (?P<> <[A-Z]+>|)' + searcher = TokenSearcher(sent.split()) + matches = [(m.group('name'), m.span('name')) for m in searcher.finditer(regexp)] + self.assertEqual(matches, [(['CT', 'scan'], (5, 7)), (['MRI'], (22, 23))])