diff --git a/benchmarks/nel/configs/nel.cfg b/benchmarks/nel/configs/nel.cfg index ffc612369..d2cdc56c7 100644 --- a/benchmarks/nel/configs/nel.cfg +++ b/benchmarks/nel/configs/nel.cfg @@ -17,7 +17,7 @@ gpu_allocator = null [nlp] lang = "en" -pipeline = ["senter","ner","entity_linker"] +pipeline = ["entity_linker"] disabled = [] before_creation = null after_creation = null @@ -27,23 +27,19 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} [components] -[components.senter] -source = "${paths.base_nlp}" - -[components.ner] -source = "${paths.base_nlp}" -component = "ner" - [components.entity_linker] factory = "entity_linker" entity_vector_length = 64 -incl_context = true -incl_prior = true +incl_context = True +incl_prior = True labels_discard = [] get_candidates = {"@misc":"spacy.CandidateGenerator.v1"} +get_candidates_all = {"@misc":"spacy.CandidateAllGenerator.v1"} +candidates_doc_mode = True +generate_empty_kb = {"@misc":"spacy.EmptyWikiKB.v1"} [components.entity_linker.model] -@architectures = "spacy.EntityLinker.v1" +@architectures = "spacy.EntityLinker.v2" nO = null [components.entity_linker.model.tok2vec] @@ -68,7 +64,7 @@ lookups = null [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] -@misc = "spacy.KBFromFile.v1" +@misc = "spacy.WikiKBFromFile.v1" kb_path = ${paths.kb} [initialize.tokenizer] @@ -77,12 +73,14 @@ kb_path = ${paths.kb} [corpora] [corpora.train] -@readers = "spacy.Corpus.v1" +@readers = "EntityEnrichedCorpusReader.v1" path = ${paths.train} +path_nlp_base = ${paths.vectors} [corpora.dev] -@readers = "spacy.Corpus.v1" +@readers = "EntityEnrichedCorpusReader.v1" path = ${paths.dev} +path_nlp_base = ${paths.vectors} [training] @@ -94,15 +92,15 @@ dropout = 0.2 patience = 10000 eval_frequency = 200 accumulate_gradient = 2 -max_epochs = 0 -max_steps = 500 -annotating_components = ["senter"] -frozen_components = ["senter","ner"] +max_epochs = 25 +max_steps = 10000 +annotating_components = [] +frozen_components = [] before_to_disk = null [training.logger] @loggers = "spacy.ConsoleLogger.v1" -progress_bar = false +progress_bar = true [training.batcher] @batchers = "spacy.batch_by_words.v1" diff --git a/benchmarks/nel/project.yml b/benchmarks/nel/project.yml index 0f7e93fee..de4bc0edb 100644 --- a/benchmarks/nel/project.yml +++ b/benchmarks/nel/project.yml @@ -1,19 +1,21 @@ title: 'NEL Benchmark' description: "Pipeline for benchmarking NEL approaches (incl. candidate generation and entity disambiguation)." vars: - run: "cg-default" + run: "default" language: "en" config: "nel.cfg" - vectors_model: "en_core_web_lg" - version: "0.0.5" + base_model: "en_core_web_lg" + version: "0.0.6" dataset: "mewsli_9" gpu_id: "" download_all_wiki_assets: "" # "--extra" to download full Wiki dumps. filter: "True" # Whether to only use parts of Wiki data and corpus containing filter terms. - training_max_steps: 1000 + training_max_steps: 10000 eval_highlight_metric: "F" # one of ("F", "r", "p") -directories: ["assets", "training", "configs", "scripts", "corpora", "evaluation"] +directories: ["assets", "training", "configs", "src", "corpora", "evaluation"] + +check_requirements: True workflows: all: @@ -24,7 +26,7 @@ workflows: - wikid_download_assets - wikid_parse - wikid_create_kb - - parse_corpus + - extract_annotations - compile_corpora - train - evaluate @@ -37,27 +39,25 @@ commands: - name: download_mewsli9 help: Download Mewsli-9 dataset. script: - - bash scripts/datasets/download_mewsli-9.sh + - bash src/datasets/download_mewsli-9.sh outputs: - - assets/mewsli_9/raw + - assets/mewsli_9/ - name: download_model help: "Download a model with pretrained vectors and NER component." script: - - "python -m spacy download ${vars.vectors_model}" + - "python -m spacy download ${vars.base_model}" - name: wikid_clone - help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`. + help: "Clone `wikid` to prepare Wiki database and `KnowledgeBase`." script: - - git clone https://github.com/explosion/wikid.git --branch main - - pip install -r wikid/requirements.txt - outputs: - - wikid + - "git clone https://github.com/rmitsch/wikid.git --branch fix/reestablish-db-connection-after-load" + - "pip install -r wikid/requirements.txt" - name: preprocess help: Preprocess and clean corpus data. script: - - "env PYTHONPATH=. python ./scripts/clean_data.py ${vars.dataset} ${vars.language}" + - "env PYTHONPATH=. python ./src/cli_clean_data.py ${vars.dataset} ${vars.language}" deps: - "assets/${vars.dataset}/raw" outputs: @@ -73,67 +73,77 @@ commands: - name: wikid_parse help: "Parse Wikipedia dumps. This can take a long time if you're not using the filtered dumps!" script: - - "spacy project run parse wikid --vars.language ${vars.language} --vars.filter True" + - "spacy project run parse wikid --vars.language ${vars.language} --vars.filter ${vars.filter}" outputs: - "wikid/output/${vars.language}/wiki.sqlite3" - name: wikid_create_kb help: "Create the knowledge base and write it to file." script: - - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model}" + - "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.base_model} --force" deps: - "wikid/output/${vars.language}/wiki.sqlite3" - outputs_no_cache: + outputs: - "wikid/output/${vars.language}/kb" - - "wikid/output/${vars.language}/nlp" + - "wikid/output/${vars.language}/wiki.annoy" - - name: parse_corpus - help: "Parse corpus to generate entity and annotation lookups used for corpora compilation." + - name: extract_annotations + help: "Extract annotations from corpus." script: - - "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language}" + - "env PYTHONPATH=. python ./src/cli_extract_annotations.py ${vars.dataset} ${vars.language}" deps: - "assets/${vars.dataset}/clean" - "wikid/output/${vars.language}/wiki.sqlite3" outputs: - - "assets/${vars.dataset}/entities.pkl" - - "assets/${vars.dataset}/entities_failed_lookup.pkl" - "assets/${vars.dataset}/annotations.pkl" - name: compile_corpora help: "Compile corpora, separated in train/dev/test sets." script: - - "env PYTHONPATH=. python ./scripts/compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}" + - "env PYTHONPATH=. python ./src/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.base_model} ${vars.filter}" deps: - - "assets/${vars.dataset}/entities.pkl" - - "assets/${vars.dataset}/entities_failed_lookups.pkl" - "assets/${vars.dataset}/annotations.pkl" - "wikid/output/${vars.language}/kb" - - "wikid/output/${vars.language}/nlp" - "configs/datasets.yml" outputs: - "corpora/${vars.dataset}/train.spacy" - "corpora/${vars.dataset}/dev.spacy" - "corpora/${vars.dataset}/test.spacy" + - name: retrieve_mentions_candidates + help: "Retrieve candidates for mentions in corpus and persist them in file. This is an optional step, but speeds up training and evaluation" + script: + - "env PYTHONPATH=. python ./src/cli_retrieve_mentions_candidates.py ${vars.dataset} ${vars.language}" + deps: + - "wikid/output/${vars.language}/kb" + - "wikid/output/${vars.language}/wiki.annoy" + - "wikid/output/${vars.language}/wiki.sqlite3" + - "assets/${vars.dataset}/annotations.pkl" + outputs: + - "corpora/${vars.dataset}/mention_candidates.pkl" + - name: train help: "Train a new Entity Linking component. Pass --vars.gpu_id GPU_ID to train with GPU. Training with some datasets may take a long time!" script: - - "bash scripts/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}" + - "bash src/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}" outputs: - "training/${vars.dataset}/${vars.run}" deps: - "wikid/output/${vars.language}/kb" - - "wikid/output/${vars.language}/nlp" + - "wikid/output/${vars.language}/wiki.annoy" + - "training/base-nlp/${vars.language}" - "corpora/${vars.dataset}/train.spacy" - "corpora/${vars.dataset}/dev.spacy" - name: evaluate help: "Evaluate on the test set." script: - - "env PYTHONPATH=. python ./scripts/evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}" + - "env PYTHONPATH=. python ./src/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language} ${vars.gpu_id}" deps: - "training/${vars.dataset}/${vars.run}/model-best" - - "wikid/output/${vars.language}/nlp" + - "training/base-nlp/${vars.language}" + - "wikid/output/${vars.language}/wiki.annoy" + - "training/base-nlp/${vars.language}" - "corpora/${vars.dataset}/dev.spacy" outputs: - "evaluation/${vars.dataset}" @@ -141,7 +151,7 @@ commands: - name: compare_evaluations help: "Compare available set of evaluation runs." script: - - "env PYTHONPATH=. python ./scripts/compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}" + - "env PYTHONPATH=. python ./src/cli_compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}" deps: - "evaluation/${vars.dataset}" diff --git a/benchmarks/nel/requirements.txt b/benchmarks/nel/requirements.txt index 2fe786dcc..dbc056b70 100644 --- a/benchmarks/nel/requirements.txt +++ b/benchmarks/nel/requirements.txt @@ -1,7 +1,6 @@ +spacy @ git+https://github.com/rmitsch/spaCy.git@feature/candidate-generation-by-docs pyyaml tqdm prettytable -scikit-learn -fuzzyset2 -spacyfishing -virtualenv \ No newline at end of file +virtualenv +spacyfishing \ No newline at end of file diff --git a/benchmarks/nel/scripts/candidate_generation/base.py b/benchmarks/nel/scripts/candidate_generation/base.py deleted file mode 100644 index 38f25d035..000000000 --- a/benchmarks/nel/scripts/candidate_generation/base.py +++ /dev/null @@ -1,70 +0,0 @@ -""" Base class generation for candidate selection. """ -import abc -import pickle -from typing import Dict, Any, Optional, Iterable, Tuple - -import spacy -from spacy import Language -from spacy.kb import KnowledgeBase, Candidate -from spacy.tokens import Span - -from datasets.dataset import Dataset - - -class NearestNeighborCandidateSelector(abc.ABC): - """Callable object selecting candidates via nearest neighbour search.""" - - _pipeline: Optional[Language] = None - _lookup_struct: Optional[Any] = None - _entities: Dict[str, Any] = {} - - def __call__( - self, kb: KnowledgeBase, span: Span, dataset_id: str, language: str, max_n_candidates: int, **kwargs - ) -> Iterable[Candidate]: - """Identifies entity candidates. - kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates. - span (Span): Span to match potential entity candidates with. - dataset_id (str): ID of dataset for which to select candidates. - language (str): Language. - max_n_candidates (int): Numbers of nearest neighbours to query. - RETURNS (Iterator[Candidate]): Candidates for specified entity. - """ - - if self._pipeline is None: - # Load pipeline and pickled entities. Run name doesn't matter for either of those. - paths = Dataset.assemble_paths(dataset_id, "", language) - self._pipeline = spacy.load(paths["nlp_base"]) - with open(paths["entities"], "rb") as file: - self._entities[dataset_id] = pickle.load(file) - if self._lookup_struct is None: - self._lookup_struct = self._init_lookup_structure(kb, max_n_candidates, **kwargs) - - # Retrieve candidates from KB. - return self._fetch_candidates(dataset_id, span, kb, max_n_candidates, **kwargs) - - @abc.abstractmethod - def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> Any: - """Init container for lookups for new dataset. Doesn't do anything if initialized for this dataset already. - kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates. - max_n_candidates (int): Max. number of candidates to generate. - RETURNS (Any): Initialized container. - """ - raise NotImplementedError - - @abc.abstractmethod - def _fetch_candidates( - self, - dataset_id: str, - span: Span, - kb: KnowledgeBase, - max_n_candidates: int, - **kwargs - ) -> Iterable[Candidate]: - """Fetches candidates for entity in span.text. - dataset_id (str): ID of dataset for which to select candidates. - span (Span): candidate span. - kb (KnowledgeBase): KnowledgeBase containing all possible entity candidates. - max_n_candidates (int): Max. number of candidates to generate. - RETURNS (Iterator[Candidate]): Candidates for specified entity. - """ - raise NotImplementedError diff --git a/benchmarks/nel/scripts/candidate_generation/embeddings.py b/benchmarks/nel/scripts/candidate_generation/embeddings.py deleted file mode 100644 index bd4f49987..000000000 --- a/benchmarks/nel/scripts/candidate_generation/embeddings.py +++ /dev/null @@ -1,53 +0,0 @@ -""" Candidate generation via distance in embedding space. """ -import time -from typing import Iterable, List, Set - -import numpy -from sklearn.neighbors import NearestNeighbors - -from spacy.kb import KnowledgeBase -from spacy.tokens import Span -from .base import NearestNeighborCandidateSelector -from rapidfuzz.string_metric import normalized_levenshtein - - -class EmbeddingCandidateSelector(NearestNeighborCandidateSelector): - """Callable object selecting candidates as nearest neighbours in embedding space.""" - - _entity_ids: List[str] = [] - - def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> NearestNeighbors: - container = NearestNeighbors(n_neighbors=max_n_candidates, metric="cosine", n_jobs=1) - container.fit(numpy.asarray([kb.get_vector(ent_id) for ent_id in kb.get_entity_strings()])) - self._entity_ids = kb.get_entity_strings() - - return container - - def _fetch_candidates( - self, - dataset_id: str, - span: Span, - kb: KnowledgeBase, - max_n_candidates: int, - lexical_similarity_cutoff: float = 0.5, - ) -> Iterable[int]: - target_vec = span.vector - if not isinstance(target_vec, numpy.ndarray): - target_vec = target_vec.get() - - nn_idx = self._lookup_struct.kneighbors(target_vec.reshape((1, -1)))[1][0] - nn_entities = {self._entity_ids[i]: self._entities[dataset_id][self._entity_ids[i]] for i in nn_idx} - candidate_entity_ids: Set[str] = set() - for nne in nn_entities: - for name in nn_entities[nne].aliases: - if normalized_levenshtein(name.lower(), span.text.lower()) / 100 >= lexical_similarity_cutoff: - candidate_entity_ids.add(nne) - break - - return { - cand - for cands_for_alias in [ - kb.get_alias_candidates("_" + cei + "_") for cei in candidate_entity_ids - ] - for cand in cands_for_alias - } diff --git a/benchmarks/nel/scripts/candidate_generation/lexical.py b/benchmarks/nel/scripts/candidate_generation/lexical.py deleted file mode 100644 index 3c5c40155..000000000 --- a/benchmarks/nel/scripts/candidate_generation/lexical.py +++ /dev/null @@ -1,29 +0,0 @@ -""" Candidate generation via distance in lexical space. """ -from typing import Iterable - -from spacy.kb import KnowledgeBase, Candidate -from spacy.tokens import Span -from .base import NearestNeighborCandidateSelector -from cfuzzyset import cFuzzySet as FuzzySet - - -class LexicalCandidateSelector(NearestNeighborCandidateSelector): - """Callable object selecting candidates as nearest neighbours in lexical space.""" - - def _init_lookup_structure(self, kb: KnowledgeBase, max_n_candidates: int, **kwargs) -> FuzzySet: - return FuzzySet(kb.get_alias_strings()) - - def _fetch_candidates( - self, - dataset_id: str, - span: Span, - kb: KnowledgeBase, - max_n_candidates: int, - similarity_cutoff: float = 0.5, - ) -> Iterable[int]: - all_cands = [ - kb.get_alias_candidates(entry[1]) for entry in self._lookup_struct.get(span.text, []) - if entry[0] >= similarity_cutoff - ][:max_n_candidates] - - return {cand for cands_for_alias in all_cands for cand in cands_for_alias} diff --git a/benchmarks/nel/scripts/custom_functions.py b/benchmarks/nel/scripts/custom_functions.py deleted file mode 100644 index 2ecb0e753..000000000 --- a/benchmarks/nel/scripts/custom_functions.py +++ /dev/null @@ -1,65 +0,0 @@ -""" Custom functions to be hooked up into the registry. """ -from functools import partial - -from typing import Iterable, Callable -import typing -import spacy -from spacy.kb import Candidate, KnowledgeBase -from spacy.tokens import Span - -from scripts.candidate_generation import embeddings -from scripts.candidate_generation import lexical - -embedding_candidate_selector = embeddings.EmbeddingCandidateSelector() -fuzzy_lexical_candidate_selector = lexical.LexicalCandidateSelector() - - -@spacy.registry.misc("EmbeddingGetCandidates.v1") -def create_candidates_via_embeddings( - dataset_name: str, language: str, max_n_candidates: int, lexical_similarity_cutoff: float -) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: - """Returns Callable for identification of candidates via their embeddings. - dataset_name (str): Dataset name. - langugage (str): Language. - max_n_candidates (int): Numbers of nearest neighbours to query. - RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates. - """ - - # More elegant way to enforce proper typing for partial object? - return typing.cast( - Callable[[KnowledgeBase, Span], Iterable[Candidate]], - partial( - embedding_candidate_selector, - dataset_id=dataset_name, - language=language, - max_n_candidates=max_n_candidates, - lexical_similarity_cutoff=lexical_similarity_cutoff, - ), - ) - - -@spacy.registry.misc("FuzzyStringGetCandidates.v1") -def create_candidates_via_fuzzy_string_matching( - dataset_name: str, language: str, max_n_candidates: int, similarity_cutoff: float -) -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: - """Returns Callable for identification of candidates via NN search in lexical space. - dataset_name (str): Dataset name. - langugage (str): Language. - max_n_candidates (int): Numbers of nearest neighbours to query. - similarity_cutoff (float): Similarity value below which candidates won't be included. - RETURNS (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Callable for identification of entity candidates. - """ - - assert 0 <= similarity_cutoff <= 1 - - # More elegant way to enforce proper typing for partial object? - return typing.cast( - Callable[[KnowledgeBase, Span], Iterable[Candidate]], - partial( - fuzzy_lexical_candidate_selector, - dataset_id=dataset_name, - language=language, - max_n_candidates=max_n_candidates, - similarity_cutoff=similarity_cutoff, - ), - ) diff --git a/benchmarks/nel/scripts/datasets/__init__.py b/benchmarks/nel/scripts/datasets/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/nel/scripts/datasets/mewsli_9.py b/benchmarks/nel/scripts/datasets/mewsli_9.py deleted file mode 100644 index 1623f73c8..000000000 --- a/benchmarks/nel/scripts/datasets/mewsli_9.py +++ /dev/null @@ -1,98 +0,0 @@ -""" Dataset class for Mewsli-9 dataset. """ -import csv -import distutils.dir_util -from typing import Tuple, Set, List, Dict, Optional - -import tqdm -from spacy.tokens import Doc - -from datasets.dataset import Dataset -from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation -from wikid import schemas - - -class Mewsli9Dataset(Dataset): - """Mewsli-9 dataset.""" - - @property - def name(self) -> str: - return "mewsli_9" - - def _parse_corpus( - self, **kwargs - ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]: - entity_qids: Set[str] = set() - annotations: Dict[str, List[schemas.Annotation]] = {} - - with open( - self._paths["assets"] / "clean" / "en" / "mentions.tsv", encoding="utf-8" - ) as file_path: - for i, row in enumerate(csv.DictReader(file_path, delimiter="\t")): - assert len(row) == 9 - - entity_qids.add(row["qid"]) - if row["docid"] not in annotations: - annotations[row["docid"]] = [] - annotations[row["docid"]].append( - schemas.Annotation( - entity_name=row["url"].split("/")[-1].replace("_", " "), - entity_id=row["qid"], - start_pos=int(row["position"]), - end_pos=int(row["position"]) + int(row["length"]), - ) - ) - - entities, failed_entity_lookups, _ = fetch_entity_information(tuple(entity_qids), self._language) - - return entities, failed_entity_lookups, annotations - - def clean_assets(self) -> None: - # No cleaning necessary, just copy all data into /clean. - distutils.dir_util.copy_tree(str(self._paths["assets"] / "raw"), str(self._paths["assets"] / "clean")) - - def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> List[Doc]: - annotated_docs: List[Doc] = [] - - with open( - self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8" - ) as title_file: - row_count = sum(1 for _ in title_file) - title_file.seek(0) - n_annots_available = 0 - n_annots_assigned = 0 - - with tqdm.tqdm( - desc="Creating doc objects", total=row_count, leave=False - ) as pbar: - for row in csv.DictReader(title_file, delimiter="\t"): - with open( - self._paths["assets"] / "clean" / "en" / "text" / row["docid"], - encoding="utf-8", - ) as text_file: - # Replace newlines with whitespace and \xa0 (non-breaking whitespace) appearing after titles - # with a period. This maintains the correct offsets in the dataset annotations. - doc_text = "".join([ - line.replace("\n", " ").replace("\xa0", ".") for line in text_file.readlines() - ]) - - if filter_terms and not any([ft in doc_text for ft in filter_terms]): - pbar.update(1) - continue - - doc = self._nlp_base(doc_text) - doc_annots = self._annotations.get(row["docid"], []) - doc.ents, _ = create_spans_from_doc_annotation( - doc=doc, - entities_info=self._entities, - annotations=doc_annots, - harmonize_with_doc_ents=True, - ) - annotated_docs.append(doc) - n_annots_available += len(doc_annots) - n_annots_assigned += len(doc.ents) - pbar.update(1) - - print(f"Assigned {n_annots_assigned} out of {n_annots_available} annotations " - f"({(n_annots_assigned / n_annots_available * 100):.2f}%) in {pbar.n} docs.") - - return annotated_docs diff --git a/benchmarks/nel/scripts/__init__.py b/benchmarks/nel/src/__init__.py similarity index 100% rename from benchmarks/nel/scripts/__init__.py rename to benchmarks/nel/src/__init__.py diff --git a/benchmarks/nel/scripts/clean_data.py b/benchmarks/nel/src/cli_clean_data.py similarity index 100% rename from benchmarks/nel/scripts/clean_data.py rename to benchmarks/nel/src/cli_clean_data.py diff --git a/benchmarks/nel/scripts/compare_evaluations.py b/benchmarks/nel/src/cli_compare_evaluations.py similarity index 100% rename from benchmarks/nel/scripts/compare_evaluations.py rename to benchmarks/nel/src/cli_compare_evaluations.py diff --git a/benchmarks/nel/scripts/compile_corpora.py b/benchmarks/nel/src/cli_compile_corpora.py similarity index 53% rename from benchmarks/nel/scripts/compile_corpora.py rename to benchmarks/nel/src/cli_compile_corpora.py index dce834ce8..4b682ee8b 100644 --- a/benchmarks/nel/scripts/compile_corpora.py +++ b/benchmarks/nel/src/cli_compile_corpora.py @@ -1,20 +1,29 @@ """ Compiles train/dev/test corpora. """ +from pathlib import Path +from typing import Set, Optional import typer from datasets.dataset import Dataset -from wikid import read_filter_terms -def main(dataset_name: str, language: str, use_filter_terms: bool = typer.Argument(False)): +def main(dataset_name: str, language: str, model: str, use_filter_terms: bool = typer.Argument(False)): """Create corpora in spaCy format. dataset_name (str): Dataset name. language (str): Language. + model (str): Name or path of model with tokenizer, tok2vec and parser. use_filter_terms (bool): Whether to use the filter terms defined in the dataset config. If True, only documents containing at least one of the specified terms will be included in corpora. If False, all documents are included. """ + filter_terms: Optional[Set[str]] = None + if use_filter_terms: + with open( + Path(__file__).parent.parent / "wikid" / "configs" / "filter_terms.txt", "r" + ) as file: + filter_terms = {ft.replace("\n", "") for ft in file.readlines()} + # Run name isn't relevant for corpora compilation. - Dataset.generate_from_id(dataset_name, language).compile_corpora(read_filter_terms() if use_filter_terms else None) + Dataset.generate_from_id(dataset_name, language).compile_corpora(model, filter_terms) if __name__ == "__main__": diff --git a/benchmarks/nel/scripts/evaluate.py b/benchmarks/nel/src/cli_evaluate.py similarity index 57% rename from benchmarks/nel/scripts/evaluate.py rename to benchmarks/nel/src/cli_evaluate.py index 3dc1bf242..411c269c7 100644 --- a/benchmarks/nel/scripts/evaluate.py +++ b/benchmarks/nel/src/cli_evaluate.py @@ -1,17 +1,21 @@ """ Evaluation on test data. """ +from typing import Optional + from datasets.dataset import Dataset +import custom_functions import typer -from custom_functions import create_candidates_via_embeddings -def main(dataset_name: str, run_name: str, language: str): +def main(dataset_name: str, run_name: str, language: str, gpu_id: Optional[int] = typer.Argument(None)): """Evaluate the trained EL component by applying it to unseen text. dataset_name (str): Name of dataset to evaluate on. run_name (str): Run name. language (str): Language. + gpu_id (Optional[int]): ID of GPU to utilize for evaluation. """ - Dataset.generate_from_id(dataset_name, language, run_name).evaluate(run_name=run_name) + Dataset.generate_from_id(dataset_name, language, run_name).evaluate(gpu_id=gpu_id) if __name__ == "__main__": + # main("mewsli_9", "default", "en", 0) typer.run(main) diff --git a/benchmarks/nel/src/cli_extract_annotations.py b/benchmarks/nel/src/cli_extract_annotations.py new file mode 100644 index 000000000..6731f8546 --- /dev/null +++ b/benchmarks/nel/src/cli_extract_annotations.py @@ -0,0 +1,15 @@ +""" Parse corpus. """ +from datasets.dataset import Dataset +import typer + + +def main(dataset_name: str, language: str): + """Parse corpus. + dataset_name (str): Name of dataset to evaluate on. + language (str): Language. + """ + Dataset.generate_from_id(dataset_name, language, "").extract_annotations() + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/nel/src/cli_retrieve_mentions_candidates.py b/benchmarks/nel/src/cli_retrieve_mentions_candidates.py new file mode 100644 index 000000000..1600c3379 --- /dev/null +++ b/benchmarks/nel/src/cli_retrieve_mentions_candidates.py @@ -0,0 +1,16 @@ +"""Retrieve candidates for mentions in corpus.""" +import typer as typer + +from datasets.dataset import Dataset + + +def main(dataset_name: str, language: str): + """Retrieve candidates for mentions in corpus. + dataset_name (str): Name of dataset to evaluate on. + language (str): Language. + """ + Dataset.generate_from_id(dataset_name, language, "").retrieve_candidates_for_mentions() + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/nel/src/custom_functions.py b/benchmarks/nel/src/custom_functions.py new file mode 100644 index 000000000..1b999972b --- /dev/null +++ b/benchmarks/nel/src/custom_functions.py @@ -0,0 +1,81 @@ +from pathlib import Path +from typing import Callable, Iterable + +import spacy +from spacy import registry, Vocab, Language +from spacy.training import Example +from spacy.pipeline import EntityLinker + +from wikid.src.kb import WikiKB + + +@spacy.registry.readers("EntityEnrichedCorpusReader.v1") +def create_docbin_reader(path: Path, path_nlp_base: Path) -> Callable[[Language], Iterable[Example]]: + """Returns Callable generating a corpus reader function that enriches read documents with the correct entities as + specified in the corpus annotations. + path (Path): Path to DocBin file with documents to prepare. + path_nlp_base (Path): Path to pipeline for tokenization/sentence. + """ + def read_docbin(_: Language) -> Iterable[Example]: + """Read DocBin for training. Set all entities as they appear in the annotated corpus, but set entity type and KB + ID to NIL. + nlp (Language): Pipeline to use for creating document used in EL from reference document. + """ + nlp = spacy.load(path_nlp_base) + + for example in spacy.training.Corpus(path)(nlp): + example.predicted = nlp(example.predicted) + example.predicted.ents = [ + example.predicted.char_span(ent.start_char, ent.end_char, label=EntityLinker.NIL, kb_id=EntityLinker.NIL) + for ent in example.reference.ents + ] + sents = list(example.predicted.sents) + sents_orig = list(example.reference.sents) + + assert len(sents) == len(sents_orig) + assert len(sents) > 0 and len(sents_orig) > 0 + assert all([ent is not None for ent in example.predicted.ents]) + assert len(example.reference.ents) == len(example.predicted.ents) + assert len(example.reference.ents) > 0 + + yield Example(example.predicted, example.reference) + + return read_docbin + + +@registry.misc("spacy.WikiKBFromFile.v1") +def load_kb(kb_path: Path) -> Callable[[Vocab], WikiKB]: + """Loads WikiKB instance from disk. + kb_path (Path): Path to WikiKB path. + mention_candidates_path (Path): Path to pre-computed file with candidates per mention. + RETURNS (Callable[[Vocab], WikiKB]): Callable generating WikiKB from disk. + """ + def kb_from_file(_: Vocab) -> WikiKB: + return WikiKB.generate_from_disk(path=kb_path) + + return kb_from_file + + +@registry.misc("spacy.EmptyWikiKB.v1") +def empty_wiki_kb() -> Callable[[Vocab, int], WikiKB]: + """Generates empty WikiKB instance. + RETURNS (Callable[[Vocab, int], WikiKB]): Callable generating WikiKB from disk. + """ + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + """Generates new WikiKB instance. + Since WikiKB relies on an external DB file that we have no information on at this point, this instance will not + have initialized its DB connection. Also, its parameters specified at init are arbitrarily chosen. This only + serves to return a placeholder WikiKB instance to be overwritten using .from_bytes() or .from_disk(). + vocab (Vocab): Vocab instance. + entity_vector_length (int): Entity vector length. + """ + return WikiKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + db_path=Path("."), + annoy_path=Path(".annoy"), + language=".", + establish_db_connection_at_init=False + ) + + return empty_kb_factory diff --git a/benchmarks/nel/scripts/candidate_generation/__init__.py b/benchmarks/nel/src/datasets/__init__.py similarity index 100% rename from benchmarks/nel/scripts/candidate_generation/__init__.py rename to benchmarks/nel/src/datasets/__init__.py diff --git a/benchmarks/nel/scripts/datasets/dataset.py b/benchmarks/nel/src/datasets/dataset.py similarity index 62% rename from benchmarks/nel/scripts/datasets/dataset.py rename to benchmarks/nel/src/datasets/dataset.py index 9a139c5e7..dfc93a72c 100644 --- a/benchmarks/nel/scripts/datasets/dataset.py +++ b/benchmarks/nel/src/datasets/dataset.py @@ -1,5 +1,6 @@ """ Dataset class. """ import abc +import copy import csv import datetime import importlib @@ -9,7 +10,7 @@ import pickle from collections import defaultdict from pathlib import Path -from typing import Tuple, Set, List, Optional, TypeVar, Type, Dict, Union +from typing import Tuple, Set, List, Optional, TypeVar, Type, Dict import numpy import prettytable @@ -18,12 +19,12 @@ import yaml from spacy import Language from spacy.kb import KnowledgeBase -from spacy.pipeline.legacy import EntityLinker_v1 from spacy.tokens import Doc, DocBin from spacy.training import Example from spacy.pipeline import EntityLinker from wikid import schemas +from wikid.src.kb import WikiKB, WikiKBCandidate from . import evaluation from utils import get_logger @@ -47,11 +48,8 @@ def __init__(self, run_name: str, language: str): with open(self._paths["root"] / "configs" / "datasets.yml", "r") as stream: self._options = yaml.safe_load(stream)[self.name] - self._entities: Optional[Dict[str, schemas.Entity]] = None - self._failed_entity_lookups: Optional[Set[str]] = None self._annotations: Optional[Dict[str, List[schemas.Annotation]]] = None self._kb: Optional[KnowledgeBase] = None - self._nlp_base: Optional[Language] = None self._nlp_best: Optional[Language] = None self._annotated_docs: Optional[List[Doc]] = None @@ -65,20 +63,19 @@ def assemble_paths(dataset_name: str, run_name: str, language: str) -> Dict[str, """ root_path = Path(os.path.abspath(__file__)).parent.parent.parent - wikid_path = root_path / "wikid" / "output" + wikid_output_path = root_path / "wikid" / "output" assets_path = root_path / "assets" / dataset_name return { "root": root_path, "evaluation": root_path / "configs" / "evaluation.yml", "assets": assets_path, - "nlp_base": wikid_path / language / "nlp", - "kb": wikid_path / language / "kb", - "entities": assets_path / "entities.pkl", - "failed_entity_lookups": assets_path / "entities_failed_lookups.pkl", + "kb": wikid_output_path / language / "kb", "annotations": assets_path / "annotations.pkl", + "nlp_base": root_path / "training" / "base-nlp" / language, "nlp_best": root_path / "training" / dataset_name / run_name / "model-best", - "corpora": root_path / "corpora" / dataset_name + "corpora": root_path / "corpora" / dataset_name, + "mentions_candidates": root_path / "corpora" / dataset_name / "mentions_candidates.pkl", } @property @@ -86,52 +83,50 @@ def name(self) -> str: """Returns dataset name.""" raise NotImplementedError - def compile_corpora(self, filter_terms: Optional[Set[str]] = None) -> None: + def compile_corpora(self, model: str, filter_terms: Optional[Set[str]] = None) -> None: """Creates train/dev/test corpora for dataset. + model (str): Name or path of model with tokenizer, tok2vec, parser, tagger, parser. filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified terms will be included in corpora. If None, all documents are included. """ - self._load_resource("entities") - self._load_resource("failed_entity_lookups") - self._load_resource("annotations") - self._load_resource("nlp_base") + with open(self._paths["annotations"], "rb") as file: + self._annotations = pickle.load(file) Doc.set_extension("overlapping_annotations", default=None) - self._annotated_docs = self._create_annotated_docs(filter_terms) + nlp = spacy.load(model) + + # Incorporate annotations from corpus into documents. Only keep docs with entities (relevant mostly when working + # with filtered data). + self._annotated_docs = [doc for doc in self._create_annotated_docs(nlp, filter_terms) if len(doc.ents)] + + # Serialize pipeline and corpora. + self._paths["nlp_base"].parent.mkdir(parents=True, exist_ok=True) + nlp.to_disk( + self._paths["nlp_base"], + # exclude=[comp for comp in nlp.component_names if comp not in [*nlp_components]] + ) self._serialize_corpora() - def _create_annotated_docs(self, filter_terms: Optional[Set[str]] = None) -> List[Doc]: - """Creates docs annotated with entities. + def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]: + """Creates docs annotated with entities. This should set documents `ents` attribute. + nlp (Language): Model with tokenizer, tok2vec and parser. filter_terms (Optional[Set[str]]): Set of filter terms. Only documents containing at least one of the specified terms will be included in corpora. If None, all documents are included. RETURN (List[Doc]): List of docs reflecting all entity annotations. """ raise NotImplementedError - def parse_corpus(self, **kwargs) -> None: - """Parses corpus. Loads data on entities and mentions. - Populates self._entities, self._failed_entity_lookups, self._annotations. - RETURNS (Tuple[Dict[str, Entity], Set[str], Dict[str, List[Annotation]]]): entities, titles of failed entity - lookups, annotations. + def extract_annotations(self, **kwargs) -> None: + """Parses corpus and extracts annotations. Loads data on entities and mentions. + Populates self._annotations. """ - self._load_resource("nlp_base") - logger.info("Parsing external corpus") - ( - self._entities, - self._failed_entity_lookups, - self._annotations, - ) = self._parse_corpus(**kwargs) - - # Serialize entity information. - for to_serialize in ( - (self._paths["entities"], self._entities), - (self._paths["failed_entity_lookups"], self._failed_entity_lookups), - (self._paths["annotations"], self._annotations), - ): - with open(to_serialize[0], "wb") as fp: - pickle.dump(to_serialize[1], fp) - logger.info("Successfully parsed corpus.") - - def _parse_corpus( + logger.info("Extracting annotations from corpus") + self._annotations = self._extract_annotations_from_corpus(**kwargs) + with open(self._paths["annotations"], "wb") as fp: + pickle.dump(self._annotations, fp) + + logger.info("Successfully extracted annotations from corpus.") + + def _extract_annotations_from_corpus( self, **kwargs ) -> Tuple[Dict[str, schemas.Entity], Set[str], Dict[str, List[schemas.Annotation]]]: """Parses corpus. Loads data on entities and mentions. @@ -169,55 +164,26 @@ def _serialize_corpora(self) -> None: for key, idx in indices.items(): corpus = DocBin(store_user_data=True, docs=[self._annotated_docs[i] for i in idx]) - if not self._paths["corpora"].exists(): - self._paths["corpora"].mkdir() + self._paths["corpora"].mkdir(parents=True, exist_ok=True) corpus.to_disk(self._paths["corpora"] / f"{key}.spacy") logger.info(f"Completed serializing corpora at {self._paths['corpora']}.") - def _load_resource(self, key: str, force: bool = False) -> None: - """Loads serialized resource. - key (str): Resource key. Must be in self._paths. - force (bool): Load from disk even if already not None. - """ - - path = self._paths[key] - - if key == "nlp_base" and (force or not self._nlp_base): - self._nlp_base = spacy.load(path) - elif key == "nlp_best" and (force or not self._nlp_best): - self._nlp_best = spacy.load(path) - elif key == "kb" and (force or not self._kb): - self._load_resource("nlp_base") - self._kb = KnowledgeBase( - vocab=self._nlp_base.vocab, - entity_vector_length=self._nlp_base.vocab.vectors_length, - ) - self._kb.from_disk(path) - elif key == "annotations" and (force or not self._annotations): - with open(path, "rb") as file: - self._annotations = pickle.load(file) - elif key == "entities" and (force or not self._entities): - with open(path, "rb") as file: - self._entities = pickle.load(file) - elif key == "failed_entity_lookups" and ( - force or not self._failed_entity_lookups - ): - with open(self._paths["failed_entity_lookups"], "rb") as file: - self._failed_entity_lookups = pickle.load(file) - - def evaluate(self, run_name: str) -> None: + def evaluate(self, gpu_id: Optional[int] = None) -> None: """Evaluates trained pipeline on test set. run_name (str): Run name. + gpu_id (Optional[int]): ID of GPU to utilize. """ - self._load_resource("nlp_best") - self._load_resource("nlp_base") - self._load_resource("kb") + if gpu_id is not None: + spacy.require_gpu(gpu_id) + + nlp_base = spacy.load(self._paths["nlp_base"]) + self._nlp_best = spacy.load(self._paths["nlp_best"]) + self._kb = WikiKB.generate_from_disk(self._paths["kb"]) with open(self._paths["evaluation"], "r") as config_file: eval_config = yaml.safe_load(config_file) - if eval_config["external"]["spacyfishing"]: - self._nlp_base.add_pipe("entityfishing", last=True) + nlp_base.add_pipe("entityfishing", last=True) # Apply config overrides, if defined. if "config_overrides" in eval_config and eval_config["config_overrides"]: @@ -227,26 +193,18 @@ def evaluate(self, run_name: str) -> None: # Infer test set. test_set_path = self._paths["corpora"] / "test.spacy" docs = list(DocBin().from_disk(test_set_path).get_docs(self._nlp_best.vocab)) - # spaCy sometimes includes leading articles in entities, our benchmark datasets don't. Hence we drop all - # leading "the " and adjust the entity positions accordingly. - for doc in docs: - doc.ents = [ - doc.char_span(ent.start_char + 4, ent.end_char, label=ent.label, kb_id=ent.kb_id) - if ent.text.lower().startswith("the ") else ent - for ent in doc.ents - ] test_set = [ Example(predicted_doc, doc) for predicted_doc, doc in zip( [ doc for doc in tqdm.tqdm( - self._nlp_best.pipe(texts=[doc.text for doc in docs], n_process=-1, batch_size=500), + self._nlp_best.pipe(texts=docs, n_process=1 if gpu_id else -1, batch_size=500), desc="Inferring entities for test set", total=len(docs) ) ], - DocBin().from_disk(self._paths["corpora"] / "test.spacy").get_docs(self._nlp_best.vocab) + docs ) ] @@ -261,19 +219,19 @@ def evaluate(self, run_name: str) -> None: for example in tqdm.tqdm(test_set, total=len(test_set), leave=True, desc="Evaluating test set"): example: Example if len(example) > 0: - entity_linker: Union[EntityLinker, EntityLinker_v1] = \ - self._nlp_best.get_pipe("entity_linker") # type: ignore + entity_linker: EntityLinker = self._nlp_best.get_pipe("entity_linker") # type: ignore ent_gold_ids = { evaluation.offset(ent.start_char, ent.end_char): ent.kb_id_ for ent in example.reference.ents } if len(ent_gold_ids) == 0: continue ent_pred_labels = {(ent.start_char, ent.end_char): ent.label_ for ent in example.predicted.ents} - ent_cands = { - (ent.start_char, ent.end_char): { - cand.entity_: cand for cand in entity_linker.get_candidates(self._kb, ent) - } - for ent in example.reference.ents + ent_cands_by_offset = { + (ent.start_char, ent.end_char): {cand.entity_: cand for cand in ent_cands} + for ent, ent_cands in zip( + example.reference.ents, + next(entity_linker.get_candidates_all(self._kb, (ents for ents in [example.reference.ents]))) + ) } # Update candidate generation stats. @@ -283,7 +241,9 @@ def evaluate(self, run_name: str) -> None: # For the candidate generation evaluation also mis-aligned entities are considered. label = ent_pred_labels.get(ent_offset, "NIL") cand_gen_label_counts[label] += 1 - candidate_results.update_metrics(label, ent.kb_id_, set(ent_cands.get(ent_offset, {}).keys())) + candidate_results.update_metrics( + label, ent.kb_id_, set(ent_cands_by_offset.get(ent_offset, {}).keys()) + ) # Update entity disambiguation stats for baselines. evaluation.add_disambiguation_baseline( @@ -291,15 +251,15 @@ def evaluate(self, run_name: str) -> None: label_counts, example.predicted, ent_gold_ids, - ent_cands, + ent_cands_by_offset, ) # Update entity disambiguation stats for trained model. - evaluation.add_disambiguation_eval_result(trained_results, example.predicted, ent_gold_ids, ent_cands) + evaluation.add_disambiguation_eval_result(trained_results, example.predicted, ent_gold_ids, ent_cands_by_offset) - if eval_config["external"]["spacyfishing"]: + if eval_config["external"].get("spacyfishing", False): try: - doc = self._nlp_base(example.reference.text) + doc = nlp_base(example.reference.text) except TypeError: doc = None evaluation.add_disambiguation_spacyfishing_eval_result(spacyfishing_results, doc, ent_gold_ids) @@ -317,7 +277,7 @@ def evaluate(self, run_name: str) -> None: eval_results.append(spacyfishing_results) logger.info(dict(cand_gen_label_counts)) - evaluation.EvaluationResults.report(tuple(eval_results), run_name=run_name, dataset_name=self.name) + evaluation.EvaluationResults.report(tuple(eval_results), run_name=self._run_name, dataset_name=self.name) def compare_evaluations(self, highlight_criterion: str) -> None: """Generate and display table for comparison of all available runs for this dataset. @@ -404,3 +364,85 @@ def clean_assets(self) -> None: automatically. """ raise NotImplementedError + + def _collapse_spaces(self, doc_id: str, doc_text: str) -> Tuple[str, List[schemas.Annotation]]: + """ + Replace multiple spaces with singles to avoid tokenization & sentence splitting issues later + in pipeline. + doc_id (str): Doc ID to be looked up in self._annotations. + doc_text (str): Doc text. + RETURNS (Annotation): Potentially updated (1) doc text, (2) annotations (start/end positions may have changed). + """ + doc_annots = self._annotations.get(doc_id, []) + doc_text_orig = doc_text + annots_orig = copy.deepcopy(doc_annots) + + # This is inefficient and could surely be optimized. + multi_space_start_idx = doc_text.find(" ") + while multi_space_start_idx >= 0: + multi_space_stop_idx = multi_space_start_idx + 2 + while multi_space_stop_idx == " ": + multi_space_stop_idx += 1 + + # Shrink multiple whitespaces to one. + doc_text = doc_text[:multi_space_start_idx] + " " + doc_text[multi_space_stop_idx:] + + # Adjust annotations indices accordingly. + for i, annot in enumerate(doc_annots): + annot_text_orig = doc_text_orig[annots_orig[i].start_pos:annots_orig[i].end_pos] + if multi_space_start_idx <= annot.start_pos < multi_space_stop_idx: + offset = multi_space_start_idx + 1 - annot.start_pos + annot.start_pos -= offset + annot.end_pos -= offset + elif annot.start_pos >= multi_space_stop_idx: + offset = multi_space_stop_idx - multi_space_start_idx - 1 + annot.start_pos -= offset + annot.end_pos -= offset + + # New annotation should match old one, except for leading/trailing spaces + assert doc_text[annot.start_pos:annot.end_pos].split() == " ".join(annot_text_orig.split()) + + multi_space_start_idx = doc_text.find(" ", multi_space_start_idx) + + assert doc_text.find(" ") == -1 + + return doc_text, doc_annots + + def retrieve_candidates_for_mentions(self) -> None: + """Retrieves candidates for all mentions in corpus.""" + logger.info(f"Retrieving candidates for all mentions in corpus.") + + self._kb = WikiKB.generate_from_disk(self._paths["kb"]) + # Ensure KB is not using outdated mention_candidates map (which happens if the current step was already done and + # is now repeated). + self._kb._mentions_candidates = None + + # Our entity corpora incorporate annotated mentions as in their .ents attributes at this point, so we can + # extract all mentions from there. + mentions = { + ent.text: ent + for corpus_name in ("train", "dev", "test") + for doc in DocBin( + ).from_disk(self._paths["corpora"] / (corpus_name + ".spacy")).get_docs(self._kb.vocab) + for ent in doc.ents + } + mention_texts = list(mentions.keys()) + + # Select candidates. + mention_candidates: Dict[str, List[WikiKBCandidate]] = { + mention_text: mention_candidates + for mention_text, mention_candidates in zip( + mention_texts, + list(self._kb.get_candidates_all([[mentions[mt] for mt in mention_texts]]))[0] + ) + } + for mention in mention_candidates: + assert all([mention == mc.mention for mc in mention_candidates[mention]]) + + # Store results. + self._paths["mentions_candidates"].parent.mkdir(parents=True, exist_ok=True) + with open(self._paths["mentions_candidates"], "wb") as file: + pickle.dump(mention_candidates, file) + # Update hash in KB, persist updated KB. + self._kb.update_path("mentions_candidates", self._paths["mentions_candidates"]) + self._kb.to_disk(self._paths["kb"]) diff --git a/benchmarks/nel/scripts/datasets/download_mewsli-9.sh b/benchmarks/nel/src/datasets/download_mewsli-9.sh similarity index 100% rename from benchmarks/nel/scripts/datasets/download_mewsli-9.sh rename to benchmarks/nel/src/datasets/download_mewsli-9.sh diff --git a/benchmarks/nel/scripts/datasets/evaluation.py b/benchmarks/nel/src/datasets/evaluation.py similarity index 100% rename from benchmarks/nel/scripts/datasets/evaluation.py rename to benchmarks/nel/src/datasets/evaluation.py diff --git a/benchmarks/nel/src/datasets/mewsli_9.py b/benchmarks/nel/src/datasets/mewsli_9.py new file mode 100644 index 000000000..0b1794773 --- /dev/null +++ b/benchmarks/nel/src/datasets/mewsli_9.py @@ -0,0 +1,155 @@ +""" Dataset class for Mewsli-9 dataset. """ +import copy +import csv +import distutils.dir_util +import pickle +import time +from typing import Tuple, Set, List, Dict, Optional + +import spacy +import tqdm +from spacy import Language +from spacy.tokens import Doc + +from datasets.dataset import Dataset +from datasets.utils import fetch_entity_information, create_spans_from_doc_annotation +from wikid import schemas, load_entities + + +class Mewsli9Dataset(Dataset): + """Mewsli-9 dataset.""" + + @property + def name(self) -> str: + return "mewsli_9" + + def _extract_annotations_from_corpus( + self, **kwargs + ) -> Dict[str, List[schemas.Annotation]]: + annotations: Dict[str, List[schemas.Annotation]] = {} + + with open( + self._paths["assets"] / "clean" / "en" / "mentions.tsv", encoding="utf-8" + ) as file_path: + curr_article: Optional[str] = None + curr_docid: Optional[str] = None + + for i, row in enumerate(csv.DictReader(file_path, delimiter="\t")): + assert len(row) == 9 + if row["docid"] not in annotations: + annotations[row["docid"]] = [] + + # Read article, if this annotation refers to a new entity. + if row["docid"] != curr_docid: + curr_docid = row["docid"] + curr_article = self._read_article_file(curr_docid) + + # Correct leading/trailing whitespaces. + annot_start = int(row["position"]) + annot_end = annot_start + int(row["length"]) + while curr_article[annot_start] == " ": + annot_start += 1 + while curr_article[annot_end - 1] == " ": + annot_end -= 1 + annot_text = curr_article[annot_start:annot_end] + assert annot_text.startswith(" ") is False and annot_text.endswith(" ") is False + + annotations[row["docid"]].append( + schemas.Annotation( + entity_name=row["url"].split("/")[-1].replace("_", " "), + entity_id=row["qid"], + start_pos=annot_start, + end_pos=annot_end, + ) + ) + + return annotations + + def clean_assets(self) -> None: + # No cleaning necessary, just copy all data into /clean. + distutils.dir_util.copy_tree(str(self._paths["assets"] / "raw"), str(self._paths["assets"] / "clean")) + + def _read_article_file(self, doc_id: str) -> str: + """Reads article file for specified doc ID. + doc_id (str): Doc ID of article to read. + RETURNS (str): Article text as single string. + """ + with open( + self._paths["assets"] / "clean" / "en" / "text" / doc_id, + encoding="utf-8", + ) as text_file: + # Replace newlines with whitespace and \xa0 (non-breaking whitespace) appearing after titles + # with a period. This maintains the correct offsets in the dataset annotations. + return "".join([ + line.replace("\n", " ").replace("\xa0", ".") for line in text_file.readlines() + ]) + + def _create_annotated_docs(self, nlp: Language, filter_terms: Optional[Set[str]] = None) -> List[Doc]: + annotated_docs: List[Doc] = [] + + with open( + self._paths["assets"] / "clean" / "en" / "docs.tsv", encoding="utf-8" + ) as title_file: + row_count = sum(1 for _ in title_file) + title_file.seek(0) + n_annots_available = 0 + n_annots_assigned = 0 + + # Load entities batched to avoid hitting max. number of parameters supported by SQLite. + batch_size = 2**14 + qids = tuple({annot.entity_id for annots in self._annotations.values() for annot in annots}) + entities = { + qid: entity_info + for entity_batch in + [ + load_entities(qids=qids[i:i + batch_size], language=self._language) + for i in range(0, len(qids), batch_size) + ] + for qid, entity_info in entity_batch.items() + } + + with tqdm.tqdm( + desc="Reading files", total=row_count, leave=False + ) as pbar: + docs_info_rows: List[Dict[str, str]] = [] + doc_texts: List[str] = [] + for row in csv.DictReader(title_file, delimiter="\t"): + doc_text = self._read_article_file(row["docid"]) + if filter_terms and not any([ft in doc_text for ft in filter_terms]): + pbar.update(1) + continue + docs_info_rows.append(row) + doc_texts.append(doc_text) + pbar.update(1) + + docs = list( + nlp.pipe( + tqdm.tqdm( + doc_texts, + desc="Creating doc objects", + leave=False + ), + n_process=-1, + batch_size=64, + ) + ) + + # This is an embarrassingly parallel scenario - speed is fine for ~10k articles though. + with tqdm.tqdm( + desc="Parsing annotations", total=len(docs), leave=False + ) as pbar: + for doc, row in zip(docs, docs_info_rows): + doc_annots = self._annotations.get(row["docid"], []) + doc.ents, _ = create_spans_from_doc_annotation( + doc=doc, entities_info=entities, annotations=doc_annots, + ) + annotated_docs.append(doc) + n_annots_available += len(doc_annots) + n_annots_assigned += len(doc.ents) + + pbar.update(1) + + print(f"Assigned {n_annots_assigned} out of {n_annots_available} annotations " + f"({(n_annots_assigned / n_annots_available * 100):.2f}%) in {pbar.n} docs.") + + return annotated_docs diff --git a/benchmarks/nel/scripts/datasets/utils.py b/benchmarks/nel/src/datasets/utils.py similarity index 84% rename from benchmarks/nel/scripts/datasets/utils.py rename to benchmarks/nel/src/datasets/utils.py index 5d7d4688a..6dd5c6e44 100644 --- a/benchmarks/nel/scripts/datasets/utils.py +++ b/benchmarks/nel/src/datasets/utils.py @@ -3,6 +3,7 @@ from typing import Dict, List, Set, Tuple import tqdm from spacy.tokens import Token, Span, Doc +from spacy.pipeline import EntityLinker from wikid import schemas, load_entities @@ -67,29 +68,17 @@ def create_spans_from_doc_annotation( doc: Doc, entities_info: Dict[str, schemas.Entity], annotations: List[schemas.Annotation], - harmonize_with_doc_ents: bool, ) -> Tuple[List[Span], List[schemas.Annotation]]: """Creates spans from annotations for one document. doc (Doc): Document for whom to create spans. entities_info (Dict[str, Entity]): All available entities. annotation (List[Dict[str, Union[Set[str], str, int]]]): Annotations for this post/comment. - harmonize_harmonize_with_doc_ents (Language): Whether to only keep those annotations matched by entities in the - provided Doc object. RETURNS (Tuple[List[Span], List[Dict[str, Union[Set[str], str, int]]]]): List of doc spans for annotated entities; list of overlapping entities. """ - doc_ents_idx = { - # spaCy sometimes includes leading articles in entities, our benchmark datasets don't. Hence we drop all leading - # "the " and adjust the entity positions accordingly. - (ent.start_char + (0 if not ent.text.lower().startswith("the ") else 4), ent.end_char) - for ent in doc.ents - } doc_annots: List[schemas.Annotation] = [] overlapping_doc_annotations: List[schemas.Annotation] = [] - if harmonize_with_doc_ents and len(doc_ents_idx) == 0: - return [], [] - for i, annot_data in enumerate( sorted( [ @@ -120,10 +109,6 @@ def create_spans_from_doc_annotation( annot.end_pos = token.idx + len(token) break - # After token alignment: filter with NER pipeline, if available. - if harmonize_with_doc_ents and (annot.start_pos, annot.end_pos) not in doc_ents_idx: - continue - # If there is an overlap between annotation's start and end position and this token's parsed start # and end, we try to create a span with this token's position. overlaps = False @@ -143,7 +128,7 @@ def create_spans_from_doc_annotation( doc_spans = [ # No label/entity type information available. doc.char_span( - annot.start_pos, annot.end_pos, label="NIL", kb_id=annot.entity_id + annot.start_pos, annot.end_pos, label=EntityLinker.NIL, kb_id=annot.entity_id ) for annot in doc_annots ] diff --git a/benchmarks/nel/scripts/parse_corpus.py b/benchmarks/nel/src/parse_corpus.py similarity index 100% rename from benchmarks/nel/scripts/parse_corpus.py rename to benchmarks/nel/src/parse_corpus.py diff --git a/benchmarks/nel/src/train.py b/benchmarks/nel/src/train.py new file mode 100644 index 000000000..7dc20b2f0 --- /dev/null +++ b/benchmarks/nel/src/train.py @@ -0,0 +1,25 @@ +""" +API call for training. Mainly for debugging purposes. +""" +from pathlib import Path +import custom_functions +from spacy.cli.train import train + +if __name__ == '__main__': + root = Path(__file__).parent.parent + train( + root / "configs" / "nel.cfg", + output_path=root / "training" / "mewsli_9" / "default", + use_gpu=0, + overrides={ + "paths.dataset_name": "mewsli_9", + "paths.train": str(root / "corpora/mewsli_9/train.spacy"), + "paths.dev": str(root / "corpora/mewsli_9/dev.spacy"), + "paths.kb": str(root / "wikid/output/en/kb"), + "paths.db": str(root / "wikid/output/en/wiki.sqlite3"), + "paths.base_nlp": str(root / "training/base-nlp/en"), + "paths.mentions_candidates": str(root / "corpora" / "mewsli_9" / "mentions_candidates.pkl"), + "paths.language": "en", + "training.max_steps": 10, + } + ) diff --git a/benchmarks/nel/scripts/train.sh b/benchmarks/nel/src/train.sh similarity index 66% rename from benchmarks/nel/scripts/train.sh rename to benchmarks/nel/src/train.sh index 6a615ecab..c0c3ff0db 100644 --- a/benchmarks/nel/scripts/train.sh +++ b/benchmarks/nel/src/train.sh @@ -9,14 +9,16 @@ gpu_id="${6:--1}" # (4) config file name, # (5) max. steps. # (6) GPU information if GPU is to be used. -PYTHONPATH=scripts python -m spacy train configs/$4 \ +PYTHONPATH='src' python -m spacy train configs/$4 \ --paths.dataset_name $1 \ --output training/$1/$2 \ --paths.train corpora/$1/train.spacy \ --paths.dev corpora/$1/dev.spacy \ --paths.kb wikid/output/$3/kb \ - --paths.base_nlp wikid/output/$3/nlp \ + --paths.db wikid/output/$3/wiki.sqlite3 \ + --paths.base_nlp training/base-nlp/$3 \ + --paths.mentions_candidates corpora/$1/mentions_candidates.pkl \ --paths.language $3 \ --training.max_steps $5 \ - -c scripts/custom_functions.py \ + -c src/custom_functions.py \ --gpu-id $gpu_id \ No newline at end of file diff --git a/benchmarks/nel/scripts/utils.py b/benchmarks/nel/src/utils.py similarity index 100% rename from benchmarks/nel/scripts/utils.py rename to benchmarks/nel/src/utils.py diff --git a/benchmarks/nel/test_nel_benchmark.py b/benchmarks/nel/test_nel_benchmark.py index 54b26d217..55be2f3cf 100644 --- a/benchmarks/nel/test_nel_benchmark.py +++ b/benchmarks/nel/test_nel_benchmark.py @@ -11,7 +11,6 @@ def test_nel_benchmark(): overrides_key = "SPACY_CONFIG_OVERRIDES" root = Path(__file__).parent - project_run(root, "download_mewsli9", capture=True) project_run(root, "download_model", capture=True) project_run(root, "wikid_clone", capture=True) @@ -25,8 +24,9 @@ def test_nel_benchmark(): # Re-enable config overrides, if set before. if overrides: os.environ[overrides_key] = overrides - project_run(root, "parse_corpus", capture=True) + project_run(root, "extract_annotations", capture=True) project_run(root, "compile_corpora", capture=True) + project_run(root, "retrieve_mentions_candidates", capture=True) project_run(root, "train", capture=True, overrides={"vars.training_max_steps": 1, "vars.training_max_epochs": 1}) project_run(root, "evaluate", capture=True) project_run(root, "compare_evaluations", capture=True)