diff --git a/kgdata/wikidata/datasets/entities.py b/kgdata/wikidata/datasets/entities.py index 24bac7a..907537f 100644 --- a/kgdata/wikidata/datasets/entities.py +++ b/kgdata/wikidata/datasets/entities.py @@ -21,7 +21,7 @@ @lru_cache() -def entities(lang: str = "en") -> Dataset[WDEntity]: +def entities(lang: str = "en", with_dep: bool = False) -> Dataset[WDEntity]: """Normalize Wikidata entity from Wikidata entity json dumps. In the json dumps, an entity can linked to an entity that either: @@ -38,6 +38,14 @@ def entities(lang: str = "en") -> Dataset[WDEntity]: """ cfg = WikidataDirCfg.get_instance() + if not with_dep: + return Dataset( + cfg.entities / lang / "*.zst", + deserialize=deser_entity, + name=f"entities/{lang}/fixed", + dependencies=[], + ) + if cfg.has_json_dump(): prelim_ent_ds = entity_dump() prelim_ent_rdd = prelim_ent_ds.get_extended_rdd().map( diff --git a/kgdata/wikidata/datasets/entity_types_and_degrees.py b/kgdata/wikidata/datasets/entity_types_and_degrees.py index 27a6543..fedd5e9 100644 --- a/kgdata/wikidata/datasets/entity_types_and_degrees.py +++ b/kgdata/wikidata/datasets/entity_types_and_degrees.py @@ -25,13 +25,13 @@ class EntityTypeAndDegree(Record): @lru_cache() -def entity_types_and_degrees() -> Dataset[EntityTypeAndDegree]: +def entity_types_and_degrees(with_dep: bool = False) -> Dataset[EntityTypeAndDegree]: cfg = WikidataDirCfg.get_instance() ds = Dataset( cfg.entity_types_and_degrees / "*.gz", deserialize=EntityTypeAndDegree.deser, name="entity-types-and-degrees", - dependencies=[entity_all_types(), entity_degrees()], + dependencies=[entity_all_types(), entity_degrees()] if with_dep else [], ) if not ds.has_complete_data(): ( @@ -48,7 +48,7 @@ def entity_types_and_degrees() -> Dataset[EntityTypeAndDegree]: def merge_type_degree( - tup: tuple[str, tuple[EntityAllTypes, EntityDegree]] + tup: tuple[str, tuple[EntityAllTypes, EntityDegree]], ) -> EntityTypeAndDegree: return EntityTypeAndDegree( id=tup[0], diff --git a/kgdata/wikidata/datasets/properties.py b/kgdata/wikidata/datasets/properties.py index b2acb5f..40d10fc 100644 --- a/kgdata/wikidata/datasets/properties.py +++ b/kgdata/wikidata/datasets/properties.py @@ -3,7 +3,6 @@ from typing import List import orjson - from kgdata.dataset import Dataset from kgdata.db import deser_from_dict, ser_to_dict from kgdata.misc.hierarchy import build_ancestors @@ -19,7 +18,7 @@ from kgdata.wikidata.models.wdentity import WDEntity -def properties(lang="en") -> Dataset[WDProperty]: +def properties(lang="en", with_dep: bool = False) -> Dataset[WDProperty]: cfg = WikidataDirCfg.get_instance() if not does_result_dir_exist(cfg.properties / "ids"): @@ -39,7 +38,7 @@ def properties(lang="en") -> Dataset[WDProperty]: cfg.properties / f"{subdir}-{lang}/*.gz", deserialize=partial(deser_from_dict, WDProperty), name=f"properties/{subdir}/{lang}", - dependencies=[entities(lang)], + dependencies=[entities(lang)] if with_dep else [], ) basic_ds = get_ds("basic") full_ds = get_ds("full") diff --git a/pyproject.toml b/pyproject.toml index 79d1708..8731b06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "kgdata" -version = "7.0.10" +version = "7.0.11" description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)" authors = ["Binh Vu "] license = "MIT"