Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion kgdata/wikidata/datasets/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


@lru_cache()
def entities(lang: str = "en") -> Dataset[WDEntity]:
def entities(lang: str = "en", with_dep: bool = False) -> Dataset[WDEntity]:
"""Normalize Wikidata entity from Wikidata entity json dumps.

In the json dumps, an entity can linked to an entity that either:
Expand All @@ -38,6 +38,14 @@ def entities(lang: str = "en") -> Dataset[WDEntity]:
"""
cfg = WikidataDirCfg.get_instance()

if not with_dep:
return Dataset(
cfg.entities / lang / "*.zst",
deserialize=deser_entity,
name=f"entities/{lang}/fixed",
dependencies=[],
)

if cfg.has_json_dump():
prelim_ent_ds = entity_dump()
prelim_ent_rdd = prelim_ent_ds.get_extended_rdd().map(
Expand Down
6 changes: 3 additions & 3 deletions kgdata/wikidata/datasets/entity_types_and_degrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ class EntityTypeAndDegree(Record):


@lru_cache()
def entity_types_and_degrees() -> Dataset[EntityTypeAndDegree]:
def entity_types_and_degrees(with_dep: bool = False) -> Dataset[EntityTypeAndDegree]:
cfg = WikidataDirCfg.get_instance()
ds = Dataset(
cfg.entity_types_and_degrees / "*.gz",
deserialize=EntityTypeAndDegree.deser,
name="entity-types-and-degrees",
dependencies=[entity_all_types(), entity_degrees()],
dependencies=[entity_all_types(), entity_degrees()] if with_dep else [],
)
if not ds.has_complete_data():
(
Expand All @@ -48,7 +48,7 @@ def entity_types_and_degrees() -> Dataset[EntityTypeAndDegree]:


def merge_type_degree(
tup: tuple[str, tuple[EntityAllTypes, EntityDegree]]
tup: tuple[str, tuple[EntityAllTypes, EntityDegree]],
) -> EntityTypeAndDegree:
return EntityTypeAndDegree(
id=tup[0],
Expand Down
5 changes: 2 additions & 3 deletions kgdata/wikidata/datasets/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import List

import orjson

from kgdata.dataset import Dataset
from kgdata.db import deser_from_dict, ser_to_dict
from kgdata.misc.hierarchy import build_ancestors
Expand All @@ -19,7 +18,7 @@
from kgdata.wikidata.models.wdentity import WDEntity


def properties(lang="en") -> Dataset[WDProperty]:
def properties(lang="en", with_dep: bool = False) -> Dataset[WDProperty]:
cfg = WikidataDirCfg.get_instance()

if not does_result_dir_exist(cfg.properties / "ids"):
Expand All @@ -39,7 +38,7 @@ def properties(lang="en") -> Dataset[WDProperty]:
cfg.properties / f"{subdir}-{lang}/*.gz",
deserialize=partial(deser_from_dict, WDProperty),
name=f"properties/{subdir}/{lang}",
dependencies=[entities(lang)],
dependencies=[entities(lang)] if with_dep else [],
)
basic_ds = get_ds("basic")
full_ds = get_ds("full")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "kgdata"
version = "7.0.10"
version = "7.0.11"
description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)"
authors = ["Binh Vu <binh@toan2.com>"]
license = "MIT"
Expand Down