Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from fundus import Crawler, PublisherCollection
from fundus.logging import create_logger, set_log_level
from fundus.publishers.base_objects import Publisher
from fundus.scraping.article import Article
from fundus.scraping.publication import Article
from fundus.scraping.filter import RequiresAll
from fundus.scraping.html import WebSource
from fundus.scraping.scraper import BaseScraper
Expand Down
2 changes: 1 addition & 1 deletion scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from fundus import Crawler, PublisherCollection
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.scraping.article import Article
from fundus.scraping.publication import Article
from fundus.scraping.session import socket_timeout


Expand Down
2 changes: 1 addition & 1 deletion src/fundus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from langdetect import DetectorFactory

from fundus.publishers import PublisherCollection
from fundus.scraping.article import Article
from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
from fundus.scraping.filter import Requires
from fundus.scraping.publication import Article
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

__module_path__ = pathlib.Path(__file__).parent
Expand Down
60 changes: 54 additions & 6 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,14 +332,14 @@ def __eq__(self, other: object) -> bool:
class TextSequenceTree(ABC):
"""Base class to traverse and build trees of TextSequence."""

def as_text_sequence(self) -> TextSequence:
texts = [text for tl in self.df_traversal() for text in tl]
def as_text_sequence(self, iterator: Optional[Iterator[Any]] = None) -> TextSequence:
texts = [text for tl in self.df_traversal(iterator=iterator) for text in tl]
return TextSequence(texts)

def text(self, join_on: str = "\n\n") -> str:
return join_on.join(self.as_text_sequence())
def text(self, join_on: str = "\n\n", iterator: Optional[Iterator[Any]] = None) -> str:
return join_on.join(self.as_text_sequence(iterator=iterator))

def df_traversal(self) -> Iterable[TextSequence]:
def df_traversal(self, iterator: Optional[Iterator[Any]] = None) -> Iterable[TextSequence]:
def recursion(o: object):
if isinstance(o, TextSequence):
yield o
Expand All @@ -349,7 +349,7 @@ def recursion(o: object):
else:
yield o

for value in self:
for value in iter(self) if not iterator else iterator:
yield from recursion(value)

@abstractmethod
Expand Down Expand Up @@ -413,6 +413,54 @@ def __bool__(self):
return any(bool(section) for section in self.sections)


@dataclass
class LiveTickerBody(TextSequenceTree):
summary: TextSequence
entries: List[ArticleBody]
entry_meta_information: List[Dict[str, Any]]

def serialize(self) -> Dict[str, Any]:
return {
"summary": list(self.summary),
"entries": [entry.serialize() for entry in self.entries],
"entry_meta_information": self.entry_meta_information,
}

@classmethod
def deserialize(cls, serialized: Dict[str, Any]) -> Self:
return cls(
summary=TextSequence(serialized["summary"]),
entries=[ArticleBody.deserialize(entry) for entry in serialized["entries"]],
entry_meta_information=serialized["entry_meta_information"],
)

def __bool__(self):
return any(bool(entry) for entry in self.entries)

def __iter__(self) -> Iterator[Any]:
field_values = [
getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
]
field_values.extend([entry.sections for entry in self.entries])
yield from field_values

def __meta_iter__(self) -> Iterator[Any]:
field_values = [
getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
]
for entry, meta in zip(self.entries, self.entry_meta_information):
field_values.append(
TextSequence(
[f"LiveTicker entry from {meta.get('publishing_date')} by {', '.join(meta.get('authors', []))}"]
)
)
field_values.extend([entry.sections])
yield from field_values

def pretty_print(self):
return self.text(iterator=self.__meta_iter__())


@total_ordering
@dataclass
class Dimension(DataclassSerializationMixin):
Expand Down
139 changes: 134 additions & 5 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from datetime import datetime
from functools import total_ordering
from typing import (
Any,
Callable,
ClassVar,
Dict,
Expand All @@ -33,7 +34,8 @@
import validators
from dateutil import parser
from lxml.cssselect import CSSSelector
from lxml.etree import XPath
from lxml.etree import XPath, tostring
from lxml.html import Element

from fundus.logging import create_logger
from fundus.parser.data import (
Expand All @@ -44,6 +46,7 @@
Image,
ImageVersion,
LinkedDataMapping,
LiveTickerBody,
TextSequence,
)
from fundus.utils.regex import _get_match_dict
Expand All @@ -69,7 +72,7 @@ def normalize_whitespace(text: str) -> str:
@total_ordering
@dataclass(eq=False)
class Node:
position: int
position: float
node: lxml.html.HtmlElement = field(compare=False)
_break_selector: ClassVar[XPath] = XPath("*//br")

Expand Down Expand Up @@ -124,10 +127,37 @@ class SummaryNode(Node):
pass


@dataclass(eq=False)
class BoundaryNode(Node):
def __post_init__(self):
self.position -= 0.5 # in case a content node is also a boundary node, we want the boundary to come first


class SubheadNode(Node):
pass


@dataclass(eq=False)
class DateNode(Node):
_datetime_selector = XPath("./@datetime")
_timestamp: Optional[str] = None

def __post_init__(self):
if (timestamp := self._datetime_selector(self.node)) is not None:
self._timestamp = " ".join(generic_nodes_to_text(timestamp))

def text_content(self, excluded_tags: Optional[List[str]] = None, tag_filter: Optional[XPath] = None) -> str:
return self._timestamp if self._timestamp else super().text_content(excluded_tags, tag_filter)


class AuthorNode(Node):
pass


class ImageNode(Node):
pass


class ParagraphNode(Node):
pass

Expand Down Expand Up @@ -190,6 +220,105 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
return ArticleBody(summary=summary, sections=sections)


def extract_live_ticker_body_with_selector(
doc: lxml.html.HtmlElement,
paragraph_selector: XPath,
summary_selector: Optional[XPath] = None,
subheadline_selector: Optional[XPath] = None,
entry_boundary_selector: Optional[XPath] = None,
tag_filter: Optional[XPath] = None,
date_selector: Optional[XPath] = None,
author_selector: Optional[XPath] = None,
image_selector: Optional[XPath] = None,
image_selection_helper: Optional[Callable[[lxml.html.HtmlElement], List[Image]]] = None,
) -> LiveTickerBody:
# depth first index for each element in tree
df_idx_by_ref = {element: i for i, element in enumerate(doc.iter())}

def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlElement = doc) -> List[Node]:
if not selector and node_type:
raise ValueError("Both a selector and node type are required")

return [node for element in selector(root) if (node := node_type(df_idx_by_ref[element], element))]

summary_nodes = extract_nodes(summary_selector, SummaryNode) if summary_selector else []
boundary_nodes = extract_nodes(entry_boundary_selector, BoundaryNode) if entry_boundary_selector else []
paragraph_nodes = extract_nodes(paragraph_selector, ParagraphNode)
subhead_nodes = extract_nodes(subheadline_selector, SubheadNode) if subheadline_selector else []
date_nodes = extract_nodes(date_selector, DateNode) if date_selector else []
author_nodes = extract_nodes(author_selector, AuthorNode) if author_selector else []
image_nodes = extract_nodes(image_selector, ImageNode) if image_selector else []
nodes = sorted(
summary_nodes + boundary_nodes + subhead_nodes + paragraph_nodes + date_nodes + author_nodes + image_nodes
)

if not nodes[: len(summary_nodes)] == summary_nodes:
raise ValueError(f"All summary nodes should be at the beginning of the article")

summary = TextSequence(
map(
lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
summary_nodes,
)
)

entries: List[ArticleBody] = []
entries_meta_information: List[Dict[str, Any]] = []
entry_nodes = more_itertools.split_at(nodes[len(summary_nodes) :], pred=lambda x: isinstance(x, BoundaryNode))

for entry in entry_nodes:
if not entry:
continue
content_nodes = filter(lambda x: isinstance(x, ParagraphNode) or isinstance(x, SubheadNode), entry)
instructions = more_itertools.split_when(content_nodes, pred=lambda x, y: type(x) != type(y))
subhead_nodes = []
paragraph_nodes = []
entry_date = None
entry_authors = []
entry_images: List[Image] = []
wrapper = Element("div")
for node in entry:
wrapper.append(node.node)
if isinstance(node, SubheadNode):
subhead_nodes.append(node)
elif isinstance(node, ParagraphNode):
paragraph_nodes.append(node)
elif isinstance(node, DateNode):
entry_date = generic_date_parsing("".join(node.text_content()))
elif isinstance(node, AuthorNode):
entry_authors = generic_author_parsing(node.text_content())
elif isinstance(node, ImageNode):
entry_images = image_selection_helper(node.node) if image_selection_helper else []
else:
raise ValueError(f"Unsupported node type: {type(node)}")

if not subhead_nodes or (paragraph_nodes and subhead_nodes[0] > paragraph_nodes[0]):
first = next(instructions)
instructions = itertools.chain([first, []], instructions)

sections: List[ArticleSection] = []

for chunk in more_itertools.chunked(instructions, 2):
if len(chunk) == 1:
chunk.append([])
texts = [
list(
map(
lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
c,
)
)
for c in chunk
]
sections.append(ArticleSection(*map(TextSequence, texts)))

entries.append(ArticleBody(summary=TextSequence([]), sections=sections))
entries_meta_information.append(
{"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": tostring(wrapper)}
)
return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)


_ld_node_selector = XPath("//script[@type='application/ld+json']")
_json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")
_json_undefined = re.compile(r'(?P<key>"[^"]*?"):\s*undefined')
Expand Down Expand Up @@ -502,10 +631,10 @@ class CustomParserInfo(parser.parserinfo):
("Jul", "July", "Juli"),
("Aug", "August"),
("Sep", "Sept", "September"),
("Oct", "October", "Oktober", "Okt"),
("Oct", "October", "Oktober", "Okt"), # type: ignore[list-item]
("Nov", "November"),
("Dec", "December", "Dezember", "Dez"),
]
("Dec", "December", "Dezember", "Dez"), # type: ignore[list-item]
] # type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806


def generic_date_parsing(date_str: Optional[str]) -> Optional[datetime]:
Expand Down
37 changes: 36 additions & 1 deletion src/fundus/publishers/de/sz.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import datetime
from typing import List, Optional
from typing import List, Optional, Union

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.data import LiveTickerBody
from fundus.parser.utility import (
extract_article_body_with_selector,
extract_live_ticker_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
Expand Down Expand Up @@ -65,3 +67,36 @@ class V1_1(V1):
"//div[@itemprop='articleBody']//h3[@data-manual='subheadline'] |"
"//div[@itemprop='articleBody']//h2[@data-manual='subheadline']"
)

_live_ticker_boundary_selector = XPath("//div[contains(@class, 'event__body')]")
_live_ticker_paragraph_selector = XPath(
"//article//div[contains(@class, 'event__body')]//li|//article//div[contains(@class, 'event__body')]//div[@class='tik4-rich-text tik4-rich-text--de']/div"
)
_live_ticker_subheadline_selector = XPath(
"//article//div[contains(@class, 'event__body')]//h2|//article//div[contains(@class, 'event__body')]//h3"
)
_live_ticker_date_selector = XPath("//article//div[contains(@class, 'event__body')]//time")
_live_ticker_author_selector = XPath(
"//article//div[contains(@class, 'event__body')]//div[@class='tik4-author__name']"
)
_live_ticker_summary_selector = XPath("//p[@data-manual='teaserText']")

@attribute
def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
if not self._live_ticker_boundary_selector(self.precomputed.doc):
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)
else:
return extract_live_ticker_body_with_selector(
self.precomputed.doc,
summary_selector=self._live_ticker_summary_selector,
subheadline_selector=self._live_ticker_subheadline_selector,
paragraph_selector=self._live_ticker_paragraph_selector,
entry_boundary_selector=self._live_ticker_boundary_selector,
author_selector=self._live_ticker_author_selector,
date_selector=self._live_ticker_date_selector,
)
Loading