flairNLP · addie9800 · Oct 16, 2025 · Oct 26, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
@@ -9,7 +9,7 @@
 from fundus import Crawler, PublisherCollection
 from fundus.logging import create_logger, set_log_level
 from fundus.publishers.base_objects import Publisher
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.filter import RequiresAll
 from fundus.scraping.html import WebSource
 from fundus.scraping.scraper import BaseScraper

diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py
@@ -11,7 +11,7 @@
 
 from fundus import Crawler, PublisherCollection
 from fundus.publishers.base_objects import Publisher, PublisherGroup
-from fundus.scraping.article import Article
+from fundus.scraping.publication import Article
 from fundus.scraping.session import socket_timeout
 
 

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
@@ -3,9 +3,9 @@
 from langdetect import DetectorFactory
 
 from fundus.publishers import PublisherCollection
-from fundus.scraping.article import Article
 from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
 from fundus.scraping.filter import Requires
+from fundus.scraping.publication import Article
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -332,14 +332,14 @@ def __eq__(self, other: object) -> bool:
 class TextSequenceTree(ABC):
     """Base class to traverse and build trees of TextSequence."""
 
-    def as_text_sequence(self) -> TextSequence:
-        texts = [text for tl in self.df_traversal() for text in tl]
+    def as_text_sequence(self, iterator: Optional[Iterator[Any]] = None) -> TextSequence:
+        texts = [text for tl in self.df_traversal(iterator=iterator) for text in tl]
         return TextSequence(texts)
 
-    def text(self, join_on: str = "\n\n") -> str:
-        return join_on.join(self.as_text_sequence())
+    def text(self, join_on: str = "\n\n", iterator: Optional[Iterator[Any]] = None) -> str:
+        return join_on.join(self.as_text_sequence(iterator=iterator))
 
-    def df_traversal(self) -> Iterable[TextSequence]:
+    def df_traversal(self, iterator: Optional[Iterator[Any]] = None) -> Iterable[TextSequence]:
         def recursion(o: object):
             if isinstance(o, TextSequence):
                 yield o
@@ -349,7 +349,7 @@ def recursion(o: object):
             else:
                 yield o
 
-        for value in self:
+        for value in iter(self) if not iterator else iterator:
             yield from recursion(value)
 
     @abstractmethod
@@ -413,6 +413,54 @@ def __bool__(self):
         return any(bool(section) for section in self.sections)
 
 
+@dataclass
+class LiveTickerBody(TextSequenceTree):
+    summary: TextSequence
+    entries: List[ArticleBody]
+    entry_meta_information: List[Dict[str, Any]]
+
+    def serialize(self) -> Dict[str, Any]:
+        return {
+            "summary": list(self.summary),
+            "entries": [entry.serialize() for entry in self.entries],
+            "entry_meta_information": self.entry_meta_information,
+        }
+
+    @classmethod
+    def deserialize(cls, serialized: Dict[str, Any]) -> Self:
+        return cls(
+            summary=TextSequence(serialized["summary"]),
+            entries=[ArticleBody.deserialize(entry) for entry in serialized["entries"]],
+            entry_meta_information=serialized["entry_meta_information"],
+        )
+
+    def __bool__(self):
+        return any(bool(entry) for entry in self.entries)
+
+    def __iter__(self) -> Iterator[Any]:
+        field_values = [
+            getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
+        ]
+        field_values.extend([entry.sections for entry in self.entries])
+        yield from field_values
+
+    def __meta_iter__(self) -> Iterator[Any]:
+        field_values = [
+            getattr(self, f.name) for f in fields(self) if f.name not in ("entry_meta_information", "entries")
+        ]
+        for entry, meta in zip(self.entries, self.entry_meta_information):
+            field_values.append(
+                TextSequence(
+                    [f"LiveTicker entry from {meta.get('publishing_date')} by {', '.join(meta.get('authors', []))}"]
+                )
+            )
+            field_values.extend([entry.sections])
+        yield from field_values
+
+    def pretty_print(self):
+        return self.text(iterator=self.__meta_iter__())
+
+
 @total_ordering
 @dataclass
 class Dimension(DataclassSerializationMixin):

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -10,6 +10,7 @@
 from datetime import datetime
 from functools import total_ordering
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Dict,
@@ -33,7 +34,8 @@
 import validators
 from dateutil import parser
 from lxml.cssselect import CSSSelector
-from lxml.etree import XPath
+from lxml.etree import XPath, tostring
+from lxml.html import Element
 
 from fundus.logging import create_logger
 from fundus.parser.data import (
@@ -44,6 +46,7 @@
     Image,
     ImageVersion,
     LinkedDataMapping,
+    LiveTickerBody,
     TextSequence,
 )
 from fundus.utils.regex import _get_match_dict
@@ -69,7 +72,7 @@ def normalize_whitespace(text: str) -> str:
 @total_ordering
 @dataclass(eq=False)
 class Node:
-    position: int
+    position: float
     node: lxml.html.HtmlElement = field(compare=False)
     _break_selector: ClassVar[XPath] = XPath("*//br")
 
@@ -124,10 +127,37 @@ class SummaryNode(Node):
     pass
 
 
+@dataclass(eq=False)
+class BoundaryNode(Node):
+    def __post_init__(self):
+        self.position -= 0.5  # in case a content node is also a boundary node, we want the boundary to come first
+
+
 class SubheadNode(Node):
     pass
 
 
+@dataclass(eq=False)
+class DateNode(Node):
+    _datetime_selector = XPath("./@datetime")
+    _timestamp: Optional[str] = None
+
+    def __post_init__(self):
+        if (timestamp := self._datetime_selector(self.node)) is not None:
+            self._timestamp = " ".join(generic_nodes_to_text(timestamp))
+
+    def text_content(self, excluded_tags: Optional[List[str]] = None, tag_filter: Optional[XPath] = None) -> str:
+        return self._timestamp if self._timestamp else super().text_content(excluded_tags, tag_filter)
+
+
+class AuthorNode(Node):
+    pass
+
+
+class ImageNode(Node):
+    pass
+
+
 class ParagraphNode(Node):
     pass
 
@@ -190,6 +220,105 @@ def extract_nodes(selector: XPath, node_type: Type[Node]) -> List[Node]:
     return ArticleBody(summary=summary, sections=sections)
 
 
+def extract_live_ticker_body_with_selector(
+    doc: lxml.html.HtmlElement,
+    paragraph_selector: XPath,
+    summary_selector: Optional[XPath] = None,
+    subheadline_selector: Optional[XPath] = None,
+    entry_boundary_selector: Optional[XPath] = None,
+    tag_filter: Optional[XPath] = None,
+    date_selector: Optional[XPath] = None,
+    author_selector: Optional[XPath] = None,
+    image_selector: Optional[XPath] = None,
+    image_selection_helper: Optional[Callable[[lxml.html.HtmlElement], List[Image]]] = None,
+) -> LiveTickerBody:
+    # depth first index for each element in tree
+    df_idx_by_ref = {element: i for i, element in enumerate(doc.iter())}
+
+    def extract_nodes(selector: XPath, node_type: Type[Node], root: lxml.html.HtmlElement = doc) -> List[Node]:
+        if not selector and node_type:
+            raise ValueError("Both a selector and node type are required")
+
+        return [node for element in selector(root) if (node := node_type(df_idx_by_ref[element], element))]
+
+    summary_nodes = extract_nodes(summary_selector, SummaryNode) if summary_selector else []
+    boundary_nodes = extract_nodes(entry_boundary_selector, BoundaryNode) if entry_boundary_selector else []
+    paragraph_nodes = extract_nodes(paragraph_selector, ParagraphNode)
+    subhead_nodes = extract_nodes(subheadline_selector, SubheadNode) if subheadline_selector else []
+    date_nodes = extract_nodes(date_selector, DateNode) if date_selector else []
+    author_nodes = extract_nodes(author_selector, AuthorNode) if author_selector else []
+    image_nodes = extract_nodes(image_selector, ImageNode) if image_selector else []
+    nodes = sorted(
+        summary_nodes + boundary_nodes + subhead_nodes + paragraph_nodes + date_nodes + author_nodes + image_nodes
+    )
+
+    if not nodes[: len(summary_nodes)] == summary_nodes:
+        raise ValueError(f"All summary nodes should be at the beginning of the article")
+
+    summary = TextSequence(
+        map(
+            lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
+            summary_nodes,
+        )
+    )
+
+    entries: List[ArticleBody] = []
+    entries_meta_information: List[Dict[str, Any]] = []
+    entry_nodes = more_itertools.split_at(nodes[len(summary_nodes) :], pred=lambda x: isinstance(x, BoundaryNode))
+
+    for entry in entry_nodes:
+        if not entry:
+            continue
+        content_nodes = filter(lambda x: isinstance(x, ParagraphNode) or isinstance(x, SubheadNode), entry)
+        instructions = more_itertools.split_when(content_nodes, pred=lambda x, y: type(x) != type(y))
+        subhead_nodes = []
+        paragraph_nodes = []
+        entry_date = None
+        entry_authors = []
+        entry_images: List[Image] = []
+        wrapper = Element("div")
+        for node in entry:
+            wrapper.append(node.node)
+            if isinstance(node, SubheadNode):
+                subhead_nodes.append(node)
+            elif isinstance(node, ParagraphNode):
+                paragraph_nodes.append(node)
+            elif isinstance(node, DateNode):
+                entry_date = generic_date_parsing("".join(node.text_content()))
+            elif isinstance(node, AuthorNode):
+                entry_authors = generic_author_parsing(node.text_content())
+            elif isinstance(node, ImageNode):
+                entry_images = image_selection_helper(node.node) if image_selection_helper else []
+            else:
+                raise ValueError(f"Unsupported node type: {type(node)}")
+
+        if not subhead_nodes or (paragraph_nodes and subhead_nodes[0] > paragraph_nodes[0]):
+            first = next(instructions)
+            instructions = itertools.chain([first, []], instructions)
+
+        sections: List[ArticleSection] = []
+
+        for chunk in more_itertools.chunked(instructions, 2):
+            if len(chunk) == 1:
+                chunk.append([])
+            texts = [
+                list(
+                    map(
+                        lambda x: normalize_whitespace(x.text_content(excluded_tags=["script"], tag_filter=tag_filter)),
+                        c,
+                    )
+                )
+                for c in chunk
+            ]
+            sections.append(ArticleSection(*map(TextSequence, texts)))
+
+        entries.append(ArticleBody(summary=TextSequence([]), sections=sections))
+        entries_meta_information.append(
+            {"publishing_date": entry_date, "authors": entry_authors, "images": entry_images, "html": tostring(wrapper)}
+        )
+    return LiveTickerBody(summary=summary, entries=entries, entry_meta_information=entries_meta_information)
+
+
 _ld_node_selector = XPath("//script[@type='application/ld+json']")
 _json_pattern = re.compile(r"(?P<json>{[\s\S]*}|\[\s*{[\s\S]*}\s*](?!\s*}))")
 _json_undefined = re.compile(r'(?P<key>"[^"]*?"):\s*undefined')
@@ -502,10 +631,10 @@ class CustomParserInfo(parser.parserinfo):
         ("Jul", "July", "Juli"),
         ("Aug", "August"),
         ("Sep", "Sept", "September"),
-        ("Oct", "October", "Oktober", "Okt"),
+        ("Oct", "October", "Oktober", "Okt"),  # type: ignore[list-item]
         ("Nov", "November"),
-        ("Dec", "December", "Dezember", "Dez"),
-    ]
+        ("Dec", "December", "Dezember", "Dez"),  # type: ignore[list-item]
+    ]  # type ignore due to types-python-dateutil==2.9.0.20251008, see https://github.com/flairNLP/fundus/issues/806
 
 
 def generic_date_parsing(date_str: Optional[str]) -> Optional[datetime]:

diff --git a/src/fundus/publishers/de/sz.py b/src/fundus/publishers/de/sz.py
@@ -1,12 +1,14 @@
 import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.data import LiveTickerBody
 from fundus.parser.utility import (
     extract_article_body_with_selector,
+    extract_live_ticker_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
     generic_topic_parsing,
@@ -65,3 +67,36 @@ class V1_1(V1):
             "//div[@itemprop='articleBody']//h3[@data-manual='subheadline'] |"
             "//div[@itemprop='articleBody']//h2[@data-manual='subheadline']"
         )
+
+        _live_ticker_boundary_selector = XPath("//div[contains(@class, 'event__body')]")
+        _live_ticker_paragraph_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//li|//article//div[contains(@class, 'event__body')]//div[@class='tik4-rich-text tik4-rich-text--de']/div"
+        )
+        _live_ticker_subheadline_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//h2|//article//div[contains(@class, 'event__body')]//h3"
+        )
+        _live_ticker_date_selector = XPath("//article//div[contains(@class, 'event__body')]//time")
+        _live_ticker_author_selector = XPath(
+            "//article//div[contains(@class, 'event__body')]//div[@class='tik4-author__name']"
+        )
+        _live_ticker_summary_selector = XPath("//p[@data-manual='teaserText']")
+
+        @attribute
+        def body(self) -> Optional[Union[ArticleBody, LiveTickerBody]]:
+            if not self._live_ticker_boundary_selector(self.precomputed.doc):
+                return extract_article_body_with_selector(
+                    self.precomputed.doc,
+                    summary_selector=self._summary_selector,
+                    subheadline_selector=self._subheadline_selector,
+                    paragraph_selector=self._paragraph_selector,
+                )
+            else:
+                return extract_live_ticker_body_with_selector(
+                    self.precomputed.doc,
+                    summary_selector=self._live_ticker_summary_selector,
+                    subheadline_selector=self._live_ticker_subheadline_selector,
+                    paragraph_selector=self._live_ticker_paragraph_selector,
+                    entry_boundary_selector=self._live_ticker_boundary_selector,
+                    author_selector=self._live_ticker_author_selector,
+                    date_selector=self._live_ticker_date_selector,
+                )