diff --git a/edgar/company_reports/_base.py b/edgar/company_reports/_base.py index b52ce6ee..349c0472 100644 --- a/edgar/company_reports/_base.py +++ b/edgar/company_reports/_base.py @@ -199,14 +199,26 @@ def view(self, item_or_part: str): if item_text: print(item_text) - def _focused_context(self, focus, detail: str = 'standard') -> str: + def _focused_context(self, focus, detail: str = 'standard', output_format: str = 'text') -> str: """Generate cross-cutting context for specific topic(s). Pulls statement line items, note content, and policies together. + + Args: + focus: Topic or list of topics + detail: 'minimal', 'standard', or 'full' + output_format: 'text' (default) or 'markdown' for GFM pipe tables """ if isinstance(focus, str): focus = [focus] + notes = self.notes + + # Markdown mode: delegate to Notes.to_markdown with focus filtering + if output_format == 'markdown' and notes: + return notes.to_markdown(detail=detail, focus=focus, optimize_for_llm=True) + + # Text mode (original behavior) form_label = self.form or 'Filing' lines = [] topic_str = ', '.join(focus) @@ -220,7 +232,6 @@ def _focused_context(self, focus, detail: str = 'standard') -> str: pass lines.append("") - notes = self.notes if not notes: lines.append("(No notes available)") return "\n".join(lines) diff --git a/edgar/company_reports/ten_k.py b/edgar/company_reports/ten_k.py index 4504ad6f..33584924 100644 --- a/edgar/company_reports/ten_k.py +++ b/edgar/company_reports/ten_k.py @@ -349,7 +349,8 @@ def id_parse_document(self, markdown:bool=False): def __str__(self): return f"""TenK('{self.company}')""" - def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = None) -> str: + def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = None, + output_format: str = 'text') -> str: """ AI-optimized context string. @@ -358,10 +359,11 @@ def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = focus: Optional topic or list of topics for cross-cutting context. When set, returns statement lines + note + policy for that topic. Example: focus='debt' or focus=['debt', 'revenue'] + output_format: 'text' (default) or 'markdown' for GFM with pipe tables """ # Handle focus mode — cross-cutting topic context if focus: - return self._focused_context(focus, detail) + return self._focused_context(focus, detail, output_format=output_format) from edgar.display.formatting import format_currency_short diff --git a/edgar/company_reports/ten_q.py b/edgar/company_reports/ten_q.py index 3e816349..17cf4070 100644 --- a/edgar/company_reports/ten_q.py +++ b/edgar/company_reports/ten_q.py @@ -79,7 +79,8 @@ def __init__(self, filing): def __str__(self): return f"""TenQ('{self.company}')""" - def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = None) -> str: + def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = None, + output_format: str = 'text') -> str: """ AI-optimized context string. @@ -88,10 +89,11 @@ def to_context(self, detail: str = 'standard', focus: 'str | list[str] | None' = focus: Optional topic or list of topics for cross-cutting context. When set, returns statement lines + note + policy for that topic. Example: focus='debt' or focus=['debt', 'revenue'] + output_format: 'text' (default) or 'markdown' for GFM with pipe tables """ # Handle focus mode — cross-cutting topic context if focus: - return self._focused_context(focus, detail) + return self._focused_context(focus, detail, output_format=output_format) from edgar.display.formatting import format_currency_short diff --git a/edgar/markdown.py b/edgar/markdown.py new file mode 100644 index 00000000..da29e65a --- /dev/null +++ b/edgar/markdown.py @@ -0,0 +1,1330 @@ +""" +Markdown formatting utilities for financial data. + +Converts HTML tables, XBRL data, and structured records into +LLM-optimized GitHub-Flavored Markdown. + +Key features: +- Smart table cell preprocessing (currency/percent merging) +- Intelligent column deduplication +- Noise filtering (XBRL metadata, verbose labels) +- Duplicate detection +- Markdown generation optimized for LLM readability +""" + +import re +from collections import deque +from typing import Optional, Tuple + +from bs4 import BeautifulSoup + +__all__ = [ + 'preprocess_currency_cells', + 'preprocess_percent_cells', + 'is_width_grid_row', + 'is_xbrl_metadata_table', + 'html_to_json', + 'list_of_dicts_to_table', + 'create_markdown_table', + 'process_content', + 'clean_text', + 'is_noise_text', + 'postprocess_text', +] + + +# ----------------------------- +# Text Utilities +# ----------------------------- + +def clean_text(text: str) -> str: + """Clean and normalize text.""" + if not text: + return "" + text = text.replace("\xa0", " ").replace(" ", " ") + return re.sub(r"\s+", " ", text).strip() + + +def is_noise_text(text: str) -> bool: + """Check if text is XBRL metadata or other noise.""" + text_lower = (text or "").lower() + + noise_patterns = [ + "reference 1:", + "http://fasb.org", + "http://www.xbrl.org", + "no definition available", + "namespace prefix:", + "balance type:", + "period type:", + "axis:", + "domain:", + "documentation of verbose label", + "documentation of label", + "verbose label", + "auth_ref", + ] + + return any(p in text_lower for p in noise_patterns) + + +def should_skip_duplicate(text: str, recent: deque, window: int = 8) -> bool: + """Check if text is a recent duplicate.""" + t = clean_text(text).lower() + if not t: + return True + return t in list(recent)[-window:] + + +def is_page_number(text: str) -> bool: + """Detect if text is a standalone page number (1-999).""" + text_clean = clean_text(text) + return bool(re.fullmatch(r'\d{1,3}', text_clean)) + + +def is_header_footer_text(text: str) -> bool: + """Detect common header/footer text that should be filtered.""" + text_lower = clean_text(text).lower() + + # Common header/footer patterns + patterns = [ + r'^table of contents?$', + r'^page \d+$', + r'^\d+$', # Standalone page numbers + r'^continued$', + r'^end of page$', + ] + + return any(re.match(pattern, text_lower) for pattern in patterns) + + +def format_page_reference(text: str, next_text: str = None) -> Optional[str]: + """ + Format page number references. + + If text is a bare page number followed by "Table of Contents", + replace with "page XX" or remove entirely based on context. + + Returns: + None if should be removed, formatted string otherwise + """ + if is_page_number(text): + # Check if next text is "Table of Contents" + if next_text and "table of contents" in clean_text(next_text).lower(): + # Skip both the page number and TOC text + return None + # Format as "page XX" + return f"page {clean_text(text)}" + return text + + +def postprocess_text(text: str) -> str: + """ + Post-process extracted text to filter page numbers and TOC references. + + Removes: + - Bare page numbers followed by "Table of Contents" + - Standalone "Table of Contents" text + - Formats standalone page numbers as "page XX" + + Args: + text: Text content to process + + Returns: + Cleaned text with filtered page references + """ + if not text: + return text + + lines = text.split('\n') + processed_lines = [] + skip_next = False + + for i, line in enumerate(lines): + if skip_next: + skip_next = False + continue + + line_clean = line.strip() + + # Skip standalone "Table of Contents" + if line_clean.lower() == 'table of contents': + continue + + # Check for bare page number + if is_page_number(line_clean): + # Look ahead for "Table of Contents" + if i + 1 < len(lines): + next_line = lines[i + 1].strip() + if next_line.lower() == 'table of contents': + # Skip both this line and next + skip_next = True + continue + # Standalone page number - format as "page XX" + processed_lines.append(f"page {line_clean}") + else: + processed_lines.append(line) + + return '\n'.join(processed_lines) + + +# ----------------------------- +# Subsection Detection +# ----------------------------- + +def is_subsection_heading(element) -> tuple[bool, str]: + """ + Detect if an element is a subsection heading. + + Subsection headings are typically: + - tags with font-weight:700 (bold) or font-style:italic + - Standalone in parent div (no siblings except whitespace) + - Short text (< 80 chars) starting with capital letter + - Parent div has top margin + + Returns: + tuple[bool, str]: (is_subsection, heading_level) + heading_level is "###" for bold or "####" for italic + """ + # Must be a span or div tag + if element.name not in ('span', 'div'): + return False, "" + + # Get text + text = element.get_text().strip() + + # Must be short (< 80 chars) and start with capital + if not text or len(text) > 80: + return False, "" + + if not text[0].isupper(): + return False, "" + + # Check for noise patterns + if text.lower() in ('table of contents', 'page'): + return False, "" + + # Exclude form headers and common SEC filing text + text_lower = text.lower() + + # Exact match filters (these are complete section names to exclude) + exact_noise = [ + 'united states', 'securities and exchange commission', 'washington', + 'commission file number', 'form 10-k', 'form 10-q', + 'signatures', 'part i', 'part ii', 'part iii', 'part iv', + 'table of contents' + ] + if any(keyword == text_lower for keyword in exact_noise): + return False, "" + + # Prefix match filters (exclude if text starts with these) + prefix_noise = ['exhibit', 'index to'] + if any(text_lower.startswith(keyword) for keyword in prefix_noise): + return False, "" + + # Exclude very short text (likely abbreviations or form codes) + if len(text) < 4: + return False, "" + + # For div elements, check if it contains a single span child + if element.name == 'div': + # Check if this div contains a single span as subsection + spans = element.find_all('span', recursive=False) + if len(spans) == 1: + span = spans[0] + span_text = span.get_text().strip() + # Check if span text matches the div text (meaning it's the only content) + if span_text == text: + element = span # Use the span for style checking + else: + return False, "" + else: + return False, "" + + # Must have siblings check (for span elements) + parent = element.parent + if parent: + # Get all children, filtering out whitespace-only text nodes + # Note: NavigableString has name=None, Tag elements have name set + children = [ + child for child in parent.children + if (hasattr(child, 'name') and child.name is not None) or (isinstance(child, str) and child.strip()) + ] + + # If more than one non-whitespace child, it's not standalone + if len(children) > 1: + return False, "" + + # Check parent style for top margin (indicates section break) + parent_style = parent.get('style', '') + if 'margin-top' not in parent_style: + return False, "" + + # Exclude centered text (usually form headers) + if 'text-align:center' in parent_style: + return False, "" + + # Check style attributes for bold or italic + style = element.get('style', '') + + # Determine heading level based on style + is_bold = 'font-weight:700' in style or 'font-weight:bold' in style + is_italic = 'font-style:italic' in style + + if is_bold: + return True, "###" # Level 1 subsection + elif is_italic: + return True, "####" # Level 2 subsection + + return False, "" + + +# ----------------------------- +# Table Preprocessing +# -------------------------------------- + +def is_xbrl_metadata_table(soup_table) -> bool: + """Detect XBRL metadata tables (should be skipped).""" + text = soup_table.get_text().lower() + + if "namespace prefix" in text or "xbrli:string" in text: + return True + + if "us-gaap_" in text: + # Check if it's actual financial data (has $ and years) + if "$" in text and re.search(r"20\d{2}", text): + return False + return True + + return False + + +# ----------------------------- +# Table Title Extraction +# ----------------------------- + +def _is_valid_title(text: str) -> bool: + """ + Validate if text is a reasonable table title. + + Args: + text: Candidate title string + + Returns: + True if text is a valid title + """ + if not text: + return False + + text = text.strip() + + # Length check (flexible, not magic numbers) + if len(text) < 2 or len(text) > 200: + return False + + # Must contain at least one letter + if not any(c.isalpha() for c in text): + return False + + # Filter noise patterns + noise_patterns = [ + r'^\d+$', # Just numbers + r'^col_?\d+$', # Placeholder columns + r'^\$?[\d,]+\.?\d*$', # Just a number/currency + r'^[\s\-_]+$', # Just whitespace/separators + ] + + for pattern in noise_patterns: + if re.match(pattern, text, re.IGNORECASE): + return False + + return True + + +def _extract_from_caption_tag(table_element) -> Optional[str]: + """Extract title from HTML tag (HTML standard).""" + caption = table_element.find('caption') + if caption: + text = caption.get_text(strip=True) + if _is_valid_title(text): + return text + return None + + +def _extract_from_spanning_row(table_element, max_rows: int = 3) -> Optional[str]: + """ + Extract title from spanning row in first N rows. + + Improvements over original: + - Checks first N rows (not just first) + - More flexible colspan detection + - Better validation + - Non-destructive + + Args: + table_element: BeautifulSoup table element + max_rows: Maximum number of rows to check + + Returns: + Title string if found, None otherwise + """ + rows = table_element.find_all('tr') + + for row_idx in range(min(max_rows, len(rows))): + row = rows[row_idx] + cells = row.find_all(['th', 'td']) + + if not cells: + continue + + # Case 1: Single cell with large colspan + if len(cells) == 1: + cell = cells[0] + colspan = int(cell.get('colspan', 1)) + text = cell.get_text(strip=True) + + # If colspan is large (3+) and text is valid + if colspan >= 3 and _is_valid_title(text): + return text + + # Case 2: Multiple cells with identical text (merged visually) + texts = [c.get_text(strip=True) for c in cells] + unique_texts = set(t for t in texts if t) + + if len(unique_texts) == 1: + text = list(unique_texts)[0] + if _is_valid_title(text): + return text + + return None + + +def _infer_from_content(table_element) -> Optional[str]: + """ + Infer table title from content analysis. + + Strategies: + - Financial statement detection (Revenue + Net Income = Income Statement) + - Date-based patterns ("Year Ended", "Quarter Ended") + - Segment/category patterns + + Args: + table_element: BeautifulSoup table element + + Returns: + Inferred title if pattern matched, None otherwise + """ + rows = table_element.find_all('tr') + if not rows: + return None + + # Get text from first few rows + all_text = ' '.join([r.get_text(' ', strip=True).lower() for r in rows[:5]]) + + # Financial statement patterns + financial_patterns = { + 'Income Statement': ['revenue', 'net income'], + 'Statement of Operations': ['revenue', 'operating income'], + 'Balance Sheet': ['assets', 'liabilities', 'equity'], + 'Cash Flow Statement': ['cash flow', 'operating activities'], + 'Statement of Cash Flows': ['cash provided', 'operating activities'], + } + + for statement_name, keywords in financial_patterns.items(): + if all(kw in all_text for kw in keywords): + return statement_name + + # Date-based patterns + date_pattern = r'(year|years|quarter|quarters|month|months|period)s?\s+ended\s+\w+' + match = re.search(date_pattern, all_text, re.IGNORECASE) + if match: + return match.group(0).title() + + # Segment/geographic patterns + if 'segment' in all_text and 'revenue' in all_text: + return 'Segment Information' + if 'geographic' in all_text or 'region' in all_text: + return 'Geographic Information' + + return None + + +def extract_table_title( + table_element, + section_title: Optional[str] = None, + context: Optional[dict] = None +) -> Tuple[Optional[str], str]: + """ + Extract table title from multiple sources with priority hierarchy. + + This replaces the brittle derived_title logic with a robust multi-source approach. + + Priority order: + 1. HTML tag (HTML standard) + 2. HTML summary attribute + 3. Preceding heading from context + 4. Spanning row (improved - checks first 3 rows) + 5. Inferred from content + 6. Section title from context + + Args: + table_element: BeautifulSoup table element + section_title: Optional section title + context: Optional context dict with 'preceding_heading', etc. + + Returns: + Tuple of (title, source) where source indicates where title came from + For debugging and quality tracking + """ + context = context or {} + + # Priority 1: HTML tag (HTML standard) + caption_title = _extract_from_caption_tag(table_element) + if caption_title: + return caption_title, 'caption_tag' + + # Priority 2: HTML summary attribute + summary = table_element.get('summary') + if summary and _is_valid_title(summary): + return summary, 'summary_attr' + + # Priority 3: Preceding heading from context + if 'preceding_heading' in context: + heading = context['preceding_heading'] + if _is_valid_title(heading): + return heading, 'preceding_heading' + + # Priority 4: Spanning row (improved - checks first 3 rows) + spanning_title = _extract_from_spanning_row(table_element, max_rows=3) + if spanning_title: + return spanning_title, 'spanning_row' + + # Priority 5: Infer from table content + inferred_title = _infer_from_content(table_element) + if inferred_title: + return inferred_title, 'inferred' + + # Priority 6: Section title from context + if section_title and _is_valid_title(section_title) and section_title != "Table": + return section_title, 'section_context' + + # No title found + return None, 'none' + + +def is_width_grid_row(tr) -> bool: + """Detect layout rows (empty cells with width styling).""" + tds = tr.find_all(["td", "th"]) + if not tds: + return False + if tr.get_text(strip=True): # Has text content + return False + + width_cells = 0 + for td in tds: + style = (td.get("style") or "").lower() + if "width" in style: + width_cells += 1 + + return width_cells >= 6 and (width_cells / max(1, len(tds))) >= 0.6 + + +def preprocess_currency_cells(table_soup): + """ + Merge standalone currency symbols with adjacent values. + + Example: [$] [100] -> [$100] with colspan adjustment + + Note: Modifies the BeautifulSoup table in-place. + Safe for SEC financial tables where all rows have consistent structure. + """ + rows = table_soup.find_all("tr") + for row in rows: + cells = row.find_all(["td", "th"]) + i = 0 + while i < len(cells): + cell = cells[i] + txt = clean_text(cell.get_text()) + # If standalone $ symbol and has next cell + if txt in ["$"] and i + 1 < len(cells): + next_cell = cells[i + 1] + # Merge: prepend $ to next cell content + next_cell.string = txt + clean_text(next_cell.get_text()) + # Adjust colspan (next cell now spans both positions) + next_cell["colspan"] = str(int(next_cell.get("colspan", 1)) + 1) + # Remove the $ cell + cell.decompose() + i += 1 + + +def preprocess_percent_cells(table_soup): + """ + Merge standalone percent symbols with adjacent values. + + Example: [5] [%] -> [5%] with colspan adjustment + + Note: Scans right-to-left to merge % with preceding cell. + """ + rows = table_soup.find_all("tr") + for row in rows: + cells = row.find_all(["td", "th"]) + i = len(cells) - 1 + while i > 0: + cell = cells[i] + txt = clean_text(cell.get_text()) + # If standalone % symbol and has previous cell + if txt in ["%", "%)", "pts"]: + prev_cell = cells[i - 1] + prev_txt = clean_text(prev_cell.get_text()) + if prev_txt: + # Merge: append % to previous cell + prev_cell.string = prev_txt + txt + # Adjust colspan + prev_cell["colspan"] = str( + int(prev_cell.get("colspan", 1)) + + int(cell.get("colspan", 1)) + ) + # Remove the % cell + cell.decompose() + i -= 1 + + +def build_row_values(cells, max_cols): + """Build row value list with colspan expansion.""" + row_values = [] + for cell in cells: + try: + colspan = int(cell.get("colspan", 1)) + except (TypeError, ValueError): + colspan = 1 + txt = clean_text(cell.get_text(" ", strip=True)).replace("|", r"\|") + row_values.append(txt) + # Repeat value for colspan + for _ in range(colspan - 1): + row_values.append(txt) + + # Pad to max_cols + if len(row_values) < max_cols: + row_values.extend([""] * (max_cols - len(row_values))) + return row_values[:max_cols] + + +# ----------------------------- +# HTML to JSON Conversion +# ----------------------------- + +def html_to_json(table_soup): + """ + Convert HTML table to JSON intermediate format. + + Returns: + (text_blocks, records, derived_title) tuple where: + - text_blocks: List of long-form text extracted from table + - records: List of dicts with 'label' and 'col_N' keys + - derived_title: Extracted table title if found + + This intermediate format enables intelligent column deduplication + and header merging before markdown generation. + """ + table_soup_copy = BeautifulSoup(str(table_soup), "html.parser") + preprocess_currency_cells(table_soup_copy) + preprocess_percent_cells(table_soup_copy) + + rows = table_soup_copy.find_all("tr") + if not rows: + return None, [], None + + # Filter layout rows + rows = [r for r in rows if not is_width_grid_row(r)] + if not rows: + return None, [], None + + # Calculate max columns + max_cols = 0 + widths = [] + for row in rows: + cells = row.find_all(["th", "td"]) + if not cells: + continue + width = sum(int(cell.get("colspan", 1)) for cell in cells) + widths.append(width) + max_cols = max(max_cols, width) + + if max_cols == 0: + return None, [], None + + # Use 90th percentile to handle outliers + if len(widths) >= 5: + sorted_widths = sorted(widths) + p90 = sorted_widths[int(0.9 * (len(sorted_widths) - 1))] + if p90 >= 2 and max_cols > p90 * 2: + max_cols = p90 + + matrix = [] + row_flags = [] + output_blocks = [] + + # Build matrix + for row in rows: + cells = row.find_all(["td", "th"]) + if not cells: + continue + row_has_th = any(cell.name == "th" for cell in cells) + row_text = " ".join([c.get_text(" ", strip=True) for c in cells]) + + # Extract long text as separate blocks + if len(row_text) > 300: + if not is_noise_text(row_text): + output_blocks.append( + {"type": "text", "content": clean_text(row_text)} + ) + continue + + row_vals = build_row_values(cells, max_cols) + if not any(v.strip() for v in row_vals): + continue + matrix.append(row_vals) + row_flags.append(row_has_th) + + if not matrix: + return output_blocks, [], None + + # Skip sparse tables (>50 cols with <5% filled) + if max_cols >= 50: + total_cells = len(matrix) * max_cols + filled = sum(1 for row in matrix for val in row if val.strip()) + if total_cells and (filled / total_cells) < 0.05: + return output_blocks, [], None + + # Extract derived title (first row with single unique value spanning all columns) + derived_title = None + if len(matrix) > 1: + first_row = matrix[0] + unique_vals = set(v for v in first_row if v.strip()) + if len(unique_vals) == 1: + title_candidate = list(unique_vals)[0] + if 3 < len(title_candidate) < 150: + derived_title = title_candidate + matrix.pop(0) + row_flags.pop(0) + + # Detect label column (column with most text content) + def is_numericish(s): + """Check if string is primarily numeric data (pure numbers, currency, percentages).""" + s_stripped = s.strip() + if not s_stripped: + return False + + # Remove common formatting + s_clean = s_stripped.replace('$', '').replace(',', '').replace('%', '').replace('(', '').replace(')', '').strip() + + # Check if it's a pure number (possibly with decimal) + try: + float(s_clean) + return True + except ValueError: + pass + + # Check if it's mostly digits (more than 50% digits) + if s_clean: + digit_ratio = sum(c.isdigit() for c in s_clean) / len(s_clean) + if digit_ratio > 0.5: + return True + + return False + + def is_labelish(s): + """Check if string looks like a row label rather than data. + + Recognizes: + - Pure text: "Revenue", "Assets" + - Text with numbers: "Q1", "Item 1", "Note 5" + - Excludes: pure numbers, currency values, percentages + """ + s_stripped = s.strip() + if not s_stripped: + return False + + # Must contain at least one letter + if not re.search(r'[A-Za-z]', s_stripped): + return False + + # Exclude if it's primarily numeric + if is_numericish(s_stripped): + return False + + return True + + # Calculate label scores only from data rows (not header rows with th tags) + # NOTE: This heuristic-based label detection could be simplified if + # TableNode.has_row_headers metadata were available at this level. + # Current approach: count "labelish" values per column and prefer leftmost. + label_scores = [] + for c in range(max_cols): + # Only count labelish values in rows without th tags (data rows) + score = sum( + 1 for i, row in enumerate(matrix) + if not row_flags[i] and is_labelish(row[c]) + ) + label_scores.append(score) + + # If no clear label column from data rows, fall back to all rows + if max(label_scores) == 0: + for c in range(max_cols): + score = sum(1 for r in matrix if is_labelish(r[c])) + label_scores[c] = score + + # Select label column: highest score, prefer leftmost column on ties + # (Most tables have row headers in the first column) + label_col = max(range(max_cols), key=lambda c: (label_scores[c], -c)) + + year_re = re.compile(r"\b(20\d{2}|19\d{2})\b") + + # Convert matrix to records with intelligent header detection + records = [] + for row_index, row in enumerate(matrix): + row_has_th = row_flags[row_index] + record = {} + is_header = row_has_th + + # Detect headers by content (year patterns, date headings) + for c in range(max_cols): + if c == label_col: + continue + if row[c] == row[label_col]: + continue + if year_re.search(row[c]): + is_header = True + break + + # Empty label + numeric values might still be header + label_text = (row[label_col] or "").lower() + if not is_header and not label_text: + data_values = [ + row[c] + for c in range(max_cols) + if c != label_col and row[c].strip() + ] + if data_values and len(set(data_values)) == 1: + is_header = True + + # Build record + if is_header: + record["label"] = "" + else: + record["label"] = row[label_col] + + for c in range(max_cols): + if c != label_col: + record[f"col_{c}"] = row[c] + + records.append(record) + + return output_blocks, records, derived_title + + +# ----------------------------- +# JSON to Markdown Conversion +# ----------------------------- + +def _normalize_table_value(value: str) -> str: + """Normalize value for comparison.""" + return clean_text(str(value)).lower() + + +def _is_total_row(row_dict, label_key): + """ + Detect if a row is a total row based on its label. + + Args: + row_dict: Dictionary representing the row + label_key: Key for the label column + + Returns: + bool: True if this appears to be a total row + """ + if not label_key or label_key not in row_dict: + return False + + label_text = str(row_dict.get(label_key, "")).lower().strip() + + if not label_text: + return False + + # Check for total keywords + total_keywords = ['total', 'sum', 'subtotal', 'grand total', 'net total'] + + # Check for exact match or starts with total keyword + for keyword in total_keywords: + if label_text == keyword or label_text.startswith(keyword + ' '): + return True + + return False + + +def create_markdown_table(headers, rows, alignments=None): + """ + Create markdown table from headers and rows with optional alignment. + + Args: + headers: List of header strings + rows: List of row data (lists) + alignments: Optional list of 'left', 'right', 'center' for each column + If None, all columns are left-aligned + + Returns: + str: Markdown formatted table + """ + if not headers or not rows: + return "" + + # Build header row + md = f"| {' | '.join(map(str, headers))} |\n" + + # Build separator row with alignment + if alignments: + separators = [] + for align in alignments: + if align == 'right': + separators.append('---:') + elif align == 'center': + separators.append(':---:') + else: # left or None + separators.append('---') + md += f"| {' | '.join(separators)} |\n" + else: + md += f"| {' | '.join(['---'] * len(headers))} |\n" + + # Build data rows + for row in rows: + padded_row = list(row) + [""] * (len(headers) - len(row)) + cleaned_row = [str(x) if x is not None else "" for x in padded_row] + md += f"| {' | '.join(cleaned_row)} |\n" + return md + + +def list_of_dicts_to_table(data_list): + """ + Convert list of dicts to markdown table with intelligent column handling. + + Features: + - Deduplicates columns with identical signatures + - Filters placeholder columns (col_0, col_1, etc.) + - Merges multi-row headers intelligently + - Removes blank value columns + - Auto-detects numeric columns for right-alignment + """ + if not data_list: + return "" + + all_keys = set().union(*(d.keys() for d in data_list)) + + def natural_keys(text): + return [ + int(c) if c.isdigit() else c.lower() + for c in re.split(r"(\d+)", text) + ] + + def is_numeric_column(values): + """Detect if a column contains primarily numeric data.""" + if not values: + return False + + numeric_count = 0 + total_count = 0 + + for val in values: + val_str = str(val).strip() + if not val_str or val_str == '-': + continue + + total_count += 1 + # Check for numeric patterns: numbers, currency, percentages + # Remove common formatting: $, commas, parentheses, %, and scale indicators (M, K, B, T) + cleaned = re.sub(r'[\$,\(\)%]', '', val_str).strip() + # Remove scale indicators (Millions, Thousands, Billions, Trillions) + cleaned = re.sub(r'[MKBT]$', '', cleaned, flags=re.IGNORECASE).strip() + + # Check if it's a number (possibly with minus sign or decimal) + if re.match(r'^-?\d+\.?\d*$', cleaned): + numeric_count += 1 + + # Consider numeric if >70% of non-empty values are numeric + if total_count == 0: + return False + return (numeric_count / total_count) > 0.7 + + sorted_keys = sorted(list(all_keys), key=natural_keys) + label_key = next( + (k for k in sorted_keys if k.lower() in ["label", "metric", "name"]), None + ) + + # Separate header rows from data rows + header_rows = [] + data_rows = [] + + if label_key: + for item in data_list: + if not str(item.get(label_key, "")).strip(): + header_rows.append(item) + else: + data_rows.append(item) + else: + data_rows = data_list + + # Build headers + if header_rows: + # Group columns by header signature + column_groups = {} + value_keys = [k for k in sorted_keys if k != label_key] + + for key in value_keys: + signature = tuple(str(row.get(key, "")).strip() for row in header_rows) + if signature not in column_groups: + column_groups[signature] = [] + column_groups[signature].append(key) + + final_headers = [label_key if label_key else "Row"] + final_keys = [label_key] if label_key else [] + processed_signatures = set() + + for key in value_keys: + signature = tuple(str(row.get(key, "")).strip() for row in header_rows) + if signature in processed_signatures: + continue + processed_signatures.add(signature) + + candidate_keys = column_groups[signature] + # Choose key with most non-empty data + best_key = max( + candidate_keys, + key=lambda k: sum( + 1 + for row in data_rows + if str(row.get(k, "")).strip() not in ["", "-"] + ), + ) + + # Skip empty columns + if sum( + 1 for row in data_rows if str(row.get(best_key, "")).strip() + ) == 0: + continue + + # Build header string from signature + header_str = " - ".join([p for p in signature if p]) or best_key + final_headers.append(header_str) + final_keys.append(best_key) + else: + # No multi-row headers + final_headers = sorted_keys + final_keys = sorted_keys + if label_key and label_key in final_headers: + final_headers.insert(0, final_headers.pop(final_headers.index(label_key))) + final_keys.insert(0, final_keys.pop(final_keys.index(label_key))) + + # Filter placeholder headers and duplicate columns + if data_rows and final_headers and final_keys: + def is_placeholder_header(header): + header_text = clean_text(str(header)).lower() + if not header_text: + return True + if re.fullmatch(r"col_?\d+", header_text): + return True + if header_text == "row": + return True + return False + + def is_blank_value(value): + if not value: + return True + return bool(re.fullmatch(r"-+", value)) + + keep_headers = [] + keep_keys = [] + seen = set() + locked_index = 0 + + for idx, (header, key) in enumerate(zip(final_headers, final_keys)): + # Always keep first column (labels) + if idx == locked_index: + keep_headers.append(header) + keep_keys.append(key) + continue + + # Get column values + values = tuple( + _normalize_table_value(item.get(key, "")) for item in data_rows + ) + + # Skip all-blank columns + if all(is_blank_value(value) for value in values): + continue + + # Skip columns that duplicate the label column + if idx > locked_index and label_key: + label_values = tuple( + _normalize_table_value(item.get(label_key, "")) for item in data_rows + ) + if values == label_values: + # This column duplicates the label column + continue + + # Build signature (header + values) + header_norm = _normalize_table_value(header) + signature = ( + "" if is_placeholder_header(header) else header_norm, + values, + ) + + # Skip duplicates + if signature in seen: + continue + seen.add(signature) + + keep_headers.append(header) + keep_keys.append(key) + + final_headers = keep_headers + final_keys = keep_keys + + # Build table rows with total row highlighting + table_rows = [] + for item in data_rows: + # Check if this is a total row + is_total = _is_total_row(item, label_key) + + # Build row, bolding values if it's a total row + if is_total: + row = [f"**{item.get(k, '')}**" if item.get(k, '') else "" for k in final_keys] + else: + row = [item.get(k, "") for k in final_keys] + table_rows.append(row) + + # Detect numeric columns for right-alignment + alignments = [] + for idx, key in enumerate(final_keys): + # First column (labels) should be left-aligned + if idx == 0 and label_key and key == label_key: + alignments.append('left') + else: + # Extract column values (without bold formatting) + column_values = [str(item.get(key, "")).replace("**", "") for item in data_rows] + # Check if numeric + if is_numeric_column(column_values): + alignments.append('right') + else: + alignments.append('left') + + return create_markdown_table(final_headers, table_rows, alignments) + + +# ----------------------------- +# Content Processing +# ----------------------------- + +def process_content(content, section_title=None, track_filtered=False): + """ + Process HTML content to LLM-optimized markdown. + + Features: + - Extracts tables with intelligent preprocessing + - Filters XBRL metadata tables + - Deduplicates tables via signature matching + - Extracts headings and text + - Optimizes for token efficiency + + Args: + content: HTML content to process + section_title: Title of the section + track_filtered: If True, return (markdown, filtered_metadata) tuple + + Returns: + str if track_filtered=False, else (str, dict) with filtered metadata + """ + if not content: + return ("", {}) if track_filtered else "" + + raw_str = str(content) + is_html = bool(re.search(r"<(table|div|p|h[1-6])", raw_str, re.IGNORECASE)) + + if not is_html: + result = f"\n{raw_str.strip()}\n" + return (result, {}) if track_filtered else result + + soup = BeautifulSoup(raw_str, "html.parser") + for tag in soup(["script", "style", "head", "meta"]): + tag.decompose() + + output_parts = [] + processed_tables = set() + table_signatures = set() + recent_text = deque(maxlen=32) + normalized_section = clean_text(section_title or "").lower() + table_counter = 0 + + # Track filtered items + filtered_metadata = { + "xbrl_metadata_tables": 0, + "duplicate_tables": 0, + "filtered_text_blocks": 0, + "details": [] + } if track_filtered else None + + elements = soup.find_all( + ["p", "div", "table", "ul", "ol", "h1", "h2", "h3", "h4", "h5", "h6"] + ) + + skip_next = False # Flag to skip elements (e.g., TOC after page number) + + for idx, element in enumerate(elements): + # Skip if marked by previous iteration + if skip_next: + skip_next = False + continue + # Skip if already processed as part of another table + if element.find_parent("table") in processed_tables: + continue + + # Process tables + if element.name == "table": + # Skip nested tables + if element.find("table"): + continue + # Skip XBRL metadata + if is_xbrl_metadata_table(element): + if filtered_metadata is not None: + filtered_metadata["xbrl_metadata_tables"] += 1 + table_text = element.get_text()[:100] + filtered_metadata["details"].append({ + "type": "xbrl_metadata_table", + "reason": "Contains XBRL namespace/type metadata (non-financial content)", + "preview": clean_text(table_text) + }) + continue + + # Extract table title using improved multi-source extraction + table_title, title_source = extract_table_title( + element, + section_title=section_title, + context={} # Could be enhanced with preceding_heading in future + ) + + # Convert to JSON intermediate format (for backward compat, still returns derived_title) + text_blocks, records, derived_title = html_to_json(element) + + # Add text blocks + for block in text_blocks: + if block["type"] == "text" and not is_noise_text(block["content"]): + output_parts.append(block["content"]) + + # Process table records + if records: + # Generate signature for deduplication + def _table_signature(records, title, max_rows=8): + if not records: + return None + keys = sorted({key for record in records for key in record.keys()}) + if not keys: + return None + row_sig = [] + for record in records[:max_rows]: + row_sig.append(tuple(_normalize_table_value(record.get(key, "")) for key in keys)) + title_sig = _normalize_table_value(title or "") + return (title_sig, tuple(keys), tuple(row_sig), len(records)) + + signature = _table_signature(records, table_title) + + # Skip duplicate tables + if signature and signature in table_signatures: + if filtered_metadata is not None: + filtered_metadata["duplicate_tables"] += 1 + filtered_metadata["details"].append({ + "type": "duplicate_table", + "reason": "Duplicate of earlier table (identical structure and data)", + "title": table_title or "Untitled" + }) + processed_tables.add(element) + continue + if signature: + table_signatures.add(signature) + + # Convert to markdown + table_counter += 1 + md_table = list_of_dicts_to_table(records) + if md_table: + # Generate header with improved title + if table_title: + header_str = f"#### Table: {table_title}" + else: + header_str = f"#### Table {table_counter}: {section_title or 'Data'}" + + output_parts.append(f"\n{header_str}\n{md_table}\n") + + processed_tables.add(element) + continue + + # Process headings + if element.name.startswith("h"): + txt = clean_text(element.get_text()) + if txt and not is_noise_text(txt): + output_parts.append(f"\n### {txt}\n") + continue + + # Check for subsection headings (bold/italic spans in standalone divs) + is_subsection, heading_level = is_subsection_heading(element) + if is_subsection: + txt = clean_text(element.get_text()) + if txt and not is_noise_text(txt): + output_parts.append(f"\n{heading_level} {txt}\n") + continue + + # Process lists + if element.name in ["ul", "ol"]: + lines = [ + f"- {clean_text(li.get_text())}" + for li in element.find_all("li") + if clean_text(li.get_text()) + ] + if lines: + output_parts.append("\n".join(lines)) + continue + + # Process paragraphs and divs + if element.find("table"): # Skip containers with tables + continue + text = clean_text(element.get_text()) + if len(text) <= 5 or is_noise_text(text): + continue + if normalized_section and text.lower() == normalized_section: + continue + + # Filter header/footer text (Table of Contents, etc.) + if is_header_footer_text(text): + continue + + # Check for page number + TOC combination + if is_page_number(text): + # Look ahead to see if next element is "Table of Contents" + next_text = None + if idx + 1 < len(elements): + next_element = elements[idx + 1] + next_text = clean_text(next_element.get_text()) + if "table of contents" in next_text.lower(): + # Skip both this page number and the next TOC text + skip_next = True + continue + # Standalone page number without TOC - format as "page XX" + text = f"page {text}" + + if should_skip_duplicate(text, recent_text): + continue + + output_parts.append(text) + recent_text.append(text.lower()) + + result = "\n\n".join(output_parts) + + if track_filtered: + return (result, filtered_metadata) + return result diff --git a/edgar/xbrl/notes.py b/edgar/xbrl/notes.py index fc685285..eb42520a 100644 --- a/edgar/xbrl/notes.py +++ b/edgar/xbrl/notes.py @@ -201,6 +201,71 @@ def to_context(self, detail: str = 'standard') -> str: return "\n".join(lines) + def to_markdown(self, detail: str = 'standard', optimize_for_llm: bool = True) -> str: + """Render this note as GitHub-Flavored Markdown. + + Args: + detail: 'minimal' (title + table names), 'standard' (+ tables + narrative), + 'full' (+ policies + details) + optimize_for_llm: Use LLM-optimized table formatting via process_content() + """ + parts = [] + parts.append(f"## Note {self.number}: {self.title}") + + # Expands metadata + expands = self.expands + if expands: + parts.append(f"**Expands:** {', '.join(expands)}") + expands_stmts = self.expands_statements + if expands_stmts: + parts.append(f"**From:** {', '.join(expands_stmts)}") + + if detail == 'minimal': + # Minimal: just title + table names list + if self.tables: + table_names = [_extract_table_name(t, self.short_name) for t in self.tables] + parts.append("**Tables:** " + ", ".join(table_names)) + return '\n\n'.join(part for part in parts if part) + + # Tables — always included for standard and full + for table_stmt in self.tables: + table_name = _extract_table_name(table_stmt, self.short_name) + parts.append(f"### {table_name}") + md = _render_statement_to_markdown(table_stmt, table_name, optimize_for_llm) + if md: + parts.append(md) + + # Narrative — extract from HTML with tables stripped to avoid duplication + note_html = self.html + if note_html: + narrative = _extract_narrative_markdown(note_html, optimize_for_llm) + if narrative: + parts.append("### Narrative") + parts.append(narrative) + elif self.text: + # Fallback: plain text if no HTML available + parts.append("### Narrative") + parts.append(self.text) + + # Policies — full only + if detail == 'full': + for policy_stmt in self.policies: + policy_name = _extract_table_name(policy_stmt, self.short_name) + parts.append(f"### Policy: {policy_name}") + policy_text = policy_stmt.text() + if policy_text: + parts.append(policy_text) + + # Details — full only + for detail_stmt in self.details: + detail_name = _extract_table_name(detail_stmt, self.short_name) + parts.append(f"### {detail_name}") + md = _render_statement_to_markdown(detail_stmt, detail_name, optimize_for_llm) + if md: + parts.append(md) + + return '\n\n'.join(part for part in parts if part) + def __rich__(self): parts = [] @@ -500,6 +565,48 @@ def with_tables(self) -> List[Note]: """Notes that have child tables.""" return [n for n in self._notes if n.has_tables] + def to_markdown(self, detail: str = 'standard', focus=None, optimize_for_llm: bool = True) -> str: + """Render all notes as a single GitHub-Flavored Markdown document. + + Args: + detail: 'minimal' (titles only), 'standard' (+ tables + narrative), 'full' (everything) + focus: Optional topic string or list of topics to filter notes (uses search()) + optimize_for_llm: Use LLM-optimized table formatting + """ + header_parts = ["# Notes to Financial Statements"] + subtitle_bits = [] + if self.entity_name: + subtitle_bits.append(self.entity_name) + if self.form: + subtitle_bits.append(self.form) + if self.period: + subtitle_bits.append(f"Period ending {self.period}") + if subtitle_bits: + header_parts.append(f"**{' · '.join(subtitle_bits)}**") + + # Select notes + notes_to_render = self._notes + if focus: + if isinstance(focus, str): + focus = [focus] + seen = set() + notes_to_render = [] + for topic in focus: + for note in self.search(topic): + if note.number not in seen: + seen.add(note.number) + notes_to_render.append(note) + + # Render each note + note_parts = [] + for note in notes_to_render: + note_parts.append(note.to_markdown(detail=detail, optimize_for_llm=optimize_for_llm)) + + header = '\n\n'.join(header_parts) + if note_parts: + return header + '\n\n---\n\n' + '\n\n---\n\n'.join(note_parts) + return header + def to_context(self, detail: str = 'standard', focus: Optional[List[str]] = None) -> str: """ AI-optimized context string for all notes. @@ -595,6 +702,181 @@ def __bool__(self): # === Helpers === + +def _is_garbled_markdown(md: str) -> bool: + """Detect if process_content() produced garbled output from a complex table. + + Checks for signs of broken colspan parsing: + - Header cells with merged data values (digits + separators in headers > 40 chars) + - Excessive placeholder column names (col_N pattern) + """ + lines = md.split('\n') + for line in lines: + if '|' not in line or '---' in line: + continue + cells = [c.strip() for c in line.split('|') if c.strip()] + for cell in cells: + # Skip the label column and normal header names + if cell in ('label', ''): + continue + # Header cells with merged values: long + contain digits + separator chars + # Legitimate headers: "Jun 30, 2025" (13 chars), "FY 2024" (7 chars) + # Garbled headers: "2025 - Effective Interest Rate - $86,781 - 4,500" (48 chars) + if len(cell) > 40 and any(ch.isdigit() for ch in cell) and ' - ' in cell: + return True + # Count placeholder column names (col_N) in header rows + placeholder_count = 0 + for line in lines[:3]: + if '|' in line: + placeholder_count += len(re.findall(r'\bcol_\d+\b', line)) + if placeholder_count > 5: + return True + return False + + +def _render_statement_to_markdown(stmt: 'Statement', section_title: str, + optimize_for_llm: bool) -> Optional[str]: + """Render a sub-table/detail Statement to markdown with fallback. + + A single XBRL table Statement may contain multiple HTML tags. + Each is processed individually: clean ones become pipe tables, garbled + ones (complex colspans) fall back to aligned plain text. Non-table + content (headings, paragraphs, footnotes) is preserved between tables. + """ + html = stmt.text(raw_html=True) + if not html: + plain = stmt.text() + return plain if plain else None + + if not optimize_for_llm: + plain = stmt.text() + return plain if plain else None + + try: + from bs4 import BeautifulSoup, NavigableString + from edgar.markdown import process_content + soup = BeautifulSoup(html, 'html.parser') + html_tables = soup.find_all('table') + + if not html_tables: + return process_content(html, section_title=section_title) or None + + # Replace each
with a placeholder, then process_content the + # whole document for text/headings, and splice tables back in. + placeholders = {} + for i, table_tag in enumerate(html_tables): + marker = f'__TABLE_PLACEHOLDER_{i}__' + table_html = str(table_tag) + + # Try pipe-table conversion for this individual table + md = process_content(table_html, section_title=section_title) + if md and not _is_garbled_markdown(md): + placeholders[marker] = md + else: + # Garbled — use aligned plain text + plain = _html_table_to_plain_text(table_tag) + placeholders[marker] = plain or '' + + # Replace the tag in the soup with a text marker + table_tag.replace_with(NavigableString(f'\n{marker}\n')) + + # Now process the modified HTML (text + placeholders) for headings/paragraphs + modified_html = str(soup) + text_md = process_content(modified_html) + + # Splice table renderings back in place of placeholders + if text_md: + result = text_md + for marker, table_md in placeholders.items(): + result = result.replace(marker, f'\n\n{table_md}\n\n' if table_md else '') + else: + # No text content — just join the tables + result = '\n\n'.join(md for md in placeholders.values() if md) + + # Clean up excessive blank lines + result = re.sub(r'\n{3,}', '\n\n', result).strip() + return result if result else None + + except (ValueError, TypeError, AttributeError, KeyError) as e: + log.warning(f"Per-table rendering failed for '{section_title}': {e}") + plain = stmt.text() + return plain if plain else None + + +def _html_table_to_plain_text(table_tag) -> Optional[str]: + """Convert a BeautifulSoup
tag to aligned plain text. + + Extracts rows and pads columns for readable alignment when pipe-table + conversion fails due to complex colspans. + """ + rows = table_tag.find_all('tr') + if not rows: + return None + + matrix = [] + for row in rows: + cells = row.find_all(['td', 'th']) + row_text = [c.get_text(strip=True) for c in cells] + if any(row_text): # Skip fully empty rows + matrix.append(row_text) + + if not matrix: + return None + + # Determine max columns and pad + max_cols = max(len(r) for r in matrix) + for r in matrix: + r.extend([''] * (max_cols - len(r))) + + # Calculate column widths + col_widths = [0] * max_cols + for r in matrix: + for i, val in enumerate(r): + col_widths[i] = max(col_widths[i], len(val)) + + # Build aligned text + lines = [] + for r in matrix: + line = ' '.join(val.ljust(col_widths[i]) for i, val in enumerate(r)) + lines.append(line.rstrip()) + + return '\n'.join(lines) + + +def _extract_narrative_markdown(html: str, optimize_for_llm: bool) -> Optional[str]: + """Extract narrative text from note HTML, stripping tables to avoid duplication. + + Tables are rendered separately via sub-table Statements, so we strip them + from the TextBlock HTML before extracting narrative text. + """ + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Remove all
elements — they're rendered separately + for table_tag in soup.find_all('table'): + table_tag.decompose() + + remaining = str(soup).strip() + if not remaining or remaining in ('', ''): + return None + + if optimize_for_llm: + from edgar.markdown import process_content + md = process_content(remaining) + return md if md and md.strip() else None + else: + # Plain text extraction + text = soup.get_text(separator=' ', strip=True) + # Fix missing spaces between adjacent spans (e.g., "hadno" → "had no") + text = re.sub(r'([a-z])([A-Z$])', r'\1 \2', text) + text = re.sub(r'(\w)([$])', r'\1 \2', text) + return text if text.strip() else None + except (ValueError, TypeError, AttributeError, KeyError) as e: + log.warning(f"Narrative extraction failed: {e}") + return None + + def _extract_short_name(definition: str) -> str: """Extract a clean short name from a definition string. @@ -672,8 +954,8 @@ def _append_statement_lines(lines: list, statement: 'Statement', indent: int = 4 if label: lines.append(f"{prefix}{label}: {val_str}") count += 1 - except Exception as e: - log.debug(f"Failed to render statement lines for {statement.role_or_type}: {e}") + except (ValueError, TypeError, AttributeError, KeyError) as e: + log.warning(f"Failed to render statement lines for {statement.role_or_type}: {e}") # Suffixes for structural XBRL concepts (not real financial data) diff --git a/edgar/xbrl/rendering.py b/edgar/xbrl/rendering.py index 619ade80..78374179 100644 --- a/edgar/xbrl/rendering.py +++ b/edgar/xbrl/rendering.py @@ -656,41 +656,68 @@ def to_dataframe(self, include_unit: bool = False, include_point_in_time: bool = except ImportError: return "Pandas is required for DataFrame conversion" - def to_markdown(self) -> str: - """Convert to a markdown table representation""" - lines = [] + def to_markdown(self, detail: str = 'standard', optimize_for_llm: bool = False) -> str: + """Convert to a GitHub-Flavored Markdown table. - # Add title as a header - lines.append(f"## {self.title}") - lines.append("") + Args: + detail: 'minimal' (table only), 'standard' (with header), 'full' (header + footer) + optimize_for_llm: When True, drop abstract-only rows with no values + """ + import re as _re + lines = [] - # Add subtitle info if available - if self.fiscal_period_indicator or self.units_note: - subtitle_parts = [] - if self.fiscal_period_indicator: - subtitle_parts.append(f"**{self.fiscal_period_indicator}**") - if self.units_note: - # Remove rich formatting tags from units note - clean_units = self.units_note.replace('[italic]', '').replace('[/italic]', '') - subtitle_parts.append(f"*{clean_units}*") + # Clean title — remove internal terminology + clean_title = self.title.replace("(Standardized)", "").strip() - lines.append(" ".join(subtitle_parts)) + if detail != 'minimal': + # Header: ## Statement Title + lines.append(f"## {clean_title}") lines.append("") - # Create header row + # Company + ticker subtitle + company_name = self.metadata.get('company_name', '') + ticker = self.metadata.get('ticker', '') + if company_name: + subtitle = f"**{company_name}**" + if ticker: + subtitle += f" ({ticker.upper()})" + lines.append(subtitle) + lines.append("") + + # Fiscal period + units + if self.fiscal_period_indicator or self.units_note: + subtitle_parts = [] + if self.fiscal_period_indicator: + subtitle_parts.append(f"**{self.fiscal_period_indicator}**") + if self.units_note: + # Strip ALL Rich markup tags + clean_units = _re.sub(r'\[/?[^\]]+\]', '', self.units_note) + subtitle_parts.append(f"*{clean_units}*") + lines.append(" ".join(subtitle_parts)) + lines.append("") + + # Column header row — right-align numeric columns header = [""] + self.header.columns lines.append("| " + " | ".join(header) + " |") - # Add separator row - separator = ["---"] + ["---" for _ in self.header.columns] + separator = ["---"] + ["---:" for _ in self.header.columns] lines.append("| " + " | ".join(separator) + " |") - # Add data rows + # Non-breaking space for indentation (regular spaces stripped in pipe cells) + NBSP = "\u00A0" + for row in self.rows: - # Handle indentation for row label - indent = " " * row.level + # Optionally skip abstract rows with no values + if optimize_for_llm and row.is_abstract: + has_values = any( + cell.value is not None and cell.value != "" + for cell in row.cells + ) + if not has_values: + continue + + indent = (NBSP * 2) * row.level - # Format row label based on properties if row.is_abstract: label = f"**{indent}{row.label}**" elif row.is_dimension: @@ -698,7 +725,6 @@ def to_markdown(self) -> str: else: label = f"{indent}{row.label}" - # Format cell values cell_values = [] for cell in row.cells: cell_value = cell.formatter(cell.value) @@ -709,10 +735,20 @@ def to_markdown(self) -> str: else: cell_values.append(cell_value) - # Add the row row_data = [label] + cell_values lines.append("| " + " | ".join(row_data) + " |") + # Footer for 'full' detail + if detail == 'full': + lines.append("") + clean_units = "" + if self.units_note: + clean_units = _re.sub(r'\[/?[^\]]+\]', '', self.units_note) + footer_parts = ["*Source: SEC XBRL*"] + if clean_units: + footer_parts.append(f"*{clean_units}*") + lines.append(" · ".join(footer_parts)) + return "\n".join(lines) diff --git a/edgar/xbrl/statements.py b/edgar/xbrl/statements.py index f1c31109..0701d8ac 100644 --- a/edgar/xbrl/statements.py +++ b/edgar/xbrl/statements.py @@ -750,6 +750,15 @@ def __str__(self): rendered_statement = self.render() return str(rendered_statement) # Delegates to RenderedStatement.__str__() + def to_markdown(self, detail: str = 'standard', optimize_for_llm: bool = False) -> str: + """Render this statement as GitHub-Flavored Markdown. + + Args: + detail: 'minimal' (table only), 'standard' (with header), 'full' (header + footer) + optimize_for_llm: Simplify output for LLM consumption + """ + return self.render().to_markdown(detail=detail, optimize_for_llm=optimize_for_llm) + def to_context(self, detail: str = 'standard') -> str: """ AI-optimized context string. @@ -1899,9 +1908,10 @@ def __getitem__(self, label: str) -> Optional['StatementLineItem']: return None label_lower = label.lower() + columns = rendered.header.columns if rendered.header else [] for row in rendered.rows: if row.label.lower() == label_lower: - return StatementLineItem(row, self.xbrl) + return StatementLineItem(row, self.xbrl, columns=columns) return None @@ -1947,7 +1957,8 @@ def search(self, keyword: str) -> List['StatementLineItem']: scored.append((4, row)) scored.sort(key=lambda x: x[0]) - return [StatementLineItem(row, self.xbrl) for _, row in scored] + columns = rendered.header.columns if rendered.header else [] + return [StatementLineItem(row, self.xbrl, columns=columns) for _, row in scored] class StatementLineItem: @@ -1963,11 +1974,12 @@ class StatementLineItem: >>> item.notes # → [Note, ...] (all related notes) >>> item.values # {'instant_2024-12-31': 98071000000, ...} """ - __slots__ = ('_row', '_xbrl') + __slots__ = ('_row', '_xbrl', '_columns') - def __init__(self, row, xbrl): + def __init__(self, row, xbrl, columns=None): self._row = row self._xbrl = xbrl + self._columns = columns or [] @property def label(self) -> str: @@ -1997,6 +2009,40 @@ def notes(self) -> List[Any]: from edgar.xbrl.notes import get_notes_for_concept return get_notes_for_concept(self.concept, xbrl) + def to_markdown(self, include_note: bool = True) -> str: + """Render this line item as markdown with formatted values and optional note link. + + Args: + include_note: Include a blockquote linking to the related Note + """ + parts = [] + + # Format values with period labels from the rendered statement header + cells = self._row.cells or [] + columns = self._columns or [] + formatted_pairs = [] + for i, cell in enumerate(cells): + if cell.value is not None and cell.value != "": + formatted_val = str(cell.formatter(cell.value)) + if formatted_val: + if i < len(columns) and columns[i]: + formatted_pairs.append(f"{formatted_val} ({columns[i]})") + else: + formatted_pairs.append(formatted_val) + + if formatted_pairs: + parts.append(f"**{self.label}**: {', '.join(formatted_pairs)}") + else: + parts.append(f"**{self.label}**") + + # Note reference + if include_note: + note = self.note + if note: + parts.append(f"> Related: Note {note.number} \u2014 {note.title}") + + return '\n\n'.join(parts) + def __repr__(self): concept_str = f", concept='{self.concept}'" if self.concept else "" return f"StatementLineItem('{self.label}'{concept_str})" diff --git a/tests/demo_to_markdown.ipynb b/tests/demo_to_markdown.ipynb new file mode 100644 index 00000000..b885526f --- /dev/null +++ b/tests/demo_to_markdown.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# EdgarTools `to_markdown()` Demo\n\nDemonstrates the new markdown output methods added to drill-down objects in the `markdown` branch.\n\n**Features covered:**\n- `create_markdown_table()` and `process_content()` from `edgar.markdown`\n- `RenderedStatement.to_markdown()` with `detail` and `optimize_for_llm` params\n- `Statement.to_markdown()`\n- `StatementLineItem.to_markdown()` with note references and period labels\n- `Note.to_markdown()` with tables rendered as pipe tables\n- `Notes.to_markdown()` with focus filtering\n- `TenK.to_context(focus=..., output_format='markdown')`" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Low-Level Formatting Utilities\n", + "\n", + "These are the building blocks ported from `quant/markdown/helpers.py`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from edgar.markdown import create_markdown_table, process_content\n", + "\n", + "# create_markdown_table: headers + rows + optional alignment\n", + "md = create_markdown_table(\n", + " headers=['Metric', 'FY 2024', 'FY 2023'],\n", + " rows=[\n", + " ['Revenue', '$394B', '$383B'],\n", + " ['Net Income', '$97B', '$94B'],\n", + " ['EPS (diluted)', '$6.42', '$6.13'],\n", + " ],\n", + " alignments=['left', 'right', 'right'],\n", + ")\n", + "print(md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# process_content: converts raw HTML (tables, headings, text) to LLM-optimized markdown\n", + "html = \"\"\"\n", + "

Debt Maturity Schedule

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
2025202620272028
Term Debt$10,500$9,750$8,250$7,000
Commercial Paper$5,900$4,800$3,200$2,100
Total$16,400$14,550$11,450$9,100
\n", + "

The Company issues unsecured short-term promissory notes under a commercial paper program.

\n", + "\"\"\"\n", + "print(process_content(html))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. RenderedStatement.to_markdown()\n", + "\n", + "Upgraded with company header, NBSP indentation, Rich tag stripping, and detail levels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from edgar.xbrl.rendering import RenderedStatement, StatementRow, StatementCell, StatementHeader\n", + "\n", + "def cell(value, formatted=None):\n", + " fmt = formatted or (f'{value:,.0f}' if isinstance(value, (int, float)) and value else '')\n", + " return StatementCell(value=value, formatter=lambda v, f=fmt: f if v is not None else '')\n", + "\n", + "rows = [\n", + " StatementRow(label='Net sales', level=0, is_abstract=True,\n", + " cells=[cell(391035), cell(383285)], metadata={'concept': 'Revenue'}),\n", + " StatementRow(label='Products', level=1,\n", + " cells=[cell(224578), cell(220272)], metadata={}),\n", + " StatementRow(label='Services', level=1,\n", + " cells=[cell(166457), cell(163013)], metadata={}),\n", + " StatementRow(label='Cost of sales', level=0,\n", + " cells=[cell(210352), cell(214137)], metadata={}),\n", + " StatementRow(label='Gross profit', level=0, is_abstract=True,\n", + " cells=[cell(180683), cell(169148)], metadata={}),\n", + "]\n", + "\n", + "rs = RenderedStatement(\n", + " title='Income Statement',\n", + " header=StatementHeader(columns=['2024-09-28', '2023-09-30']),\n", + " rows=rows,\n", + " metadata={'company_name': 'Apple Inc.', 'ticker': 'AAPL'},\n", + " statement_type='IncomeStatement',\n", + " fiscal_period_indicator='FY 2024',\n", + " units_note='[italic]In millions[/italic]',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard detail (default)\n", + "print(rs.to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Minimal — just the pipe table, no header\n", + "print(rs.to_markdown(detail='minimal'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full — header + footer with source attribution\n", + "print(rs.to_markdown(detail='full'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# optimize_for_llm: skips abstract rows that have no values\n", + "from edgar.xbrl.rendering import StatementRow, StatementCell\n", + "\n", + "rows_with_empty_abstract = [\n", + " StatementRow(label='ASSETS', level=0, is_abstract=True,\n", + " cells=[cell(None), cell(None)], metadata={}),\n", + " StatementRow(label='Cash and equivalents', level=1,\n", + " cells=[cell(29965), cell(29965)], metadata={}),\n", + " StatementRow(label='Short-term investments', level=1,\n", + " cells=[cell(31590), cell(27699)], metadata={}),\n", + "]\n", + "\n", + "rs2 = RenderedStatement(\n", + " title='Balance Sheet', header=StatementHeader(columns=['2024-09-28', '2023-09-30']),\n", + " rows=rows_with_empty_abstract,\n", + " metadata={'company_name': 'Apple Inc.', 'ticker': 'AAPL'},\n", + " statement_type='BalanceSheet',\n", + ")\n", + "\n", + "print('=== Normal ===')\n", + "print(rs2.to_markdown(detail='minimal'))\n", + "print()\n", + "print('=== LLM Optimized (ASSETS header dropped) ===')\n", + "print(rs2.to_markdown(detail='minimal', optimize_for_llm=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Statement.to_markdown() — Convenience Wrapper\n", + "\n", + "Calls `self.render().to_markdown()` under the hood." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This requires a real Filing + XBRL parse, so we use a live example.\n", + "# If you don't have network access, skip this cell.\n", + "\n", + "from edgar import Company\n", + "\n", + "company = Company(\"MSFT\")\n", + "filing = company.get_filings(form=\"10-K\").latest()\n", + "financials = filing.xbrl().statements\n", + "income = financials.income_statement()\n", + "\n", + "print(income.to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Minimal detail — just the pipe table\n", + "print(income.to_markdown(detail='minimal'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. StatementLineItem.to_markdown()\n", + "\n", + "Compact markdown for a single line item with values and optional note link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Requires network — uses MSFT 10-K\n", + "from edgar import Company\n", + "\n", + "company = Company(\"MSFT\")\n", + "tenk = company.get_filings(form=\"10-K\").latest().obj()\n", + "bs = tenk.financials.balance_sheet\n", + "\n", + "# Look up a line item\n", + "item = bs['Goodwill']\n", + "if item:\n", + " print('=== With note reference ===')\n", + " print(item.to_markdown())\n", + " print()\n", + " print('=== Without note reference ===')\n", + " print(item.to_markdown(include_note=False))\n", + "else:\n", + " print('Goodwill not found in balance sheet — try searching:')\n", + " for match in bs.search('goodwill'):\n", + " print(f' {match}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Note.to_markdown()\n", + "\n", + "Full markdown output for a single note — tables rendered as pipe tables via `process_content()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Requires network — uses MSFT 10-K\n", + "from edgar import Company\n", + "\n", + "company = Company(\"MSFT\")\n", + "tenk = company.get_filings(form=\"10-K\").latest().obj()\n", + "notes = tenk.notes\n", + "\n", + "# Find a note about debt\n", + "debt_notes = notes.search('debt')\n", + "if debt_notes:\n", + " note = debt_notes[0]\n", + " print(f'Found: {note}\\n')\n", + " print(note.to_markdown(detail='standard'))\n", + "else:\n", + " print('No debt note found. Available notes:')\n", + " for n in notes:\n", + " print(f' {n}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Minimal detail — just the title and table names\n", + "if debt_notes:\n", + " print(debt_notes[0].to_markdown(detail='minimal'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full detail — includes policies and detail breakdowns\n", + "if debt_notes:\n", + " print(debt_notes[0].to_markdown(detail='full'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Notes.to_markdown()\n", + "\n", + "Render all notes (or a focused subset) as a single markdown document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Full document — all notes (minimal detail to keep output manageable)\n", + "print(notes.to_markdown(detail='minimal'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Focused on specific topics\n", + "print(notes.to_markdown(detail='standard', focus='revenue'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Multi-topic focus\n", + "print(notes.to_markdown(detail='standard', focus=['debt', 'revenue']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## 7. TenK.to_context(focus=..., output_format='markdown')\n\nThe `output_format='markdown'` param routes through `Notes.to_markdown()` for GFM output." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Text format (default — backward compatible)\nprint('=== TEXT FORMAT ===')\nprint(tenk.to_context(focus='debt', detail='minimal'))\nprint()\nprint('=== MARKDOWN FORMAT ===')\nprint(tenk.to_context(focus='debt', detail='minimal', output_format='markdown'))" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Full markdown with multiple topics\nprint(tenk.to_context(focus=['debt', 'goodwill'], detail='standard', output_format='markdown'))" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Side-by-Side Comparison: text vs markdown\n", + "\n", + "Compare the old `to_context()` plain text with the new markdown output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if debt_notes:\n", + " note = debt_notes[0]\n", + " print('=' * 60)\n", + " print('to_context() — plain text')\n", + " print('=' * 60)\n", + " print(note.to_context(detail='standard'))\n", + " print()\n", + " print('=' * 60)\n", + " print('to_markdown() — GFM with pipe tables')\n", + " print('=' * 60)\n", + " print(note.to_markdown(detail='standard'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Save Markdown to File\n", + "\n", + "Write the output to a `.md` file and open in any markdown viewer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# Save full notes document\n", + "output = notes.to_markdown(detail='standard', focus=['debt', 'revenue'])\n", + "output_path = Path('demo_output.md')\n", + "output_path.write_text(output, encoding='utf-8')\n", + "print(f'Saved {len(output):,} characters to {output_path.resolve()}')\n", + "print(f'Open in VS Code: code {output_path}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/tests/test_to_markdown.py b/tests/test_to_markdown.py new file mode 100644 index 00000000..914a55ef --- /dev/null +++ b/tests/test_to_markdown.py @@ -0,0 +1,547 @@ +""" +Tests for to_markdown() methods on drill-down objects. + +Covers: +- edgar.markdown: create_markdown_table, process_content +- RenderedStatement.to_markdown(): detail levels, Rich tag stripping, NBSP indentation +- Statement.to_markdown(): delegation +- StatementLineItem.to_markdown(): values + note reference +- Note.to_markdown(): tables, narrative, policies, details, detail levels +- Notes.to_markdown(): full document, focus filtering +- TenK/TenQ.to_context(output_format='markdown'): wired through _focused_context +""" +import re +from dataclasses import field +from unittest.mock import Mock, MagicMock, patch + +import pytest + +from edgar.markdown import create_markdown_table, process_content, clean_text, list_of_dicts_to_table +from edgar.xbrl.rendering import ( + RenderedStatement, StatementRow, StatementCell, StatementHeader, +) +from edgar.xbrl.notes import Note, Notes + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _cell(value, formatted=None): + """Create a StatementCell with a simple formatter.""" + fmt = formatted or str(value) if value is not None else "" + return StatementCell(value=value, formatter=lambda v, f=fmt: f if v is not None else "") + + +def _row(label, values, level=0, is_abstract=False, is_dimension=False, concept=''): + """Create a StatementRow with cells.""" + cells = [_cell(v) for v in values] + return StatementRow( + label=label, + level=level, + cells=cells, + metadata={'concept': concept}, + is_abstract=is_abstract, + is_dimension=is_dimension, + ) + + +def _rendered_statement(title='Income Statement', company='Apple Inc.', ticker='AAPL', + columns=None, rows=None, units_note=None, fiscal_period=None): + """Build a RenderedStatement with sensible defaults.""" + columns = columns or ['2024-09-28', '2023-09-30'] + rows = rows or [ + _row('Net sales', [391035, 383285], level=0, is_abstract=True, concept='us-gaap_Revenue'), + _row('Products', [224578, 220272], level=1, concept='us-gaap_ProductRevenue'), + _row('Services', [166457, 163013], level=1, concept='us-gaap_ServiceRevenue'), + ] + header = StatementHeader(columns=columns) + return RenderedStatement( + title=title, + header=header, + rows=rows, + metadata={'company_name': company, 'ticker': ticker}, + statement_type='IncomeStatement', + fiscal_period_indicator=fiscal_period, + units_note=units_note, + ) + + +def _make_note(number=1, title='Debt', short_name='Debt', role='http://co/role/Debt', + tables=None, policies=None, details=None, xbrl=None, statement=None): + """Build a Note with sensible defaults.""" + return Note( + number=number, title=title, short_name=short_name, role=role, + statement=statement, tables=tables or [], policies=policies or [], + details=details or [], xbrl=xbrl, + ) + + +def _make_notes_collection(count=5): + """Build a Notes object with N numbered notes.""" + titles = [ + 'Organization and Summary of Significant Accounting Policies', + 'Revenue Recognition', + 'Debt', + 'Income Taxes', + 'Share-Based Compensation', + 'Goodwill and Intangible Assets', + 'Commitments and Contingencies', + 'Earnings Per Share', + ] + notes = [] + for i in range(min(count, len(titles))): + notes.append(_make_note( + number=i + 1, title=titles[i], short_name=titles[i], + role=f'http://co/role/Note{i + 1}', + )) + return Notes(notes, entity_name='Test Corp', form='10-K', period='2024-12-31') + + +# =========================================================================== +# edgar.markdown module +# =========================================================================== + +class TestCreateMarkdownTable: + + @pytest.mark.fast + def test_basic_table(self): + md = create_markdown_table(['Item', 'Value'], [['Revenue', '$100M'], ['Net Income', '$25M']]) + assert '| Item | Value |' in md + assert '| Revenue | $100M |' in md + assert '| Net Income | $25M |' in md + + @pytest.mark.fast + def test_right_alignment(self): + md = create_markdown_table(['Label', 'Amount'], [['A', '100']], alignments=['left', 'right']) + assert '---:' in md + + @pytest.mark.fast + def test_empty_table(self): + md = create_markdown_table(['A'], []) + # Empty rows produces empty string (no table at all) + assert md == '' + + @pytest.mark.fast + def test_short_rows_padded(self): + md = create_markdown_table(['A', 'B', 'C'], [['x']]) + # Row should be padded to 3 columns + assert md.count('|') >= 4 # at least | x | | | + + +class TestProcessContent: + + @pytest.mark.fast + def test_html_table_to_markdown(self): + html = '
Metric2024
Revenue$100M
' + md = process_content(html) + assert 'Revenue' in md + assert '$100M' in md + assert '|' in md # pipe table + + @pytest.mark.fast + def test_plain_text_passthrough(self): + text = 'The Company issues unsecured promissory notes.' + md = process_content(text) + assert 'promissory notes' in md + + @pytest.mark.fast + def test_heading_extraction(self): + html = '

Debt Summary

Total debt was $50B.

' + md = process_content(html) + assert 'Debt Summary' in md + assert '$50B' in md + + @pytest.mark.fast + def test_noise_filtered(self): + html = '

http://fasb.org/us-gaap/2024

Real content here.

' + md = process_content(html) + assert 'fasb.org' not in md + assert 'Real content' in md + + @pytest.mark.fast + def test_track_filtered_returns_tuple(self): + html = '
Data
' + result = process_content(html, track_filtered=True) + assert isinstance(result, tuple) + assert len(result) == 2 + + +class TestCleanText: + + @pytest.mark.fast + def test_nbsp_collapsed(self): + assert clean_text('hello\xa0world') == 'hello world' + + @pytest.mark.fast + def test_multiple_spaces(self): + assert clean_text('a b c') == 'a b c' + + +class TestListOfDictsToTable: + + @pytest.mark.fast + def test_basic_conversion(self): + data = [ + {'label': 'Revenue', 'col_0': '$100M', 'col_1': '$90M'}, + {'label': 'Net Income', 'col_0': '$25M', 'col_1': '$20M'}, + ] + md = list_of_dicts_to_table(data) + assert 'Revenue' in md + assert '$100M' in md + assert '|' in md + + +# =========================================================================== +# RenderedStatement.to_markdown() +# =========================================================================== + +class TestRenderedStatementToMarkdown: + + @pytest.mark.fast + def test_standard_includes_company_header(self): + rs = _rendered_statement() + md = rs.to_markdown() + assert '## Income Statement' in md + assert 'Apple Inc.' in md + assert 'AAPL' in md + + @pytest.mark.fast + def test_minimal_no_header(self): + rs = _rendered_statement() + md = rs.to_markdown(detail='minimal') + assert '## ' not in md + assert 'Apple' not in md + # But table is present + assert '| ' in md + assert 'Net sales' in md + + @pytest.mark.fast + def test_full_includes_footer(self): + rs = _rendered_statement(units_note='[italic]In millions[/italic]') + md = rs.to_markdown(detail='full') + assert 'Source: SEC XBRL' in md + assert 'In millions' in md + + @pytest.mark.fast + def test_rich_tags_stripped(self): + rs = _rendered_statement(units_note='[italic]In millions[/italic], [dim]except per share[/dim]') + md = rs.to_markdown() + assert '[italic]' not in md + assert '[/italic]' not in md + assert '[dim]' not in md + assert '[/dim]' not in md + assert 'In millions' in md + + @pytest.mark.fast + def test_right_aligned_separator(self): + rs = _rendered_statement() + md = rs.to_markdown() + assert '---:' in md + + @pytest.mark.fast + def test_nbsp_indentation(self): + rs = _rendered_statement() + md = rs.to_markdown() + # Level-1 rows should use NBSP for indentation + assert '\u00A0' in md + + @pytest.mark.fast + def test_abstract_row_bold(self): + rs = _rendered_statement() + md = rs.to_markdown() + assert '**' in md # abstract row is bolded + + @pytest.mark.fast + def test_optimize_for_llm_skips_empty_abstract(self): + rows = [ + _row('ASSETS', [None, None], is_abstract=True), + _row('Cash', [50000, 40000]), + ] + rs = _rendered_statement(rows=rows) + md_normal = rs.to_markdown() + md_llm = rs.to_markdown(optimize_for_llm=True) + assert 'ASSETS' in md_normal + assert 'ASSETS' not in md_llm + assert 'Cash' in md_llm + + @pytest.mark.fast + def test_backward_compatible_no_args(self): + """Calling to_markdown() with no args should work like old behavior.""" + rs = _rendered_statement() + md = rs.to_markdown() + assert isinstance(md, str) + assert len(md) > 50 + + +# =========================================================================== +# Statement.to_markdown() +# =========================================================================== + +class TestStatementToMarkdown: + + @pytest.mark.fast + def test_delegates_to_rendered(self): + """Statement.to_markdown() should delegate to render().to_markdown().""" + mock_rendered = Mock() + mock_rendered.to_markdown.return_value = '## Test\n| A | B |' + + stmt = Mock() + stmt.render.return_value = mock_rendered + + # Call the actual method implementation + from edgar.xbrl.statements import Statement + result = Statement.to_markdown(stmt, detail='full', optimize_for_llm=True) + mock_rendered.to_markdown.assert_called_once_with(detail='full', optimize_for_llm=True) + + +# =========================================================================== +# StatementLineItem.to_markdown() +# =========================================================================== + +class TestStatementLineItemToMarkdown: + + @pytest.mark.fast + def test_with_values(self): + from edgar.xbrl.statements import StatementLineItem + cells = [ + StatementCell(value=67886000000, formatter=lambda v: '67,886' if v else ''), + StatementCell(value=65413000000, formatter=lambda v: '65,413' if v else ''), + ] + row = StatementRow(label='Goodwill', level=0, cells=cells, + metadata={'concept': 'us-gaap_Goodwill'}) + columns = ['2024-09-28', '2023-09-30'] + item = StatementLineItem(row, xbrl=None, columns=columns) + md = item.to_markdown(include_note=False) + assert '**Goodwill**' in md + assert '67,886 (2024-09-28)' in md + assert '65,413 (2023-09-30)' in md + + @pytest.mark.fast + def test_with_note_reference(self): + from edgar.xbrl.statements import StatementLineItem + cells = [StatementCell(value=67886000000, formatter=lambda v: '67,886' if v else '')] + row = StatementRow(label='Goodwill', level=0, cells=cells, + metadata={'concept': 'us-gaap_Goodwill'}) + mock_note = _make_note(number=7, title='Goodwill and Intangible Assets') + + item = StatementLineItem(row, xbrl=None, columns=['2024-09-28']) + + with patch.object(StatementLineItem, 'note', new_callable=lambda: property(lambda self: mock_note)): + md = item.to_markdown(include_note=True) + assert 'Note 7' in md + assert 'Goodwill and Intangible Assets' in md + assert '>' in md # blockquote + + @pytest.mark.fast + def test_no_note_no_blockquote(self): + from edgar.xbrl.statements import StatementLineItem + cells = [StatementCell(value=50000, formatter=lambda v: '50,000' if v else '')] + row = StatementRow(label='Cash', level=0, cells=cells, + metadata={'concept': 'us-gaap_Cash'}) + item = StatementLineItem(row, xbrl=None) + md = item.to_markdown(include_note=True) + # No note found (xbrl=None), so no blockquote + assert '>' not in md + assert '**Cash**' in md + + @pytest.mark.fast + def test_no_values(self): + from edgar.xbrl.statements import StatementLineItem + cells = [ + StatementCell(value=None, formatter=lambda v: ''), + StatementCell(value=None, formatter=lambda v: ''), + ] + row = StatementRow(label='Abstract Header', level=0, cells=cells, + metadata={'concept': ''}, is_abstract=True) + item = StatementLineItem(row, xbrl=None) + md = item.to_markdown(include_note=False) + assert '**Abstract Header**' in md + + +# =========================================================================== +# Note.to_markdown() +# =========================================================================== + +class TestNoteToMarkdown: + + @pytest.mark.fast + def test_minimal_title_only(self): + note = _make_note(number=3, title='Debt') + md = note.to_markdown(detail='minimal') + assert '## Note 3: Debt' in md + # No narrative or tables + assert '### Narrative' not in md + + @pytest.mark.fast + def test_minimal_with_table_names(self): + table_stmt = Mock() + table_stmt.render.return_value = Mock(title='Schedule of Debt Maturities') + note = _make_note(number=3, title='Debt', tables=[table_stmt]) + md = note.to_markdown(detail='minimal') + assert '## Note 3: Debt' in md + assert '**Tables:**' in md + + @pytest.mark.fast + def test_standard_with_tables_and_narrative(self): + # Table stmt that returns HTML + table_stmt = Mock() + table_stmt.render.return_value = Mock(title='Debt Maturities') + table_stmt.text.side_effect = lambda raw_html=False: ( + '
2025$10B
' if raw_html + else '2025: $10B' + ) + # Main statement with narrative + main_stmt = Mock() + main_stmt.text.return_value = 'The Company issues short-term promissory notes.' + + note = _make_note(number=9, title='Debt', tables=[table_stmt], statement=main_stmt) + md = note.to_markdown(detail='standard') + + assert '## Note 9: Debt' in md + # _extract_table_name strips parent prefix "Debt" → "Maturities" + assert 'Maturities' in md + assert '### Narrative' in md + assert 'promissory notes' in md + + @pytest.mark.fast + def test_full_includes_policies_and_details(self): + policy_stmt = Mock() + policy_stmt.render.return_value = Mock(title='Debt - Accounting Policy') + policy_stmt.text.return_value = 'The Company records debt at amortized cost.' + + detail_stmt = Mock() + detail_stmt.render.return_value = Mock(title='Long-term Debt Components') + detail_stmt.text.side_effect = lambda raw_html=False: ( + '
Term Debt$50B
' if raw_html + else 'Term Debt: $50B' + ) + + main_stmt = Mock() + main_stmt.text.return_value = 'Narrative text here.' + + note = _make_note( + number=9, title='Debt', + tables=[], policies=[policy_stmt], details=[detail_stmt], + statement=main_stmt, + ) + md = note.to_markdown(detail='full') + + assert '### Policy:' in md + assert 'amortized cost' in md + assert 'Long-term Debt' in md + + @pytest.mark.fast + def test_optimize_for_llm_false_uses_plain_text(self): + table_stmt = Mock() + table_stmt.render.return_value = Mock(title='Summary') + table_stmt.text.side_effect = lambda raw_html=False: ( + '
A
' if raw_html + else 'Plain text fallback' + ) + + note = _make_note(number=1, title='Test', tables=[table_stmt]) + md = note.to_markdown(detail='standard', optimize_for_llm=False) + assert 'Plain text fallback' in md + + @pytest.mark.fast + def test_expands_metadata(self): + note = _make_note(number=5, title='Revenue') + # Patch expands properties + with patch.object(type(note), 'expands', new_callable=lambda: property( + lambda self: ['Net sales', 'Product revenue'])): + with patch.object(type(note), 'expands_statements', new_callable=lambda: property( + lambda self: ['IncomeStatement'])): + md = note.to_markdown(detail='minimal') + assert '**Expands:**' in md + assert 'Net sales' in md + assert '**From:**' in md + assert 'IncomeStatement' in md + + +# =========================================================================== +# Notes.to_markdown() +# =========================================================================== + +class TestNotesToMarkdown: + + @pytest.mark.fast + def test_full_document_header(self): + notes = _make_notes_collection(3) + md = notes.to_markdown(detail='minimal') + assert '# Notes to Financial Statements' in md + assert 'Test Corp' in md + assert '10-K' in md + assert '2024-12-31' in md + + @pytest.mark.fast + def test_all_notes_included(self): + notes = _make_notes_collection(3) + md = notes.to_markdown(detail='minimal') + assert 'Note 1' in md + assert 'Note 2' in md + assert 'Note 3' in md + + @pytest.mark.fast + def test_focus_filters_notes(self): + notes = _make_notes_collection(5) + md = notes.to_markdown(detail='minimal', focus='Debt') + assert 'Note 3: Debt' in md + # Other notes should NOT be present + assert 'Revenue Recognition' not in md + assert 'Income Taxes' not in md + + @pytest.mark.fast + def test_focus_list(self): + notes = _make_notes_collection(5) + md = notes.to_markdown(detail='minimal', focus=['Debt', 'Revenue']) + assert 'Debt' in md + assert 'Revenue' in md + # Non-matching notes excluded + assert 'Income Taxes' not in md + + @pytest.mark.fast + def test_separator_between_notes(self): + notes = _make_notes_collection(3) + md = notes.to_markdown(detail='minimal') + assert '---' in md + + @pytest.mark.fast + def test_empty_notes(self): + notes = Notes([], entity_name='Empty Corp') + md = notes.to_markdown() + assert '# Notes to Financial Statements' in md + assert 'Note 1' not in md + + +# =========================================================================== +# to_context(output_format='markdown') integration +# =========================================================================== + +class TestToContextMarkdownFormat: + + @pytest.mark.fast + def test_focused_context_text_default(self): + """format='text' (default) should return plain text, not markdown.""" + from edgar.company_reports._base import CompanyReport + report = Mock(spec=CompanyReport) + report.form = '10-K' + report.company = 'Test Corp' + report.notes = _make_notes_collection(5) + report.period_of_report = '2024-12-31' + + # Call _focused_context directly + result = CompanyReport._focused_context(report, focus='Debt', detail='minimal') + assert '10-K: Test Corp' in result + assert '## Debt' in result # text mode uses ## for topic headers + + @pytest.mark.fast + def test_focused_context_markdown_format(self): + """output_format='markdown' should delegate to Notes.to_markdown().""" + from edgar.company_reports._base import CompanyReport + report = Mock(spec=CompanyReport) + report.notes = _make_notes_collection(5) + + result = CompanyReport._focused_context(report, focus='Debt', detail='minimal', output_format='markdown') + assert '# Notes to Financial Statements' in result + assert 'Note 3: Debt' in result