diff --git a/Readme.md b/Readme.md index 5e2739e..40ed4d6 100644 --- a/Readme.md +++ b/Readme.md @@ -4,7 +4,8 @@ [![SWH](https://archive.softwareheritage.org/badge/origin/https://github.com/kermitt2/grobid_client_python/)](https://archive.softwareheritage.org/browse/origin/https://github.com/kermitt2/grobid_client_python/) [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) -A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobid) REST services that provides concurrent processing capabilities for PDF documents, reference strings, and patents. +A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobid) REST services that provides +concurrent processing capabilities for PDF documents, reference strings, and patents. ## 📋 Table of Contents @@ -13,8 +14,8 @@ A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobi - [Installation](#-installation) - [Quick Start](#-quick-start) - [Usage](#-usage) - - [Command Line Interface](#command-line-interface) - - [Python Library](#python-library) + - [Command Line Interface](#command-line-interface) + - [Python Library](#python-library) - [Configuration](#-configuration) - [Services](#-services) - [Testing](#-testing) @@ -31,15 +32,17 @@ A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobi - **Coordinate Extraction**: Optional PDF coordinate extraction for precise element positioning - **Sentence Segmentation**: Layout-aware sentence segmentation capabilities - **JSON Output**: Convert TEI XML output to structured JSON format with CORD-19-like structure +- **Markdown Output**: Convert TEI XML output to clean Markdown format with structured sections ## 📋 Prerequisites - **Python**: 3.8 - 3.13 (tested versions) - **GROBID Server**: A running GROBID service instance - - Local installation: [GROBID Documentation](http://grobid.readthedocs.io/) - - Docker: `docker run -t --rm -p 8070:8070 lfoppiano/grobid:0.8.2` - - Default server: `http://localhost:8070` - - Online demo: https://lfoppiano-grobid.hf.space (usage limits apply), more details [here](https://grobid.readthedocs.io/en/latest/getting_started/#using-grobid-from-the-cloud). + - Local installation: [GROBID Documentation](http://grobid.readthedocs.io/) + - Docker: `docker run -t --rm -p 8070:8070 lfoppiano/grobid:0.8.2` + - Default server: `http://localhost:8070` + - Online demo: https://lfoppiano-grobid.hf.space (usage limits apply), more + details [here](https://grobid.readthedocs.io/en/latest/getting_started/#using-grobid-from-the-cloud). > [!IMPORTANT] @@ -51,16 +54,19 @@ A simple, efficient Python client for [GROBID](https://github.com/kermitt2/grobi Choose one of the following installation methods: ### PyPI (Recommended) + ```bash pip install grobid-client-python ``` ### Development Version + ```bash pip install git+https://github.com/kermitt2/grobid_client_python.git ``` ### Local Development + ```bash git clone https://github.com/kermitt2/grobid_client_python cd grobid_client_python @@ -70,6 +76,7 @@ pip install -e . ## ⚡ Quick Start ### Command Line + ```bash # Process PDFs in a directory grobid_client --input ./pdfs --output ./output processFulltextDocument @@ -79,6 +86,7 @@ grobid_client --server https://your-grobid-server.com --input ./pdfs processFull ``` ### Python Library + ```python from grobid_client.grobid_client import GrobidClient @@ -135,6 +143,7 @@ grobid_client [OPTIONS] SERVICE | `--segmentSentences` | Segment sentences with coordinates | | `--flavor` | Processing flavor for fulltext extraction | | `--json` | Convert TEI output to JSON format | +| `--markdown` | Convert TEI output to Markdown format | #### Examples @@ -149,6 +158,9 @@ grobid_client --input ~/pdfs --output ~/tei --n 20 --teiCoordinates processFullt # Process with JSON output grobid_client --input ~/pdfs --output ~/results --json processFulltextDocument +# Process with Markdown output +grobid_client --input ~/pdfs --output ~/results --markdown processFulltextDocument + # Process citations with custom server grobid_client --server https://grobid.example.com --input ~/citations.txt processCitationList @@ -204,6 +216,14 @@ client.process( json_output=True ) +# Process with Markdown output +client.process( + service="processFulltextDocument", + input_path="/path/to/pdfs", + output_path="/path/to/output", + markdown_output=True +) + # Process citation lists client.process( service="processCitationList", @@ -214,17 +234,25 @@ client.process( ## ⚙️ Configuration -Configuration can be provided via a JSON file. When using the CLI, the `--server` argument overrides the config file settings. +Configuration can be provided via a JSON file. When using the CLI, the `--server` argument overrides the config file +settings. ### Default Configuration ```json { - "grobid_server": "http://localhost:8070", - "batch_size": 1000, - "sleep_time": 5, - "timeout": 60, - "coordinates": ["persName", "figure", "ref", "biblStruct", "formula", "s"] + "grobid_server": "http://localhost:8070", + "batch_size": 1000, + "sleep_time": 5, + "timeout": 60, + "coordinates": [ + "persName", + "figure", + "ref", + "biblStruct", + "formula", + "s" + ] } ``` @@ -314,6 +342,7 @@ The config file can include logging settings: ## 🔬 Services ### Fulltext Document Processing + Extracts complete document structure including headers, body text, figures, tables, and references. ```bash @@ -336,11 +365,16 @@ When using the `--json` flag, the client converts TEI XML output to a structured "level": "paragraph", "biblio": { "title": "Document Title", - "authors": ["Author 1", "Author 2"], + "authors": [ + "Author 1", + "Author 2" + ], "doi": "10.1000/example", "publication_date": "2023-01-01", "journal": "Journal Name", - "abstract": [...] + "abstract": [ + ... + ] }, "body_text": [ { @@ -365,8 +399,16 @@ When using the `--json` flag, the client converts TEI XML output to a structured "label": "Table 1", "head": "Sample Data", "content": { - "headers": ["Header 1", "Header 2"], - "rows": [["Value 1", "Value 2"]], + "headers": [ + "Header 1", + "Header 2" + ], + "rows": [ + [ + "Value 1", + "Value 2" + ] + ], "metadata": { "row_count": 1, "column_count": 2, @@ -399,9 +441,91 @@ client.process( ``` > [!NOTE] -> When using `--json`, the `--force` flag only checks for existing TEI files. If a TEI file is rewritten (due to `--force`), the corresponding JSON file is automatically rewritten as well. +> When using `--json`, the `--force` flag only checks for existing TEI files. If a TEI file is rewritten (due to +`--force`), the corresponding JSON file is automatically rewritten as well. + +### Markdown Output Format + +When using the `--markdown` flag, the client converts TEI XML output to a clean, readable Markdown format. This +provides: + +- **Structured Sections**: Title, Authors, Affiliations, Publication Date, Fulltext, Annex, and References +- **Clean Formatting**: Human-readable format suitable for documentation and sharing +- **Preserved Content**: All text content with proper section organization +- **Reference Formatting**: Bibliographic references in a readable format + +#### Markdown Structure + +The generated Markdown follows this structure: + +```markdown +# Document Title + +## Authors + +- Author Name 1 +- Author Name 2 + +## Affiliations + +- Affiliation 1 +- Affiliation 2 + +## Publication Date + +January 1, 2023 + +## Fulltext + +### Introduction + +Content of the introduction section... + +### Methods + +Content of the methods section... + +## Annex + +### Acknowledgements + +Acknowledgement text... + +### Competing Interests + +Competing interests statement... + +## References + +**[1]** Paper Title. *Author Name*. *Journal Name* (2023). +**[2]** Another Paper. *Author et al.*. *Conference* (2022). +``` + +#### Usage Examples + +```bash +# Generate both TEI and Markdown outputs +grobid_client --input pdfs/ --output results/ --markdown processFulltextDocument + +# Markdown output with coordinates and sentence segmentation +grobid_client --input pdfs/ --output results/ --markdown --teiCoordinates --segmentSentences processFulltextDocument +``` + +```python +# Python library usage +client.process( + service="processFulltextDocument", + input_path="/path/to/pdfs", + output_path="/path/to/output", + markdown_output=True +) +``` + +> [!NOTE] +> When using `--markdown`, the `--force` flag only checks for existing TEI files. If a TEI file is rewritten (due to `--force`), the corresponding Markdown file is automatically rewritten as well. ### Header Document Processing + Extracts only document metadata (title, authors, abstract, etc.). ```bash @@ -409,6 +533,7 @@ grobid_client --input pdfs/ --output headers/ processHeaderDocument ``` ### Reference Processing + Extracts and structures bibliographic references from documents. ```bash @@ -416,6 +541,7 @@ grobid_client --input pdfs/ --output refs/ processReferences ``` ### Citation List Processing + Parses raw citation strings from text files. ```bash @@ -458,6 +584,7 @@ pytest -v ### Continuous Integration Tests are automatically run via GitHub Actions on: + - Push to main branch - Pull requests - Multiple Python versions (3.8-3.13) @@ -480,7 +607,7 @@ Benchmark results for processing **136 PDFs** (3,443 pages total, ~25 pages per ### Additional Benchmarks - **Header processing**: 3.74s for 136 PDFs (36 PDF/s) with n=10 -- **Reference extraction**: 26.9s for 136 PDFs (5.1 PDF/s) with n=10 +- **Reference extraction**: 26.9s for 136 PDFs (5.1 PDF/s) with n=10 - **Citation parsing**: 4.3s for 3,500 citations (814 citations/s) with n=10 ## 🛠️ Development @@ -530,7 +657,8 @@ bump-my-version bump patch ## 📄 License -Distributed under the [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0). See `LICENSE` for more information. +Distributed under the [Apache 2.0 License](http://www.apache.org/licenses/LICENSE-2.0). See `LICENSE` for more +information. ## 👥 Authors & Contact diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py new file mode 100644 index 0000000..167231e --- /dev/null +++ b/grobid_client/format/TEI2Markdown.py @@ -0,0 +1,707 @@ +""" +Convert TEI XML format to Markdown format + +This module provides functionality to convert GROBID TEI XML output to a clean +Markdown format with the following sections: +- Title +- Authors +- Affiliations +- Publication date +- Fulltext +- Annex +- References +""" +import os +import uuid +from pathlib import Path +from typing import List, Dict, Union, Optional, BinaryIO +from bs4 import BeautifulSoup, NavigableString, Tag +import logging +import dateparser + +# Configure module-level logger +logger = logging.getLogger(__name__) +if not logger.handlers: + # Basic configuration if not already configured by the application + logging.basicConfig(level=logging.INFO) + + +class TEI2MarkdownConverter: + """Converter that converts TEI XML to Markdown format.""" + + def __init__(self): + pass + + def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]: + """Convert a TEI file to Markdown format. + + Args: + tei_file: Path to TEI file or file-like object + + Returns: + Markdown content as string, or None if conversion fails + """ + try: + # Load with BeautifulSoup + if isinstance(tei_file, (str, Path)): + content = open(tei_file, 'r', encoding='utf-8').read() + else: + content = tei_file.read() + + soup = BeautifulSoup(content, 'xml') + + if soup.TEI is None: + logger.warning("The TEI file is not well-formed or empty. Skipping the file.") + return None + + markdown_sections = [] + + # Extract title + title = self._extract_title(soup) + if title: + markdown_sections.append(f"# {title}\n") + + # Extract authors + authors = self._extract_authors(soup) + if authors: + for author in authors: + markdown_sections.append(f"{author}\n") + markdown_sections.append("\n") + + # Extract affiliations + affiliations = self._extract_affiliations(soup) + if affiliations: + affiliations_as_text = ", ".join(affiliations) + markdown_sections.append(f"{affiliations_as_text}\n\n") + + # Extract publication date + pub_date = self._extract_publication_date(soup) + if pub_date: + markdown_sections.append(f"Publishd on {pub_date}\n\n") + + # Extract abstract + abstract = self._extract_abstract(soup) + if abstract: + markdown_sections.append(abstract) + markdown_sections.append("\n\n") + + # Extract fulltext + fulltext = self._extract_fulltext(soup) + if fulltext: + markdown_sections.append(fulltext) + markdown_sections.append("\n") + + # Extract annex (acknowledgements, competing interests, etc.) + annex = self._extract_annex(soup) + if annex: + markdown_sections.append(annex) + markdown_sections.append("\n") + + # Extract references + references = self._extract_references(soup) + if references: + markdown_sections.append("## References\n") + markdown_sections.append(references) + markdown_sections.append("\n") + + return "".join(markdown_sections) + + except Exception as e: + logger.error(f"Error converting TEI to Markdown: {str(e)}") + return None + + def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: + """Extract document title from TEI.""" + title_node = soup.find("title", attrs={"type": "main", "level": "a"}) + if title_node: + return title_node.get_text().strip() + return None + + def _extract_authors(self, soup: BeautifulSoup) -> List[str]: + """Extract authors from TEI document header (excluding references).""" + authors = [] + + # Only look in teiHeader to avoid picking up authors from references + tei_header = soup.find("teiHeader") + if not tei_header: + return authors + + for author in tei_header.find_all("author"): + forename = author.find('forename') + surname = author.find('surname') + + if forename and surname: + author_name = f"{forename.get_text().strip()} {surname.get_text().strip()}" + elif surname: + author_name = surname.get_text().strip() + elif forename: + author_name = forename.get_text().strip() + else: + continue + + if author_name.strip(): + authors.append(author_name.strip()) + + return authors + + def _extract_affiliations(self, soup: BeautifulSoup) -> List[str]: + """Extract affiliations from TEI document header (excluding references).""" + affiliations = [] + + # Only look in teiHeader to avoid picking up affiliations from references + tei_header = soup.find("teiHeader") + if not tei_header: + return affiliations + + for affiliation in tei_header.find_all("affiliation"): + # Get the full affiliation text + affiliation_text = affiliation.get_text().strip() + if affiliation_text: + affiliations.append(affiliation_text) + + return affiliations + + def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]: + """Extract publication date from TEI.""" + pub_date = soup.find("date", attrs={"type": "published"}) + if pub_date: + iso_date = pub_date.attrs.get("when") + if iso_date: + try: + parsed_date = dateparser.parse(iso_date) + if parsed_date: + return parsed_date.strftime("%B %d, %Y") + except Exception: + pass + return iso_date + return None + + def _extract_abstract(self, soup: BeautifulSoup) -> str: + """Extract abstract from TEI.""" + abstract_paragraphs = [] + + # Find abstract element + abstract = soup.find("abstract") + if not abstract: + return "" + + # Extract paragraphs from abstract + for p in abstract.find_all("p"): + paragraph_text = self._process_paragraph(p) + # Filter out empty paragraphs and standalone periods + if paragraph_text.strip() and paragraph_text.strip() != ".": + # Remove trailing periods that might create standalone lines + cleaned_text = paragraph_text.strip() + abstract_paragraphs.append(cleaned_text) + + return "\n\n".join(abstract_paragraphs) + + def _extract_fulltext(self, soup: BeautifulSoup) -> str: + """Extract main body text from TEI.""" + fulltext_sections = [] + + # Find body element + body = soup.find("body") + if not body: + return "" + + # Process each div in the body + for div in body.find_all("div"): + # Get section heading + head = div.find("head") + if head: + section_title = head.get_text().strip() + fulltext_sections.append(f"### {section_title}\n") + + # Get paragraphs + paragraphs = div.find_all("p") + for p in paragraphs: + paragraph_text = self._process_paragraph(p) + if paragraph_text.strip(): + fulltext_sections.append(f"{paragraph_text}\n\n") + + return "".join(fulltext_sections) + + def _extract_annex(self, soup: BeautifulSoup) -> str: + """Extract annex content (everything in except references and content that should be in body) from TEI.""" + annex_sections = [] + + # Find back element + back = soup.find("back") + if not back: + return "" + + # Get all content from back (not just divs) - stream everything + for child in back.children: + if hasattr(child, 'name') and child.name: + if child.name == "div": + # Skip the references div since it's handled separately + if child.get("type") == "references": + continue + + # Skip methods-like content that should be in body, not annex + div_type = child.get("type", "").lower() + if div_type in ["methods", "results", "discussion", "introduction"]: + continue + + # Process this div and any nested divs + self._process_div_and_nested_divs(child, annex_sections) + elif child.name == "p": + # Direct paragraphs in back + paragraph_text = self._process_paragraph(child) + if paragraph_text.strip(): + annex_sections.append(f"{paragraph_text}\n\n") + # Add other elements as needed (e.g., notes, etc.) + elif child.name not in ["listBibl"]: # Skip listBibl, handled in references + # Get text content from other elements + text_content = child.get_text().strip() + if text_content: + annex_sections.append(f"{text_content}\n\n") + + return "".join(annex_sections) + + def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None: + """Process a div element and its nested div elements.""" + # Add section header if present for this div (avoid duplicates) + head = div.find("head") + if head and head.get_text().strip(): + header_text = f"### {head.get_text().strip()}\n\n" + # Check if this header already exists to avoid duplication + if header_text not in annex_sections: + annex_sections.append(header_text) + + # Process paragraphs that are direct children of this div (not in nested divs) + for child in div.children: + if hasattr(child, 'name') and child.name == "p": + paragraph_text = self._process_paragraph(child) + if paragraph_text.strip(): + annex_sections.append(f"{paragraph_text}\n\n") + + # Process nested div elements + for child in div.children: + if hasattr(child, 'name') and child.name == "div": + self._process_div_and_nested_divs(child, annex_sections) + + def _extract_references(self, soup: BeautifulSoup) -> str: + """Extract bibliographic references from TEI.""" + references = [] + + # Find back element + back = soup.find("back") + if not back: + return "" + + # Find the specific div with type="references" + references_div = back.find("div", attrs={"type": "references"}) + if not references_div: + return "" + + # Find listBibl element within the references div + list_bibl = references_div.find("listBibl") + if not list_bibl: + return "" + + # Process each biblStruct + for i, bibl_struct in enumerate(list_bibl.find_all("biblStruct"), 1): + ref_text = self._format_reference(bibl_struct, i) + if ref_text: + references.append(ref_text) + + return "\n".join(references) + + def _process_paragraph(self, p_element: Tag) -> str: + """Process a paragraph element and convert to markdown.""" + text_parts = [] + + for element in p_element.children: + if isinstance(element, NavigableString): + text_parts.append(str(element)) + elif element.name == "ref": + # Handle references - keep the text but don't add special formatting + ref_text = element.get_text() + text_parts.append(ref_text) + elif element.name == "figure": + # Handle figures + fig_desc = element.find("figDesc") + if fig_desc: + text_parts.append(f"\n*Figure: {fig_desc.get_text().strip()}*\n") + elif element.name == "table": + # Handle tables - convert to simple markdown + table_md = self._table_to_markdown(element) + if table_md: + text_parts.append(f"\n{table_md}\n") + else: + # For other elements, just get the text + text_parts.append(element.get_text()) + + return "".join(text_parts).strip() + + def _table_to_markdown(self, table_element: Tag) -> str: + """Convert a table element to simple markdown.""" + markdown_lines = [] + + # Process table rows + for row in table_element.find_all("row"): + cells = [] + for cell in row.find_all("cell"): + cell_text = cell.get_text().strip() + cells.append(cell_text) + + if cells: + markdown_lines.append("| " + " | ".join(cells) + " |") + + return "\n".join(markdown_lines) if markdown_lines else "" + + def _format_reference(self, bibl_struct: Tag, ref_num: int) -> str: + """ + Format a bibliographic reference with comprehensive TEI element handling. + + This method processes all standard TEI bibliographic elements including: + - Title extraction from analytic and monogr levels + - Author information from all levels with proper name formatting + - Publication details (journal, year, volume, issue, pages) + - Identifiers (DOI, PMID, PMCID, ISBN, ISSN) + - URLs and external links from ptr elements + - Raw reference fallback for unstructured data + """ + reference_components = [] + + # Reference identifier always comes first + reference_components.append(f"**[{ref_num}]**") + + # Extract bibliographic information in hierarchical order + ref_data = self._extract_bibliographic_data(bibl_struct) + + # Add title if available + if ref_data.get('title'): + reference_components.append(ref_data['title']) + + # Add authors with proper formatting + if ref_data.get('authors'): + author_text = self._format_authors(ref_data['authors']) + reference_components.append(f"*{author_text}*") + + # Add publication venue (journal, book, etc.) + if ref_data.get('venue'): + reference_components.append(f"*{ref_data['venue']}*") + + # Add publication details + publication_details = self._build_publication_details(ref_data) + if publication_details: + reference_components.append(publication_details) + + # Add identifiers and links + identifiers_and_links = self._build_identifiers_and_links(ref_data) + reference_components.extend(identifiers_and_links) + + # Fallback to raw reference if no structured data + if len(reference_components) == 1: # Only has reference number + raw_reference = self._extract_raw_reference(bibl_struct) + if raw_reference: + reference_components.append(raw_reference) + + # Assemble final reference + formatted_reference = " ".join(reference_components) + + # Ensure proper ending punctuation + if not formatted_reference.endswith('.'): + formatted_reference += "." + + return formatted_reference + + def _extract_bibliographic_data(self, bibl_struct: Tag) -> dict: + """ + Extract comprehensive bibliographic data from TEI structure. + + Handles both analytic (article-level) and monogr (journal/book-level) information + following standard TEI bibliographic structure. + """ + bib_data = { + 'title': None, + 'authors': [], + 'venue': None, + 'year': None, + 'volume': None, + 'issue': None, + 'pages': None, + 'identifiers': {}, + 'urls': [], + 'raw_text': None + } + + # Process analytic section (article-level information) + analytic = bibl_struct.find("analytic") + if analytic: + self._process_analytic_section(analytic, bib_data) + + # Process monogr section (journal/book-level information) + monogr = bibl_struct.find("monogr") + if monogr: + self._process_monograph_section(monogr, bib_data) + + # Process series information if present + series = bibl_struct.find("series") + if series: + self._process_series_section(series, bib_data) + + # Extract identifiers from all levels + self._extract_identifiers(bibl_struct, bib_data) + + # Extract URLs and links + self._extract_urls(bibl_struct, bib_data) + + return bib_data + + def _process_analytic_section(self, analytic: Tag, bib_data: dict) -> None: + """Process the analytic section containing article-level information.""" + # Extract article title + title = analytic.find("title", level="a") + if title and title.get_text().strip(): + bib_data['title'] = title.get_text().strip() + + # Extract authors from analytic section + for author in analytic.find_all("author"): + author_info = self._extract_author_info(author) + if author_info: + bib_data['authors'].append(author_info) + + def _process_monograph_section(self, monogr: Tag, bib_data: dict) -> None: + """Process the monograph section containing publication-level information.""" + # Extract title if no analytic title was found + if not bib_data['title']: + title = monogr.find("title") + if title and title.get_text().strip(): + bib_data['title'] = title.get_text().strip() + + # Extract journal/book title + journal = monogr.find("title", level="j") + if journal and journal.get_text().strip(): + bib_data['venue'] = journal.get_text().strip() + + # Extract authors from monograph if no analytic authors + if not bib_data['authors']: + for author in monogr.find_all("author"): + author_info = self._extract_author_info(author) + if author_info: + bib_data['authors'].append(author_info) + + # Process imprint section containing publication details + imprint = monogr.find("imprint") + if imprint: + self._process_imprint_section(imprint, bib_data) + + def _process_series_section(self, series: Tag, bib_data: dict) -> None: + """Process series information for multi-part publications.""" + series_title = series.find("title", level="s") + if series_title and series_title.get_text().strip(): + if bib_data['venue']: + bib_data['venue'] += f" ({series_title.get_text().strip()})" + else: + bib_data['venue'] = series_title.get_text().strip() + + def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None: + """Process the imprint section containing publication details.""" + # Extract publication date + date = imprint.find("date") + if date: + bib_data['year'] = self._extract_year(date.get_text().strip()) + + # Extract publication details from biblScope elements + for bibl_scope in imprint.find_all("biblScope"): + unit = bibl_scope.get("unit", "").lower() + text = bibl_scope.get_text().strip() + + if unit == "vol" and text: + bib_data['volume'] = text + elif unit == "issue" and text: + bib_data['issue'] = text + elif unit == "page" and text: + # Handle page ranges + if "from" in bibl_scope.attrs: + bib_data['pages'] = f"{text}-" + elif "to" in bibl_scope.attrs and bib_data.get('pages'): + bib_data['pages'] += text + else: + bib_data['pages'] = text + + def _extract_author_info(self, author: Tag) -> dict: + """Extract author information from a TEI author element.""" + author_info = {} + + # Handle persName wrapper + pers_name = author.find("persName") + if pers_name: + forename = pers_name.find('forename') + surname = pers_name.find('surname') + else: + forename = author.find('forename') + surname = author.find('surname') + + # Extract name components + if forename: + author_info['forename'] = forename.get_text().strip() + if surname: + author_info['surname'] = surname.get_text().strip() + + return author_info if author_info else None + + def _extract_identifiers(self, bibl_struct: Tag, bib_data: dict) -> None: + """Extract various identifier types from the bibliographic structure.""" + identifier_sections = [bibl_struct] + + # Add analytic and monogr sections if they exist + analytic = bibl_struct.find("analytic") + if analytic: + identifier_sections.append(analytic) + + monogr = bibl_struct.find("monogr") + if monogr: + identifier_sections.append(monogr) + + # Extract identifiers from all sections + for section in identifier_sections: + if section: + idnos = section.find_all("idno") + for idno in idnos: + id_type = idno.get("type", "").lower() + id_value = idno.get_text().strip() + + if id_type and id_value: + bib_data['identifiers'][id_type] = id_value + + def _extract_urls(self, bibl_struct: Tag, bib_data: dict) -> None: + """Extract URLs and external links from ptr elements.""" + url_sections = [bibl_struct] + + # Add analytic and monogr sections if they exist + analytic = bibl_struct.find("analytic") + if analytic: + url_sections.append(analytic) + + monogr = bibl_struct.find("monogr") + if monogr: + url_sections.append(monogr) + + # Extract URLs from all sections + for section in url_sections: + if section: + ptrs = section.find_all("ptr") + for ptr in ptrs: + target = ptr.get("target") + if target and target.strip(): + bib_data['urls'].append(target.strip()) + + def _extract_year(self, date_text: str) -> str: + """Extract year from date text, handling various formats.""" + import re + + # Look for 4-digit year patterns + year_match = re.search(r'\b(19|20)\d{2}\b', date_text) + if year_match: + return year_match.group() + + # Fallback to returning the original text + return date_text.strip() + + def _format_authors(self, authors: list) -> str: + """Format author list for display.""" + formatted_authors = [] + + for author in authors: + if 'forename' in author and 'surname' in author: + formatted_authors.append(f"{author['forename']} {author['surname']}") + elif 'surname' in author: + formatted_authors.append(author['surname']) + elif 'forename' in author: + formatted_authors.append(author['forename']) + + if not formatted_authors: + return "" + + if len(formatted_authors) == 1: + return formatted_authors[0] + elif len(formatted_authors) == 2: + return f"{formatted_authors[0]} and {formatted_authors[1]}" + else: + return f"{formatted_authors[0]} et al." + + def _build_publication_details(self, ref_data: dict) -> str: + """Build publication details string from extracted data.""" + details = [] + + if ref_data.get('volume'): + details.append(ref_data['volume']) + + if ref_data.get('issue'): + details.append(f"({ref_data['issue']})") + + if ref_data.get('pages'): + details.append(f"pp. {ref_data['pages']}") + + return " ".join(details) + + def _build_identifiers_and_links(self, ref_data: dict) -> list: + """Build list of formatted identifiers and links.""" + identifiers_and_links = [] + + # Format DOI if present + if 'doi' in ref_data['identifiers']: + doi = ref_data['identifiers']['doi'] + identifiers_and_links.append(f"https://doi.org/{doi}") + + # Format other identifiers + for id_type, id_value in ref_data['identifiers'].items(): + if id_type != 'doi': + if id_type.lower() in ['pmid', 'pmcid']: + identifiers_and_links.append(f"{id_type.upper()}: {id_value}") + elif id_type.lower() in ['isbn', 'issn']: + identifiers_and_links.append(f"{id_type.upper()}: {id_value}") + + # Format URLs with display-friendly text + for url in ref_data['urls']: + if url.startswith(('http://', 'https://')): + # Extract domain for cleaner display + try: + domain = url.split('//')[1].split('/')[0] + identifiers_and_links.append(f"[{domain}]({url})") + except IndexError: + identifiers_and_links.append(f"[{url}]({url})") + else: + identifiers_and_links.append(f"[{url}]({url})") + + return identifiers_and_links + + def _extract_raw_reference(self, bibl_struct: Tag) -> str: + """Extract raw reference text as fallback.""" + # Look for raw reference notes + raw_ref = bibl_struct.find("note", attrs={"type": "raw_reference"}) + if raw_ref: + raw_text = raw_ref.get_text().strip() + if raw_text: + return raw_text + + # Fallback to cleaning all text content + raw_text = bibl_struct.get_text().strip() + + # Remove reference number if present + import re + raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text) + + # Clean up excessive whitespace + raw_text = re.sub(r'\s+', ' ', raw_text) + + return raw_text if len(raw_text) > 20 else None + + +# Backwards compatible top-level function +def convert_tei_file_to_markdown(tei_file: Union[Path, BinaryIO]) -> Optional[str]: + """Convert a TEI file to Markdown format. + + Args: + tei_file: Path to TEI file or file-like object + + Returns: + Markdown content as string, or None if conversion fails + """ + converter = TEI2MarkdownConverter() + return converter.convert_tei_file(tei_file) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index 2e0aed3..3d585bf 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -339,7 +339,8 @@ def process( force=True, verbose=False, flavor=None, - json_output=False + json_output=False, + markdown_output=False ): batch_size_pdf = self.config["batch_size"] @@ -399,10 +400,12 @@ def process( force, verbose, flavor, - json_output + json_output, + markdown_output ) processed_files_count += batch_processed errors_files_count += batch_errors + skipped_files_count += batch_skipped input_files = [] # last batch @@ -423,7 +426,8 @@ def process( force, verbose, flavor, - json_output + json_output, + markdown_output ) processed_files_count += batch_processed errors_files_count += batch_errors @@ -452,7 +456,8 @@ def process_batch( force, verbose=False, flavor=None, - json_output=False + json_output=False, + markdown_output=False ): if verbose: self.logger.info(f"{len(input_files)} files to process in current batch") @@ -493,6 +498,27 @@ def process_batch( except Exception as e: self.logger.error(f"Failed to convert TEI to JSON for {filename}: {str(e)}") + # Check if Markdown output is needed but Markdown file doesn't exist + if markdown_output: + markdown_filename = filename.replace('.grobid.tei.xml', '.md') + # Expand ~ to home directory before checking file existence + markdown_filename_expanded = os.path.expanduser(markdown_filename) + if not os.path.isfile(markdown_filename_expanded): + self.logger.info(f"Markdown file {markdown_filename} does not exist, generating Markdown from existing TEI...") + try: + from .format.TEI2Markdown import TEI2MarkdownConverter + converter = TEI2MarkdownConverter() + markdown_data = converter.convert_tei_file(filename) + + if markdown_data: + with open(markdown_filename_expanded, 'w', encoding='utf8') as markdown_file: + markdown_file.write(markdown_data) + self.logger.debug(f"Successfully created Markdown file: {markdown_filename_expanded}") + else: + self.logger.warning(f"Failed to convert TEI to Markdown for {filename}") + except Exception as e: + self.logger.error(f"Failed to convert TEI to Markdown for {filename}: {str(e)}") + continue selected_process = self.process_pdf @@ -564,6 +590,25 @@ def process_batch( self.logger.warning(f"Failed to convert TEI to JSON for {filename}") except Exception as e: self.logger.error(f"Failed to convert TEI to JSON for {filename}: {str(e)}") + + # Convert to Markdown if requested + if markdown_output: + try: + from .format.TEI2Markdown import TEI2MarkdownConverter + converter = TEI2MarkdownConverter() + markdown_data = converter.convert_tei_file(filename) + + if markdown_data: + markdown_filename = filename.replace('.grobid.tei.xml', '.md') + # Always write Markdown file when TEI is written (respects --force behavior) + markdown_filename_expanded = os.path.expanduser(markdown_filename) + with open(markdown_filename_expanded, 'w', encoding='utf8') as markdown_file: + markdown_file.write(markdown_data) + self.logger.debug(f"Successfully wrote Markdown file: {markdown_filename_expanded}") + else: + self.logger.warning(f"Failed to convert TEI to Markdown for {filename}") + except Exception as e: + self.logger.error(f"Failed to convert TEI to Markdown for {filename}: {str(e)}") except OSError as e: self.logger.error(f"Failed to write TEI XML file {filename}: {str(e)}") @@ -832,6 +877,11 @@ def main(): action="store_true", help="Convert TEI output to JSON format using the TEI2LossyJSON converter", ) + parser.add_argument( + "--markdown", + action="store_true", + help="Convert TEI output to Markdown format", + ) args = parser.parse_args() @@ -840,6 +890,7 @@ def main(): output_path = args.output flavor = args.flavor json_output = args.json + markdown_output = args.markdown # Initialize n with default value n = 10 @@ -909,7 +960,8 @@ def main(): force=force, verbose=verbose, flavor=flavor, - json_output=json_output + json_output=json_output, + markdown_output=markdown_output ) except Exception as e: logger.error(f"Processing failed: {str(e)}") diff --git a/resources/test_out/test_1/0f318087-b211-4ace-8ac9-3d9372a73c1c.tei.xml b/resources/test_out/test_1/0f318087-b211-4ace-8ac9-3d9372a73c1c.tei.xml deleted file mode 100644 index 4c48e2f..0000000 --- a/resources/test_out/test_1/0f318087-b211-4ace-8ac9-3d9372a73c1c.tei.xml +++ /dev/null @@ -1,1094 +0,0 @@ - - - - - - Clinical Characteristics, Electrophysiology, and Skin Biopsy of 38 Peripheral Neuropathy Cases with Small Fiber Involvement of Various Etiologies - - - Ovid Technologies (Wolters Kluwer Health) -

Copyright Ovid Technologies (Wolters Kluwer Health)

-
- 2017-07 -
- - - - - BoSun - - - Li-ZhiLiu - - - Yi-FanLi - - - Zhao-HuiChen - - - LiLing - - - FeiYang - - - FangCui - - - DrXu-ShengHuang - - - - Department of Neurology - Chinese People's Liberation Army General Hospital -
- 100853 - Beijing - China -
-
-
- - - Department of Neurology, Chinese People's Liberation Army General Hospital -
- No. 28 Fuxing Road - 100853 - Beijing - China -
-
-
- Clinical Characteristics, Electrophysiology, and Skin Biopsy of 38 Peripheral Neuropathy Cases with Small Fiber Involvement of Various Etiologies -
- - Chinese Medical Journal - Chinese Medical Journal - 0366-6999 - - Ovid Technologies (Wolters Kluwer Health) - 130 - 14 - - - - - 10.4103/0366-6999.209897 - Received: 28-02-2017 -
-
-
- - - - GROBID - A machine learning software for extracting information from scholarly documents - - - - - - - - Impaired Glucose Tolerance - Intraepidermal Nerve Fiber Density - Metabolic Syndrome - Nerve Conduction Studies - Small Fiber Neuropathy - - - -

Peripheral neuropathy (PN) can be categorized according to the nerves involved. Pure small fiber neuropathy (SFN) affects only the small fiber nerves, pure large fiber neuropathy affects only large fiber nerves, and mixed small and large fiber neuropathy affects both types of nerves. [1] Small diameter, thinly myelinated Aδ and unmyelinated C fibers are primarily affected by SFN, resulting in sensory and/or autonomic symptoms such as allodynia, hyperalgesia, palpitations, and hyper/hypohidrosis. [1,2] Traditional nerve conduction studies (NCS) measure the integrity and function of large fibers independent of small fiber function. Thus, the evaluation of small fiber function has been challenging until the introduction of skin biopsies assessing intraepidermal nerve fiber density (IENFD). [3] Recently, other techniques such as quantitative sensory testing, contact heat evoked potentials, as well as questionnaires and scales (e.g., the SFN and Symptom

-
-
-
- - - - - - - - - - -

Inventory Questionnaire [SIQ]), have been applied to evaluate small fiber damage in patients. [1,[4][5][6] However, skin biopsies are still considered the single most reliable and necessary test in the diagnosis of SFN. [1,7] Various etiologies have been shown to be associated with SFN. [8,9] Therefore, obtaining a thorough medical history to determine possible causative factors should be carried out in patients suspected having SFN. In general, impaired glucose tolerance (IGT) is the most prevalent etiology resulting in SFN. [8] In addition to IGT, diabetes, dyslipidemia, connective tissue diseases, HIV infection, hepatitis C and other hepatic disorders, nutritional diseases, hyper/hypothyroidism, paraneoplastic syndromes, and neurotoxic agents such as oxaliplatin, paclitaxel, and vincristine [1,10] have all been shown to result in SFN. Thorough screening in patients could help identifying the etiology; however, no definitive etiologies are easily identifiable in some patients, which are regarded as having idiopathic neuropathy. [8] The occurrence of SFN can precede the occurrence of the primary disease, and a routine follow-up should be performed to identify the etiology. [11][12][13][14] This study aimed to investigate the clinical features, electrophysiological parameters, and IENFD in patients with small fiber PN resulting from a variety of etiologies, and to compare the severity of disease among idiopathic PN, IGT-related PN, and MS-related PN.

-
Methods
-
Ethical approval

The study was approved by the Ethics Committee of the Chinese People's Liberation Army (PLA) General Hospital, and all patients gave written informed consent before the study.

-
Patients

Sixty-eight consecutive patients with small fiber PN presenting to the department of neurology in the Chinese PLA General Hospital from December 20, 2013, to May 31, 2016, were enrolled. All patients presented with sensory or autonomic symptoms, which included disturbance of pinprick sensation and temperature sensation, allodynia, hyperalgesia, numbness, coldness, tightness, burning sensation, tingling sensation, bedsheet intolerance, restless legs, sensation of walking on sand or pebbles, muscle cramps, palpitations, hot flushes, hyper/hypohidrosis, skin discoloration, difficulty in urination, constipation/ diarrhea, sexual dysfunction, or sicca syndrome. A detailed medical history was taken from all patients, and a variety of laboratory tests were performed to screen for underlying etiologies. We identified 38 suitable patients with idiopathic PN (n = 17), IGT-related PN (n = 12), and metabolic syndrome (MS)-related PN (n = 9). Patients were divided into three groups according to the above etiologies. Patients included in the study were 22 males and 16 females, with a mean age of 54 ± 14 years.

-
Etiology screening

A detailed medical history was taken, including a history of smoking and alcohol consumption, renal disorders, nutritional diseases, hyper/hypothyroidism, hematological disorders, malignancy, connective tissue diseases, infection with hepatitis C and other hepatic disorders, paraneoplastic syndromes, a medication history, a history of neurotoxic agents, and the presence of a family history of any neuropathy.

Laboratory tests to determine disease etiology included a complete blood count, erythrocyte sedimentation rate, C-reactive protein, renal and hepatic function, lipid profile, fasting glucose, glycosylated hemoglobin, oral glucose tolerance test, folate and Vitamin B 12 , thyroid function test, antinuclear antibody, anti-extractable nuclear antigens antibody, serum protein electrophoresis, and tumor markers. Diagnosis of impaired fasting glucose, IGT, and diabetes was based on comparison with the diagnostic criteria published by the World Health Organization. [15]

-
Clinical characteristics

Sensory and autonomic symptoms were recorded using the 13-item SFN-SIQ, which assessed factors including overly-sensitive leg skin, burning feet, bedsheet intolerance, restless legs at night, changes in sweating patterns, presence of diarrhea or constipation, urinary tract problems (including hesitation and incontinence), dry eyes, dry mouth, dizziness when standing up, palpitations, and hot flushes. A 4-point Likert scale (0: never present; 1: sometimes; 2: often; and 3: always present) was used to determine symptom severity. [5,6] A neurological examination was also performed by an experienced neurologist and included examinations for muscle strength, pinprick and vibration sensation, and deep tendon reflexes. Pinprick and vibration perception were conducted using a disposable safety needle and 128 Hz tuning fork, respectively. As previously described, the neurologist was blinded to the clinical symptoms. [16] Nerve conduction studies Skin temperature was maintained at or above 32°C during the examination. NCS were performed using surface electrodes on the tibial, peroneal, and sural nerves in both lower limbs using the Keypoint electromyography (EMG) system (Medoc Ltd., Ramat Yishai, Israel). Motor conduction velocities (MCVs), proximal and distal compound muscle action potentials (CMAPs) of the tibial and peroneal nerves, as well as sensory conduction velocities and sensory nerve action potential of the sural nerve were measured. Results were compared with reference values utilized by the EMG laboratory of Chinese PLA General Hospital.

-
Assessment of intraepidermal nerve fiber density

Skin biopsies were conducted using a 3-mm diameter skin biopsy punch (Acuderm Inc., Fort Lauderdale, USA) approximately 10 cm above the lateral malleolus under local anesthesia. Specimens were acquired from the side with the most severe symptoms. The right side was selected if the clinical presentation was identical on both sides. Specimens were immunostained using rabbit polyclonal antibodies to human protein gene product 9.5 (Chemicon International Inc., Temecula, USA) as previously described. [3,17] IENFs were counted in at least three sections as previously described, by two independent researchers blind to the clinical data. [18] A computerized imaging system -Image Pro Plus 6.0 (Media Cybernetics, Silver Spring, USA) was used to measure epidermal length. Thus, the average IENFD (number of fibers/mm) was calculated. An IENFD below the fifth percentile of the worldwide normative reference value was considered abnormal. [19] Statistical analysis SPSS version 19.0 (IBM, Armonk, USA) was applied to perform statistical analysis. Partial correlations, which were controlled for age and gender, were analyzed to determine relationships among the IENFD and visual analog scale (VAS), SFN-SIQ, and nerve conduction parameters. Analysis of variance (ANOVA) was conducted to compare the differences among idiopathic PN, IGT-related PN, and MS-related PN groups. Post hoc analyses (Least Significant Difference and Student-Newman-Keuls) were performed if significant differences were identified in the ANOVA. Kruskal-Wallis test was conducted if data were not normally distributed.

-
results
-
Clinical characteristics

All patients presented with a disturbance in pinprick sensation, temperature sensation, and/or allodynia or hyperalgesia, which was in accordance with the clinical manifestations of SFN.

Mean VAS score of all patients included was 4.66 ± 3.56. Mean VAS scores of all groups are presented in Table 1. No significant differences in VAS scores were revealed among the three groups ( χ 2 = 2.102, P = 0.350). Mean SFN-SIQ score of all patients included was 5.74 ± 2.42. Mean SFN-SIQ scores for all groups were also presented in Table 1. The mean SFN-SIQ score was significantly different among the three groups (F = 14.433, P < 0.001). The mean SFN-SIQ score was significantly increased (P < 0.001) in the MS-related PN group compared to IGT-related PN group. The mean SFN-SIQ score was significantly increased (P < 0.001) in MS-related PN group compared to the idiopathic PN group (P < 0.001).

-
Nerve conduction study and skin biopsy analysis

NCS was abnormal in 17 patients and normal in 21 patients. According to the international normative reference value, IENFD parameters were abnormal in 15 patients and normal in 23 patients. Eight patients were diagnosed with pure SFN, seven patients were diagnosed with mixed small and large fiber neuropathy, 10 patients were diagnosed with pure large fiber neuropathy, and 13 patients did not display any abnormalities in NCS and skin biopsies.

In the IGT-related PN group, three patients were diagnosed with mixed small and large fiber neuropathy, six patients were diagnosed with pure large fiber neuropathy, and three patients did not display any abnormalities in NCS and skin biopsies.

In the MS-related PN group, three patients were diagnosed with pure SFN, four patients were diagnosed with mixed small and large fiber neuropathy, one patient was diagnosed with pure large fiber neuropathy, and one patient did not display any abnormalities in NCS and skin biopsies.

In the idiopathic PN group, five patients were diagnosed with pure SFN, three patients were diagnosed with pure large fiber neuropathy, and nine patients did not display any abnormalities in NCS and skin biopsies.

-
Relationships among intraepidermal nerve fiber density, visual analog scale, small fiber neuropathy and symptom inventory questionnaire, and nerve conduction studies

The partial correlation coefficient between the IENFD and SFN-SIQ was r = −0.668 (P < 0.001), indicating a moderate correlation. The partial correlation coefficient between the IENFD and MCV of the tibial nerve was r = 0.372 (P = 0.025), indicating low correlation. The partial correlation coefficient between the IENFD and proximal and distal CMAP of the tibial nerve was r = 0.383 (P = 0.021) and r = 0.358 (P = 0.032), respectively, indicating a low correlation. The partial correlation coefficient between the IENFD and MCV of the peroneal nerve was r = 0.399 (P = 0.016), indicating a low correlation.

-
Comparison of parameters among different groups

IENFD parameters were significantly different among all three groups ( χ 2 = 9.901, P = 0.007). IENFD was significantly decreased ( χ 2 = 23.000, P = 0.003) in the MS-related PN group compared to the idiopathic PN group. IENFD was also decreased ( χ 2 = 27.000, P = 0.058) in the MS-related PN group compared to the IGT-related PN group, however, with no significant difference. IENFD was also decreased ( χ 2 = 64.000, P = 0.097) in the IGT-related PN group compared to the idiopathic PN group, again with no significant difference [ Table 2].

The MCV of the tibial nerve was significantly different among all three groups ( χ 2 = 8.172, P = 0.017). The MCV of the tibial nerve was also significantly decreased ( χ 2 = 29.503, P < 0.009) in the MS-related PN group compared to the idiopathic PN group. The proximal and distal CMAP of the tibial nerve was significantly different between all three groups (F = 4.336, P = 0.021; F = 3.262, P = 0.049), respectively. The proximal CMAP of the tibial nerve was significantly decreased (P = 0.017) in the IGT-related PN group compared to the idiopathic PN group. The proximal CMAP of the tibial nerve was significantly decreased (P = 0.022) in the MS-related PN group compared to the idiopathic PN group. Furthermore, the distal CMAP of the tibial nerve was significantly decreased (P = 0.035) in the IGT-related PN group compared to the idiopathic PN group. The distal CMAP of the tibial nerve was significantly decreased (P = 0.049) in the MS-related PN group compared to the idiopathic PN group [ Table 2].

-
dIscussIon

This study identified that IENFD weakly correlated with the MCV and proximal and distal CMAPs of the tibial nerve. IENFD also weakly correlated with the MCV of the peroneal nerve. However, a conflicting study has previously demonstrated that IENFD does not relate to NCS parameters in patients with IGT and early diabetes, [20] indicating that the severity of small and large fiber damage may not coincide. Such findings may be accounted for by the fact that patients in the study were in the initial stages of IGT or diabetes-associated neuropathy, suggesting they may not have extensive nerve damage. The initial stages of IGT or diabetes-associated neuropathy involve small fiber damage, eventually progressing to large fiber involvement. [20] The aforementioned report enrolled only 14 patients, six with IGT and eight with early-stage diabetes. The small number of patients in that study may have led to a certain degree of outcome bias. In another previously published study, [21] a low to moderate correlation was identified between IENFD and the electrophysiological parameters in diabetic patients, which was in accordance with the findings in our study. One study [21] has indicated IENFD correlated with the various parameters of NCS, which is in accordance with our study. Furthermore, another study [22] found NCS remained within the normal range in type 2 diabetes, but there was a small significant decline after 5 years; IENFD changed from normal to abnormal after 5-year follow-up. This suggests that during early stages of IGT/diabetes-associated neuropathy, only IENFD are involved, and IENFD does not correlate initially with NCS parameters. Nevertheless, as the disease progresses, IENFD and electrophysiological parameters correlate, suggesting advancement in the degree of nerve damage. A recent study indicated that IENFD was able to serve as a marker of the course of diabetic neuropathy. [23] This study also demonstrated that IENFD significantly decreased in the MS-related group compared to the idiopathic PN group. The mean IENFD in the MS-related group was lower than that of the IGT-related group, although was not significantly different. The mean IENFD in the IGT-related group was lower than that of the idiopathic PN group, again with no significant difference. The MCV of the tibial nerve was significantly decreased in the MS-related PN group compared to the idiopathic PN group, and the proximal and distal CMAP of the tibial nerve was significantly different between all three groups. In a study focusing on IGT and diabetes-associated neuropathy, IENFD and electrophysiological parameters were reduced in diabetics compared to IGT-related PN patients, although not significantly. [20] The small number of included patients in that study may account for such findings. In our study, the number of included patients was comparably larger in comparison to other previously published reports, with significant differences in IENFD and NCS parameters between IGT/MS-related and idiopathic PN groups. This suggests that improving the number of patients in such a study might increase the power of the statistical findings associated with clinical parameters. The IENFD and various NCS parameters in the MS-related PN group were decreased compared to the IGT-related PN group and idiopathic PN groups. Another study also indicated distal leg IENFD was significantly reduced in both MS and diabetic groups. [24] This may be caused by several factors. First, obesity, hypertension, and in particular, hyperlipidemia are crucial risk factors for small fiber damage. [21,25,26] Indeed, MS is more prevalent in patients with severe PN than with mild/moderate PN, [25] supporting our finding that IENFD in the MS-related PN group was lower. Second, in the MS-related PN group, six patients were diagnosed with diabetes mellitus, and three were diagnosed with IGT, which may partially explain the lower IENFD in the MS-related PN group compared with the IGT-related PN groups. In a recent study, dynamic worsening was present in relation to changes in glucose tolerance status, which was in accordance with our findings. [27] What's more, the IENFD and electrophysiological parameters in the idiopathic PN group were the least affected. The occurrence of SFN can precede the presentation of primary disease, with approximately 35-50% of idiopathic SFN cases being confirmed as IGT-related SFN. [11][12][13][14]28] Dyslipidemia has also been identified in patients with idiopathic PN in routine follow-up. [26] Patients presenting with idiopathic PN in our study may precede IGT or dyslipidemia, thus, small and large fiber damage in the idiopathic PN group was less severe than in the IGT-and MS-related PN groups. Therefore, a routine follow-up to search for etiologies associated with idiopathic PN is of vital importance.

It is important, to mention that a limitation of this study was that the relatively small number of the patients. Large sample studies are warranted, and the relationship between dyslipidemia and PN should be evaluated in further studies.

In conclusion, the IENFD of patients included in our study weakly correlated with certain electrophysiological parameters. Peripheral nerves, including both small and large fibers, were more severely involved in MS-related PN than in idiopathic PN. All possible auxiliary examinations and routine follow-up should be conducted to search for underlying etiologies in idiopathic PN patients.

:Motor conduction velocity of tibial nerve; T.PCMAP: Proximal compound muscle action potential of tibial nerve; T.DCMAP: Distal compound muscle action potential of tibial nerve; PERI.MCV: Motor conduction velocity of peroneal nerve; P.PCMAP: Proximal compound muscle action potential of peroneal nerve; P.DCMAP (mV): Distal compound muscle action potential of peroneal nerve; SURA.SCV: Sensory conduction velocity of sural nerve; S.SNAP: Sensory nerve action potential of sural nerve; IGT: Impaired glucose tolerance; MS: Metabolic syndrome; PN: Peripheral neuropathy; SD: Standard deviation; NCS: Nerve conduction studies.
-
Table 1 : VAS and SFN-SIQ scores of different types of peripheral neuropathy patients with small fiber involvementItemsIGT-relatedMS-relatedIdiopathicPN (n = 12)PN (n = 9)PN (n = 17)VAS3.75 ± 3.916.00 ± 3.914.59 ± 3.08SFN-SIQ5.33 ± 1.678.56 ± 2.514.53 ± 1.55Data are shown as mean ± SD. VAS: Visual analog scale; SFN-SIQ: TheSmall Fiber Neuropathy and Symptom Inventory Questionnaire; IGT:Impaired glucose tolerance; MS: Metabolic syndrome; PN: Peripheralneuropathy; SD: Standard deviation.
-
Table 2 : IENFD and NCS parameters among different types of peripheral neuropathy patients with small fiber involvementData are shown as n or mean ± SD. *F value. IENFD: Intraepidermal nerve fiber density; TIBI.MCVParametersIGT-related PN (n = 12)MS-related PN (n = 9)Idiopathic PN (n = 17)F or χ 2PAge (years)56.8 ± 12.760.2 ± 15.649.6 ± 14.21.924*0.161Gender (male), n75100.4240.809IENFD (number of4.66 ± 5.252.57 ± 2.606.25 ± 2.519.9010.007fibers/mm)TIBI.MCV (m/s)40.48 ± 13.3536.34 ± 15.6847.44 ± 6.098.1720.017T.PCMAP (mV)5.26 ± 3.395.03 ± 5.2410.31 ± 6.424.336*0.021T.DCMAP (mV)7.41 ± 4.917.32 ± 7.0912.72 ± 6.973.262*0.049PERI.MCV (m/s)42.24 ± 13.7835.47 ± 20.2447.98 ± 5.242.9260.232P.PCMAP (mV)3.54 ± 2.644.10 ± 3.735.79 ± 2.842.189*0.127P.DCMAP (mV)4.15 ± 3.204.86 ± 3.926.81 ± 2.273.024*0.061SURA.SCV (m/s)34.43 ± 25.6035.53 ± 27.0246.48 ± 18.171.7920.408S.SNAP (µV)4.68 ± 5.257.33 ± 8.309.59 ± 9.401.314*0.282
- - -
-
Financial support and sponsorship
-
Conflicts of interest

There are no conflicts of interest.

-
- - - - - - Small fiber neuropathy: A common and important clinical disorder - - EHoitsma - - - JPReulen - - - MDe Baets - - - MDrent - - - FSpaans - - - CGFaber - - 10.1016/j.jns.2004.08.012.2 - doi: 10.1097/01. wco.0000093103.34793.5a - - - Curr Opin Neurol - - 227 - - - - - J Neurol Sci - - - - - Cutaneous innervation in sensory neuropathies: Evaluation by skin biopsy - - BGMccarthy - - - S-THsieh - - - AStocks - - - PHauer - - - CMacko - - - DRCornblath - - - JWGriffin - - - JCMcarthur - - 10.1212/wnl.45.10.1848 - doi: 10.1212/ WNL.45.10.1848 - - - Neurology - Neurology - 0028-3878 - 1526-632X - - 45 - 10 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - Contact Heat-Evoked Potential Stimulation for the Evaluation of Small Nerve Fiber Function - - HenriKParson - - - VanTNguyen - - - Michael-AngeloOrciga - - - AmandaLBoyd - - - CarolinaMCasellini - - - AaronIVinik - - 10.1089/dia.2012.0202 - - - Diabetes Technology & Therapeutics - Diabetes Technology & Therapeutics - 1520-9156 - 1557-8593 - - 15 - 2 - - - Mary Ann Liebert Inc - - - - - - - Small fibers, large impact: Quality of life in small-fiber neuropathy - - MayienneBakkers - - - CatharinaGFaber - - - JannekeG JHoeijmakers - - - GiuseppeLauria - - - IngemarS JMerkies - - 10.1002/mus.23910 - - - Muscle & Nerve - Muscle Nerve - 0148-639X - - 49 - 3 - - - Wiley - - - - - - - Intraepidermal nerve fiber density and its application in sarcoidosis - - MBakkers - - - IS JMerkies - - - GLauria - - - GDevigili - - - PPenza - - - RLombardi - - - MC EHermans - - - SIVan Nes - - - MDe Baets - - - CGFaber - - 10.1212/wnl.0b013e3181bacf05 - doi: 10.1212/ WNL.0b013e3181bacf05 - - - Neurology - Neurology - 0028-3878 - 1526-632X - - 73 - 14 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - The diagnostic criteria for small fibre neuropathy: from symptoms to neuropathology - - GDevigili - - - VTugnoli - - - PPenza - - - FCamozzi - - - RLombardi - - - GMelli - - - LBroglio - - - EGranieri - - - GLauria - - 10.1093/brain/awn093 - - - Brain - Brain - 0006-8950 - 1460-2156 - - 131 - 7 - - - Oxford University Press (OUP) - - - Pt 7 - - - - - Increased Prevalence of Impaired Glucose Tolerance in Patients With Painful Sensory Neuropathy - - JRSingleton - - - AGSmith - - - MBBromberg - - 10.2337/diacare.24.8.1448 - doi: 10.2337/ diacare.24.8.1448 - - - Diabetes Care - Diabetes Care - 0149-5992 - 1935-5548 - - 24 - 8 - - - American Diabetes Association - - - - - - - The clinical approach to small fibre neuropathy and painful channelopathy - - AndreasCThemistocleous - - - JuanDRamirez - - - JordiSerra - - - DavidL HBennett - - 10.1136/practneurol-2013-000758 - - - Practical Neurology - Pract Neurol - 1474-7758 - 1474-7766 - - 14 - 6 - - - BMJ - - - - - - - Chemotherapy-induced peripheral neuropathy: Current status and progress - - JamieRBrewer - - - GladysMorrison - - - MEileenDolan - - - GiniFFleming - - 10.1016/j.ygyno.2015.11.011 - doi: 10.1016/j. ygyno.2015.11.011 - - - Gynecologic Oncology - Gynecologic Oncology - 0090-8258 - - 140 - 1 - - - Elsevier BV - - - - - - - Painful sensory polyneuropathy associated with impaired glucose tolerance - - JRobinsonSingleton - - - AGordonSmith - - - MarkBBromberg - - 10.1002/mus.1136 - - - Muscle & Nerve - Muscle Nerve - 0148-639X - 1097-4598 - - 24 - 9 - - - Wiley - - - - - - - The spectrum of neuropathy in diabetes and impaired glucose tolerance - - CJSumner - - - SSheth - - - JWGriffin - - - DRCornblath - - - MPolydefkis - - 10.1212/wnl.60.1.108 - - - Neurology - Neurology - 0028-3878 - 1526-632X - - 60 - 1 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - The Diagnostic Yield of a Standardized Approach to Idiopathic Sensory-Predominant Neuropathy - - AGordonSmith - - - JRobinsonSingleton - - 10.1001/archinte.164.9.1021 - - - Archives of Internal Medicine - Arch Intern Med - 0003-9926 - - 164 - 9 - 1021 - - American Medical Association (AMA) - - - - - - - The frequency of undiagnosed diabetes and impaired glucose tolerance in patients with idiopathic sensory neuropathy - - StevenPNovella - - - SilvioEInzucchi - - - JonathanMGoldstein - - 10.1002/mus.1137 - doi: 10.1002/ mus.1137 - - - Muscle & Nerve - Muscle Nerve - 0148-639X - 1097-4598 - - 24 - 9 - - - Wiley - - - - - - - Child mental health and psychosocial development. - 10.1037/e400972004-001 - - - American Psychological Association (APA) - - - - - - - DeJong's the Neurologic Examination - - AFHaerer - - - - Lippincott Williams & Wilkins - 107 - - - - - - - Intraepidermal nerve fiber density in patients with painful sensory neuropathy - - NRHolland - - - AStocks - - - PHauer - - - DRCornblath - - - JWGriffin - - - JCMcarthur - - 10.1212/wnl.48.3.708 - doi: 12.2316/WNL.32.7.122 - - - Neurology - Neurology - 0028-3878 - 1526-632X - - 48 - 3 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - The innervation of human epidermis - - WRKennedy - - - GWendelschafer-Crabb - - 10.1016/0022-510X - - - J Neurol Sci - - 115 - 93 - 90223 - - - - - - - - Intraepidermal nerve fiber density at the distal leg: a worldwide normative reference study - - GiuseppeLauria - - - MayienneBakkers - - - ChristophSchmitz - - - RaffaellaLombardi - - - PaolaPenza - - - GraziaDevigili - - - AGordonSmith - - - Sung-TsiehHsieh - - - SveinIMellgren - - - ThirugnanamUmapathi - - - DanZiegler - - - CatharinaGFaber - - - IngemarS JMerkies - - 10.1111/j.1529-8027.2010.00271.x - - - Journal of the Peripheral Nervous System - 1085-9489 - - 15 - 3 - - - Wiley - - - - - - - Epidermal nerve innervation in impaired glucose tolerance and diabetes-associated neuropathy - - AGSmith - - - PRamachandran - - - STripp - - - JRSingleton - - 10.1212/wnl.57.9.1701 - doi: 10.1212/ WNL.57.9.1701 - - - Neurology - Neurology - 0028-3878 - 1526-632X - - 57 - 9 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - Intraepidermal nerve fiber density and nerve conduction study parameters correlate with clinical staging of diabetic polyneuropathy - - AikoArimura - - - TakahisaDeguchi - - - KazuhiroSugimoto - - - TadashiUto - - - TomonoriNakamura - - - YumikoArimura - - - KimiyoshiArimura - - - SorokuYagihashi - - - YoshihikoNishio - - - HiroshiTakashima - - 10.1016/j.diabres.2012.09.026 - - - Diabetes Research and Clinical Practice - Diabetes Research and Clinical Practice - 0168-8227 - - 99 - 1 - - - Elsevier BV - - - - - - - Small and large fiber neuropathy in those with type 1 and type 2 diabetes: a 5-year follow-up study - - SisselLøseth - - - ErikVStålberg - - - SigurdLindal - - - EdelOlsen - - - RolfJorde - - - SveinIMellgren - - 10.1111/jns.12154 - - - Journal of the Peripheral Nervous System - J Peripher Nerv Syst - 1085-9489 - - 21 - 1 - - - Wiley - - - - - - - Intraepidermal nerve-fibre density as a biomarker of the course of neuropathy in patients with Type 2 diabetes mellitus - - SDivisova - - - EVlckova - - - ISrotova - - - SKincova - - - MSkorna - - - LDusek - - - PDubovy - - - JBednarik - - 10.1111/dme.12890 - - - Diabetic Medicine - Diabet. Med. - 0742-3071 - - 33 - 5 - - - Wiley - - - - - - - Supervised exercise improves cutaneous reinnervation capacity in metabolic syndrome patients - - JRobinsonSingleton - - - RobinLMarcus - - - MargaretKLessard - - - JustinEJackson - - - AGordonSmith - - 10.1002/ana.24310 - - - Annals of Neurology - Ann Neurol. - 0364-5134 - - 77 - 1 - - - Wiley - - - - - - - Metabolic Syndrome in Small Fiber Sensory Neuropathy - - LanZhou - - - JianboLi - - - DanielOntaneda - - - JoshuaSperling - - 10.1097/cnd.0b013e3182196e3c - - - Journal of Clinical Neuromuscular Disease - Journal of Clinical Neuromuscular Disease - 1522-0443 - - 12 - 4 - - - Ovid Technologies (Wolters Kluwer Health) - - - - - - - Idiopathic neuropathy patients are at high risk for metabolic syndrome - - AGordonSmith - - - KristiRose - - - JRobinsonSingleton - - 10.1016/j.jns.2008.06.005 - - - Journal of the Neurological Sciences - Journal of the Neurological Sciences - 0022-510X - - 273 - 1-2 - - - Elsevier BV - - - - - - - Corneal Confocal Microscopy Identifies Small-Fiber Neuropathy in Subjects With Impaired Glucose Tolerance Who Develop Type 2 Diabetes - - ShazliAzmi - - - MaryamFerdousi - - - IoannisNPetropoulos - - - GeorgiosPonirakis - - - UazmanAlam - - - HassanFadavi - - - OmarAsghar - - - AndrewMarshall - - - AndrewJAtkinson - - - WendyJones - - - AndrewJ MBoulton - - - MitraTavakoli - - - MariaJeziorska - - - RayazAMalik - - 10.2337/dc14-2733 - - - Diabetes Care - Dia Care - 0149-5992 - 1935-5548 - - 38 - 8 - - - American Diabetes Association - - - - - - - Glucose tolerance testing in the evaluation of idiopathic neuropathy - - JRSingleton - - - AGSmith - - - MBBromberg - - 10.1016/j.annpneurol.1998.08.03 - - - Ann Neurol - - 44 - 477 - - - - - - -
-
-
-
diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 4805c75..0dbe48a 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,5 +1,5 @@ """ -Unit tests for TEI to JSON conversion functionality. +Unit tests for TEI to JSON and TEI to Markdown conversion functionality. """ import os import tempfile @@ -10,7 +10,7 @@ class TestTEIConversions: - """Test cases for TEI to JSON conversions.""" + """Test cases for TEI to JSON and Markdown conversions.""" def setup_method(self): """Set up test fixtures.""" @@ -135,6 +135,107 @@ def test_json_conversion_with_nonexistent_file(self): # It's acceptable to raise an exception for nonexistent files assert True, "Exception is acceptable for nonexistent files" + @patch('grobid_client.grobid_client.GrobidClient._test_server_connection') + @patch('grobid_client.grobid_client.GrobidClient._configure_logging') + def test_markdown_conversion_with_existing_tei_file(self, mock_configure_logging, mock_test_server): + """Test Markdown conversion when TEI file exists but Markdown doesn't.""" + mock_test_server.return_value = (True, 200) + + client = GrobidClient(check_server=False) + client.logger = Mock() + + # Create a temporary TEI file for testing + with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file: + tei_file.write(self.sample_tei_content) + tei_path = tei_file.name + + try: + # Test actual conversion + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + converter = TEI2MarkdownConverter() + markdown_data = converter.convert_tei_file(tei_path) + + # Verify the conversion result + assert markdown_data is not None, "Markdown conversion should not return None" + assert isinstance(markdown_data, str), "Markdown conversion should return a string" + assert len(markdown_data) > 0, "Markdown conversion should produce non-empty content" + + # Check that the converted content contains expected elements + assert '#' in markdown_data or 'Sample Document Title' in markdown_data, "Markdown should contain title" + + finally: + # Clean up temporary file + os.unlink(tei_path) + + def test_markdown_conversion_with_empty_tei(self): + """Test Markdown conversion with empty TEI content.""" + + # Test with empty TEI content + empty_tei = """ + +""" + + # Create a temporary TEI file with empty content + with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file: + tei_file.write(empty_tei) + tei_path = tei_file.name + + try: + # Test actual conversion + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + converter = TEI2MarkdownConverter() + markdown_data = converter.convert_tei_file(tei_path) + + # Verify that conversion still produces some content even with empty TEI + assert markdown_data is not None, "Even empty TEI should produce some markdown content" + assert isinstance(markdown_data, str), "Result should be a string" + + finally: + # Clean up temporary file + os.unlink(tei_path) + + @patch('grobid_client.grobid_client.GrobidClient._test_server_connection') + @patch('grobid_client.grobid_client.GrobidClient._configure_logging') + def test_both_conversions_same_tei_file(self, mock_configure_logging, mock_test_server): + """Test both JSON and Markdown conversions for the same TEI file.""" + mock_test_server.return_value = (True, 200) + + client = GrobidClient(check_server=False) + client.logger = Mock() + + # Create a temporary TEI file for testing + with tempfile.NamedTemporaryFile(mode='w', suffix='.tei.xml', delete=False) as tei_file: + tei_file.write(self.sample_tei_content) + tei_path = tei_file.name + + try: + # Test JSON conversion + from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter + json_converter = TEI2LossyJSONConverter() + json_data = json_converter.convert_tei_file(tei_path, stream=False) + + # Test Markdown conversion + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + md_converter = TEI2MarkdownConverter() + markdown_data = md_converter.convert_tei_file(tei_path) + + # Verify both conversions produced valid results + assert json_data is not None, "JSON conversion should not return None" + assert isinstance(json_data, dict), "JSON conversion should return a dictionary" + + assert markdown_data is not None, "Markdown conversion should not return None" + assert isinstance(markdown_data, str), "Markdown conversion should return a string" + assert len(markdown_data) > 0, "Markdown should have content" + + # Both conversions should be from the same source, so they should extract similar information + if 'biblio' in json_data and 'title' in json_data['biblio']: + title = json_data['biblio']['title'] + # The title should appear in the markdown output + assert title in markdown_data or 'Sample Document Title' in markdown_data, "Title should appear in markdown" + + finally: + # Clean up temporary file + os.unlink(tei_path) def test_process_batch_with_json_output(self): """Test process_batch method with JSON output functionality using real TEI resources.""" @@ -233,6 +334,31 @@ def test_real_tei_json_conversion_integration(self): assert 'offset_end' in ref, "Reference should have offset_end" assert ref['offset_start'] < ref['offset_end'], "offset_start should be less than offset_end" + def test_markdown_conversion_with_real_tei_file(self): + """Test Markdown conversion with real TEI file from test resources.""" + + # Use the actual TEI file from test resources + tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml') + + # Verify the test TEI file exists + assert os.path.exists(tei_file), f"Test TEI file should exist at {tei_file}" + + # Test actual conversion + from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter + converter = TEI2MarkdownConverter() + markdown_data = converter.convert_tei_file(tei_file) + + # Verify the conversion result + assert markdown_data is not None, "Markdown conversion should not return None" + assert isinstance(markdown_data, str), "Markdown conversion should return a string" + assert len(markdown_data) > 0, "Markdown conversion should produce non-empty content" + + # Check that the converted content contains expected elements from real TEI + assert '#' in markdown_data, "Markdown should contain headers" + assert 'Multi-contact functional electrical stimulation' in markdown_data, "Markdown should contain the paper title" + + # Check for author information + assert 'De Marchis' in markdown_data or 'Cristiano' in markdown_data, "Markdown should contain author information" def test_reference_offset_issues_with_known_cases(self): """Test TEI to JSON conversion for XML files with known reference offset issues.""" diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py index 143faa6..98adb42 100644 --- a/tests/test_grobid_client.py +++ b/tests/test_grobid_client.py @@ -504,7 +504,8 @@ def test_process_batch_empty_input_files(self, mock_configure_logging, mock_test force=True, verbose=False, flavor=None, - json_output=False + json_output=False, + markdown_output=False ) assert result == (0, 0, 0) # No files processed, no errors, no skipped