#!/usr/bin/env python3 import csv import sys import tempfile from io import StringIO from pathlib import Path from docling.document_converter import DocumentConverter def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]: tables: list[list[list[str]]] = [] current_table: list[list[str]] = [] for line in markdown_text.splitlines(): stripped = line.strip() if not stripped: # Docling markdown can include blank lines inside table regions. # Do not break the current table because of empty lines. continue is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|") if not is_table_line: if current_table: tables.append(current_table) current_table = [] continue cells = [cell.strip() for cell in stripped.strip("|").split("|")] if all(cell.startswith("-") for cell in cells): continue current_table.append(cells) if current_table: tables.append(current_table) return tables def normalize_tables_to_csv(markdown_text: str) -> str: tables = parse_markdown_tables(markdown_text) if not tables: return "" buffer = StringIO() # Force LF line endings to avoid double-spacing on Windows consumers. writer = csv.writer(buffer, lineterminator="\n") for table in tables: for row in table: writer.writerow(row) csv_text = buffer.getvalue().strip() return csv_text.replace("\r\n", "\n").replace("\r", "\n") def read_input_from_stdin() -> str: html_text = sys.stdin.read() with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file: tmp_file.write(html_text) return tmp_file.name def main() -> int: temp_path = None try: source = read_input_from_stdin() temp_path = source converter = DocumentConverter() result = converter.convert(source) markdown_output = result.document.export_to_markdown() output_payload = normalize_tables_to_csv(markdown_output) sys.stdout.write(output_payload) return 0 except Exception as exc: # pylint: disable=broad-except sys.stderr.write(f"DOCLING_ERROR: {exc}\n") return 1 finally: if temp_path: Path(temp_path).unlink(missing_ok=True) if __name__ == "__main__": raise SystemExit(main())