#!/usr/bin/env python3 import argparse import csv import sys import tempfile from pathlib import Path from io import StringIO from docling.document_converter import DocumentConverter def normalize_tables_to_html(markdown_text: str) -> str: # Docling returns markdown; downstream Java extractor expects HTML tables # marked with data-extraction attribute. lines = markdown_text.splitlines() out_lines = [] in_table = False for line in lines: if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"): if not in_table: out_lines.append('') in_table = True cells = [cell.strip() for cell in line.strip().strip("|").split("|")] if all(cell.startswith("-") for cell in cells): continue row = "".join(f"" for cell in cells) out_lines.append(f"{row}") else: if in_table: out_lines.append("
{cell}
") in_table = False if in_table: out_lines.append("") return "\n".join(out_lines).strip() def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]: tables: list[list[list[str]]] = [] current_table: list[list[str]] = [] for line in markdown_text.splitlines(): stripped = line.strip() if not stripped: # Docling markdown can include blank lines inside table regions. # Do not break the current table because of empty lines. continue is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|") if not is_table_line: if current_table: tables.append(current_table) current_table = [] continue cells = [cell.strip() for cell in stripped.strip("|").split("|")] if all(cell.startswith("-") for cell in cells): continue current_table.append(cells) if current_table: tables.append(current_table) return tables def normalize_tables_to_csv(markdown_text: str) -> str: tables = parse_markdown_tables(markdown_text) if not tables: return "" buffer = StringIO() # Force LF line endings to avoid double-spacing on Windows consumers. writer = csv.writer(buffer, lineterminator="\n") for table in tables: for row in table: writer.writerow(row) csv_text = buffer.getvalue().strip() return csv_text.replace("\r\n", "\n").replace("\r", "\n") def read_input(args: argparse.Namespace) -> str: if args.input_file: return str(Path(args.input_file).resolve()) if args.stdin: html_text = sys.stdin.read() with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file: tmp_file.write(html_text) return tmp_file.name raise ValueError("Either --input-file or --stdin must be provided") def main() -> int: parser = argparse.ArgumentParser(description="Extract HTML tables using Docling") parser.add_argument("--input-file", help="Path to input file") parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin") parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format") args = parser.parse_args() temp_path = None try: source = read_input(args) if args.stdin: temp_path = source converter = DocumentConverter() result = converter.convert(source) markdown_output = result.document.export_to_markdown() if args.output == "markdown": output_payload = markdown_output elif args.output == "csv": output_payload = normalize_tables_to_csv(markdown_output) else: output_payload = normalize_tables_to_html(markdown_output) sys.stdout.write(output_payload) return 0 except Exception as exc: # pylint: disable=broad-except sys.stderr.write(f"DOCLING_ERROR: {exc}\n") return 1 finally: if temp_path: Path(temp_path).unlink(missing_ok=True) if __name__ == "__main__": raise SystemExit(main())