#!/usr/bin/env python3
import argparse
import csv
import sys
import tempfile
from pathlib import Path
from io import StringIO
from docling.document_converter import DocumentConverter
def normalize_tables_to_html(markdown_text: str) -> str:
# Docling returns markdown; downstream Java extractor expects HTML tables
# marked with data-extraction attribute.
lines = markdown_text.splitlines()
out_lines = []
in_table = False
for line in lines:
if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
if not in_table:
out_lines.append('
')
in_table = True
cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
if all(cell.startswith("-") for cell in cells):
continue
row = "".join(f"| {cell} | " for cell in cells)
out_lines.append(f"{row}
")
else:
if in_table:
out_lines.append("
")
in_table = False
if in_table:
out_lines.append("")
return "\n".join(out_lines).strip()
def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
tables: list[list[list[str]]] = []
current_table: list[list[str]] = []
for line in markdown_text.splitlines():
stripped = line.strip()
if not stripped:
# Docling markdown can include blank lines inside table regions.
# Do not break the current table because of empty lines.
continue
is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
if not is_table_line:
if current_table:
tables.append(current_table)
current_table = []
continue
cells = [cell.strip() for cell in stripped.strip("|").split("|")]
if all(cell.startswith("-") for cell in cells):
continue
current_table.append(cells)
if current_table:
tables.append(current_table)
return tables
def normalize_tables_to_csv(markdown_text: str) -> str:
tables = parse_markdown_tables(markdown_text)
if not tables:
return ""
buffer = StringIO()
# Force LF line endings to avoid double-spacing on Windows consumers.
writer = csv.writer(buffer, lineterminator="\n")
for table in tables:
for row in table:
writer.writerow(row)
csv_text = buffer.getvalue().strip()
return csv_text.replace("\r\n", "\n").replace("\r", "\n")
def read_input(args: argparse.Namespace) -> str:
if args.input_file:
return str(Path(args.input_file).resolve())
if args.stdin:
html_text = sys.stdin.read()
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
tmp_file.write(html_text)
return tmp_file.name
raise ValueError("Either --input-file or --stdin must be provided")
def main() -> int:
parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
parser.add_argument("--input-file", help="Path to input file")
parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
args = parser.parse_args()
temp_path = None
try:
source = read_input(args)
if args.stdin:
temp_path = source
converter = DocumentConverter()
result = converter.convert(source)
markdown_output = result.document.export_to_markdown()
if args.output == "markdown":
output_payload = markdown_output
elif args.output == "csv":
output_payload = normalize_tables_to_csv(markdown_output)
else:
output_payload = normalize_tables_to_html(markdown_output)
sys.stdout.write(output_payload)
return 0
except Exception as exc: # pylint: disable=broad-except
sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
return 1
finally:
if temp_path:
Path(temp_path).unlink(missing_ok=True)
if __name__ == "__main__":
raise SystemExit(main())