UV-APPS
/
UV_SAIC_DES


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
							#!/usr/bin/env python3
import argparse
import csv
import sys
import tempfile
from pathlib import Path
from io import StringIO

from docling.document_converter import DocumentConverter


def normalize_tables_to_html(markdown_text: str) -> str:
    # Docling returns markdown; downstream Java extractor expects HTML tables
    # marked with data-extraction attribute.
    lines = markdown_text.splitlines()
    out_lines = []
    in_table = False

    for line in lines:
        if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
            if not in_table:
                out_lines.append('<table data-extraction="docling">')
                in_table = True

            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
            if all(cell.startswith("-") for cell in cells):
                continue

            row = "".join(f"<td>{cell}</td>" for cell in cells)
            out_lines.append(f"<tr>{row}</tr>")
        else:
            if in_table:
                out_lines.append("</table>")
                in_table = False

    if in_table:
        out_lines.append("</table>")

    return "\n".join(out_lines).strip()


def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
    tables: list[list[list[str]]] = []
    current_table: list[list[str]] = []

    for line in markdown_text.splitlines():
        stripped = line.strip()
        if not stripped:
            # Docling markdown can include blank lines inside table regions.
            # Do not break the current table because of empty lines.
            continue
        is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
        if not is_table_line:
            if current_table:
                tables.append(current_table)
                current_table = []
            continue

        cells = [cell.strip() for cell in stripped.strip("|").split("|")]
        if all(cell.startswith("-") for cell in cells):
            continue
        current_table.append(cells)

    if current_table:
        tables.append(current_table)

    return tables


def normalize_tables_to_csv(markdown_text: str) -> str:
    tables = parse_markdown_tables(markdown_text)
    if not tables:
        return ""

    buffer = StringIO()
    # Force LF line endings to avoid double-spacing on Windows consumers.
    writer = csv.writer(buffer, lineterminator="\n")
    for table in tables:
        for row in table:
            writer.writerow(row)
    csv_text = buffer.getvalue().strip()
    return csv_text.replace("\r\n", "\n").replace("\r", "\n")


def read_input(args: argparse.Namespace) -> str:
    if args.input_file:
        return str(Path(args.input_file).resolve())

    if args.stdin:
        html_text = sys.stdin.read()
        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
            tmp_file.write(html_text)
            return tmp_file.name

    raise ValueError("Either --input-file or --stdin must be provided")


def main() -> int:
    parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
    parser.add_argument("--input-file", help="Path to input file")
    parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
    parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
    args = parser.parse_args()

    temp_path = None
    try:
        source = read_input(args)
        if args.stdin:
            temp_path = source
        converter = DocumentConverter()
        result = converter.convert(source)
        markdown_output = result.document.export_to_markdown()
        if args.output == "markdown":
            output_payload = markdown_output
        elif args.output == "csv":
            output_payload = normalize_tables_to_csv(markdown_output)
        else:
            output_payload = normalize_tables_to_html(markdown_output)
        sys.stdout.write(output_payload)
        return 0
    except Exception as exc:  # pylint: disable=broad-except
        sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
        return 1
    finally:
        if temp_path:
            Path(temp_path).unlink(missing_ok=True)


if __name__ == "__main__":
    raise SystemExit(main())