| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- #!/usr/bin/env python3
- import argparse
- import csv
- import sys
- import tempfile
- from pathlib import Path
- from io import StringIO
- from docling.document_converter import DocumentConverter
- def normalize_tables_to_html(markdown_text: str) -> str:
- # Docling returns markdown; downstream Java extractor expects HTML tables
- # marked with data-extraction attribute.
- lines = markdown_text.splitlines()
- out_lines = []
- in_table = False
- for line in lines:
- if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
- if not in_table:
- out_lines.append('<table data-extraction="docling">')
- in_table = True
- cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
- if all(cell.startswith("-") for cell in cells):
- continue
- row = "".join(f"<td>{cell}</td>" for cell in cells)
- out_lines.append(f"<tr>{row}</tr>")
- else:
- if in_table:
- out_lines.append("</table>")
- in_table = False
- if in_table:
- out_lines.append("</table>")
- return "\n".join(out_lines).strip()
- def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
- tables: list[list[list[str]]] = []
- current_table: list[list[str]] = []
- for line in markdown_text.splitlines():
- stripped = line.strip()
- if not stripped:
- # Docling markdown can include blank lines inside table regions.
- # Do not break the current table because of empty lines.
- continue
- is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
- if not is_table_line:
- if current_table:
- tables.append(current_table)
- current_table = []
- continue
- cells = [cell.strip() for cell in stripped.strip("|").split("|")]
- if all(cell.startswith("-") for cell in cells):
- continue
- current_table.append(cells)
- if current_table:
- tables.append(current_table)
- return tables
- def normalize_tables_to_csv(markdown_text: str) -> str:
- tables = parse_markdown_tables(markdown_text)
- if not tables:
- return ""
- buffer = StringIO()
- # Force LF line endings to avoid double-spacing on Windows consumers.
- writer = csv.writer(buffer, lineterminator="\n")
- for table in tables:
- for row in table:
- writer.writerow(row)
- csv_text = buffer.getvalue().strip()
- return csv_text.replace("\r\n", "\n").replace("\r", "\n")
- def read_input(args: argparse.Namespace) -> str:
- if args.input_file:
- return str(Path(args.input_file).resolve())
- if args.stdin:
- html_text = sys.stdin.read()
- with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
- tmp_file.write(html_text)
- return tmp_file.name
- raise ValueError("Either --input-file or --stdin must be provided")
- def main() -> int:
- parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
- parser.add_argument("--input-file", help="Path to input file")
- parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
- parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
- args = parser.parse_args()
- temp_path = None
- try:
- source = read_input(args)
- if args.stdin:
- temp_path = source
- converter = DocumentConverter()
- result = converter.convert(source)
- markdown_output = result.document.export_to_markdown()
- if args.output == "markdown":
- output_payload = markdown_output
- elif args.output == "csv":
- output_payload = normalize_tables_to_csv(markdown_output)
- else:
- output_payload = normalize_tables_to_html(markdown_output)
- sys.stdout.write(output_payload)
- return 0
- except Exception as exc: # pylint: disable=broad-except
- sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
- return 1
- finally:
- if temp_path:
- Path(temp_path).unlink(missing_ok=True)
- if __name__ == "__main__":
- raise SystemExit(main())
|