|
|
@@ -1,44 +1,12 @@
|
|
|
#!/usr/bin/env python3
|
|
|
-import argparse
|
|
|
import csv
|
|
|
import sys
|
|
|
import tempfile
|
|
|
-from pathlib import Path
|
|
|
from io import StringIO
|
|
|
+from pathlib import Path
|
|
|
|
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
|
|
-
|
|
|
-def normalize_tables_to_html(markdown_text: str) -> str:
|
|
|
- # Docling returns markdown; downstream Java extractor expects HTML tables
|
|
|
- # marked with data-extraction attribute.
|
|
|
- lines = markdown_text.splitlines()
|
|
|
- out_lines = []
|
|
|
- in_table = False
|
|
|
-
|
|
|
- for line in lines:
|
|
|
- if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
|
|
|
- if not in_table:
|
|
|
- out_lines.append('<table data-extraction="docling">')
|
|
|
- in_table = True
|
|
|
-
|
|
|
- cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
|
|
|
- if all(cell.startswith("-") for cell in cells):
|
|
|
- continue
|
|
|
-
|
|
|
- row = "".join(f"<td>{cell}</td>" for cell in cells)
|
|
|
- out_lines.append(f"<tr>{row}</tr>")
|
|
|
- else:
|
|
|
- if in_table:
|
|
|
- out_lines.append("</table>")
|
|
|
- in_table = False
|
|
|
-
|
|
|
- if in_table:
|
|
|
- out_lines.append("</table>")
|
|
|
-
|
|
|
- return "\n".join(out_lines).strip()
|
|
|
-
|
|
|
-
|
|
|
def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
|
|
|
tables: list[list[list[str]]] = []
|
|
|
current_table: list[list[str]] = []
|
|
|
@@ -82,40 +50,22 @@ def normalize_tables_to_csv(markdown_text: str) -> str:
|
|
|
return csv_text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
|
|
|
-def read_input(args: argparse.Namespace) -> str:
|
|
|
- if args.input_file:
|
|
|
- return str(Path(args.input_file).resolve())
|
|
|
-
|
|
|
- if args.stdin:
|
|
|
- html_text = sys.stdin.read()
|
|
|
- with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
|
|
|
- tmp_file.write(html_text)
|
|
|
- return tmp_file.name
|
|
|
-
|
|
|
- raise ValueError("Either --input-file or --stdin must be provided")
|
|
|
+def read_input_from_stdin() -> str:
|
|
|
+ html_text = sys.stdin.read()
|
|
|
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
|
|
|
+ tmp_file.write(html_text)
|
|
|
+ return tmp_file.name
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
- parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
|
|
|
- parser.add_argument("--input-file", help="Path to input file")
|
|
|
- parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
|
|
|
- parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
|
|
|
- args = parser.parse_args()
|
|
|
-
|
|
|
temp_path = None
|
|
|
try:
|
|
|
- source = read_input(args)
|
|
|
- if args.stdin:
|
|
|
- temp_path = source
|
|
|
+ source = read_input_from_stdin()
|
|
|
+ temp_path = source
|
|
|
converter = DocumentConverter()
|
|
|
result = converter.convert(source)
|
|
|
markdown_output = result.document.export_to_markdown()
|
|
|
- if args.output == "markdown":
|
|
|
- output_payload = markdown_output
|
|
|
- elif args.output == "csv":
|
|
|
- output_payload = normalize_tables_to_csv(markdown_output)
|
|
|
- else:
|
|
|
- output_payload = normalize_tables_to_html(markdown_output)
|
|
|
+ output_payload = normalize_tables_to_csv(markdown_output)
|
|
|
sys.stdout.write(output_payload)
|
|
|
return 0
|
|
|
except Exception as exc: # pylint: disable=broad-except
|