| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- #!/usr/bin/env python3
- import csv
- import sys
- import tempfile
- from io import StringIO
- from pathlib import Path
- from docling.document_converter import DocumentConverter
- def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
- tables: list[list[list[str]]] = []
- current_table: list[list[str]] = []
- for line in markdown_text.splitlines():
- stripped = line.strip()
- if not stripped:
- # Docling markdown can include blank lines inside table regions.
- # Do not break the current table because of empty lines.
- continue
- is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
- if not is_table_line:
- if current_table:
- tables.append(current_table)
- current_table = []
- continue
- cells = [cell.strip() for cell in stripped.strip("|").split("|")]
- if all(cell.startswith("-") for cell in cells):
- continue
- current_table.append(cells)
- if current_table:
- tables.append(current_table)
- return tables
- def normalize_tables_to_csv(markdown_text: str) -> str:
- tables = parse_markdown_tables(markdown_text)
- if not tables:
- return ""
- buffer = StringIO()
- # Force LF line endings to avoid double-spacing on Windows consumers.
- writer = csv.writer(buffer, lineterminator="\n")
- for table in tables:
- for row in table:
- writer.writerow(row)
- csv_text = buffer.getvalue().strip()
- return csv_text.replace("\r\n", "\n").replace("\r", "\n")
- def read_input_from_stdin() -> str:
- html_text = sys.stdin.read()
- with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
- tmp_file.write(html_text)
- return tmp_file.name
- def main() -> int:
- temp_path = None
- try:
- source = read_input_from_stdin()
- temp_path = source
- converter = DocumentConverter()
- result = converter.convert(source)
- markdown_output = result.document.export_to_markdown()
- output_payload = normalize_tables_to_csv(markdown_output)
- sys.stdout.write(output_payload)
- return 0
- except Exception as exc: # pylint: disable=broad-except
- sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
- return 1
- finally:
- if temp_path:
- Path(temp_path).unlink(missing_ok=True)
- if __name__ == "__main__":
- raise SystemExit(main())
|