docling_extract.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/usr/bin/env python3
  2. import csv
  3. import sys
  4. import tempfile
  5. from io import StringIO
  6. from pathlib import Path
  7. from docling.document_converter import DocumentConverter
  8. def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
  9. tables: list[list[list[str]]] = []
  10. current_table: list[list[str]] = []
  11. for line in markdown_text.splitlines():
  12. stripped = line.strip()
  13. if not stripped:
  14. # Docling markdown can include blank lines inside table regions.
  15. # Do not break the current table because of empty lines.
  16. continue
  17. is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
  18. if not is_table_line:
  19. if current_table:
  20. tables.append(current_table)
  21. current_table = []
  22. continue
  23. cells = [cell.strip() for cell in stripped.strip("|").split("|")]
  24. if all(cell.startswith("-") for cell in cells):
  25. continue
  26. current_table.append(cells)
  27. if current_table:
  28. tables.append(current_table)
  29. return tables
  30. def normalize_tables_to_csv(markdown_text: str) -> str:
  31. tables = parse_markdown_tables(markdown_text)
  32. if not tables:
  33. return ""
  34. buffer = StringIO()
  35. # Force LF line endings to avoid double-spacing on Windows consumers.
  36. writer = csv.writer(buffer, lineterminator="\n")
  37. for table in tables:
  38. for row in table:
  39. writer.writerow(row)
  40. csv_text = buffer.getvalue().strip()
  41. return csv_text.replace("\r\n", "\n").replace("\r", "\n")
  42. def read_input_from_stdin() -> str:
  43. html_text = sys.stdin.read()
  44. with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
  45. tmp_file.write(html_text)
  46. return tmp_file.name
  47. def main() -> int:
  48. temp_path = None
  49. try:
  50. source = read_input_from_stdin()
  51. temp_path = source
  52. converter = DocumentConverter()
  53. result = converter.convert(source)
  54. markdown_output = result.document.export_to_markdown()
  55. output_payload = normalize_tables_to_csv(markdown_output)
  56. sys.stdout.write(output_payload)
  57. return 0
  58. except Exception as exc: # pylint: disable=broad-except
  59. sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
  60. return 1
  61. finally:
  62. if temp_path:
  63. Path(temp_path).unlink(missing_ok=True)
  64. if __name__ == "__main__":
  65. raise SystemExit(main())