docling_extract.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #!/usr/bin/env python3
  2. import argparse
  3. import csv
  4. import sys
  5. import tempfile
  6. from io import StringIO
  7. from pathlib import Path
  8. from docling.document_converter import DocumentConverter
  9. def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
  10. tables: list[list[list[str]]] = []
  11. current_table: list[list[str]] = []
  12. for line in markdown_text.splitlines():
  13. stripped = line.strip()
  14. if not stripped:
  15. # Docling markdown can include blank lines inside table regions.
  16. # Do not break the current table because of empty lines.
  17. continue
  18. is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
  19. if not is_table_line:
  20. if current_table:
  21. tables.append(current_table)
  22. current_table = []
  23. continue
  24. cells = [cell.strip() for cell in stripped.strip("|").split("|")]
  25. if all(cell.startswith("-") for cell in cells):
  26. continue
  27. current_table.append(cells)
  28. if current_table:
  29. tables.append(current_table)
  30. return tables
  31. def normalize_tables_to_csv(markdown_text: str) -> str:
  32. tables = parse_markdown_tables(markdown_text)
  33. if not tables:
  34. return ""
  35. buffer = StringIO()
  36. # Force LF line endings to avoid double-spacing on Windows consumers.
  37. writer = csv.writer(buffer, lineterminator="\n")
  38. for table in tables:
  39. for row in table:
  40. writer.writerow(row)
  41. csv_text = buffer.getvalue().strip()
  42. return csv_text.replace("\r\n", "\n").replace("\r", "\n")
  43. def read_input_from_stdin() -> str:
  44. html_text = sys.stdin.read()
  45. with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
  46. tmp_file.write(html_text)
  47. return tmp_file.name
  48. def main() -> int:
  49. parser = argparse.ArgumentParser(description="Extract tables with Docling")
  50. parser.add_argument("--output", choices=["csv", "markdown"], default="csv", help="Output format")
  51. args = parser.parse_args()
  52. temp_path = None
  53. try:
  54. source = read_input_from_stdin()
  55. temp_path = source
  56. converter = DocumentConverter()
  57. result = converter.convert(source)
  58. markdown_output = result.document.export_to_markdown()
  59. if args.output == "markdown":
  60. output_payload = markdown_output
  61. else:
  62. output_payload = normalize_tables_to_csv(markdown_output)
  63. sys.stdout.write(output_payload)
  64. return 0
  65. except Exception as exc: # pylint: disable=broad-except
  66. sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
  67. return 1
  68. finally:
  69. if temp_path:
  70. Path(temp_path).unlink(missing_ok=True)
  71. if __name__ == "__main__":
  72. raise SystemExit(main())