docling_extract.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #!/usr/bin/env python3
  2. import argparse
  3. import csv
  4. import sys
  5. import tempfile
  6. from pathlib import Path
  7. from io import StringIO
  8. from docling.document_converter import DocumentConverter
  9. def normalize_tables_to_html(markdown_text: str) -> str:
  10. # Docling returns markdown; downstream Java extractor expects HTML tables
  11. # marked with data-extraction attribute.
  12. lines = markdown_text.splitlines()
  13. out_lines = []
  14. in_table = False
  15. for line in lines:
  16. if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
  17. if not in_table:
  18. out_lines.append('<table data-extraction="docling">')
  19. in_table = True
  20. cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
  21. if all(cell.startswith("-") for cell in cells):
  22. continue
  23. row = "".join(f"<td>{cell}</td>" for cell in cells)
  24. out_lines.append(f"<tr>{row}</tr>")
  25. else:
  26. if in_table:
  27. out_lines.append("</table>")
  28. in_table = False
  29. if in_table:
  30. out_lines.append("</table>")
  31. return "\n".join(out_lines).strip()
  32. def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
  33. tables: list[list[list[str]]] = []
  34. current_table: list[list[str]] = []
  35. for line in markdown_text.splitlines():
  36. stripped = line.strip()
  37. is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
  38. if not is_table_line:
  39. if current_table:
  40. tables.append(current_table)
  41. current_table = []
  42. continue
  43. cells = [cell.strip() for cell in stripped.strip("|").split("|")]
  44. if all(cell.startswith("-") for cell in cells):
  45. continue
  46. current_table.append(cells)
  47. if current_table:
  48. tables.append(current_table)
  49. return tables
  50. def normalize_tables_to_csv(markdown_text: str) -> str:
  51. tables = parse_markdown_tables(markdown_text)
  52. if not tables:
  53. return ""
  54. buffer = StringIO()
  55. writer = csv.writer(buffer)
  56. for index, table in enumerate(tables):
  57. for row in table:
  58. writer.writerow(row)
  59. if index < len(tables) - 1:
  60. writer.writerow([])
  61. return buffer.getvalue().strip()
  62. def read_input(args: argparse.Namespace) -> str:
  63. if args.input_file:
  64. return str(Path(args.input_file).resolve())
  65. if args.stdin:
  66. html_text = sys.stdin.read()
  67. with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
  68. tmp_file.write(html_text)
  69. return tmp_file.name
  70. raise ValueError("Either --input-file or --stdin must be provided")
  71. def main() -> int:
  72. parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
  73. parser.add_argument("--input-file", help="Path to input file")
  74. parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
  75. parser.add_argument("--output", choices=["html", "csv"], default="html", help="Output format")
  76. args = parser.parse_args()
  77. temp_path = None
  78. try:
  79. source = read_input(args)
  80. if args.stdin:
  81. temp_path = source
  82. converter = DocumentConverter()
  83. result = converter.convert(source)
  84. markdown_output = result.document.export_to_markdown()
  85. if args.output == "csv":
  86. output_payload = normalize_tables_to_csv(markdown_output)
  87. else:
  88. output_payload = normalize_tables_to_html(markdown_output)
  89. sys.stdout.write(output_payload)
  90. return 0
  91. except Exception as exc: # pylint: disable=broad-except
  92. sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
  93. return 1
  94. finally:
  95. if temp_path:
  96. Path(temp_path).unlink(missing_ok=True)
  97. if __name__ == "__main__":
  98. raise SystemExit(main())