atsachlaris 1 день назад
Родитель
Сommit
a7c5103252
2 измененных файлов с 16 добавлено и 94 удалено
  1. 9 59
      scripts/docling_extract.py
  2. 7 35
      src/main/java/es/uv/saic/service/DoclingPythonClient.java

+ 9 - 59
scripts/docling_extract.py

@@ -1,44 +1,12 @@
 #!/usr/bin/env python3
-import argparse
 import csv
 import sys
 import tempfile
-from pathlib import Path
 from io import StringIO
+from pathlib import Path
 
 from docling.document_converter import DocumentConverter
 
-
-def normalize_tables_to_html(markdown_text: str) -> str:
-    # Docling returns markdown; downstream Java extractor expects HTML tables
-    # marked with data-extraction attribute.
-    lines = markdown_text.splitlines()
-    out_lines = []
-    in_table = False
-
-    for line in lines:
-        if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
-            if not in_table:
-                out_lines.append('<table data-extraction="docling">')
-                in_table = True
-
-            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
-            if all(cell.startswith("-") for cell in cells):
-                continue
-
-            row = "".join(f"<td>{cell}</td>" for cell in cells)
-            out_lines.append(f"<tr>{row}</tr>")
-        else:
-            if in_table:
-                out_lines.append("</table>")
-                in_table = False
-
-    if in_table:
-        out_lines.append("</table>")
-
-    return "\n".join(out_lines).strip()
-
-
 def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
     tables: list[list[list[str]]] = []
     current_table: list[list[str]] = []
@@ -82,40 +50,22 @@ def normalize_tables_to_csv(markdown_text: str) -> str:
     return csv_text.replace("\r\n", "\n").replace("\r", "\n")
 
 
-def read_input(args: argparse.Namespace) -> str:
-    if args.input_file:
-        return str(Path(args.input_file).resolve())
-
-    if args.stdin:
-        html_text = sys.stdin.read()
-        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
-            tmp_file.write(html_text)
-            return tmp_file.name
-
-    raise ValueError("Either --input-file or --stdin must be provided")
+def read_input_from_stdin() -> str:
+    html_text = sys.stdin.read()
+    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
+        tmp_file.write(html_text)
+        return tmp_file.name
 
 
 def main() -> int:
-    parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
-    parser.add_argument("--input-file", help="Path to input file")
-    parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
-    parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
-    args = parser.parse_args()
-
     temp_path = None
     try:
-        source = read_input(args)
-        if args.stdin:
-            temp_path = source
+        source = read_input_from_stdin()
+        temp_path = source
         converter = DocumentConverter()
         result = converter.convert(source)
         markdown_output = result.document.export_to_markdown()
-        if args.output == "markdown":
-            output_payload = markdown_output
-        elif args.output == "csv":
-            output_payload = normalize_tables_to_csv(markdown_output)
-        else:
-            output_payload = normalize_tables_to_html(markdown_output)
+        output_payload = normalize_tables_to_csv(markdown_output)
         sys.stdout.write(output_payload)
         return 0
     except Exception as exc:  # pylint: disable=broad-except

+ 7 - 35
src/main/java/es/uv/saic/service/DoclingPythonClient.java

@@ -6,7 +6,6 @@ import org.springframework.stereotype.Service;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
@@ -27,30 +26,19 @@ public class DoclingPythonClient {
             throw new IllegalStateException("Docling extractor is disabled");
         }
 
-        Path tempFile = null;
         try {
             List<String> command = new ArrayList<>();
             command.add(properties.getPythonCommand());
             command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
-            command.add("--output");
-            command.add("csv");
-
-            Process process;
-            if (request.hasFile()) {
-                tempFile = createTempInputFile(request);
-                command.add("--input-file");
-                command.add(tempFile.toString());
-                process = new ProcessBuilder(command).start();
-            } else if (request.hasRawHtml()) {
-                command.add("--stdin");
-                process = new ProcessBuilder(command).start();
-                process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
-                process.getOutputStream().flush();
-                process.getOutputStream().close();
-            } else {
-                throw new IllegalArgumentException("Unsupported extraction request: no input provided");
+            if (!request.hasRawHtml()) {
+                throw new IllegalArgumentException("Unsupported extraction request: no HTML payload provided");
             }
 
+            Process process = new ProcessBuilder(command).start();
+            process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
+            process.getOutputStream().flush();
+            process.getOutputStream().close();
+
             CompletableFuture<String> stdoutFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getInputStream()));
             CompletableFuture<String> stderrFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getErrorStream()));
 
@@ -74,14 +62,6 @@ public class DoclingPythonClient {
             throw new IllegalStateException("Docling extraction interrupted", e);
         } catch (ExecutionException e) {
             throw new IllegalStateException("Unable to read Docling extraction output", e);
-        } finally {
-            if (tempFile != null) {
-                try {
-                    Files.deleteIfExists(tempFile);
-                } catch (IOException ignored) {
-                    // Best effort cleanup.
-                }
-            }
         }
     }
 
@@ -93,12 +73,4 @@ public class DoclingPythonClient {
         }
     }
 
-    private static Path createTempInputFile(ExtractionRequest request) throws IOException {
-        String suffix = request.fileName() != null && request.fileName().contains(".")
-                ? request.fileName().substring(request.fileName().lastIndexOf('.'))
-                : ".bin";
-        Path tempPath = Files.createTempFile("docling-input-", suffix);
-        Files.write(tempPath, request.fileBytes());
-        return tempPath;
-    }
 }