1 روز پیش · a7c5103252
--- a/scripts/docling_extract.py
+++ b/scripts/docling_extract.py
@@ -1,44 +1,12 @@
 
				 #!/usr/bin/env python3
			
 
				-import argparse
			
 
				 import csv
			
 
				 import sys
			
 
				 import tempfile
			
 
				-from pathlib import Path
			
 
				 from io import StringIO
			
 
				+from pathlib import Path
			
 
				 
			
 
				 from docling.document_converter import DocumentConverter
			
 
				 
			
 
				-
			
 
				-def normalize_tables_to_html(markdown_text: str) -> str:
			
 
				-    # Docling returns markdown; downstream Java extractor expects HTML tables
			
 
				-    # marked with data-extraction attribute.
			
 
				-    lines = markdown_text.splitlines()
			
 
				-    out_lines = []
			
 
				-    in_table = False
			
 
				-
			
 
				-    for line in lines:
			
 
				-        if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
			
 
				-            if not in_table:
			
 
				-                out_lines.append('<table data-extraction="docling">')
			
 
				-                in_table = True
			
 
				-
			
 
				-            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
			
 
				-            if all(cell.startswith("-") for cell in cells):
			
 
				-                continue
			
 
				-
			
 
				-            row = "".join(f"<td>{cell}</td>" for cell in cells)
			
 
				-            out_lines.append(f"<tr>{row}</tr>")
			
 
				-        else:
			
 
				-            if in_table:
			
 
				-                out_lines.append("</table>")
			
 
				-                in_table = False
			
 
				-
			
 
				-    if in_table:
			
 
				-        out_lines.append("</table>")
			
 
				-
			
 
				-    return "\n".join(out_lines).strip()
			
 
				-
			
 
				-
			
 
				 def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
			
 
				     tables: list[list[list[str]]] = []
			
 
				     current_table: list[list[str]] = []
			
@@ -82,40 +50,22 @@ def normalize_tables_to_csv(markdown_text: str) -> str:
 
				     return csv_text.replace("\r\n", "\n").replace("\r", "\n")
			
 
				 
			
 
				 
			
 
				-def read_input(args: argparse.Namespace) -> str:
			
 
				-    if args.input_file:
			
 
				-        return str(Path(args.input_file).resolve())
			
 
				-
			
 
				-    if args.stdin:
			
 
				-        html_text = sys.stdin.read()
			
 
				-        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
			
 
				-            tmp_file.write(html_text)
			
 
				-            return tmp_file.name
			
 
				-
			
 
				-    raise ValueError("Either --input-file or --stdin must be provided")
			
 
				+def read_input_from_stdin() -> str:
			
 
				+    html_text = sys.stdin.read()
			
 
				+    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
			
 
				+        tmp_file.write(html_text)
			
 
				+        return tmp_file.name
			
 
				 
			
 
				 
			
 
				 def main() -> int:
			
 
				-    parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
			
 
				-    parser.add_argument("--input-file", help="Path to input file")
			
 
				-    parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
			
 
				-    parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				     temp_path = None
			
 
				     try:
			
 
				-        source = read_input(args)
			
 
				-        if args.stdin:
			
 
				-            temp_path = source
			
 
				+        source = read_input_from_stdin()
			
 
				+        temp_path = source
			
 
				         converter = DocumentConverter()
			
 
				         result = converter.convert(source)
			
 
				         markdown_output = result.document.export_to_markdown()
			
 
				-        if args.output == "markdown":
			
 
				-            output_payload = markdown_output
			
 
				-        elif args.output == "csv":
			
 
				-            output_payload = normalize_tables_to_csv(markdown_output)
			
 
				-        else:
			
 
				-            output_payload = normalize_tables_to_html(markdown_output)
			
 
				+        output_payload = normalize_tables_to_csv(markdown_output)
			
 
				         sys.stdout.write(output_payload)
			
 
				         return 0
			
 
				     except Exception as exc:  # pylint: disable=broad-except
			
--- a/src/main/java/es/uv/saic/service/DoclingPythonClient.java
+++ b/src/main/java/es/uv/saic/service/DoclingPythonClient.java
@@ -6,7 +6,6 @@ import org.springframework.stereotype.Service;
 
				 import java.io.IOException;
			
 
				 import java.io.InputStream;
			
 
				 import java.nio.charset.StandardCharsets;
			
 
				-import java.nio.file.Files;
			
 
				 import java.nio.file.Path;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.List;
			
@@ -27,30 +26,19 @@ public class DoclingPythonClient {
 
				             throw new IllegalStateException("Docling extractor is disabled");
			
 
				         }
			
 
				 
			
 
				-        Path tempFile = null;
			
 
				         try {
			
 
				             List<String> command = new ArrayList<>();
			
 
				             command.add(properties.getPythonCommand());
			
 
				             command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
			
 
				-            command.add("--output");
			
 
				-            command.add("csv");
			
 
				-
			
 
				-            Process process;
			
 
				-            if (request.hasFile()) {
			
 
				-                tempFile = createTempInputFile(request);
			
 
				-                command.add("--input-file");
			
 
				-                command.add(tempFile.toString());
			
 
				-                process = new ProcessBuilder(command).start();
			
 
				-            } else if (request.hasRawHtml()) {
			
 
				-                command.add("--stdin");
			
 
				-                process = new ProcessBuilder(command).start();
			
 
				-                process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
			
 
				-                process.getOutputStream().flush();
			
 
				-                process.getOutputStream().close();
			
 
				-            } else {
			
 
				-                throw new IllegalArgumentException("Unsupported extraction request: no input provided");
			
 
				+            if (!request.hasRawHtml()) {
			
 
				+                throw new IllegalArgumentException("Unsupported extraction request: no HTML payload provided");
			
 
				             }
			
 
				 
			
 
				+            Process process = new ProcessBuilder(command).start();
			
 
				+            process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
			
 
				+            process.getOutputStream().flush();
			
 
				+            process.getOutputStream().close();
			
 
				+
			
 
				             CompletableFuture<String> stdoutFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getInputStream()));
			
 
				             CompletableFuture<String> stderrFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getErrorStream()));
			
 
				 
			
@@ -74,14 +62,6 @@ public class DoclingPythonClient {
 
				             throw new IllegalStateException("Docling extraction interrupted", e);
			
 
				         } catch (ExecutionException e) {
			
 
				             throw new IllegalStateException("Unable to read Docling extraction output", e);
			
 
				-        } finally {
			
 
				-            if (tempFile != null) {
			
 
				-                try {
			
 
				-                    Files.deleteIfExists(tempFile);
			
 
				-                } catch (IOException ignored) {
			
 
				-                    // Best effort cleanup.
			
 
				-                }
			
 
				-            }
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -93,12 +73,4 @@ public class DoclingPythonClient {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    private static Path createTempInputFile(ExtractionRequest request) throws IOException {
			
 
				-        String suffix = request.fileName() != null && request.fileName().contains(".")
			
 
				-                ? request.fileName().substring(request.fileName().lastIndexOf('.'))
			
 
				-                : ".bin";
			
 
				-        Path tempPath = Files.createTempFile("docling-input-", suffix);
			
 
				-        Files.write(tempPath, request.fileBytes());
			
 
				-        return tempPath;
			
 
				-    }
			
 
				 }