|
|
@@ -0,0 +1,102 @@
|
|
|
+package es.uv.saic.service;
|
|
|
+
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.io.InputStream;
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
|
+import java.nio.file.Files;
|
|
|
+import java.nio.file.Path;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.concurrent.CompletableFuture;
|
|
|
+import java.util.concurrent.ExecutionException;
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
+
|
|
|
+@Service
|
|
|
+public class DoclingPythonClient {
|
|
|
+ private final DoclingProperties properties;
|
|
|
+
|
|
|
+ public DoclingPythonClient(DoclingProperties properties) {
|
|
|
+ this.properties = properties;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String extractHtmlTables(ExtractionRequest request) {
|
|
|
+ if (!properties.isEnabled()) {
|
|
|
+ throw new IllegalStateException("Docling extractor is disabled");
|
|
|
+ }
|
|
|
+
|
|
|
+ Path tempFile = null;
|
|
|
+ try {
|
|
|
+ List<String> command = new ArrayList<>();
|
|
|
+ command.add(properties.getPythonCommand());
|
|
|
+ command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
|
|
|
+
|
|
|
+ Process process;
|
|
|
+ if (request.hasFile()) {
|
|
|
+ tempFile = createTempInputFile(request);
|
|
|
+ command.add("--input-file");
|
|
|
+ command.add(tempFile.toString());
|
|
|
+ process = new ProcessBuilder(command).start();
|
|
|
+ } else if (request.hasRawHtml()) {
|
|
|
+ command.add("--stdin");
|
|
|
+ process = new ProcessBuilder(command).start();
|
|
|
+ process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
|
|
|
+ process.getOutputStream().flush();
|
|
|
+ process.getOutputStream().close();
|
|
|
+ } else {
|
|
|
+ throw new IllegalArgumentException("Unsupported extraction request: no input provided");
|
|
|
+ }
|
|
|
+
|
|
|
+ CompletableFuture<String> stdoutFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getInputStream()));
|
|
|
+ CompletableFuture<String> stderrFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getErrorStream()));
|
|
|
+
|
|
|
+ boolean completed = process.waitFor(properties.getTimeoutMs(), TimeUnit.MILLISECONDS);
|
|
|
+ if (!completed) {
|
|
|
+ process.destroyForcibly();
|
|
|
+ throw new IllegalStateException("Docling extraction timed out after " + properties.getTimeoutMs() + "ms");
|
|
|
+ }
|
|
|
+
|
|
|
+ String stdout = stdoutFuture.get();
|
|
|
+ String stderr = stderrFuture.get();
|
|
|
+ if (process.exitValue() != 0) {
|
|
|
+ String message = StringUtils.abbreviate(StringUtils.defaultString(stderr), 1000);
|
|
|
+ throw new IllegalStateException("Docling extraction failed with exit code " + process.exitValue() + ": " + message);
|
|
|
+ }
|
|
|
+ return stdout;
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new IllegalStateException("Unable to run Docling python command", e);
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ throw new IllegalStateException("Docling extraction interrupted", e);
|
|
|
+ } catch (ExecutionException e) {
|
|
|
+ throw new IllegalStateException("Unable to read Docling extraction output", e);
|
|
|
+ } finally {
|
|
|
+ if (tempFile != null) {
|
|
|
+ try {
|
|
|
+ Files.deleteIfExists(tempFile);
|
|
|
+ } catch (IOException ignored) {
|
|
|
+ // Best effort cleanup.
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String readAsString(InputStream stream) {
|
|
|
+ try {
|
|
|
+ return new String(stream.readAllBytes(), StandardCharsets.UTF_8);
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new IllegalStateException("Unable to read process stream", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static Path createTempInputFile(ExtractionRequest request) throws IOException {
|
|
|
+ String suffix = request.fileName() != null && request.fileName().contains(".")
|
|
|
+ ? request.fileName().substring(request.fileName().lastIndexOf('.'))
|
|
|
+ : ".bin";
|
|
|
+ Path tempPath = Files.createTempFile("docling-input-", suffix);
|
|
|
+ Files.write(tempPath, request.fileBytes());
|
|
|
+ return tempPath;
|
|
|
+ }
|
|
|
+}
|