atsachlaris 1 день назад
Родитель
Сommit
6842a4337a

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+docling

+ 124 - 0
scripts/docling_extract.py

@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import sys
+import tempfile
+from pathlib import Path
+from io import StringIO
+
+from docling.document_converter import DocumentConverter
+
+
+def normalize_tables_to_html(markdown_text: str) -> str:
+    # Docling returns markdown; downstream Java extractor expects HTML tables
+    # marked with data-extraction attribute.
+    lines = markdown_text.splitlines()
+    out_lines = []
+    in_table = False
+
+    for line in lines:
+        if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
+            if not in_table:
+                out_lines.append('<table data-extraction="docling">')
+                in_table = True
+
+            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
+            if all(cell.startswith("-") for cell in cells):
+                continue
+
+            row = "".join(f"<td>{cell}</td>" for cell in cells)
+            out_lines.append(f"<tr>{row}</tr>")
+        else:
+            if in_table:
+                out_lines.append("</table>")
+                in_table = False
+
+    if in_table:
+        out_lines.append("</table>")
+
+    return "\n".join(out_lines).strip()
+
+
+def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
+    tables: list[list[list[str]]] = []
+    current_table: list[list[str]] = []
+
+    for line in markdown_text.splitlines():
+        stripped = line.strip()
+        is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
+        if not is_table_line:
+            if current_table:
+                tables.append(current_table)
+                current_table = []
+            continue
+
+        cells = [cell.strip() for cell in stripped.strip("|").split("|")]
+        if all(cell.startswith("-") for cell in cells):
+            continue
+        current_table.append(cells)
+
+    if current_table:
+        tables.append(current_table)
+
+    return tables
+
+
+def normalize_tables_to_csv(markdown_text: str) -> str:
+    tables = parse_markdown_tables(markdown_text)
+    if not tables:
+        return ""
+
+    buffer = StringIO()
+    writer = csv.writer(buffer)
+    for index, table in enumerate(tables):
+        for row in table:
+            writer.writerow(row)
+        if index < len(tables) - 1:
+            writer.writerow([])
+    return buffer.getvalue().strip()
+
+
+def read_input(args: argparse.Namespace) -> str:
+    if args.input_file:
+        return str(Path(args.input_file).resolve())
+
+    if args.stdin:
+        html_text = sys.stdin.read()
+        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
+            tmp_file.write(html_text)
+            return tmp_file.name
+
+    raise ValueError("Either --input-file or --stdin must be provided")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
+    parser.add_argument("--input-file", help="Path to input file")
+    parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
+    parser.add_argument("--output", choices=["html", "csv"], default="html", help="Output format")
+    args = parser.parse_args()
+
+    temp_path = None
+    try:
+        source = read_input(args)
+        if args.stdin:
+            temp_path = source
+        converter = DocumentConverter()
+        result = converter.convert(source)
+        markdown_output = result.document.export_to_markdown()
+        if args.output == "csv":
+            output_payload = normalize_tables_to_csv(markdown_output)
+        else:
+            output_payload = normalize_tables_to_html(markdown_output)
+        sys.stdout.write(output_payload)
+        return 0
+    except Exception as exc:  # pylint: disable=broad-except
+        sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
+        return 1
+    finally:
+        if temp_path:
+            Path(temp_path).unlink(missing_ok=True)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 43 - 0
src/main/java/es/uv/saic/service/DoclingProperties.java

@@ -0,0 +1,43 @@
+package es.uv.saic.service;
+
+import org.springframework.boot.context.properties.ConfigurationProperties;
+
+@ConfigurationProperties(prefix = "extractor.docling")
+public class DoclingProperties {
+    private boolean enabled = false;
+    private String pythonCommand = "python";
+    private String scriptPath = "scripts/docling_extract.py";
+    private long timeoutMs = 30000;
+
+    public boolean isEnabled() {
+        return enabled;
+    }
+
+    public void setEnabled(boolean enabled) {
+        this.enabled = enabled;
+    }
+
+    public String getPythonCommand() {
+        return pythonCommand;
+    }
+
+    public void setPythonCommand(String pythonCommand) {
+        this.pythonCommand = pythonCommand;
+    }
+
+    public String getScriptPath() {
+        return scriptPath;
+    }
+
+    public void setScriptPath(String scriptPath) {
+        this.scriptPath = scriptPath;
+    }
+
+    public long getTimeoutMs() {
+        return timeoutMs;
+    }
+
+    public void setTimeoutMs(long timeoutMs) {
+        this.timeoutMs = timeoutMs;
+    }
+}

+ 102 - 0
src/main/java/es/uv/saic/service/DoclingPythonClient.java

@@ -0,0 +1,102 @@
+package es.uv.saic.service;
+
+import org.apache.commons.lang3.StringUtils;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+@Service
+public class DoclingPythonClient {
+    private final DoclingProperties properties;
+
+    public DoclingPythonClient(DoclingProperties properties) {
+        this.properties = properties;
+    }
+
+    public String extractHtmlTables(ExtractionRequest request) {
+        if (!properties.isEnabled()) {
+            throw new IllegalStateException("Docling extractor is disabled");
+        }
+
+        Path tempFile = null;
+        try {
+            List<String> command = new ArrayList<>();
+            command.add(properties.getPythonCommand());
+            command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
+
+            Process process;
+            if (request.hasFile()) {
+                tempFile = createTempInputFile(request);
+                command.add("--input-file");
+                command.add(tempFile.toString());
+                process = new ProcessBuilder(command).start();
+            } else if (request.hasRawHtml()) {
+                command.add("--stdin");
+                process = new ProcessBuilder(command).start();
+                process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
+                process.getOutputStream().flush();
+                process.getOutputStream().close();
+            } else {
+                throw new IllegalArgumentException("Unsupported extraction request: no input provided");
+            }
+
+            CompletableFuture<String> stdoutFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getInputStream()));
+            CompletableFuture<String> stderrFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getErrorStream()));
+
+            boolean completed = process.waitFor(properties.getTimeoutMs(), TimeUnit.MILLISECONDS);
+            if (!completed) {
+                process.destroyForcibly();
+                throw new IllegalStateException("Docling extraction timed out after " + properties.getTimeoutMs() + "ms");
+            }
+
+            String stdout = stdoutFuture.get();
+            String stderr = stderrFuture.get();
+            if (process.exitValue() != 0) {
+                String message = StringUtils.abbreviate(StringUtils.defaultString(stderr), 1000);
+                throw new IllegalStateException("Docling extraction failed with exit code " + process.exitValue() + ": " + message);
+            }
+            return stdout;
+        } catch (IOException e) {
+            throw new IllegalStateException("Unable to run Docling python command", e);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            throw new IllegalStateException("Docling extraction interrupted", e);
+        } catch (ExecutionException e) {
+            throw new IllegalStateException("Unable to read Docling extraction output", e);
+        } finally {
+            if (tempFile != null) {
+                try {
+                    Files.deleteIfExists(tempFile);
+                } catch (IOException ignored) {
+                    // Best effort cleanup.
+                }
+            }
+        }
+    }
+
+    private static String readAsString(InputStream stream) {
+        try {
+            return new String(stream.readAllBytes(), StandardCharsets.UTF_8);
+        } catch (IOException e) {
+            throw new IllegalStateException("Unable to read process stream", e);
+        }
+    }
+
+    private static Path createTempInputFile(ExtractionRequest request) throws IOException {
+        String suffix = request.fileName() != null && request.fileName().contains(".")
+                ? request.fileName().substring(request.fileName().lastIndexOf('.'))
+                : ".bin";
+        Path tempPath = Files.createTempFile("docling-input-", suffix);
+        Files.write(tempPath, request.fileBytes());
+        return tempPath;
+    }
+}

+ 60 - 0
src/main/java/es/uv/saic/service/DoclingTableExtractor.java

@@ -0,0 +1,60 @@
+package es.uv.saic.service;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.stereotype.Service;
+
+@Service
+public class DoclingTableExtractor implements TableExtractor {
+    private final DoclingPythonClient doclingPythonClient;
+    private final HtmlToCsvExtractor htmlToCsvExtractor;
+    private final DoclingProperties doclingProperties;
+
+    public DoclingTableExtractor(
+            DoclingPythonClient doclingPythonClient,
+            HtmlToCsvExtractor htmlToCsvExtractor,
+            DoclingProperties doclingProperties
+    ) {
+        this.doclingPythonClient = doclingPythonClient;
+        this.htmlToCsvExtractor = htmlToCsvExtractor;
+        this.doclingProperties = doclingProperties;
+    }
+
+    @Override
+    public boolean supports(ExtractionRequest request) {
+        return doclingProperties.isEnabled() && request.hasRawHtml();
+    }
+
+    @Override
+    public String extractTablesToCsv(ExtractionRequest request) {
+        String filteredHtml = keepOnlyDataExtractionTables(request.rawHtml());
+        if (StringUtils.isBlank(filteredHtml)) {
+            return "";
+        }
+
+        String normalizedHtml = doclingPythonClient.extractHtmlTables(ExtractionRequest.fromHtml(filteredHtml));
+        if (StringUtils.isBlank(normalizedHtml)) {
+            return "";
+        }
+        return htmlToCsvExtractor.extractTablesToCsv(normalizedHtml);
+    }
+
+    private static String keepOnlyDataExtractionTables(String html) {
+        if (StringUtils.isBlank(html)) {
+            return "";
+        }
+
+        Document doc = Jsoup.parse(html);
+        Elements tables = doc.select("table[data-extraction]");
+        if (tables.isEmpty()) {
+            return "";
+        }
+
+        Element body = new Element("body");
+        tables.forEach(table -> body.appendChild(table.clone()));
+        return body.html();
+    }
+}

+ 29 - 0
src/main/java/es/uv/saic/service/ExtractionRequest.java

@@ -0,0 +1,29 @@
+package es.uv.saic.service;
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Arrays;
+
+public record ExtractionRequest(
+        String rawHtml,
+        byte[] fileBytes,
+        String fileName,
+        String contentType
+) {
+    public static ExtractionRequest fromHtml(String rawHtml) {
+        return new ExtractionRequest(rawHtml, null, null, "text/html");
+    }
+
+    public static ExtractionRequest fromFile(byte[] fileBytes, String fileName, String contentType) {
+        byte[] safeBytes = fileBytes == null ? null : Arrays.copyOf(fileBytes, fileBytes.length);
+        return new ExtractionRequest(null, safeBytes, fileName, contentType);
+    }
+
+    public boolean hasRawHtml() {
+        return StringUtils.isNotBlank(rawHtml);
+    }
+
+    public boolean hasFile() {
+        return fileBytes != null && fileBytes.length > 0;
+    }
+}

+ 7 - 0
src/main/java/es/uv/saic/service/TableExtractor.java

@@ -0,0 +1,7 @@
+package es.uv.saic.service;
+
+public interface TableExtractor {
+    boolean supports(ExtractionRequest request);
+
+    String extractTablesToCsv(ExtractionRequest request);
+}

+ 30 - 0
src/test/java/es/uv/saic/service/DoclingPythonClientTest.java

@@ -0,0 +1,30 @@
+package es.uv.saic.service;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+class DoclingPythonClientTest {
+
+    @Test
+    void extractHtmlTables_throwsWhenDisabled() {
+        DoclingProperties properties = new DoclingProperties();
+        properties.setEnabled(false);
+
+        DoclingPythonClient client = new DoclingPythonClient(properties);
+
+        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
+    }
+
+    @Test
+    void extractHtmlTables_throwsOnInvalidPythonCommand() {
+        DoclingProperties properties = new DoclingProperties();
+        properties.setEnabled(true);
+        properties.setPythonCommand("python-command-that-does-not-exist");
+        properties.setScriptPath("scripts/docling_extract.py");
+
+        DoclingPythonClient client = new DoclingPythonClient(properties);
+
+        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
+    }
+}

+ 41 - 0
src/test/java/es/uv/saic/service/DoclingTableExtractorTest.java

@@ -0,0 +1,41 @@
+package es.uv.saic.service;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.mockito.ArgumentMatchers.argThat;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+class DoclingTableExtractorTest {
+
+    @Test
+    void extractTablesToCsv_sendsOnlyDataExtractionTablesToDocling() {
+        DoclingPythonClient pythonClient = mock(DoclingPythonClient.class);
+        HtmlToCsvExtractor htmlToCsvExtractor = mock(HtmlToCsvExtractor.class);
+        DoclingProperties properties = new DoclingProperties();
+        properties.setEnabled(true);
+        DoclingTableExtractor extractor = new DoclingTableExtractor(pythonClient, htmlToCsvExtractor, properties);
+
+        String html = """
+                <html><body>
+                <table><tr><td>ignore</td></tr></table>
+                <table data-extraction="a"><tr><td>keep</td></tr></table>
+                </body></html>
+                """;
+
+        when(pythonClient.extractHtmlTables(argThat(req ->
+                req.hasRawHtml() &&
+                        req.rawHtml().contains("data-extraction") &&
+                        !req.rawHtml().contains("<table><tr><td>ignore")
+        ))).thenReturn("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>");
+        when(htmlToCsvExtractor.extractTablesToCsv("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>"))
+                .thenReturn("csv");
+
+        String result = extractor.extractTablesToCsv(ExtractionRequest.fromHtml(html));
+
+        assertEquals("csv", result);
+        verify(pythonClient).extractHtmlTables(argThat(req -> req.hasRawHtml() && !req.hasFile()));
+    }
+}

+ 69 - 0
src/test/java/es/uv/saic/service/EnhancementServiceRoutingTest.java

@@ -0,0 +1,69 @@
+package es.uv.saic.service;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.ai.chat.client.ChatClient;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.argThat;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+class EnhancementServiceRoutingTest {
+
+    private HtmlToCsvExtractor htmlToCsvExtractor;
+    private DoclingTableExtractor doclingTableExtractor;
+    private EnhancementService enhancementService;
+
+    @BeforeEach
+    void setUp() {
+        ChatClient.Builder builder = mock(ChatClient.Builder.class);
+        ChatClient chatClient = mock(ChatClient.class);
+        when(builder.build()).thenReturn(chatClient);
+
+        htmlToCsvExtractor = mock(HtmlToCsvExtractor.class);
+        doclingTableExtractor = mock(DoclingTableExtractor.class);
+        enhancementService = new EnhancementService(builder, htmlToCsvExtractor, doclingTableExtractor);
+    }
+
+    @Test
+    void extractForChatEndpoint_usesDoclingWhenSupported() {
+        ExtractionRequest request = ExtractionRequest.fromHtml("<table data-extraction='x'></table>");
+        when(doclingTableExtractor.supports(request)).thenReturn(true);
+        when(doclingTableExtractor.extractTablesToCsv(request)).thenReturn("csv-docling");
+
+        String result = enhancementService.extractForChatEndpoint(request);
+
+        assertEquals("csv-docling", result);
+        verify(doclingTableExtractor).extractTablesToCsv(request);
+    }
+
+    @Test
+    void extractForChatEndpoint_fallsBackToHtmlWhenDoclingNotSupported() {
+        ExtractionRequest request = ExtractionRequest.fromHtml("<html>no tables</html>");
+        when(doclingTableExtractor.supports(request)).thenReturn(false);
+        when(htmlToCsvExtractor.extractTablesToCsv(request)).thenReturn("csv-html");
+
+        String result = enhancementService.extractForChatEndpoint(request);
+
+        assertEquals("csv-html", result);
+        verify(htmlToCsvExtractor).extractTablesToCsv(request);
+    }
+
+    @Test
+    void extractForFileEndpoint_routesAsHtmlStringToDocling() {
+        byte[] fileBytes = "<table data-extraction='x'><tr><td>a</td></tr></table>".getBytes(StandardCharsets.UTF_8);
+        ExtractionRequest request = ExtractionRequest.fromFile(fileBytes, "input.html", "text/html");
+        when(doclingTableExtractor.supports(any(ExtractionRequest.class))).thenReturn(true);
+        when(doclingTableExtractor.extractTablesToCsv(any(ExtractionRequest.class))).thenReturn("csv-docling");
+
+        String result = enhancementService.extractForFileEndpoint(request);
+
+        assertEquals("csv-docling", result);
+        verify(doclingTableExtractor).extractTablesToCsv(argThat(r -> r.hasRawHtml() && !r.hasFile()));
+    }
+}