1 день назад · 6842a4337a
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 
				+docling
			
--- a/scripts/docling_extract.py
+++ b/scripts/docling_extract.py
@@ -0,0 +1,124 @@
 
				+#!/usr/bin/env python3
			
 
				+import argparse
			
 
				+import csv
			
 
				+import sys
			
 
				+import tempfile
			
 
				+from pathlib import Path
			
 
				+from io import StringIO
			
 
				+
			
 
				+from docling.document_converter import DocumentConverter
			
 
				+
			
 
				+
			
 
				+def normalize_tables_to_html(markdown_text: str) -> str:
			
 
				+    # Docling returns markdown; downstream Java extractor expects HTML tables
			
 
				+    # marked with data-extraction attribute.
			
 
				+    lines = markdown_text.splitlines()
			
 
				+    out_lines = []
			
 
				+    in_table = False
			
 
				+
			
 
				+    for line in lines:
			
 
				+        if "|" in line and line.strip().startswith("|") and line.strip().endswith("|"):
			
 
				+            if not in_table:
			
 
				+                out_lines.append('<table data-extraction="docling">')
			
 
				+                in_table = True
			
 
				+
			
 
				+            cells = [cell.strip() for cell in line.strip().strip("|").split("|")]
			
 
				+            if all(cell.startswith("-") for cell in cells):
			
 
				+                continue
			
 
				+
			
 
				+            row = "".join(f"<td>{cell}</td>" for cell in cells)
			
 
				+            out_lines.append(f"<tr>{row}</tr>")
			
 
				+        else:
			
 
				+            if in_table:
			
 
				+                out_lines.append("</table>")
			
 
				+                in_table = False
			
 
				+
			
 
				+    if in_table:
			
 
				+        out_lines.append("</table>")
			
 
				+
			
 
				+    return "\n".join(out_lines).strip()
			
 
				+
			
 
				+
			
 
				+def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
			
 
				+    tables: list[list[list[str]]] = []
			
 
				+    current_table: list[list[str]] = []
			
 
				+
			
 
				+    for line in markdown_text.splitlines():
			
 
				+        stripped = line.strip()
			
 
				+        is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
			
 
				+        if not is_table_line:
			
 
				+            if current_table:
			
 
				+                tables.append(current_table)
			
 
				+                current_table = []
			
 
				+            continue
			
 
				+
			
 
				+        cells = [cell.strip() for cell in stripped.strip("|").split("|")]
			
 
				+        if all(cell.startswith("-") for cell in cells):
			
 
				+            continue
			
 
				+        current_table.append(cells)
			
 
				+
			
 
				+    if current_table:
			
 
				+        tables.append(current_table)
			
 
				+
			
 
				+    return tables
			
 
				+
			
 
				+
			
 
				+def normalize_tables_to_csv(markdown_text: str) -> str:
			
 
				+    tables = parse_markdown_tables(markdown_text)
			
 
				+    if not tables:
			
 
				+        return ""
			
 
				+
			
 
				+    buffer = StringIO()
			
 
				+    writer = csv.writer(buffer)
			
 
				+    for index, table in enumerate(tables):
			
 
				+        for row in table:
			
 
				+            writer.writerow(row)
			
 
				+        if index < len(tables) - 1:
			
 
				+            writer.writerow([])
			
 
				+    return buffer.getvalue().strip()
			
 
				+
			
 
				+
			
 
				+def read_input(args: argparse.Namespace) -> str:
			
 
				+    if args.input_file:
			
 
				+        return str(Path(args.input_file).resolve())
			
 
				+
			
 
				+    if args.stdin:
			
 
				+        html_text = sys.stdin.read()
			
 
				+        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", suffix=".html", delete=False) as tmp_file:
			
 
				+            tmp_file.write(html_text)
			
 
				+            return tmp_file.name
			
 
				+
			
 
				+    raise ValueError("Either --input-file or --stdin must be provided")
			
 
				+
			
 
				+
			
 
				+def main() -> int:
			
 
				+    parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
			
 
				+    parser.add_argument("--input-file", help="Path to input file")
			
 
				+    parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
			
 
				+    parser.add_argument("--output", choices=["html", "csv"], default="html", help="Output format")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    temp_path = None
			
 
				+    try:
			
 
				+        source = read_input(args)
			
 
				+        if args.stdin:
			
 
				+            temp_path = source
			
 
				+        converter = DocumentConverter()
			
 
				+        result = converter.convert(source)
			
 
				+        markdown_output = result.document.export_to_markdown()
			
 
				+        if args.output == "csv":
			
 
				+            output_payload = normalize_tables_to_csv(markdown_output)
			
 
				+        else:
			
 
				+            output_payload = normalize_tables_to_html(markdown_output)
			
 
				+        sys.stdout.write(output_payload)
			
 
				+        return 0
			
 
				+    except Exception as exc:  # pylint: disable=broad-except
			
 
				+        sys.stderr.write(f"DOCLING_ERROR: {exc}\n")
			
 
				+        return 1
			
 
				+    finally:
			
 
				+        if temp_path:
			
 
				+            Path(temp_path).unlink(missing_ok=True)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    raise SystemExit(main())
			
--- a/src/main/java/es/uv/saic/service/DoclingProperties.java
+++ b/src/main/java/es/uv/saic/service/DoclingProperties.java
@@ -0,0 +1,43 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.springframework.boot.context.properties.ConfigurationProperties;
			
 
				+
			
 
				+@ConfigurationProperties(prefix = "extractor.docling")
			
 
				+public class DoclingProperties {
			
 
				+    private boolean enabled = false;
			
 
				+    private String pythonCommand = "python";
			
 
				+    private String scriptPath = "scripts/docling_extract.py";
			
 
				+    private long timeoutMs = 30000;
			
 
				+
			
 
				+    public boolean isEnabled() {
			
 
				+        return enabled;
			
 
				+    }
			
 
				+
			
 
				+    public void setEnabled(boolean enabled) {
			
 
				+        this.enabled = enabled;
			
 
				+    }
			
 
				+
			
 
				+    public String getPythonCommand() {
			
 
				+        return pythonCommand;
			
 
				+    }
			
 
				+
			
 
				+    public void setPythonCommand(String pythonCommand) {
			
 
				+        this.pythonCommand = pythonCommand;
			
 
				+    }
			
 
				+
			
 
				+    public String getScriptPath() {
			
 
				+        return scriptPath;
			
 
				+    }
			
 
				+
			
 
				+    public void setScriptPath(String scriptPath) {
			
 
				+        this.scriptPath = scriptPath;
			
 
				+    }
			
 
				+
			
 
				+    public long getTimeoutMs() {
			
 
				+        return timeoutMs;
			
 
				+    }
			
 
				+
			
 
				+    public void setTimeoutMs(long timeoutMs) {
			
 
				+        this.timeoutMs = timeoutMs;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/es/uv/saic/service/DoclingPythonClient.java
+++ b/src/main/java/es/uv/saic/service/DoclingPythonClient.java
@@ -0,0 +1,102 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.apache.commons.lang3.StringUtils;
			
 
				+import org.springframework.stereotype.Service;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				+import java.nio.file.Files;
			
 
				+import java.nio.file.Path;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+import java.util.concurrent.CompletableFuture;
			
 
				+import java.util.concurrent.ExecutionException;
			
 
				+import java.util.concurrent.TimeUnit;
			
 
				+
			
 
				+@Service
			
 
				+public class DoclingPythonClient {
			
 
				+    private final DoclingProperties properties;
			
 
				+
			
 
				+    public DoclingPythonClient(DoclingProperties properties) {
			
 
				+        this.properties = properties;
			
 
				+    }
			
 
				+
			
 
				+    public String extractHtmlTables(ExtractionRequest request) {
			
 
				+        if (!properties.isEnabled()) {
			
 
				+            throw new IllegalStateException("Docling extractor is disabled");
			
 
				+        }
			
 
				+
			
 
				+        Path tempFile = null;
			
 
				+        try {
			
 
				+            List<String> command = new ArrayList<>();
			
 
				+            command.add(properties.getPythonCommand());
			
 
				+            command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
			
 
				+
			
 
				+            Process process;
			
 
				+            if (request.hasFile()) {
			
 
				+                tempFile = createTempInputFile(request);
			
 
				+                command.add("--input-file");
			
 
				+                command.add(tempFile.toString());
			
 
				+                process = new ProcessBuilder(command).start();
			
 
				+            } else if (request.hasRawHtml()) {
			
 
				+                command.add("--stdin");
			
 
				+                process = new ProcessBuilder(command).start();
			
 
				+                process.getOutputStream().write(request.rawHtml().getBytes(StandardCharsets.UTF_8));
			
 
				+                process.getOutputStream().flush();
			
 
				+                process.getOutputStream().close();
			
 
				+            } else {
			
 
				+                throw new IllegalArgumentException("Unsupported extraction request: no input provided");
			
 
				+            }
			
 
				+
			
 
				+            CompletableFuture<String> stdoutFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getInputStream()));
			
 
				+            CompletableFuture<String> stderrFuture = CompletableFuture.supplyAsync(() -> readAsString(process.getErrorStream()));
			
 
				+
			
 
				+            boolean completed = process.waitFor(properties.getTimeoutMs(), TimeUnit.MILLISECONDS);
			
 
				+            if (!completed) {
			
 
				+                process.destroyForcibly();
			
 
				+                throw new IllegalStateException("Docling extraction timed out after " + properties.getTimeoutMs() + "ms");
			
 
				+            }
			
 
				+
			
 
				+            String stdout = stdoutFuture.get();
			
 
				+            String stderr = stderrFuture.get();
			
 
				+            if (process.exitValue() != 0) {
			
 
				+                String message = StringUtils.abbreviate(StringUtils.defaultString(stderr), 1000);
			
 
				+                throw new IllegalStateException("Docling extraction failed with exit code " + process.exitValue() + ": " + message);
			
 
				+            }
			
 
				+            return stdout;
			
 
				+        } catch (IOException e) {
			
 
				+            throw new IllegalStateException("Unable to run Docling python command", e);
			
 
				+        } catch (InterruptedException e) {
			
 
				+            Thread.currentThread().interrupt();
			
 
				+            throw new IllegalStateException("Docling extraction interrupted", e);
			
 
				+        } catch (ExecutionException e) {
			
 
				+            throw new IllegalStateException("Unable to read Docling extraction output", e);
			
 
				+        } finally {
			
 
				+            if (tempFile != null) {
			
 
				+                try {
			
 
				+                    Files.deleteIfExists(tempFile);
			
 
				+                } catch (IOException ignored) {
			
 
				+                    // Best effort cleanup.
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private static String readAsString(InputStream stream) {
			
 
				+        try {
			
 
				+            return new String(stream.readAllBytes(), StandardCharsets.UTF_8);
			
 
				+        } catch (IOException e) {
			
 
				+            throw new IllegalStateException("Unable to read process stream", e);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private static Path createTempInputFile(ExtractionRequest request) throws IOException {
			
 
				+        String suffix = request.fileName() != null && request.fileName().contains(".")
			
 
				+                ? request.fileName().substring(request.fileName().lastIndexOf('.'))
			
 
				+                : ".bin";
			
 
				+        Path tempPath = Files.createTempFile("docling-input-", suffix);
			
 
				+        Files.write(tempPath, request.fileBytes());
			
 
				+        return tempPath;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/es/uv/saic/service/DoclingTableExtractor.java
+++ b/src/main/java/es/uv/saic/service/DoclingTableExtractor.java
@@ -0,0 +1,60 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.apache.commons.lang3.StringUtils;
			
 
				+import org.jsoup.Jsoup;
			
 
				+import org.jsoup.nodes.Document;
			
 
				+import org.jsoup.nodes.Element;
			
 
				+import org.jsoup.select.Elements;
			
 
				+import org.springframework.stereotype.Service;
			
 
				+
			
 
				+@Service
			
 
				+public class DoclingTableExtractor implements TableExtractor {
			
 
				+    private final DoclingPythonClient doclingPythonClient;
			
 
				+    private final HtmlToCsvExtractor htmlToCsvExtractor;
			
 
				+    private final DoclingProperties doclingProperties;
			
 
				+
			
 
				+    public DoclingTableExtractor(
			
 
				+            DoclingPythonClient doclingPythonClient,
			
 
				+            HtmlToCsvExtractor htmlToCsvExtractor,
			
 
				+            DoclingProperties doclingProperties
			
 
				+    ) {
			
 
				+        this.doclingPythonClient = doclingPythonClient;
			
 
				+        this.htmlToCsvExtractor = htmlToCsvExtractor;
			
 
				+        this.doclingProperties = doclingProperties;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public boolean supports(ExtractionRequest request) {
			
 
				+        return doclingProperties.isEnabled() && request.hasRawHtml();
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public String extractTablesToCsv(ExtractionRequest request) {
			
 
				+        String filteredHtml = keepOnlyDataExtractionTables(request.rawHtml());
			
 
				+        if (StringUtils.isBlank(filteredHtml)) {
			
 
				+            return "";
			
 
				+        }
			
 
				+
			
 
				+        String normalizedHtml = doclingPythonClient.extractHtmlTables(ExtractionRequest.fromHtml(filteredHtml));
			
 
				+        if (StringUtils.isBlank(normalizedHtml)) {
			
 
				+            return "";
			
 
				+        }
			
 
				+        return htmlToCsvExtractor.extractTablesToCsv(normalizedHtml);
			
 
				+    }
			
 
				+
			
 
				+    private static String keepOnlyDataExtractionTables(String html) {
			
 
				+        if (StringUtils.isBlank(html)) {
			
 
				+            return "";
			
 
				+        }
			
 
				+
			
 
				+        Document doc = Jsoup.parse(html);
			
 
				+        Elements tables = doc.select("table[data-extraction]");
			
 
				+        if (tables.isEmpty()) {
			
 
				+            return "";
			
 
				+        }
			
 
				+
			
 
				+        Element body = new Element("body");
			
 
				+        tables.forEach(table -> body.appendChild(table.clone()));
			
 
				+        return body.html();
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/es/uv/saic/service/ExtractionRequest.java
+++ b/src/main/java/es/uv/saic/service/ExtractionRequest.java
@@ -0,0 +1,29 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.apache.commons.lang3.StringUtils;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+
			
 
				+public record ExtractionRequest(
			
 
				+        String rawHtml,
			
 
				+        byte[] fileBytes,
			
 
				+        String fileName,
			
 
				+        String contentType
			
 
				+) {
			
 
				+    public static ExtractionRequest fromHtml(String rawHtml) {
			
 
				+        return new ExtractionRequest(rawHtml, null, null, "text/html");
			
 
				+    }
			
 
				+
			
 
				+    public static ExtractionRequest fromFile(byte[] fileBytes, String fileName, String contentType) {
			
 
				+        byte[] safeBytes = fileBytes == null ? null : Arrays.copyOf(fileBytes, fileBytes.length);
			
 
				+        return new ExtractionRequest(null, safeBytes, fileName, contentType);
			
 
				+    }
			
 
				+
			
 
				+    public boolean hasRawHtml() {
			
 
				+        return StringUtils.isNotBlank(rawHtml);
			
 
				+    }
			
 
				+
			
 
				+    public boolean hasFile() {
			
 
				+        return fileBytes != null && fileBytes.length > 0;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/es/uv/saic/service/TableExtractor.java
+++ b/src/main/java/es/uv/saic/service/TableExtractor.java
@@ -0,0 +1,7 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+public interface TableExtractor {
			
 
				+    boolean supports(ExtractionRequest request);
			
 
				+
			
 
				+    String extractTablesToCsv(ExtractionRequest request);
			
 
				+}
			
--- a/src/test/java/es/uv/saic/service/DoclingPythonClientTest.java
+++ b/src/test/java/es/uv/saic/service/DoclingPythonClientTest.java
@@ -0,0 +1,30 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.junit.jupiter.api.Test;
			
 
				+
			
 
				+import static org.junit.jupiter.api.Assertions.assertThrows;
			
 
				+
			
 
				+class DoclingPythonClientTest {
			
 
				+
			
 
				+    @Test
			
 
				+    void extractHtmlTables_throwsWhenDisabled() {
			
 
				+        DoclingProperties properties = new DoclingProperties();
			
 
				+        properties.setEnabled(false);
			
 
				+
			
 
				+        DoclingPythonClient client = new DoclingPythonClient(properties);
			
 
				+
			
 
				+        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
			
 
				+    }
			
 
				+
			
 
				+    @Test
			
 
				+    void extractHtmlTables_throwsOnInvalidPythonCommand() {
			
 
				+        DoclingProperties properties = new DoclingProperties();
			
 
				+        properties.setEnabled(true);
			
 
				+        properties.setPythonCommand("python-command-that-does-not-exist");
			
 
				+        properties.setScriptPath("scripts/docling_extract.py");
			
 
				+
			
 
				+        DoclingPythonClient client = new DoclingPythonClient(properties);
			
 
				+
			
 
				+        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
			
 
				+    }
			
 
				+}
			
--- a/src/test/java/es/uv/saic/service/DoclingTableExtractorTest.java
+++ b/src/test/java/es/uv/saic/service/DoclingTableExtractorTest.java
@@ -0,0 +1,41 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.junit.jupiter.api.Test;
			
 
				+
			
 
				+import static org.junit.jupiter.api.Assertions.assertEquals;
			
 
				+import static org.mockito.ArgumentMatchers.argThat;
			
 
				+import static org.mockito.Mockito.mock;
			
 
				+import static org.mockito.Mockito.verify;
			
 
				+import static org.mockito.Mockito.when;
			
 
				+
			
 
				+class DoclingTableExtractorTest {
			
 
				+
			
 
				+    @Test
			
 
				+    void extractTablesToCsv_sendsOnlyDataExtractionTablesToDocling() {
			
 
				+        DoclingPythonClient pythonClient = mock(DoclingPythonClient.class);
			
 
				+        HtmlToCsvExtractor htmlToCsvExtractor = mock(HtmlToCsvExtractor.class);
			
 
				+        DoclingProperties properties = new DoclingProperties();
			
 
				+        properties.setEnabled(true);
			
 
				+        DoclingTableExtractor extractor = new DoclingTableExtractor(pythonClient, htmlToCsvExtractor, properties);
			
 
				+
			
 
				+        String html = """
			
 
				+                <html><body>
			
 
				+                <table><tr><td>ignore</td></tr></table>
			
 
				+                <table data-extraction="a"><tr><td>keep</td></tr></table>
			
 
				+                </body></html>
			
 
				+                """;
			
 
				+
			
 
				+        when(pythonClient.extractHtmlTables(argThat(req ->
			
 
				+                req.hasRawHtml() &&
			
 
				+                        req.rawHtml().contains("data-extraction") &&
			
 
				+                        !req.rawHtml().contains("<table><tr><td>ignore")
			
 
				+        ))).thenReturn("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>");
			
 
				+        when(htmlToCsvExtractor.extractTablesToCsv("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>"))
			
 
				+                .thenReturn("csv");
			
 
				+
			
 
				+        String result = extractor.extractTablesToCsv(ExtractionRequest.fromHtml(html));
			
 
				+
			
 
				+        assertEquals("csv", result);
			
 
				+        verify(pythonClient).extractHtmlTables(argThat(req -> req.hasRawHtml() && !req.hasFile()));
			
 
				+    }
			
 
				+}
			
--- a/src/test/java/es/uv/saic/service/EnhancementServiceRoutingTest.java
+++ b/src/test/java/es/uv/saic/service/EnhancementServiceRoutingTest.java
@@ -0,0 +1,69 @@
 
				+package es.uv.saic.service;
			
 
				+
			
 
				+import org.junit.jupiter.api.BeforeEach;
			
 
				+import org.junit.jupiter.api.Test;
			
 
				+import org.springframework.ai.chat.client.ChatClient;
			
 
				+
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				+
			
 
				+import static org.junit.jupiter.api.Assertions.assertEquals;
			
 
				+import static org.mockito.ArgumentMatchers.any;
			
 
				+import static org.mockito.ArgumentMatchers.argThat;
			
 
				+import static org.mockito.Mockito.mock;
			
 
				+import static org.mockito.Mockito.verify;
			
 
				+import static org.mockito.Mockito.when;
			
 
				+
			
 
				+class EnhancementServiceRoutingTest {
			
 
				+
			
 
				+    private HtmlToCsvExtractor htmlToCsvExtractor;
			
 
				+    private DoclingTableExtractor doclingTableExtractor;
			
 
				+    private EnhancementService enhancementService;
			
 
				+
			
 
				+    @BeforeEach
			
 
				+    void setUp() {
			
 
				+        ChatClient.Builder builder = mock(ChatClient.Builder.class);
			
 
				+        ChatClient chatClient = mock(ChatClient.class);
			
 
				+        when(builder.build()).thenReturn(chatClient);
			
 
				+
			
 
				+        htmlToCsvExtractor = mock(HtmlToCsvExtractor.class);
			
 
				+        doclingTableExtractor = mock(DoclingTableExtractor.class);
			
 
				+        enhancementService = new EnhancementService(builder, htmlToCsvExtractor, doclingTableExtractor);
			
 
				+    }
			
 
				+
			
 
				+    @Test
			
 
				+    void extractForChatEndpoint_usesDoclingWhenSupported() {
			
 
				+        ExtractionRequest request = ExtractionRequest.fromHtml("<table data-extraction='x'></table>");
			
 
				+        when(doclingTableExtractor.supports(request)).thenReturn(true);
			
 
				+        when(doclingTableExtractor.extractTablesToCsv(request)).thenReturn("csv-docling");
			
 
				+
			
 
				+        String result = enhancementService.extractForChatEndpoint(request);
			
 
				+
			
 
				+        assertEquals("csv-docling", result);
			
 
				+        verify(doclingTableExtractor).extractTablesToCsv(request);
			
 
				+    }
			
 
				+
			
 
				+    @Test
			
 
				+    void extractForChatEndpoint_fallsBackToHtmlWhenDoclingNotSupported() {
			
 
				+        ExtractionRequest request = ExtractionRequest.fromHtml("<html>no tables</html>");
			
 
				+        when(doclingTableExtractor.supports(request)).thenReturn(false);
			
 
				+        when(htmlToCsvExtractor.extractTablesToCsv(request)).thenReturn("csv-html");
			
 
				+
			
 
				+        String result = enhancementService.extractForChatEndpoint(request);
			
 
				+
			
 
				+        assertEquals("csv-html", result);
			
 
				+        verify(htmlToCsvExtractor).extractTablesToCsv(request);
			
 
				+    }
			
 
				+
			
 
				+    @Test
			
 
				+    void extractForFileEndpoint_routesAsHtmlStringToDocling() {
			
 
				+        byte[] fileBytes = "<table data-extraction='x'><tr><td>a</td></tr></table>".getBytes(StandardCharsets.UTF_8);
			
 
				+        ExtractionRequest request = ExtractionRequest.fromFile(fileBytes, "input.html", "text/html");
			
 
				+        when(doclingTableExtractor.supports(any(ExtractionRequest.class))).thenReturn(true);
			
 
				+        when(doclingTableExtractor.extractTablesToCsv(any(ExtractionRequest.class))).thenReturn("csv-docling");
			
 
				+
			
 
				+        String result = enhancementService.extractForFileEndpoint(request);
			
 
				+
			
 
				+        assertEquals("csv-docling", result);
			
 
				+        verify(doclingTableExtractor).extractTablesToCsv(argThat(r -> r.hasRawHtml() && !r.hasFile()));
			
 
				+    }
			
 
				+}