Ver Fonte

Use docling to convert to csv

atsachlaris há 1 dia atrás
pai
commit
b79193969d

+ 2 - 0
src/main/java/es/uv/saic/UvSaicDesApplication.java

@@ -2,8 +2,10 @@ package es.uv.saic;
 
 import org.springframework.boot.SpringApplication;
 import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.boot.context.properties.ConfigurationPropertiesScan;
 
 @SpringBootApplication
+@ConfigurationPropertiesScan
 public class UvSaicDesApplication {
 
     public static void main(String[] args) {

+ 36 - 2
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -5,25 +5,59 @@ import lombok.SneakyThrows;
 import org.springframework.ai.chat.client.ChatClient;
 import org.springframework.stereotype.Service;
 
+import java.nio.charset.StandardCharsets;
+
 import static es.uv.saic.service.SystemPrompt.SYSTEM_INSTRUCTIONS;
 
 @Service
 public class EnhancementService {
     private final ChatClient chatClient;
     private final HtmlToCsvExtractor htmlToCsvExtractor;
+    private final DoclingTableExtractor doclingTableExtractor;
     private final ObjectMapper objectMapper = new ObjectMapper();
 
     public EnhancementService(
             ChatClient.Builder chatClientBuilder,
-            HtmlToCsvExtractor htmlToCsvExtractor
+            HtmlToCsvExtractor htmlToCsvExtractor,
+            DoclingTableExtractor doclingTableExtractor
     ) {
         this.chatClient = chatClientBuilder.build();
         this.htmlToCsvExtractor = htmlToCsvExtractor;
+        this.doclingTableExtractor = doclingTableExtractor;
     }
 
     @SneakyThrows
     public String ask(String html) {
-        String asCsv = htmlToCsvExtractor.extractTablesToCsv(html);
+        return askHtml(html);
+    }
+
+    @SneakyThrows
+    public String askHtml(String html) {
+        String asCsv = extractForChatEndpoint(ExtractionRequest.fromHtml(html));
+
+        return askLlm(asCsv);
+    }
+
+    public String askFile(byte[] fileBytes, String fileName, String contentType) {
+        String asCsv = extractForFileEndpoint(ExtractionRequest.fromFile(fileBytes, fileName, contentType));
+
+        return askLlm(asCsv);
+    }
+
+    String extractForChatEndpoint(ExtractionRequest request) {
+        if (doclingTableExtractor.supports(request)) {
+            return doclingTableExtractor.extractTablesToCsv(request);
+        }
+        return htmlToCsvExtractor.extractTablesToCsv(request);
+    }
+
+    String extractForFileEndpoint(ExtractionRequest request) {
+        String htmlFallback = request.hasFile() ? new String(request.fileBytes(), StandardCharsets.UTF_8) : "";
+        return extractForChatEndpoint(ExtractionRequest.fromHtml(htmlFallback));
+    }
+
+    @SneakyThrows
+    private String askLlm(String asCsv) {
 
         return chatClient.prompt()
                 .system(SYSTEM_INSTRUCTIONS)

+ 19 - 1
src/main/java/es/uv/saic/service/HtmlToCsvExtractor.java

@@ -18,7 +18,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 @Service
-public class HtmlToCsvExtractor {
+public class HtmlToCsvExtractor implements TableExtractor {
     private static final Pattern NUMERIC_CODE_PREFIX = Pattern.compile("^\\s*(\\d+)\\s*[-.)]\\s*(.*)$");
     private static final Pattern LETTER_CODE_PREFIX = Pattern.compile("^\\s*([a-zA-Z])\\s*[-.)]\\s*(.*)$");
     private static final List<String> CSV_HEADER = List.of(
@@ -28,6 +28,16 @@ public class HtmlToCsvExtractor {
             "table", "thead", "tbody", "tr", "td", "th", "strong"
     );
 
+    @Override
+    public boolean supports(ExtractionRequest request) {
+        return request.hasRawHtml() && hasExtractableTables(request.rawHtml());
+    }
+
+    @Override
+    public String extractTablesToCsv(ExtractionRequest request) {
+        return extractTablesToCsv(request.rawHtml());
+    }
+
     public String extractTablesToCsv(String html) {
         if (StringUtils.isEmpty(html)) {
             return "";
@@ -52,6 +62,14 @@ public class HtmlToCsvExtractor {
         return String.join("\n\n", csvTables).trim();
     }
 
+    public boolean hasExtractableTables(String html) {
+        if (StringUtils.isBlank(html)) {
+            return false;
+        }
+        Document doc = Jsoup.parse(html);
+        return !doc.select("table[data-extraction]").isEmpty();
+    }
+
     private static void clearAllAttributes(Element cleanTable) {
         cleanTable.select("*").forEach(Element::clearAttributes);
     }

+ 2 - 5
src/main/java/es/uv/saic/web/EnhancementController.java

@@ -12,8 +12,6 @@ import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 import org.springframework.web.server.ResponseStatusException;
 
-import java.nio.charset.StandardCharsets;
-
 import static org.springframework.http.HttpStatus.BAD_REQUEST;
 
 @RestController
@@ -30,7 +28,7 @@ public class EnhancementController {
 
     @PostMapping("chat")
     public String chat(@RequestBody String html) {
-        return enhancementService.ask(html);
+        return enhancementService.askHtml(html);
     }
 
     @PostMapping(value = "chat/file", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
@@ -40,8 +38,7 @@ public class EnhancementController {
         }
 
         try {
-            String html = new String(file.getBytes(), StandardCharsets.UTF_8);
-            return enhancementService.ask(html);
+            return enhancementService.askFile(file.getBytes(), file.getOriginalFilename(), file.getContentType());
         } catch (Exception e) {
             throw new ResponseStatusException(BAD_REQUEST, "Unable to read uploaded file", e);
         }

+ 7 - 1
src/main/resources/application.properties

@@ -19,4 +19,10 @@ spring.messages.encoding=UTF-8
 # Datasource config
 spring.datasource.driver-class-name=org.postgresql.Driver
 spring.datasource.username=postgres
-spring.sql.init.platform=postgres
+spring.sql.init.platform=postgres
+
+# Docling extractor
+extractor.docling.enabled=true
+extractor.docling.python-command=C:/Users/arist/AppData/Local/Python/bin/python.exe
+extractor.docling.script-path=scripts/docling_extract.py
+extractor.docling.timeout-ms=30000