Jelajahi Sumber

Keep only tables with data-extraction attribute

atsachlaris 5 hari lalu
induk
melakukan
69c07beb84

File diff ditekan karena terlalu besar
+ 0 - 0
requests/service.http


+ 23 - 5
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -7,9 +7,12 @@ import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 import org.springframework.ai.chat.client.ChatClient;
 import org.springframework.stereotype.Service;
 
+import static es.uv.saic.service.SystemPrompt.SYSTEM_INSTRUCTIONS;
+
 @Service
 public class EnhancementService {
 
@@ -23,28 +26,43 @@ public class EnhancementService {
     @SneakyThrows
     public String ask(String message) {
         return chatClient.prompt()
+                .system(SYSTEM_INSTRUCTIONS)
                 .user(normalizeMessage(message))
                 .call()
                 .content();
     }
 
     private String normalizeMessage(String message) throws JsonProcessingException {
-        String cleanHtml = removeHtmlAttributes(message);
+        String cleanHtml = extractStructuredTables(message);
 
         return objectMapper.writeValueAsString(cleanHtml);
     }
 
-    public String removeHtmlAttributes(String html) {
+    public String extractStructuredTables(String html) {
         if (StringUtils.isEmpty(html)) {
             return "";
         }
 
         Document doc = Jsoup.parse(html);
 
-        for (Element el : doc.getAllElements()) {
-            el.clearAttributes();
+        Elements tables = doc.select("table[data-extraction]");
+
+        Document cleanDoc = Document.createShell("");
+
+        for (Element table : tables) {
+            Element cleanTable = table.clone();
+
+            cleanTable.select("strong, span, p").unwrap();
+
+            for (Element el : cleanTable.getAllElements()) {
+                el.clearAttributes();
+            }
+
+            cleanDoc.body().appendChild(cleanTable);
         }
 
-        return doc.body().html();
+        return cleanDoc.body().html()
+                .replaceAll(">\\s+<", "><")   // remove spaces between tags
+                .trim();
     }
 }

+ 12 - 0
src/main/java/es/uv/saic/service/SystemPrompt.java

@@ -0,0 +1,12 @@
+package es.uv.saic.service;
+
+public class SystemPrompt {
+
+    private SystemPrompt() {}
+
+    public static String SYSTEM_INSTRUCTIONS = """
+            Your are a professor reviewing documents.
+            
+            Your mission is to summarize key findings and point out areas of improvement.
+            """;
+}

Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini