Browse Source

Explicitly keep only table tags

atsachlaris 4 ngày trước cách đây
mục cha
commit
4f085e028b
1 tập tin đã thay đổi với 15 bổ sung3 xóa
  1. 15 3
      src/main/java/es/uv/saic/service/EnhancementService.java

+ 15 - 3
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -11,6 +11,8 @@ import org.jsoup.select.Elements;
 import org.springframework.ai.chat.client.ChatClient;
 import org.springframework.stereotype.Service;
 
+import java.util.Set;
+
 import static es.uv.saic.service.SystemPrompt.SYSTEM_INSTRUCTIONS;
 
 @Service
@@ -18,6 +20,9 @@ public class EnhancementService {
 
     private final ChatClient chatClient;
     private final ObjectMapper objectMapper = new ObjectMapper();
+    private final Set<String> allowedTags = Set.of(
+            "table", "thead", "tbody", "tr", "td", "th"
+    );
 
     public EnhancementService(ChatClient.Builder chatClientBuilder) {
         this.chatClient = chatClientBuilder.build();
@@ -50,11 +55,18 @@ public class EnhancementService {
         Document cleanDoc = Document.createShell("");
         cleanDoc.outputSettings().prettyPrint(false);
 
+
         for (Element table : tables) {
             Element cleanTable = table.clone();
 
-            cleanTable.select("strong, span, p").unwrap();
+            //keep only allowed tags
+            cleanTable.select("*").forEach(el -> {
+                if (!allowedTags.contains(el.tagName())) {
+                    el.unwrap(); // remove tag but keep content
+                }
+            });
 
+            // Remove all attributes
             cleanTable.select("*").forEach(Element::clearAttributes);
 
             cleanDoc.body().appendChild(cleanTable);
@@ -65,8 +77,8 @@ public class EnhancementService {
                 .replace("\r", "")
                 .replace("\t", "")
                 .replace("&nbsp;", "")
-                .replaceAll(">\\s+<", "><")
-                .replaceAll("<!--.*?-->", "")
+                .replaceAll(">\\s+<", "><") // remove whitespace between tags
+                .replaceAll("<!--.*?-->", "") // remove comments
                 .replaceAll("\\s{2,}", " ") // collapse multiple spaces
                 .trim();
     }