Sfoglia il codice sorgente

Convert html tables to csv

atsachlaris 13 ore fa
parent
commit
545ecd5c30

+ 7 - 0
pom.xml

@@ -78,6 +78,13 @@
             <version>1.22.1</version>
             <scope>compile</scope>
         </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-csv</artifactId>
+            <version>1.14.1</version>
+            <scope>compile</scope>
+        </dependency>
     </dependencies>
 
     <build>

+ 1 - 1
requests/service.http

@@ -65,7 +65,7 @@ Content-Type: application/json
   data-mce-style="color: #f00000; font-size: 12pt; font-family: verdana, geneva, sans-serif;"><em><strong>1. EVIDENCIAS</strong></em></span>
   </p>
   <hr class="mceEditable" contenteditable="true">
-  <table data-extraction class="mceEditable"
+  <table class="mceEditable"
   style="border-collapse: collapse; width: 297mm; border-width: 1px; border-spacing: 0px; border-color: rgb(149, 165, 166); margin-left: 0px; margin-right: auto;"
   border="1" width="680" cellspacing="0" cellpadding="8"
   data-mce-style="border-collapse: collapse; width: 297mm; border-width: 1px; border-spacing: 0px; border-color: rgb(149, 165, 166); margin-left: 0px; margin-right: auto;"

+ 10 - 61
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -1,86 +1,35 @@
 package es.uv.saic.service;
 
-import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import lombok.SneakyThrows;
-import org.apache.commons.lang3.StringUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 import org.springframework.ai.chat.client.ChatClient;
 import org.springframework.stereotype.Service;
 
-import java.util.Set;
-
 import static es.uv.saic.service.SystemPrompt.SYSTEM_INSTRUCTIONS;
 
 @Service
 public class EnhancementService {
-
     private final ChatClient chatClient;
+    private final HtmlToCsvExtractor htmlToCsvExtractor;
     private final ObjectMapper objectMapper = new ObjectMapper();
-    private final Set<String> allowedTags = Set.of(
-            "table", "thead", "tbody", "tr", "td", "th"
-    );
 
-    public EnhancementService(ChatClient.Builder chatClientBuilder) {
+    public EnhancementService(
+            ChatClient.Builder chatClientBuilder,
+            HtmlToCsvExtractor htmlToCsvExtractor
+    ) {
         this.chatClient = chatClientBuilder.build();
+        this.htmlToCsvExtractor = htmlToCsvExtractor;
     }
 
     @SneakyThrows
-    public String ask(String message) {
+    public String ask(String html) {
+        String asCsv = htmlToCsvExtractor.extractTablesToCsv(html);
+
         return chatClient.prompt()
                 .system(SYSTEM_INSTRUCTIONS)
-                .user(normalizeMessage(message))
+                .user("Aquí tienes las tablas: " + objectMapper.writeValueAsString(asCsv))
                 .call()
                 .content();
     }
 
-    private String normalizeMessage(String message) throws JsonProcessingException {
-        String cleanHtml = extractStructuredTables(message);
-
-        return objectMapper.writeValueAsString(cleanHtml);
-    }
-
-    public String extractStructuredTables(String html) {
-        if (StringUtils.isEmpty(html)) {
-            return "";
-        }
-
-        Document doc = Jsoup.parse(html);
-
-        Elements tables = doc.select("table[data-extraction]");
-
-        Document cleanDoc = Document.createShell("");
-        cleanDoc.outputSettings().prettyPrint(false);
-
-
-        for (Element table : tables) {
-            Element cleanTable = table.clone();
-
-            //keep only allowed tags
-            cleanTable.select("*").forEach(el -> {
-                if (!allowedTags.contains(el.tagName())) {
-                    el.unwrap(); // remove tag but keep content
-                }
-            });
-
-            // Remove all attributes
-            cleanTable.select("*").forEach(Element::clearAttributes);
-
-            cleanDoc.body().appendChild(cleanTable);
-        }
-
-        return cleanDoc.body().html()
-                .replace("\n", "")
-                .replace("\r", "")
-                .replace("\t", "")
-                .replace("&nbsp;", "")
-                .replaceAll(">\\s+<", "><") // remove whitespace between tags
-                .replaceAll("<!--.*?-->", "") // remove comments
-                .replaceAll("\\s{2,}", " ") // collapse multiple spaces
-                .trim();
-    }
-
 }

+ 137 - 0
src/main/java/es/uv/saic/service/HtmlToCsvExtractor.java

@@ -0,0 +1,137 @@
+package es.uv.saic.service;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVPrinter;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@Service
+public class HtmlToCsvExtractor {
+    private static final Pattern NUMERIC_ITEM_PREFIX = Pattern.compile("^\\s*(\\d+)\\s*[-.)]\\s*(.*)$");
+    private static final Pattern LETTER_ITEM_PREFIX = Pattern.compile("^\\s*([a-zA-Z])\\s*[-.)]\\s*(.*)$");
+    private static final List<String> CSV_HEADER = List.of(
+            "Código", "Ítem", "Puntuación"
+    );
+    private final Set<String> allowedTags = Set.of(
+            "table", "thead", "tbody", "tr", "td", "th", "strong"
+    );
+
+    public String extractTablesToCsv(String html) {
+        if (StringUtils.isEmpty(html)) {
+            return "";
+        }
+
+        Document doc = Jsoup.parse(html);
+        Elements tables = doc.select("table[data-extraction]");
+
+        List<String> csvTables = new ArrayList<>();
+        for (Element table : tables) {
+            Element cleanTable = table.clone();
+
+            keepOnlyAllowedTags(cleanTable);
+            clearAllAttributes(cleanTable);
+
+            String csvTable = toCsv(cleanTable);
+            if (StringUtils.isNotBlank(csvTable)) {
+                csvTables.add(csvTable);
+            }
+        }
+
+        return String.join("\n\n", csvTables).trim();
+    }
+
+    private static void clearAllAttributes(Element cleanTable) {
+        cleanTable.select("*").forEach(Element::clearAttributes);
+    }
+
+    private void keepOnlyAllowedTags(Element cleanTable) {
+        cleanTable.select("*").forEach(el -> {
+            if (!allowedTags.contains(el.tagName())) {
+                el.unwrap(); // remove tag but keep content
+            }
+        });
+    }
+
+    private String toCsv(Element table) {
+        try (StringWriter writer = new StringWriter();
+             CSVPrinter printer = new CSVPrinter(writer, CSVFormat.DEFAULT)) {
+            final String[] lastNumericItem = {null};
+            final int[] rowIndex = {0};
+            table.select("tr").forEach(row -> {
+                List<String> cells = row.select("th,td").stream()
+                        .map(cell -> cell.text()
+                                .replace("\n", " ")
+                                .replace("\r", " ")
+                                .replace("\t", " ")
+                                .trim())
+                        .toList();
+                if (!cells.isEmpty()) {
+                    try {
+                        if (rowIndex[0] == 0) {
+                            printer.printRecord(CSV_HEADER);
+                            rowIndex[0]++;
+                            return;
+                        }
+
+                        ParsedFirstColumn parsed = parseFirstColumn(cells.getFirst(), lastNumericItem[0]);
+                        if (StringUtils.isNotBlank(parsed.numericItem())) {
+                            lastNumericItem[0] = parsed.numericItem();
+                        }
+
+                        List<String> rowWithItem = new ArrayList<>();
+                        rowWithItem.add(parsed.item());
+                        rowWithItem.add(parsed.description());
+                        rowWithItem.add(cells.size() > 1 ? cells.get(1) : "");
+                        printer.printRecord(rowWithItem);
+                    } catch (IOException e) {
+                        throw new IllegalStateException("Unable to write CSV row", e);
+                    }
+                    rowIndex[0]++;
+                }
+            });
+            printer.flush();
+            return writer.toString().trim();
+        } catch (IOException e) {
+            throw new IllegalStateException("Unable to generate CSV", e);
+        }
+    }
+
+    private ParsedFirstColumn parseFirstColumn(String value, String lastNumericItem) {
+        Matcher numericMatcher = NUMERIC_ITEM_PREFIX.matcher(value);
+        if (numericMatcher.matches()) {
+            return new ParsedFirstColumn(
+                    numericMatcher.group(1),
+                    numericMatcher.group(1),
+                    StringUtils.defaultString(numericMatcher.group(2)).trim()
+            );
+        }
+
+        Matcher letterMatcher = LETTER_ITEM_PREFIX.matcher(value);
+        if (letterMatcher.matches()) {
+            String letter = letterMatcher.group(1).toLowerCase();
+            String item = StringUtils.isNotBlank(lastNumericItem) ? lastNumericItem + letter : letter;
+            return new ParsedFirstColumn(
+                    item,
+                    null,
+                    StringUtils.defaultString(letterMatcher.group(2)).trim()
+            );
+        }
+
+        return new ParsedFirstColumn("", null, value);
+    }
+
+    private record ParsedFirstColumn(String item, String numericItem, String description) {
+    }
+}

+ 18 - 4
src/main/java/es/uv/saic/service/SystemPrompt.java

@@ -5,8 +5,22 @@ public class SystemPrompt {
     private SystemPrompt() {}
 
     public static String SYSTEM_INSTRUCTIONS = """
-            Your are a professor reviewing documents.
-            
-            Your mission is to summarize key findings and point out areas of improvement.
-            """;
+    Eres un analista experto en calidad académica universitaria especializado en evaluación de titulaciones (ANECA).
+    
+    Debes generar comentarios formales, institucionales y prudentes.
+    
+    El campo Nivel se determinará en función de la puntuación de la siguiente manera:
+    - Crítico: puntuación menor de 2.5
+    - Advertencia: puntuación entre 2.5 y menor de 3
+    - OK: puntuación igual o mayor a 3
+    
+    Formato de salida:
+    
+    - Código
+    - Colectivo
+    - Puntuación
+    - Nivel
+    - Comentario
+
+    """;
 }

+ 2 - 2
src/main/java/es/uv/saic/web/EnhancementController.java

@@ -21,8 +21,8 @@ public class EnhancementController {
     }
 
     @PostMapping("chat")
-    public String chat(@RequestBody String message) {
-        return enhancementService.ask(message);
+    public String chat(@RequestBody String html) {
+        return enhancementService.ask(html);
     }
 
 }