|
|
@@ -0,0 +1,137 @@
|
|
|
+package es.uv.saic.service;
|
|
|
+
|
|
|
+import org.apache.commons.csv.CSVFormat;
|
|
|
+import org.apache.commons.csv.CSVPrinter;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.io.StringWriter;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Set;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+
|
|
|
+@Service
|
|
|
+public class HtmlToCsvExtractor {
|
|
|
+ private static final Pattern NUMERIC_ITEM_PREFIX = Pattern.compile("^\\s*(\\d+)\\s*[-.)]\\s*(.*)$");
|
|
|
+ private static final Pattern LETTER_ITEM_PREFIX = Pattern.compile("^\\s*([a-zA-Z])\\s*[-.)]\\s*(.*)$");
|
|
|
+ private static final List<String> CSV_HEADER = List.of(
|
|
|
+ "Código", "Ítem", "Puntuación"
|
|
|
+ );
|
|
|
+ private final Set<String> allowedTags = Set.of(
|
|
|
+ "table", "thead", "tbody", "tr", "td", "th", "strong"
|
|
|
+ );
|
|
|
+
|
|
|
+ public String extractTablesToCsv(String html) {
|
|
|
+ if (StringUtils.isEmpty(html)) {
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ Document doc = Jsoup.parse(html);
|
|
|
+ Elements tables = doc.select("table[data-extraction]");
|
|
|
+
|
|
|
+ List<String> csvTables = new ArrayList<>();
|
|
|
+ for (Element table : tables) {
|
|
|
+ Element cleanTable = table.clone();
|
|
|
+
|
|
|
+ keepOnlyAllowedTags(cleanTable);
|
|
|
+ clearAllAttributes(cleanTable);
|
|
|
+
|
|
|
+ String csvTable = toCsv(cleanTable);
|
|
|
+ if (StringUtils.isNotBlank(csvTable)) {
|
|
|
+ csvTables.add(csvTable);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return String.join("\n\n", csvTables).trim();
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void clearAllAttributes(Element cleanTable) {
|
|
|
+ cleanTable.select("*").forEach(Element::clearAttributes);
|
|
|
+ }
|
|
|
+
|
|
|
+ private void keepOnlyAllowedTags(Element cleanTable) {
|
|
|
+ cleanTable.select("*").forEach(el -> {
|
|
|
+ if (!allowedTags.contains(el.tagName())) {
|
|
|
+ el.unwrap(); // remove tag but keep content
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ private String toCsv(Element table) {
|
|
|
+ try (StringWriter writer = new StringWriter();
|
|
|
+ CSVPrinter printer = new CSVPrinter(writer, CSVFormat.DEFAULT)) {
|
|
|
+ final String[] lastNumericItem = {null};
|
|
|
+ final int[] rowIndex = {0};
|
|
|
+ table.select("tr").forEach(row -> {
|
|
|
+ List<String> cells = row.select("th,td").stream()
|
|
|
+ .map(cell -> cell.text()
|
|
|
+ .replace("\n", " ")
|
|
|
+ .replace("\r", " ")
|
|
|
+ .replace("\t", " ")
|
|
|
+ .trim())
|
|
|
+ .toList();
|
|
|
+ if (!cells.isEmpty()) {
|
|
|
+ try {
|
|
|
+ if (rowIndex[0] == 0) {
|
|
|
+ printer.printRecord(CSV_HEADER);
|
|
|
+ rowIndex[0]++;
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ ParsedFirstColumn parsed = parseFirstColumn(cells.getFirst(), lastNumericItem[0]);
|
|
|
+ if (StringUtils.isNotBlank(parsed.numericItem())) {
|
|
|
+ lastNumericItem[0] = parsed.numericItem();
|
|
|
+ }
|
|
|
+
|
|
|
+ List<String> rowWithItem = new ArrayList<>();
|
|
|
+ rowWithItem.add(parsed.item());
|
|
|
+ rowWithItem.add(parsed.description());
|
|
|
+ rowWithItem.add(cells.size() > 1 ? cells.get(1) : "");
|
|
|
+ printer.printRecord(rowWithItem);
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new IllegalStateException("Unable to write CSV row", e);
|
|
|
+ }
|
|
|
+ rowIndex[0]++;
|
|
|
+ }
|
|
|
+ });
|
|
|
+ printer.flush();
|
|
|
+ return writer.toString().trim();
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new IllegalStateException("Unable to generate CSV", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private ParsedFirstColumn parseFirstColumn(String value, String lastNumericItem) {
|
|
|
+ Matcher numericMatcher = NUMERIC_ITEM_PREFIX.matcher(value);
|
|
|
+ if (numericMatcher.matches()) {
|
|
|
+ return new ParsedFirstColumn(
|
|
|
+ numericMatcher.group(1),
|
|
|
+ numericMatcher.group(1),
|
|
|
+ StringUtils.defaultString(numericMatcher.group(2)).trim()
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ Matcher letterMatcher = LETTER_ITEM_PREFIX.matcher(value);
|
|
|
+ if (letterMatcher.matches()) {
|
|
|
+ String letter = letterMatcher.group(1).toLowerCase();
|
|
|
+ String item = StringUtils.isNotBlank(lastNumericItem) ? lastNumericItem + letter : letter;
|
|
|
+ return new ParsedFirstColumn(
|
|
|
+ item,
|
|
|
+ null,
|
|
|
+ StringUtils.defaultString(letterMatcher.group(2)).trim()
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ return new ParsedFirstColumn("", null, value);
|
|
|
+ }
|
|
|
+
|
|
|
+ private record ParsedFirstColumn(String item, String numericItem, String description) {
|
|
|
+ }
|
|
|
+}
|