Просмотр исходного кода

Remove empty spaces and comments

atsachlaris 6 дней назад
Родитель
Сommit
d192d3c28f
1 измененных файлов с 19 добавлено и 8 удалено
  1. 19 8
      src/main/java/es/uv/saic/service/EnhancementService.java

+ 19 - 8
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -45,26 +45,37 @@ public class EnhancementService {
 
         Document doc = Jsoup.parse(html);
 
+        removeCodeComments(doc);
+
         Elements tables = doc.select("table[data-extraction]");
 
         Document cleanDoc = Document.createShell("");
+        cleanDoc.outputSettings().prettyPrint(false);
 
         for (Element table : tables) {
             Element cleanTable = table.clone();
 
             cleanTable.select("strong, span, p").unwrap();
 
-            for (Element el : cleanTable.getAllElements()) {
-                el.clearAttributes();
-            }
+            cleanTable.select("*").forEach(Element::clearAttributes);
+
+            normalizeEmptySpaces(cleanTable);
 
             cleanDoc.body().appendChild(cleanTable);
         }
 
-        return cleanDoc.body().html()
-                .replace(" ", "")
-                .replaceAll("<!--.*?-->", "")
-                .replaceAll(">\\s+<", "><") // remove spaces between tags
-                .trim();
+        return cleanDoc.body().html().trim();
+    }
+
+    private static void normalizeEmptySpaces(Element cleanTable) {
+        cleanTable.textNodes().forEach(t ->
+                t.text(t.text().replace("\u00A0", " "))
+        );
+    }
+
+    private static void removeCodeComments(Document doc) {
+        doc.select("*").forEach(node ->
+                node.childNodes().removeIf(n -> n.nodeName().equals("#comment"))
+        );
     }
 }