Procházet zdrojové kódy

Remove empty spaces and comments

atsachlaris před 6 dny
rodič
revize
d192d3c28f

+ 19 - 8
src/main/java/es/uv/saic/service/EnhancementService.java

@@ -45,26 +45,37 @@ public class EnhancementService {
 
 
         Document doc = Jsoup.parse(html);
         Document doc = Jsoup.parse(html);
 
 
+        removeCodeComments(doc);
+
         Elements tables = doc.select("table[data-extraction]");
         Elements tables = doc.select("table[data-extraction]");
 
 
         Document cleanDoc = Document.createShell("");
         Document cleanDoc = Document.createShell("");
+        cleanDoc.outputSettings().prettyPrint(false);
 
 
         for (Element table : tables) {
         for (Element table : tables) {
             Element cleanTable = table.clone();
             Element cleanTable = table.clone();
 
 
             cleanTable.select("strong, span, p").unwrap();
             cleanTable.select("strong, span, p").unwrap();
 
 
-            for (Element el : cleanTable.getAllElements()) {
-                el.clearAttributes();
-            }
+            cleanTable.select("*").forEach(Element::clearAttributes);
+
+            normalizeEmptySpaces(cleanTable);
 
 
             cleanDoc.body().appendChild(cleanTable);
             cleanDoc.body().appendChild(cleanTable);
         }
         }
 
 
-        return cleanDoc.body().html()
-                .replace(" ", "")
-                .replaceAll("<!--.*?-->", "")
-                .replaceAll(">\\s+<", "><") // remove spaces between tags
-                .trim();
+        return cleanDoc.body().html().trim();
+    }
+
+    private static void normalizeEmptySpaces(Element cleanTable) {
+        cleanTable.textNodes().forEach(t ->
+                t.text(t.text().replace("\u00A0", " "))
+        );
+    }
+
+    private static void removeCodeComments(Document doc) {
+        doc.select("*").forEach(node ->
+                node.childNodes().removeIf(n -> n.nodeName().equals("#comment"))
+        );
     }
     }
 }
 }