|
|
@@ -45,8 +45,6 @@ public class EnhancementService {
|
|
|
|
|
|
Document doc = Jsoup.parse(html);
|
|
|
|
|
|
- removeCodeComments(doc);
|
|
|
-
|
|
|
Elements tables = doc.select("table[data-extraction]");
|
|
|
|
|
|
Document cleanDoc = Document.createShell("");
|
|
|
@@ -59,23 +57,18 @@ public class EnhancementService {
|
|
|
|
|
|
cleanTable.select("*").forEach(Element::clearAttributes);
|
|
|
|
|
|
- normalizeEmptySpaces(cleanTable);
|
|
|
-
|
|
|
cleanDoc.body().appendChild(cleanTable);
|
|
|
}
|
|
|
|
|
|
- return cleanDoc.body().html().trim();
|
|
|
+ return cleanDoc.body().html()
|
|
|
+ .replace("\n", "")
|
|
|
+ .replace("\r", "")
|
|
|
+ .replace("\t", "")
|
|
|
+ .replace(" ", "")
|
|
|
+ .replaceAll(">\\s+<", "><")
|
|
|
+ .replaceAll("<!--.*?-->", "")
|
|
|
+ .replaceAll("\\s{2,}", " ") // collapse multiple spaces
|
|
|
+ .trim();
|
|
|
}
|
|
|
|
|
|
- private static void normalizeEmptySpaces(Element cleanTable) {
|
|
|
- cleanTable.textNodes().forEach(t ->
|
|
|
- t.text(t.text().replace("\u00A0", " "))
|
|
|
- );
|
|
|
- }
|
|
|
-
|
|
|
- private static void removeCodeComments(Document doc) {
|
|
|
- doc.select("*").forEach(node ->
|
|
|
- node.childNodes().removeIf(n -> n.nodeName().equals("#comment"))
|
|
|
- );
|
|
|
- }
|
|
|
}
|