|
@@ -45,26 +45,37 @@ public class EnhancementService {
|
|
|
|
|
|
|
|
Document doc = Jsoup.parse(html);
|
|
Document doc = Jsoup.parse(html);
|
|
|
|
|
|
|
|
|
|
+ removeCodeComments(doc);
|
|
|
|
|
+
|
|
|
Elements tables = doc.select("table[data-extraction]");
|
|
Elements tables = doc.select("table[data-extraction]");
|
|
|
|
|
|
|
|
Document cleanDoc = Document.createShell("");
|
|
Document cleanDoc = Document.createShell("");
|
|
|
|
|
+ cleanDoc.outputSettings().prettyPrint(false);
|
|
|
|
|
|
|
|
for (Element table : tables) {
|
|
for (Element table : tables) {
|
|
|
Element cleanTable = table.clone();
|
|
Element cleanTable = table.clone();
|
|
|
|
|
|
|
|
cleanTable.select("strong, span, p").unwrap();
|
|
cleanTable.select("strong, span, p").unwrap();
|
|
|
|
|
|
|
|
- for (Element el : cleanTable.getAllElements()) {
|
|
|
|
|
- el.clearAttributes();
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ cleanTable.select("*").forEach(Element::clearAttributes);
|
|
|
|
|
+
|
|
|
|
|
+ normalizeEmptySpaces(cleanTable);
|
|
|
|
|
|
|
|
cleanDoc.body().appendChild(cleanTable);
|
|
cleanDoc.body().appendChild(cleanTable);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- return cleanDoc.body().html()
|
|
|
|
|
- .replace(" ", "")
|
|
|
|
|
- .replaceAll("<!--.*?-->", "")
|
|
|
|
|
- .replaceAll(">\\s+<", "><") // remove spaces between tags
|
|
|
|
|
- .trim();
|
|
|
|
|
|
|
+ return cleanDoc.body().html().trim();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static void normalizeEmptySpaces(Element cleanTable) {
|
|
|
|
|
+ cleanTable.textNodes().forEach(t ->
|
|
|
|
|
+ t.text(t.text().replace("\u00A0", " "))
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static void removeCodeComments(Document doc) {
|
|
|
|
|
+ doc.select("*").forEach(node ->
|
|
|
|
|
+ node.childNodes().removeIf(n -> n.nodeName().equals("#comment"))
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|