|
@@ -1,5 +1,9 @@
|
|
|
-package es.uv.saic.service;
|
|
|
|
|
|
|
+package es.uv.saic.extractor.docling;
|
|
|
|
|
|
|
|
|
|
+import es.uv.saic.service.ExtractionRequest;
|
|
|
|
|
+import es.uv.saic.extractor.TableExtractor;
|
|
|
|
|
+import groovy.util.logging.Slf4j;
|
|
|
|
|
+import lombok.RequiredArgsConstructor;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Document;
|
|
@@ -7,19 +11,14 @@ import org.jsoup.nodes.Element;
|
|
|
import org.jsoup.select.Elements;
|
|
import org.jsoup.select.Elements;
|
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
|
|
+@lombok.extern.slf4j.Slf4j
|
|
|
@Service
|
|
@Service
|
|
|
|
|
+@RequiredArgsConstructor
|
|
|
|
|
+@Slf4j
|
|
|
public class DoclingTableExtractor implements TableExtractor {
|
|
public class DoclingTableExtractor implements TableExtractor {
|
|
|
private final DoclingPythonClient doclingPythonClient;
|
|
private final DoclingPythonClient doclingPythonClient;
|
|
|
private final DoclingProperties doclingProperties;
|
|
private final DoclingProperties doclingProperties;
|
|
|
|
|
|
|
|
- public DoclingTableExtractor(
|
|
|
|
|
- DoclingPythonClient doclingPythonClient,
|
|
|
|
|
- DoclingProperties doclingProperties
|
|
|
|
|
- ) {
|
|
|
|
|
- this.doclingPythonClient = doclingPythonClient;
|
|
|
|
|
- this.doclingProperties = doclingProperties;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
@Override
|
|
@Override
|
|
|
public boolean supports(ExtractionRequest request) {
|
|
public boolean supports(ExtractionRequest request) {
|
|
|
return doclingProperties.isEnabled() && request.hasRawHtml();
|
|
return doclingProperties.isEnabled() && request.hasRawHtml();
|
|
@@ -27,6 +26,7 @@ public class DoclingTableExtractor implements TableExtractor {
|
|
|
|
|
|
|
|
@Override
|
|
@Override
|
|
|
public String extractTablesToCsv(ExtractionRequest request) {
|
|
public String extractTablesToCsv(ExtractionRequest request) {
|
|
|
|
|
+ log.info("Extracting tables to CSV using DoclingPythonClient");
|
|
|
String filteredHtml = keepOnlyDataExtractionTables(request.rawHtml());
|
|
String filteredHtml = keepOnlyDataExtractionTables(request.rawHtml());
|
|
|
if (StringUtils.isBlank(filteredHtml)) {
|
|
if (StringUtils.isBlank(filteredHtml)) {
|
|
|
return "";
|
|
return "";
|