Ver Fonte

Use docling to convert to csv

atsachlaris há 1 dia atrás
pai
commit
b2fb0de8a3

+ 13 - 7
scripts/docling_extract.py

@@ -45,6 +45,10 @@ def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
 
     for line in markdown_text.splitlines():
         stripped = line.strip()
+        if not stripped:
+            # Docling markdown can include blank lines inside table regions.
+            # Do not break the current table because of empty lines.
+            continue
         is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
         if not is_table_line:
             if current_table:
@@ -69,13 +73,13 @@ def normalize_tables_to_csv(markdown_text: str) -> str:
         return ""
 
     buffer = StringIO()
-    writer = csv.writer(buffer)
-    for index, table in enumerate(tables):
+    # Force LF line endings to avoid double-spacing on Windows consumers.
+    writer = csv.writer(buffer, lineterminator="\n")
+    for table in tables:
         for row in table:
             writer.writerow(row)
-        if index < len(tables) - 1:
-            writer.writerow([])
-    return buffer.getvalue().strip()
+    csv_text = buffer.getvalue().strip()
+    return csv_text.replace("\r\n", "\n").replace("\r", "\n")
 
 
 def read_input(args: argparse.Namespace) -> str:
@@ -95,7 +99,7 @@ def main() -> int:
     parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
     parser.add_argument("--input-file", help="Path to input file")
     parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
-    parser.add_argument("--output", choices=["html", "csv"], default="html", help="Output format")
+    parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
     args = parser.parse_args()
 
     temp_path = None
@@ -106,7 +110,9 @@ def main() -> int:
         converter = DocumentConverter()
         result = converter.convert(source)
         markdown_output = result.document.export_to_markdown()
-        if args.output == "csv":
+        if args.output == "markdown":
+            output_payload = markdown_output
+        elif args.output == "csv":
             output_payload = normalize_tables_to_csv(markdown_output)
         else:
             output_payload = normalize_tables_to_html(markdown_output)

+ 3 - 1
src/main/java/es/uv/saic/service/DoclingPythonClient.java

@@ -22,7 +22,7 @@ public class DoclingPythonClient {
         this.properties = properties;
     }
 
-    public String extractHtmlTables(ExtractionRequest request) {
+    public String extractTablesAsCsv(ExtractionRequest request) {
         if (!properties.isEnabled()) {
             throw new IllegalStateException("Docling extractor is disabled");
         }
@@ -32,6 +32,8 @@ public class DoclingPythonClient {
             List<String> command = new ArrayList<>();
             command.add(properties.getPythonCommand());
             command.add(Path.of(properties.getScriptPath()).toAbsolutePath().toString());
+            command.add("--output");
+            command.add("csv");
 
             Process process;
             if (request.hasFile()) {

+ 3 - 6
src/main/java/es/uv/saic/service/DoclingTableExtractor.java

@@ -10,16 +10,13 @@ import org.springframework.stereotype.Service;
 @Service
 public class DoclingTableExtractor implements TableExtractor {
     private final DoclingPythonClient doclingPythonClient;
-    private final HtmlToCsvExtractor htmlToCsvExtractor;
     private final DoclingProperties doclingProperties;
 
     public DoclingTableExtractor(
             DoclingPythonClient doclingPythonClient,
-            HtmlToCsvExtractor htmlToCsvExtractor,
             DoclingProperties doclingProperties
     ) {
         this.doclingPythonClient = doclingPythonClient;
-        this.htmlToCsvExtractor = htmlToCsvExtractor;
         this.doclingProperties = doclingProperties;
     }
 
@@ -35,11 +32,11 @@ public class DoclingTableExtractor implements TableExtractor {
             return "";
         }
 
-        String normalizedHtml = doclingPythonClient.extractHtmlTables(ExtractionRequest.fromHtml(filteredHtml));
-        if (StringUtils.isBlank(normalizedHtml)) {
+        String csvOutput = doclingPythonClient.extractTablesAsCsv(ExtractionRequest.fromHtml(filteredHtml));
+        if (StringUtils.isBlank(csvOutput)) {
             return "";
         }
-        return htmlToCsvExtractor.extractTablesToCsv(normalizedHtml);
+        return csvOutput;
     }
 
     private static String keepOnlyDataExtractionTables(String html) {

+ 2 - 2
src/test/java/es/uv/saic/service/DoclingPythonClientTest.java

@@ -13,7 +13,7 @@ class DoclingPythonClientTest {
 
         DoclingPythonClient client = new DoclingPythonClient(properties);
 
-        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
+        assertThrows(IllegalStateException.class, () -> client.extractTablesAsCsv(ExtractionRequest.fromHtml("<html/>")));
     }
 
     @Test
@@ -25,6 +25,6 @@ class DoclingPythonClientTest {
 
         DoclingPythonClient client = new DoclingPythonClient(properties);
 
-        assertThrows(IllegalStateException.class, () -> client.extractHtmlTables(ExtractionRequest.fromHtml("<html/>")));
+        assertThrows(IllegalStateException.class, () -> client.extractTablesAsCsv(ExtractionRequest.fromHtml("<html/>")));
     }
 }

+ 5 - 8
src/test/java/es/uv/saic/service/DoclingTableExtractorTest.java

@@ -13,10 +13,9 @@ class DoclingTableExtractorTest {
     @Test
     void extractTablesToCsv_sendsOnlyDataExtractionTablesToDocling() {
         DoclingPythonClient pythonClient = mock(DoclingPythonClient.class);
-        HtmlToCsvExtractor htmlToCsvExtractor = mock(HtmlToCsvExtractor.class);
         DoclingProperties properties = new DoclingProperties();
         properties.setEnabled(true);
-        DoclingTableExtractor extractor = new DoclingTableExtractor(pythonClient, htmlToCsvExtractor, properties);
+        DoclingTableExtractor extractor = new DoclingTableExtractor(pythonClient, properties);
 
         String html = """
                 <html><body>
@@ -25,17 +24,15 @@ class DoclingTableExtractorTest {
                 </body></html>
                 """;
 
-        when(pythonClient.extractHtmlTables(argThat(req ->
+        when(pythonClient.extractTablesAsCsv(argThat(req ->
                 req.hasRawHtml() &&
                         req.rawHtml().contains("data-extraction") &&
                         !req.rawHtml().contains("<table><tr><td>ignore")
-        ))).thenReturn("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>");
-        when(htmlToCsvExtractor.extractTablesToCsv("<table data-extraction='docling'><tr><td>k</td><td>1</td></tr></table>"))
-                .thenReturn("csv");
+        ))).thenReturn("k,1");
 
         String result = extractor.extractTablesToCsv(ExtractionRequest.fromHtml(html));
 
-        assertEquals("csv", result);
-        verify(pythonClient).extractHtmlTables(argThat(req -> req.hasRawHtml() && !req.hasFile()));
+        assertEquals("k,1", result);
+        verify(pythonClient).extractTablesAsCsv(argThat(req -> req.hasRawHtml() && !req.hasFile()));
     }
 }