|
|
@@ -45,6 +45,10 @@ def parse_markdown_tables(markdown_text: str) -> list[list[list[str]]]:
|
|
|
|
|
|
for line in markdown_text.splitlines():
|
|
|
stripped = line.strip()
|
|
|
+ if not stripped:
|
|
|
+ # Docling markdown can include blank lines inside table regions.
|
|
|
+ # Do not break the current table because of empty lines.
|
|
|
+ continue
|
|
|
is_table_line = "|" in stripped and stripped.startswith("|") and stripped.endswith("|")
|
|
|
if not is_table_line:
|
|
|
if current_table:
|
|
|
@@ -69,13 +73,13 @@ def normalize_tables_to_csv(markdown_text: str) -> str:
|
|
|
return ""
|
|
|
|
|
|
buffer = StringIO()
|
|
|
- writer = csv.writer(buffer)
|
|
|
- for index, table in enumerate(tables):
|
|
|
+ # Force LF line endings to avoid double-spacing on Windows consumers.
|
|
|
+ writer = csv.writer(buffer, lineterminator="\n")
|
|
|
+ for table in tables:
|
|
|
for row in table:
|
|
|
writer.writerow(row)
|
|
|
- if index < len(tables) - 1:
|
|
|
- writer.writerow([])
|
|
|
- return buffer.getvalue().strip()
|
|
|
+ csv_text = buffer.getvalue().strip()
|
|
|
+ return csv_text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
|
|
|
def read_input(args: argparse.Namespace) -> str:
|
|
|
@@ -95,7 +99,7 @@ def main() -> int:
|
|
|
parser = argparse.ArgumentParser(description="Extract HTML tables using Docling")
|
|
|
parser.add_argument("--input-file", help="Path to input file")
|
|
|
parser.add_argument("--stdin", action="store_true", help="Read UTF-8 text from stdin")
|
|
|
- parser.add_argument("--output", choices=["html", "csv"], default="html", help="Output format")
|
|
|
+ parser.add_argument("--output", choices=["markdown", "html", "csv"], default="markdown", help="Output format")
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
temp_path = None
|
|
|
@@ -106,7 +110,9 @@ def main() -> int:
|
|
|
converter = DocumentConverter()
|
|
|
result = converter.convert(source)
|
|
|
markdown_output = result.document.export_to_markdown()
|
|
|
- if args.output == "csv":
|
|
|
+ if args.output == "markdown":
|
|
|
+ output_payload = markdown_output
|
|
|
+ elif args.output == "csv":
|
|
|
output_payload = normalize_tables_to_csv(markdown_output)
|
|
|
else:
|
|
|
output_payload = normalize_tables_to_html(markdown_output)
|