1
0

增加docling作为解析器

This commit is contained in:
2026-02-15 23:17:41 +08:00
parent 4324699a3d
commit 5b362686e2
7 changed files with 106 additions and 20 deletions

View File

@@ -64,6 +64,7 @@ def main() -> None:
if file_type == "docx":
parsers = [
("docling", docx_parser.parse_docx_with_docling),
("pypandoc-binary", docx_parser.parse_docx_with_pypandoc),
("MarkItDown", docx_parser.parse_docx_with_markitdown),
("python-docx", docx_parser.parse_docx_with_python_docx),
@@ -71,18 +72,21 @@ def main() -> None:
]
elif file_type == "pptx":
parsers = [
("docling", pptx_parser.parse_pptx_with_docling),
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
]
elif file_type == "xlsx":
parsers = [
("docling", xlsx_parser.parse_xlsx_with_docling),
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
("pandas", xlsx_parser.parse_xlsx_with_pandas),
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
]
else:
parsers = [
("docling", pdf_parser.parse_pdf_with_docling),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("pypdf", pdf_parser.parse_pdf_with_pypdf),