1
0

增加unstructured处理策略

This commit is contained in:
2026-02-17 20:12:26 +08:00
parent 856700fbe0
commit c693e23888
7 changed files with 603 additions and 730 deletions

View File

@@ -65,6 +65,7 @@ def main() -> None:
if file_type == "docx":
parsers = [
("docling", docx_parser.parse_docx_with_docling),
("unstructured", docx_parser.parse_docx_with_unstructured),
("pypandoc-binary", docx_parser.parse_docx_with_pypandoc),
("MarkItDown", docx_parser.parse_docx_with_markitdown),
("python-docx", docx_parser.parse_docx_with_python_docx),
@@ -73,6 +74,7 @@ def main() -> None:
elif file_type == "pptx":
parsers = [
("docling", pptx_parser.parse_pptx_with_docling),
("unstructured", pptx_parser.parse_pptx_with_unstructured),
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
@@ -80,6 +82,7 @@ def main() -> None:
elif file_type == "xlsx":
parsers = [
("docling", xlsx_parser.parse_xlsx_with_docling),
("unstructured", xlsx_parser.parse_xlsx_with_unstructured),
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
("pandas", xlsx_parser.parse_xlsx_with_pandas),
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
@@ -87,8 +90,8 @@ def main() -> None:
else:
parsers = [
("docling", pdf_parser.parse_pdf_with_docling),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]