增加unstructured处理策略
This commit is contained in:
@@ -65,6 +65,7 @@ def main() -> None:
|
||||
if file_type == "docx":
|
||||
parsers = [
|
||||
("docling", docx_parser.parse_docx_with_docling),
|
||||
("unstructured", docx_parser.parse_docx_with_unstructured),
|
||||
("pypandoc-binary", docx_parser.parse_docx_with_pypandoc),
|
||||
("MarkItDown", docx_parser.parse_docx_with_markitdown),
|
||||
("python-docx", docx_parser.parse_docx_with_python_docx),
|
||||
@@ -73,6 +74,7 @@ def main() -> None:
|
||||
elif file_type == "pptx":
|
||||
parsers = [
|
||||
("docling", pptx_parser.parse_pptx_with_docling),
|
||||
("unstructured", pptx_parser.parse_pptx_with_unstructured),
|
||||
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
|
||||
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
|
||||
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
|
||||
@@ -80,6 +82,7 @@ def main() -> None:
|
||||
elif file_type == "xlsx":
|
||||
parsers = [
|
||||
("docling", xlsx_parser.parse_xlsx_with_docling),
|
||||
("unstructured", xlsx_parser.parse_xlsx_with_unstructured),
|
||||
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
|
||||
("pandas", xlsx_parser.parse_xlsx_with_pandas),
|
||||
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
|
||||
@@ -87,8 +90,8 @@ def main() -> None:
|
||||
else:
|
||||
parsers = [
|
||||
("docling", pdf_parser.parse_pdf_with_docling),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("unstructured", pdf_parser.parse_pdf_with_unstructured),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("pypdf", pdf_parser.parse_pdf_with_pypdf),
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user