1
0

修复bug

This commit is contained in:
2026-02-15 19:53:31 +08:00
parent b022ac736b
commit f30ea08805
6 changed files with 139 additions and 97 deletions

View File

@@ -6,10 +6,10 @@ import os
import sys
import common
import docx
import pdf
import pptx
import xlsx
import docx_parser
import pdf_parser
import pptx_parser
import xlsx_parser
def main() -> None:
@@ -64,27 +64,27 @@ def main() -> None:
if file_type == "docx":
parsers = [
("MarkItDown", docx.parse_docx_with_markitdown),
("python-docx", docx.parse_docx_with_python_docx),
("XML 原生解析", docx.parse_docx_with_xml),
("MarkItDown", docx_parser.parse_docx_with_markitdown),
("python-docx", docx_parser.parse_docx_with_python_docx),
("XML 原生解析", docx_parser.parse_docx_with_xml),
]
elif file_type == "pptx":
parsers = [
("MarkItDown", pptx.parse_pptx_with_markitdown),
("python-pptx", pptx.parse_pptx_with_python_pptx),
("XML 原生解析", pptx.parse_pptx_with_xml),
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
]
elif file_type == "xlsx":
parsers = [
("MarkItDown", xlsx.parse_xlsx_with_markitdown),
("pandas", xlsx.parse_xlsx_with_pandas),
("XML 原生解析", xlsx.parse_xlsx_with_xml),
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
("pandas", xlsx_parser.parse_xlsx_with_pandas),
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
]
else:
parsers = [
("MarkItDown", pdf.parse_pdf_with_markitdown),
("unstructured", pdf.parse_pdf_with_unstructured),
("pypdf", pdf.parse_pdf_with_pypdf),
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("unstructured", pdf_parser.parse_pdf_with_unstructured),
("pypdf", pdf_parser.parse_pdf_with_pypdf),
]
failures = []