"""DOCX 文件阅读器,支持多种解析方法。""" import os from typing import List, Optional, Tuple from scripts.readers.base import BaseReader from scripts.utils import is_valid_docx from . import docling from . import unstructured from . import markitdown from . import pypandoc from . import python_docx from . import native_xml PARSERS = [ ("docling", docling.parse), ("unstructured", unstructured.parse), ("pypandoc-binary", pypandoc.parse), ("MarkItDown", markitdown.parse), ("python-docx", python_docx.parse), ("XML 原生解析", native_xml.parse), ] class DocxReader(BaseReader): """DOCX 文件阅读器""" @property def supported_extensions(self) -> List[str]: return [".docx"] def supports(self, file_path: str) -> bool: return file_path.lower().endswith('.docx') def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: failures = [] # 检查文件是否存在 if not os.path.exists(file_path): return None, ["文件不存在"] # 验证文件格式 if not is_valid_docx(file_path): return None, ["不是有效的 DOCX 文件"] content = None for parser_name, parser_func in PARSERS: content, error = parser_func(file_path) if content is not None: return content, failures else: failures.append(f"- {parser_name}: {error}") return None, failures