"""XLSX 文件阅读器,支持多种解析方法。""" import os from typing import List, Optional, Tuple from scripts.readers.base import BaseReader from scripts.utils import is_valid_xlsx from . import docling from . import unstructured from . import markitdown from . import pandas from . import native_xml PARSERS = [ ("docling", docling.parse), ("unstructured", unstructured.parse), ("MarkItDown", markitdown.parse), ("pandas", pandas.parse), ("XML 原生解析", native_xml.parse), ] class XlsxReader(BaseReader): """XLSX 文件阅读器""" def supports(self, file_path: str) -> bool: return file_path.lower().endswith('.xlsx') def parse(self, file_path: str) -> Tuple[Optional[str], List[str]]: failures = [] # 检查文件是否存在 if not os.path.exists(file_path): return None, ["文件不存在"] # 验证文件格式 if not is_valid_xlsx(file_path): return None, ["不是有效的 XLSX 文件"] content = None for parser_name, parser_func in PARSERS: content, error = parser_func(file_path) if content is not None: return content, failures else: failures.append(f"- {parser_name}: {error}") return None, failures