"""统一配置类,集中管理所有配置项。""" class Config: """统一配置类""" # 编码检测 # 回退编码列表,当 chardet 检测失败时依次尝试 FALLBACK_ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin-1'] # HTML 下载 # 下载超时时间(秒) DOWNLOAD_TIMEOUT = 30 # HTTP User-Agent 标识 USER_AGENT = "lyxy-document/0.1.0" # 日志 # 日志等级,默认只输出 ERROR 级别避免干扰 Markdown 输出 LOG_LEVEL = "ERROR" # 依赖配置:按文件类型和平台组织 # 每个平台配置包含 python 版本要求(None 表示使用默认)和依赖列表 DEPENDENCIES = { "pdf": { "default": { "python": None, "dependencies": [ "docling", "unstructured[pdf]", "markitdown[pdf]", "pypdf", "markdownify", "chardet" ] }, "Darwin-x86_64": { "python": "3.12", "dependencies": [ "docling==2.40.0", "docling-parse==4.0.0", "numpy<2", "markitdown[pdf]", "pypdf", "markdownify", "chardet" ] } }, "docx": { "default": { "python": None, "dependencies": [ "docling", "unstructured[docx]", "markitdown[docx]", "pypandoc-binary", "python-docx", "markdownify", "chardet" ] } }, "xlsx": { "default": { "python": None, "dependencies": [ "docling", "unstructured[xlsx]", "markitdown[xlsx]", "pandas", "tabulate", "chardet" ] } }, "pptx": { "default": { "python": None, "dependencies": [ "docling", "unstructured[pptx]", "markitdown[pptx]", "python-pptx", "markdownify", "chardet" ] } }, "html": { "default": { "python": None, "dependencies": [ "trafilatura", "domscribe", "markitdown", "html2text", "beautifulsoup4", "httpx", "chardet", "pyppeteer", "selenium" ] } } }