refactor: 重新梳理 DEPENDENCIES 版本和 python 版本

- default.python 全部改为 None(使用默认 python)
- 所有依赖都指定版本号(截止 2026-03-17 最新版)
- 为 unstructured[...]、domscribe 等未指定版本的依赖添加版本
- 更新 markdownify、pypandoc-binary、tabulate、trafilatura、html2text、chardet、xlrd 等依赖版本
- html 的 selenium 降级到 4.25.0 解决 urllib3 冲突
- 为 pdf/docx/xlsx/pptx/html/xls/ppt 添加 Darwin-x86_64 配置(python 3.12 + docling 2.40.0 + docling-parse 4.0.0 + numpy<2)
- 更新测试期望 python_ver 为 None
This commit is contained in:
2026-03-17 13:15:00 +08:00
parent 89ffc88082
commit 5cc347589b
4 changed files with 165 additions and 49 deletions

View File

@@ -24,13 +24,13 @@ class Config:
DEPENDENCIES = {
"pdf": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"docling==2.80.0",
"unstructured[pdf]",
"unstructured[pdf]==0.21.5",
"markitdown[pdf]==0.1.5",
"pypdf==6.9.0",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
},
"Darwin-x86_64": {
@@ -41,20 +41,20 @@ DEPENDENCIES = {
"numpy<2",
"markitdown[pdf]==0.1.5",
"pypdf==6.9.0",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
}
},
"docx": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"docling==2.80.0",
"unstructured[docx]",
"unstructured[docx]==0.21.5",
"markitdown[docx]==0.1.5",
"pypandoc-binary==1.13",
"pypandoc-binary==1.17",
"python-docx==1.2.0",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
},
"Darwin-x86_64": {
@@ -64,21 +64,21 @@ DEPENDENCIES = {
"docling-parse==4.0.0",
"numpy<2",
"markitdown[docx]==0.1.5",
"pypandoc-binary==1.13",
"pypandoc-binary==1.17",
"python-docx==1.2.0",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
}
},
"xlsx": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"docling==2.80.0",
"unstructured[xlsx]",
"unstructured[xlsx]==0.21.5",
"markitdown[xlsx]==0.1.5",
"pandas==3.0.1",
"tabulate==0.9.0",
"tabulate==0.10.0",
"openpyxl==3.1.5"
]
},
@@ -90,20 +90,20 @@ DEPENDENCIES = {
"numpy<2",
"markitdown[xlsx]==0.1.5",
"pandas<3.0.0",
"tabulate==0.9.0",
"tabulate==0.10.0",
"openpyxl==3.1.5"
]
}
},
"pptx": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"docling==2.80.0",
"unstructured[pptx]",
"unstructured[pptx]==0.21.5",
"markitdown[pptx]==0.1.5",
"python-pptx==1.0.2",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
},
"Darwin-x86_64": {
@@ -114,21 +114,35 @@ DEPENDENCIES = {
"numpy<2",
"markitdown[pptx]==0.1.5",
"python-pptx==1.0.2",
"markdownify==0.13.1"
"markdownify==1.2.2"
]
}
},
"html": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"trafilatura==1.12.2",
"domscribe",
"trafilatura==2.0.0",
"domscribe==0.1.3",
"markitdown==0.1.5",
"html2text==2024.2.26",
"html2text==2025.4.15",
"beautifulsoup4==4.14.3",
"httpx==0.28.1",
"chardet==5.2.0",
"chardet==7.1.0",
"pyppeteer==2.0.0",
"selenium==4.25.0"
]
},
"Darwin-x86_64": {
"python": "3.12",
"dependencies": [
"trafilatura==2.0.0",
"domscribe==0.1.3",
"markitdown==0.1.5",
"html2text==2025.4.15",
"beautifulsoup4==4.14.3",
"httpx==0.28.1",
"chardet==7.1.0",
"pyppeteer==2.0.0",
"selenium==4.25.0"
]
@@ -136,13 +150,13 @@ DEPENDENCIES = {
},
"xls": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"unstructured[xlsx]",
"unstructured[xlsx]==0.21.5",
"markitdown[xls]==0.1.5",
"pandas==3.0.1",
"tabulate==0.9.0",
"xlrd==2.0.1",
"tabulate==0.10.0",
"xlrd==2.0.2",
"olefile==0.47"
]
},
@@ -151,8 +165,8 @@ DEPENDENCIES = {
"dependencies": [
"markitdown[xls]==0.1.5",
"pandas<3.0.0",
"tabulate==0.9.0",
"xlrd==2.0.1",
"tabulate==0.10.0",
"xlrd==2.0.2",
"olefile==0.47",
"openpyxl==3.1.5"
]
@@ -160,19 +174,19 @@ DEPENDENCIES = {
},
"doc": {
"default": {
"python": "3.12",
"python": None,
"dependencies": []
}
},
"ppt": {
"default": {
"python": "3.12",
"python": None,
"dependencies": [
"docling==2.80.0",
"unstructured[pptx]",
"unstructured[pptx]==0.21.5",
"markitdown[pptx]==0.1.5",
"python-pptx==1.0.2",
"markdownify==0.13.1",
"markdownify==1.2.2",
"olefile==0.47"
]
},
@@ -184,7 +198,7 @@ DEPENDENCIES = {
"numpy<2",
"markitdown[pptx]==0.1.5",
"python-pptx==1.0.2",
"markdownify==0.13.1",
"markdownify==1.2.2",
"olefile==0.47"
]
}