Files
lyxy-document/scripts/lyxy_document_reader.py
lanyuanxiaoyao 15b63800a8 refactor: 将核心代码迁移到 scripts 目录
- 创建 scripts/ 目录作为核心代码根目录
- 移动 core/, readers/, utils/ 到 scripts/ 下
- 移动 config.py, lyxy_document_reader.py 到 scripts/
- 移动 encoding_detection.py 到 scripts/utils/
- 更新 pyproject.toml 中的入口点路径和 pytest 配置
- 更新所有内部导入语句为 scripts.* 模块
- 更新 README.md 目录结构说明
- 更新 openspec/config.yaml 添加目录结构说明
- 删除无用的 main.py

此变更使项目结构更清晰,便于区分核心代码与测试、文档等支撑文件。
2026-03-08 17:41:03 +08:00

101 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""文档解析器命令行交互模块,提供命令行接口。支持 DOCX、PPTX、XLSX、PDF、HTML 和 URL。"""
import argparse
import logging
import os
import sys
import warnings
# 抑制第三方库的进度条和日志,仅保留解析结果输出
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TQDM_DISABLE"] = "1"
warnings.filterwarnings("ignore")
# 配置日志系统,只输出 ERROR 级别
logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')
# 设置第三方库日志等级
logging.getLogger('docling').setLevel(logging.ERROR)
logging.getLogger('unstructured').setLevel(logging.ERROR)
from scripts.core import (
FileDetectionError,
ReaderNotFoundError,
output_result,
parse_input,
process_content,
)
from scripts.readers import READERS
def main() -> None:
parser = argparse.ArgumentParser(
description="将 DOCX、PPTX、XLSX、PDF、HTML 文件或 URL 解析为 Markdown"
)
parser.add_argument("input_path", help="DOCX、PPTX、XLSX、PDF、HTML 文件或 URL")
parser.add_argument(
"-n",
"--context",
type=int,
default=2,
help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数"
)
group.add_argument(
"-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数"
)
group.add_argument(
"-t",
"--titles",
action="store_true",
help="返回解析后的 markdown 文档的标题行1-6级",
)
group.add_argument(
"-tc",
"--title-content",
help="指定标题名称,输出该标题及其下级内容(不包含#号)",
)
group.add_argument(
"-s",
"--search",
help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)",
)
args = parser.parse_args()
# 实例化所有 readers
readers = [ReaderCls() for ReaderCls in READERS]
try:
content, failures = parse_input(args.input_path, readers)
except FileDetectionError as e:
print(f"错误: {e}")
sys.exit(1)
except ReaderNotFoundError as e:
print(f"错误: {e}")
sys.exit(1)
if content is None:
print("所有解析方法均失败:")
for failure in failures:
print(failure)
sys.exit(1)
# 处理内容
content = process_content(content)
# 输出结果
output_result(content, args)
if __name__ == "__main__":
main()