1
0

整合代码

This commit is contained in:
2026-02-15 20:25:28 +08:00
parent f30ea08805
commit f167aa2111
5 changed files with 28 additions and 96 deletions

View File

@@ -7,10 +7,24 @@ import zipfile
from typing import List, Optional, Tuple
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
MEDIA_LINK_PATTERN = re.compile(
r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$'
)
RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$")
def parse_with_markitdown(
file_path: str,
) -> Tuple[Optional[str], Optional[str]]:
"""使用 MarkItDown 库解析文件"""
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(file_path)
if not result.text_content.strip():
return None, "文档为空"
return result.text_content, None
except ImportError:
return None, "MarkItDown 库未安装"
except Exception as e:
return None, f"MarkItDown 解析失败: {str(e)}"
def build_markdown_table(rows_data: List[List[str]]) -> str:
@@ -119,43 +133,6 @@ def remove_markdown_images(markdown_text: str) -> str:
return IMAGE_PATTERN.sub("", markdown_text)
def filter_markdown_content(content: str) -> str:
"""过滤 markdown 内容,保留文本、表格、列表和基本格式"""
lines = content.split("\n")
filtered_lines = []
for line in lines:
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("<!--") and stripped.endswith("-->"):
continue
if stripped.startswith("![") or stripped.startswith("![]"):
continue
if "<img" in stripped or "</img>" in stripped:
continue
if MEDIA_LINK_PATTERN.match(stripped):
continue
if RGB_COLOR_PATTERN.match(stripped):
continue
line = re.sub(r'<span[^>]*style="[^"]*"[^>]*>(.*?)</span>', r"\1", line)
line = re.sub(r"<span[^>]*>(.*?)</span>", r"\1", line)
line = re.sub(r"\s+", " ", line).strip()
if line:
filtered_lines.append(line)
return "\n".join(filtered_lines)
def get_heading_level(line: str) -> int:
"""获取 Markdown 行的标题级别1-6非标题返回 0"""
stripped = line.lstrip()