整合代码
This commit is contained in:
@@ -7,10 +7,24 @@ import zipfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
||||
MEDIA_LINK_PATTERN = re.compile(
|
||||
r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$'
|
||||
)
|
||||
RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$")
|
||||
|
||||
|
||||
def parse_with_markitdown(
|
||||
file_path: str,
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
return result.text_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
|
||||
|
||||
def build_markdown_table(rows_data: List[List[str]]) -> str:
|
||||
@@ -119,43 +133,6 @@ def remove_markdown_images(markdown_text: str) -> str:
|
||||
return IMAGE_PATTERN.sub("", markdown_text)
|
||||
|
||||
|
||||
def filter_markdown_content(content: str) -> str:
|
||||
"""过滤 markdown 内容,保留文本、表格、列表和基本格式"""
|
||||
lines = content.split("\n")
|
||||
filtered_lines = []
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
if stripped.startswith("<!--") and stripped.endswith("-->"):
|
||||
continue
|
||||
|
||||
if stripped.startswith("![") or stripped.startswith("![]"):
|
||||
continue
|
||||
|
||||
if "<img" in stripped or "</img>" in stripped:
|
||||
continue
|
||||
|
||||
if MEDIA_LINK_PATTERN.match(stripped):
|
||||
continue
|
||||
|
||||
if RGB_COLOR_PATTERN.match(stripped):
|
||||
continue
|
||||
|
||||
line = re.sub(r'<span[^>]*style="[^"]*"[^>]*>(.*?)</span>', r"\1", line)
|
||||
line = re.sub(r"<span[^>]*>(.*?)</span>", r"\1", line)
|
||||
|
||||
line = re.sub(r"\s+", " ", line).strip()
|
||||
|
||||
if line:
|
||||
filtered_lines.append(line)
|
||||
|
||||
return "\n".join(filtered_lines)
|
||||
|
||||
|
||||
def get_heading_level(line: str) -> int:
|
||||
"""获取 Markdown 行的标题级别(1-6),非标题返回 0"""
|
||||
stripped = line.lstrip()
|
||||
|
||||
@@ -5,23 +5,12 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from common import build_markdown_table, safe_open_zip
|
||||
from common import build_markdown_table, parse_with_markitdown, safe_open_zip
|
||||
|
||||
|
||||
def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 DOCX 文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
return result.text_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
return parse_with_markitdown(file_path)
|
||||
|
||||
|
||||
def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -3,21 +3,12 @@
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from common import parse_with_markitdown
|
||||
|
||||
|
||||
def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 PDF 文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
return result.text_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
return parse_with_markitdown(file_path)
|
||||
|
||||
|
||||
def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
@@ -6,28 +6,12 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
from common import build_markdown_table, filter_markdown_content, flush_list_stack
|
||||
from common import build_markdown_table, flush_list_stack, parse_with_markitdown
|
||||
|
||||
|
||||
def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 PPTX 文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
|
||||
filtered_content = filter_markdown_content(result.text_content)
|
||||
if not filtered_content.strip():
|
||||
return None, "过滤后文档为空"
|
||||
|
||||
return filtered_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
return parse_with_markitdown(file_path)
|
||||
|
||||
|
||||
def extract_formatted_text_pptx(runs: List[Any]) -> str:
|
||||
|
||||
@@ -5,21 +5,12 @@ import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from common import parse_with_markitdown
|
||||
|
||||
|
||||
def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""使用 MarkItDown 库解析 XLSX 文件"""
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(file_path)
|
||||
if not result.text_content.strip():
|
||||
return None, "文档为空"
|
||||
return result.text_content, None
|
||||
except ImportError:
|
||||
return None, "MarkItDown 库未安装"
|
||||
except Exception as e:
|
||||
return None, f"MarkItDown 解析失败: {str(e)}"
|
||||
return parse_with_markitdown(file_path)
|
||||
|
||||
|
||||
def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
|
||||
Reference in New Issue
Block a user