diff --git a/temp/scripts/common.py b/temp/scripts/common.py index f65fabb..10cf99f 100644 --- a/temp/scripts/common.py +++ b/temp/scripts/common.py @@ -7,10 +7,24 @@ import zipfile from typing import List, Optional, Tuple IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)") -MEDIA_LINK_PATTERN = re.compile( - r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$' -) -RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$") + + +def parse_with_markitdown( + file_path: str, +) -> Tuple[Optional[str], Optional[str]]: + """使用 MarkItDown 库解析文件""" + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(file_path) + if not result.text_content.strip(): + return None, "文档为空" + return result.text_content, None + except ImportError: + return None, "MarkItDown 库未安装" + except Exception as e: + return None, f"MarkItDown 解析失败: {str(e)}" def build_markdown_table(rows_data: List[List[str]]) -> str: @@ -119,43 +133,6 @@ def remove_markdown_images(markdown_text: str) -> str: return IMAGE_PATTERN.sub("", markdown_text) -def filter_markdown_content(content: str) -> str: - """过滤 markdown 内容,保留文本、表格、列表和基本格式""" - lines = content.split("\n") - filtered_lines = [] - - for line in lines: - stripped = line.strip() - - if not stripped: - continue - - if stripped.startswith(""): - continue - - if stripped.startswith("![") or stripped.startswith("![]"): - continue - - if "" in stripped: - continue - - if MEDIA_LINK_PATTERN.match(stripped): - continue - - if RGB_COLOR_PATTERN.match(stripped): - continue - - line = re.sub(r']*style="[^"]*"[^>]*>(.*?)', r"\1", line) - line = re.sub(r"]*>(.*?)", r"\1", line) - - line = re.sub(r"\s+", " ", line).strip() - - if line: - filtered_lines.append(line) - - return "\n".join(filtered_lines) - - def get_heading_level(line: str) -> int: """获取 Markdown 行的标题级别(1-6),非标题返回 0""" stripped = line.lstrip() diff --git a/temp/scripts/docx_parser.py b/temp/scripts/docx_parser.py index cdf31b8..bcfb6f9 100644 --- a/temp/scripts/docx_parser.py +++ b/temp/scripts/docx_parser.py @@ -5,23 +5,12 @@ import xml.etree.ElementTree as ET import zipfile from typing import Any, List, Optional, Tuple -from common import build_markdown_table, safe_open_zip +from common import build_markdown_table, parse_with_markitdown, safe_open_zip def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 DOCX 文件""" - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - return result.text_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" + return parse_with_markitdown(file_path) def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/temp/scripts/pdf_parser.py b/temp/scripts/pdf_parser.py index e593656..234f422 100644 --- a/temp/scripts/pdf_parser.py +++ b/temp/scripts/pdf_parser.py @@ -3,21 +3,12 @@ from typing import Optional, Tuple +from common import parse_with_markitdown + def parse_pdf_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 PDF 文件""" - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - return result.text_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" + return parse_with_markitdown(file_path) def parse_pdf_with_unstructured(file_path: str) -> Tuple[Optional[str], Optional[str]]: diff --git a/temp/scripts/pptx_parser.py b/temp/scripts/pptx_parser.py index e8373b2..b03506e 100644 --- a/temp/scripts/pptx_parser.py +++ b/temp/scripts/pptx_parser.py @@ -6,28 +6,12 @@ import xml.etree.ElementTree as ET import zipfile from typing import Any, List, Optional, Tuple -from common import build_markdown_table, filter_markdown_content, flush_list_stack +from common import build_markdown_table, flush_list_stack, parse_with_markitdown def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 PPTX 文件""" - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - - filtered_content = filter_markdown_content(result.text_content) - if not filtered_content.strip(): - return None, "过滤后文档为空" - - return filtered_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" + return parse_with_markitdown(file_path) def extract_formatted_text_pptx(runs: List[Any]) -> str: diff --git a/temp/scripts/xlsx_parser.py b/temp/scripts/xlsx_parser.py index 877c9d8..778fb72 100644 --- a/temp/scripts/xlsx_parser.py +++ b/temp/scripts/xlsx_parser.py @@ -5,21 +5,12 @@ import xml.etree.ElementTree as ET import zipfile from typing import List, Optional, Tuple +from common import parse_with_markitdown + def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: """使用 MarkItDown 库解析 XLSX 文件""" - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - return result.text_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" + return parse_with_markitdown(file_path) def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]: