From 8c27b08fdcd086dfe0ea3e2255c32f8da98e0164 Mon Sep 17 00:00:00 2001 From: lanyuanxiaoyao Date: Sat, 14 Feb 2026 21:11:37 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E5=A4=9A=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E8=AF=BB=E5=8F=96=E7=9A=84=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- temp/document_parser.py | 1245 ----------------- .../{document_parser.md => scripts/README.md} | 175 ++- temp/scripts/common.py | 303 ++++ temp/scripts/docx.py | 268 ++++ temp/scripts/parser.py | 126 ++ temp/scripts/pptx.py | 339 +++++ temp/scripts/xlsx.py | 242 ++++ 7 files changed, 1410 insertions(+), 1288 deletions(-) delete mode 100644 temp/document_parser.py rename temp/{document_parser.md => scripts/README.md} (58%) create mode 100644 temp/scripts/common.py create mode 100644 temp/scripts/docx.py create mode 100644 temp/scripts/parser.py create mode 100644 temp/scripts/pptx.py create mode 100644 temp/scripts/xlsx.py diff --git a/temp/document_parser.py b/temp/document_parser.py deleted file mode 100644 index b7db3ab..0000000 --- a/temp/document_parser.py +++ /dev/null @@ -1,1245 +0,0 @@ -#!/usr/bin/env python3 -"""整合的文档解析器,支持 DOCX、PPTX 和 XLSX 文件: -按优先级尝试多种解析方法: -1. MarkItDown (微软官方库) -2. python-docx、python-pptx 或 pandas (成熟的 Python 库) -3. XML 原生解析 (备选方案) - -代码风格要求: -- Python 3.6+ 兼容 -- 遵循 PEP 8 规范 -- 所有公共 API 函数添加类型提示 -- 字符串优先内联使用,不提取为常量,除非被使用超过3次 -- 其他被多次使用的对象根据具体情况可考虑被提取为常量(如正则表达式) -- 模块级和公共 API 函数保留文档字符串 -- 内部辅助函数不添加文档字符串(函数名足够描述) -- 变量命名清晰,避免单字母变量名 -""" - -import argparse -import os -import re -import sys -import zipfile -import xml.etree.ElementTree as ET -from typing import Any, List, Optional, Tuple - -IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)") -MEDIA_LINK_PATTERN = re.compile( - r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$' -) -RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$") - - -def build_markdown_table(rows_data: List[List[str]]) -> str: - """将二维列表转换为 Markdown 表格格式""" - if not rows_data or not rows_data[0]: - return "" - - md_lines = [] - for i, row_data in enumerate(rows_data): - row_text = [cell if cell else "" for cell in row_data] - md_lines.append("| " + " | ".join(row_text) + " |") - if i == 0: - md_lines.append("|" + " | ".join(["---"] * len(row_text)) + " |") - return "\n".join(md_lines) + "\n\n" - - -def flush_list_stack(list_stack: List[str], target: List[str]) -> None: - """将列表堆栈中的非空项添加到目标列表并清空堆栈""" - for item in list_stack: - if item: - target.append(item + "\n") - list_stack.clear() - - -def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipExtFile]: - """安全地从 ZipFile 中打开文件,防止路径遍历攻击""" - if not name: - return None - if name.startswith("/") or name.startswith("\\"): - return None - if name.startswith(".."): - return None - if "/../" in name or name.endswith("/.."): - return None - if "\\" in name: - return None - if "/" not in name: - return None - return zip_file.open(name) - - -def normalize_markdown_whitespace(content: str) -> str: - lines = content.split("\n") - result = [] - empty_count = 0 - - for line in lines: - stripped = line.strip() - if not stripped: - empty_count += 1 - if empty_count == 1: - result.append(line) - else: - empty_count = 0 - result.append(line) - - return "\n".join(result) - - -def is_valid_docx(file_path: str) -> bool: - try: - with zipfile.ZipFile(file_path, "r") as zip_file: - required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True - except (zipfile.BadZipFile, zipfile.LargeZipFile): - return False - - -def is_valid_pptx(file_path: str) -> bool: - try: - with zipfile.ZipFile(file_path, "r") as zip_file: - required_files = [ - "[Content_Types].xml", - "_rels/.rels", - "ppt/presentation.xml", - ] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True - except (zipfile.BadZipFile, zipfile.LargeZipFile): - return False - - -def is_valid_xlsx(file_path: str) -> bool: - try: - with zipfile.ZipFile(file_path, "r") as zip_file: - required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True - except (zipfile.BadZipFile, zipfile.LargeZipFile): - return False - - -def remove_markdown_images(markdown_text: str) -> str: - return IMAGE_PATTERN.sub("", markdown_text) - - -def filter_markdown_content(content: str) -> str: - """过滤 markdown 内容,保留文本、表格、列表和基本格式""" - lines = content.split("\n") - filtered_lines = [] - - for line in lines: - stripped = line.strip() - - if not stripped: - continue - - if stripped.startswith(""): - continue - - if stripped.startswith("![") or stripped.startswith("![]"): - continue - - if "" in stripped: - continue - - if MEDIA_LINK_PATTERN.match(stripped): - continue - - if RGB_COLOR_PATTERN.match(stripped): - continue - - line = re.sub(r']*style="[^"]*"[^>]*>(.*?)', r"\1", line) - line = re.sub(r"]*>(.*?)", r"\1", line) - - line = re.sub(r"\s+", " ", line).strip() - - if line: - filtered_lines.append(line) - - return "\n".join(filtered_lines) - - -def extract_titles(markdown_text: str) -> List[str]: - """提取 markdown 文本中的所有标题行(1-6级)""" - title_lines = [] - for line in markdown_text.split("\n"): - if get_heading_level(line) > 0: - title_lines.append(line.lstrip()) - return title_lines - - -def get_heading_level(line: str) -> int: - stripped = line.lstrip() - if not stripped.startswith("#"): - return 0 - level = 0 - for char in stripped: - if char == "#": - level += 1 - else: - break - return level if 1 <= level <= 6 else 0 - - -def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]: - """提取所有指定标题及其下级内容(每个包含上级标题)""" - lines = markdown_text.split("\n") - match_indices = [] - - for i, line in enumerate(lines): - level = get_heading_level(line) - if level > 0: - stripped = line.lstrip() - title_text = stripped[level:].strip() - if title_text == title_name: - match_indices.append(i) - - if not match_indices: - return None - - result_lines = [] - for idx in match_indices: - target_level = get_heading_level(lines[idx]) - - parent_titles = [] - current_level = target_level - for i in range(idx - 1, -1, -1): - line_level = get_heading_level(lines[i]) - if line_level > 0 and line_level < current_level: - parent_titles.append(lines[i]) - current_level = line_level - if current_level == 1: - break - - parent_titles.reverse() - result_lines.extend(parent_titles) - - result_lines.append(lines[idx]) - for i in range(idx + 1, len(lines)): - line = lines[i] - line_level = get_heading_level(line) - if line_level == 0 or line_level > target_level: - result_lines.append(line) - else: - break - - return "\n".join(result_lines) - - -def search_markdown( - content: str, pattern: str, context_lines: int = 0 -) -> Optional[str]: - """使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文""" - try: - regex = re.compile(pattern) - except re.error: - return None - - lines = content.split("\n") - - non_empty_indices = [] - non_empty_to_original = {} - for i, line in enumerate(lines): - if line.strip(): - non_empty_indices.append(i) - non_empty_to_original[i] = len(non_empty_indices) - 1 - - matched_non_empty_indices = [] - for orig_idx in non_empty_indices: - if regex.search(lines[orig_idx]): - matched_non_empty_indices.append(non_empty_to_original[orig_idx]) - - if not matched_non_empty_indices: - return None - - merged_ranges = [] - current_start = matched_non_empty_indices[0] - current_end = matched_non_empty_indices[0] - - for idx in matched_non_empty_indices[1:]: - if idx - current_end <= context_lines * 2: - current_end = idx - else: - merged_ranges.append((current_start, current_end)) - current_start = idx - current_end = idx - merged_ranges.append((current_start, current_end)) - - results = [] - for start, end in merged_ranges: - context_start_idx = max(0, start - context_lines) - context_end_idx = min(len(non_empty_indices) - 1, end + context_lines) - - start_line_idx = non_empty_indices[context_start_idx] - end_line_idx = non_empty_indices[context_end_idx] - - selected_indices = set( - non_empty_indices[context_start_idx : context_end_idx + 1] - ) - result_lines = [ - line - for i, line in enumerate(lines) - if start_line_idx <= i <= end_line_idx - and (line.strip() or i in selected_indices) - ] - results.append("\n".join(result_lines)) - - return "\n---\n".join(results) - - -def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - return result.text_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" - - -def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - from docx import Document - except ImportError: - return None, "python-docx 库未安装" - - try: - doc = Document(file_path) - - def get_heading_level(para: Any) -> int: - if para.style and para.style.name: - style_name = para.style.name - if "Heading 1" in style_name or "Title" in style_name: - return 1 - elif "Heading 2" in style_name: - return 2 - elif "Heading 3" in style_name: - return 3 - elif "Heading 4" in style_name: - return 4 - elif "Heading 5" in style_name: - return 5 - elif "Heading 6" in style_name: - return 6 - return 0 - - def get_list_style(para: Any) -> Optional[str]: - if not para.style or not para.style.name: - return None - style_name = para.style.name - if style_name.startswith("List Bullet") or style_name == "Bullet": - return "bullet" - elif style_name.startswith("List Number") or style_name == "Number": - return "number" - return None - - def convert_runs_to_markdown(runs: List[Any]) -> str: - result = [] - for run in runs: - text = run.text - if not text: - continue - if run.bold: - text = f"**{text}**" - if run.italic: - text = f"*{text}*" - if run.underline: - text = f"{text}" - result.append(text) - return "".join(result) - - def convert_table_to_markdown(table: Any) -> str: - rows_data = [] - for row in table.rows: - row_data = [] - for cell in row.cells: - cell_text = cell.text.strip().replace("\n", " ") - row_data.append(cell_text) - rows_data.append(row_data) - return build_markdown_table(rows_data) - - markdown_lines = [] - prev_was_list = False - - for para in doc.paragraphs: - text = convert_runs_to_markdown(para.runs) - if not text.strip(): - continue - - heading_level = get_heading_level(para) - if heading_level > 0: - markdown_lines.append(f"{'#' * heading_level} {text}") - prev_was_list = False - else: - list_style = get_list_style(para) - if list_style == "bullet": - if not prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(f"- {text}") - prev_was_list = True - elif list_style == "number": - if not prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(f"1. {text}") - prev_was_list = True - else: - if prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(text) - markdown_lines.append("") - prev_was_list = False - - for table in doc.tables: - table_md = convert_table_to_markdown(table) - markdown_lines.append(table_md) - markdown_lines.append("") - - content = "\n".join(markdown_lines) - if not content.strip(): - return None, "文档为空" - return content, None - except Exception as e: - return None, f"python-docx 解析失败: {str(e)}" - - -def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: - word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" - namespaces = {"w": word_namespace} - - def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int: - return style_to_level.get(style_id, 0) - - def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]: - return style_to_list.get(style_id, None) - - def extract_text_with_formatting(para: Any, namespaces: dict) -> str: - texts = [] - for run in para.findall(".//w:r", namespaces=namespaces): - text_elem = run.find(".//w:t", namespaces=namespaces) - if text_elem is not None and text_elem.text: - text = text_elem.text - bold = run.find(".//w:b", namespaces=namespaces) is not None - italic = run.find(".//w:i", namespaces=namespaces) is not None - if bold: - text = f"**{text}**" - if italic: - text = f"*{text}*" - texts.append(text) - return "".join(texts).strip() - - def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str: - rows = table_elem.findall(".//w:tr", namespaces=namespaces) - if not rows: - return "" - md_lines = [] - for i, row in enumerate(rows): - cells = row.findall(".//w:tc", namespaces=namespaces) - cell_texts = [] - for cell in cells: - cell_text = extract_text_with_formatting(cell, namespaces) - cell_text = cell_text.replace("\n", " ").strip() - cell_texts.append(cell_text if cell_text else "") - if cell_texts: - md_line = "| " + " | ".join(cell_texts) + " |" - md_lines.append(md_line) - if i == 0: - sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |" - md_lines.append(sep_line) - return "\n".join(md_lines) - - try: - style_to_level = {} - style_to_list = {} - markdown_lines = [] - - with zipfile.ZipFile(file_path) as zip_file: - try: - styles_file = safe_open_zip(zip_file, "word/styles.xml") - if styles_file: - styles_root = ET.parse(styles_file).getroot() - for style in styles_root.findall( - ".//w:style", namespaces=namespaces - ): - style_id = style.get(f"{{{word_namespace}}}styleId") - style_name_elem = style.find("w:name", namespaces=namespaces) - if style_id and style_name_elem is not None: - style_name = style_name_elem.get(f"{{{word_namespace}}}val") - if style_name: - if style_name == "Title": - style_to_level[style_id] = 1 - elif style_name == "heading 1": - style_to_level[style_id] = 1 - elif style_name == "heading 2": - style_to_level[style_id] = 2 - elif style_name == "heading 3": - style_to_level[style_id] = 3 - elif style_name == "heading 4": - style_to_level[style_id] = 4 - elif style_name == "heading 5": - style_to_level[style_id] = 5 - elif style_name == "heading 6": - style_to_level[style_id] = 6 - elif ( - style_name.startswith("List Bullet") - or style_name == "Bullet" - ): - style_to_list[style_id] = "bullet" - elif ( - style_name.startswith("List Number") - or style_name == "Number" - ): - style_to_list[style_id] = "number" - except Exception: - pass - - document_file = safe_open_zip(zip_file, "word/document.xml") - if not document_file: - return None, "document.xml 不存在或无法访问" - - root = ET.parse(document_file).getroot() - body = root.find(".//w:body", namespaces=namespaces) - if body is None: - return None, "document.xml 中未找到 w:body 元素" - - for child in body.findall("./*", namespaces=namespaces): - if child.tag.endswith("}p"): - style_elem = child.find(".//w:pStyle", namespaces=namespaces) - style_id = ( - style_elem.get(f"{{{word_namespace}}}val") - if style_elem is not None - else None - ) - - heading_level = get_heading_level(style_id, style_to_level) - list_style = get_list_style(style_id, style_to_list) - para_text = extract_text_with_formatting(child, namespaces) - - if para_text: - if heading_level > 0: - markdown_lines.append(f"{'#' * heading_level} {para_text}") - elif list_style == "bullet": - markdown_lines.append(f"- {para_text}") - elif list_style == "number": - markdown_lines.append(f"1. {para_text}") - else: - markdown_lines.append(para_text) - markdown_lines.append("") - - elif child.tag.endswith("}tbl"): - table_md = convert_table_to_markdown(child, namespaces) - if table_md: - markdown_lines.append(table_md) - markdown_lines.append("") - - content = "\n".join(markdown_lines) - if not content.strip(): - return None, "文档为空" - return content, None - except Exception as e: - return None, f"XML 解析失败: {str(e)}" - - -def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - - filtered_content = filter_markdown_content(result.text_content) - if not filtered_content.strip(): - return None, "过滤后文档为空" - - return filtered_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" - - -def extract_formatted_text_pptx(runs: List[Any]) -> str: - result = [] - for run in runs: - if not run.text: - continue - - text = run.text - - font = run.font - is_bold = getattr(font, "bold", False) or False - is_italic = getattr(font, "italic", False) or False - - if is_bold and is_italic: - text = f"***{text}***" - elif is_bold: - text = f"**{text}**" - elif is_italic: - text = f"*{text}*" - - result.append(text) - - return "".join(result).strip() - - -def convert_table_to_md_pptx(table: Any) -> str: - rows_data = [] - for row in table.rows: - row_data = [] - for cell in row.cells: - cell_content = [] - for para in cell.text_frame.paragraphs: - text = extract_formatted_text_pptx(para.runs) - if text: - cell_content.append(text) - cell_text = " ".join(cell_content).strip() - row_data.append(cell_text if cell_text else "") - rows_data.append(row_data) - return build_markdown_table(rows_data) - - -def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - from pptx import Presentation - from pptx.enum.shapes import MSO_SHAPE_TYPE - except ImportError: - return None, "python-pptx 库未安装" - - try: - prs = Presentation(file_path) - md_content = [] - - for slide_num, slide in enumerate(prs.slides, 1): - md_content.append(f"\n## Slide {slide_num}\n") - - list_stack = [] - - for shape in slide.shapes: - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - continue - - if hasattr(shape, "has_table") and shape.has_table: - if list_stack: - md_content.append( - "\n" + "\n".join([x for x in list_stack if x]) + "\n" - ) - list_stack.clear() - - table_md = convert_table_to_md_pptx(shape.table) - md_content.append(table_md) - - if hasattr(shape, "text_frame"): - for para in shape.text_frame.paragraphs: - pPr = para._element.pPr - is_list = False - if pPr is not None: - is_list = ( - para.level > 0 - or pPr.find( - ".//a:buChar", - namespaces={ - "a": "http://schemas.openxmlformats.org/drawingml/2006/main" - }, - ) - is not None - or pPr.find( - ".//a:buAutoNum", - namespaces={ - "a": "http://schemas.openxmlformats.org/drawingml/2006/main" - }, - ) - is not None - ) - - if is_list: - level = para.level - - while len(list_stack) <= level: - list_stack.append("") - - text = extract_formatted_text_pptx(para.runs) - if text: - pPr = para._element.pPr - is_ordered = ( - pPr is not None - and pPr.find( - ".//a:buAutoNum", - namespaces={ - "a": "http://schemas.openxmlformats.org/drawingml/2006/main" - }, - ) - is not None - ) - marker = "1. " if is_ordered else "- " - indent = " " * level - list_stack[level] = f"{indent}{marker}{text}" - - for i in range(len(list_stack)): - if list_stack[i]: - md_content.append(list_stack[i] + "\n") - list_stack[i] = "" - else: - if list_stack: - md_content.append( - "\n" - + "\n".join([x for x in list_stack if x]) - + "\n" - ) - list_stack.clear() - - text = extract_formatted_text_pptx(para.runs) - if text: - md_content.append(f"{text}\n") - - if list_stack: - md_content.append("\n" + "\n".join([x for x in list_stack if x]) + "\n") - list_stack.clear() - - md_content.append("---\n") - - content = "\n".join(md_content) - if not content.strip(): - return None, "文档为空" - return content, None - except Exception as e: - return None, f"python-pptx 解析失败: {str(e)}" - - -def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: - pptx_namespace = { - "a": "http://schemas.openxmlformats.org/drawingml/2006/main", - "p": "http://schemas.openxmlformats.org/presentationml/2006/main", - "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", - } - - def extract_text_with_formatting_xml(text_elem: Any, namespaces: dict) -> str: - result = [] - runs = text_elem.findall(".//a:r", namespaces=namespaces) - for run in runs: - t_elem = run.find(".//a:t", namespaces=namespaces) - if t_elem is None or not t_elem.text: - continue - - text = t_elem.text - - rPr = run.find(".//a:rPr", namespaces=namespaces) - is_bold = False - is_italic = False - - if rPr is not None: - is_bold = rPr.find(".//a:b", namespaces=namespaces) is not None - is_italic = rPr.find(".//a:i", namespaces=namespaces) is not None - - if is_bold and is_italic: - text = f"***{text}***" - elif is_bold: - text = f"**{text}**" - elif is_italic: - text = f"*{text}*" - - result.append(text) - - return "".join(result).strip() if result else "" - - def convert_table_to_md_xml(table_elem: Any, namespaces: dict) -> str: - rows = table_elem.findall(".//a:tr", namespaces=namespaces) - if not rows: - return "" - - rows_data = [] - for row in rows: - cells = row.findall(".//a:tc", namespaces=namespaces) - row_data = [] - for cell in cells: - cell_text = extract_text_with_formatting_xml(cell, namespaces) - if cell_text: - cell_text = cell_text.replace("\n", " ").replace("\r", " ") - row_data.append(cell_text if cell_text else "") - rows_data.append(row_data) - return build_markdown_table(rows_data) - - def is_list_item_xml(p_elem: Any, namespaces: dict) -> Tuple[bool, bool]: - if p_elem is None: - return False, False - - pPr = p_elem.find(".//a:pPr", namespaces=namespaces) - if pPr is None: - return False, False - - buChar = pPr.find(".//a:buChar", namespaces=namespaces) - if buChar is not None: - return True, False - - buAutoNum = pPr.find(".//a:buAutoNum", namespaces=namespaces) - if buAutoNum is not None: - return True, True - - return False, False - - def get_indent_level_xml(p_elem: Any, namespaces: dict) -> int: - if p_elem is None: - return 0 - - pPr = p_elem.find(".//a:pPr", namespaces=namespaces) - if pPr is None: - return 0 - - lvl = pPr.get("lvl") - return int(lvl) if lvl else 0 - - try: - md_content = [] - - with zipfile.ZipFile(file_path) as zip_file: - slide_files = [ - f - for f in zip_file.namelist() - if re.match(r"ppt/slides/slide\d+\.xml$", f) - ] - - for slide_idx, slide_file in enumerate(slide_files, 1): - md_content.append("\n## Slide {}\n".format(slide_idx)) - - with zip_file.open(slide_file) as slide_xml: - slide_root = ET.parse(slide_xml).getroot() - - tx_bodies = slide_root.findall( - ".//p:sp/p:txBody", namespaces=pptx_namespace - ) - - tables = slide_root.findall(".//a:tbl", namespaces=pptx_namespace) - for table in tables: - table_md = convert_table_to_md_xml(table, pptx_namespace) - if table_md: - md_content.append(table_md) - - for tx_body in tx_bodies: - paragraphs = tx_body.findall( - ".//a:p", namespaces=pptx_namespace - ) - list_stack = [] - - for para in paragraphs: - is_list, is_ordered = is_list_item_xml(para, pptx_namespace) - - if is_list: - level = get_indent_level_xml(para, pptx_namespace) - - while len(list_stack) <= level: - list_stack.append("") - - text = extract_text_with_formatting_xml( - para, pptx_namespace - ) - if text: - marker = "1. " if is_ordered else "- " - indent = " " * level - list_stack[level] = f"{indent}{marker}{text}" - - for i in range(len(list_stack)): - if list_stack[i]: - md_content.append(list_stack[i] + "\n") - list_stack[i] = "" - else: - if list_stack: - flush_list_stack(list_stack, md_content) - - text = extract_text_with_formatting_xml( - para, pptx_namespace - ) - if text: - md_content.append(f"{text}\n") - - if list_stack: - flush_list_stack(list_stack, md_content) - - md_content.append("---\n") - - content = "\n".join(md_content) - if not content.strip(): - return None, "文档为空" - return content, None - except Exception as e: - return None, f"XML 解析失败: {str(e)}" - - -def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - from markitdown import MarkItDown - - md = MarkItDown() - result = md.convert(file_path) - if not result.text_content.strip(): - return None, "文档为空" - return result.text_content, None - except ImportError: - return None, "MarkItDown 库未安装" - except Exception as e: - return None, f"MarkItDown 解析失败: {str(e)}" - - -def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]: - try: - import pandas as pd - from tabulate import tabulate - except ImportError as e: - missing_lib = "pandas" if "pandas" in str(e) else "tabulate" - return None, f"{missing_lib} 库未安装" - - try: - df = pd.read_excel(file_path) - - if len(df) == 0: - return None, "Excel 文件为空" - - markdown_content = tabulate( - df, headers="keys", tablefmt="pipe", showindex=True, missingval="" - ) - - markdown_with_header = ( - f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}" - ) - - return markdown_with_header, None - except Exception as e: - return None, f"pandas 解析失败: {str(e)}" - - -def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: - xlsx_namespace = { - "main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main" - } - - def parse_col_index(cell_ref: str) -> int: - col_index = 0 - for char in cell_ref: - if char.isalpha(): - col_index = col_index * 26 + (ord(char) - ord("A") + 1) - else: - break - return col_index - 1 - - def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str: - cell_type = cell.attrib.get("t") - cell_value_elem = cell.find("main:v", xlsx_namespace) - - if cell_value_elem is not None and cell_value_elem.text: - cell_value = cell_value_elem.text - - if cell_type == "s": - try: - idx = int(cell_value) - if 0 <= idx < len(shared_strings): - text = shared_strings[idx] - return text.replace("\n", " ").replace("\r", "") - except (ValueError, IndexError): - pass - return "" - elif cell_type == "b": - return "TRUE" if cell_value == "1" else "FALSE" - elif cell_type == "str": - return cell_value.replace("\n", " ").replace("\r", "") - elif cell_type == "inlineStr": - is_elem = cell.find("main:is", xlsx_namespace) - if is_elem is not None: - t_elem = is_elem.find("main:t", xlsx_namespace) - if t_elem is not None and t_elem.text: - return t_elem.text.replace("\n", " ").replace("\r", "") - return "" - elif cell_type == "e": - error_codes = { - "#NULL!": "空引用错误", - "#DIV/0!": "除零错误", - "#VALUE!": "值类型错误", - "#REF!": "无效引用", - "#NAME?": "名称错误", - "#NUM!": "数值错误", - "#N/A": "值不可用", - } - return error_codes.get(cell_value, f"错误: {cell_value}") - elif cell_type == "d": - return f"[日期] {cell_value}" - elif cell_type == "n": - return cell_value - elif cell_type is None: - try: - float_val = float(cell_value) - if float_val.is_integer(): - return str(int(float_val)) - return cell_value - except ValueError: - return cell_value - else: - return cell_value - else: - return "" - - def get_non_empty_columns(data: List[List[str]]) -> set: - non_empty_cols = set() - for row in data: - for col_idx, cell in enumerate(row): - if cell and cell.strip(): - non_empty_cols.add(col_idx) - return non_empty_cols - - def filter_columns(row: List[str], non_empty_cols: set) -> List[str]: - return [row[i] if i < len(row) else "" for i in sorted(non_empty_cols)] - - def data_to_markdown(data: List[List[str]], sheet_name: str) -> str: - if not data or not data[0]: - return f"## {sheet_name}\n\n*工作表为空*" - - md_lines = [] - md_lines.append(f"## {sheet_name}") - md_lines.append("") - - headers = data[0] - - non_empty_cols = get_non_empty_columns(data) - - if not non_empty_cols: - return f"## {sheet_name}\n\n*工作表为空*" - - filtered_headers = filter_columns(headers, non_empty_cols) - header_line = "| " + " | ".join(filtered_headers) + " |" - md_lines.append(header_line) - - separator_line = "|" + "|".join(["---"] * len(filtered_headers)) + "|" - md_lines.append(separator_line) - - for row in data[1:]: - filtered_row = filter_columns(row, non_empty_cols) - row_line = "| " + " | ".join(filtered_row) + " |" - md_lines.append(row_line) - - md_lines.append("") - - return "\n".join(md_lines) - - try: - with zipfile.ZipFile(file_path, "r") as zip_file: - sheet_names = [] - try: - with zip_file.open("xl/workbook.xml") as f: - root = ET.parse(f).getroot() - sheet_elements = root.findall(".//main:sheet", xlsx_namespace) - for sheet in sheet_elements: - sheet_name = sheet.attrib.get("name", "") - if sheet_name: - sheet_names.append(sheet_name) - except KeyError: - return None, "无法解析工作表名称" - - if not sheet_names: - return None, "未找到工作表" - - shared_strings = [] - try: - with zip_file.open("xl/sharedStrings.xml") as f: - root = ET.parse(f).getroot() - for si in root.findall(".//main:si", xlsx_namespace): - t_elem = si.find(".//main:t", xlsx_namespace) - if t_elem is not None and t_elem.text: - shared_strings.append(t_elem.text) - else: - shared_strings.append("") - except KeyError: - pass - - markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n" - markdown_content += f"来源: {file_path}\n\n" - - for sheet_index, sheet_name in enumerate(sheet_names, start=1): - try: - worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml" - with zip_file.open(worksheet_path) as f: - root = ET.parse(f).getroot() - sheet_data = root.find("main:sheetData", xlsx_namespace) - - rows = [] - if sheet_data is not None: - row_elements = sheet_data.findall( - "main:row", xlsx_namespace - ) - - for row_elem in row_elements: - cells = row_elem.findall("main:c", xlsx_namespace) - - col_dict = {} - for cell in cells: - cell_ref = cell.attrib.get("r", "") - if not cell_ref: - continue - - col_index = parse_col_index(cell_ref) - cell_value = parse_cell_value(cell, shared_strings) - col_dict[col_index] = cell_value - - if col_dict: - max_col = max(col_dict.keys()) - row_data = [ - col_dict.get(i, "") for i in range(max_col + 1) - ] - rows.append(row_data) - - table_md = data_to_markdown(rows, sheet_name) - markdown_content += table_md + "\n\n" - - except KeyError: - markdown_content += f"## {sheet_name}\n\n*工作表解析失败*\n\n" - - if not markdown_content.strip(): - return None, "解析结果为空" - - return markdown_content, None - except Exception as e: - return None, f"XML 解析失败: {str(e)}" - - -def detect_file_type(file_path: str) -> Optional[str]: - """检测文件类型,返回 'docx'、'pptx' 或 'xlsx'""" - _, ext = os.path.splitext(file_path) - ext = ext.lower() - - if ext == ".docx": - if is_valid_docx(file_path): - return "docx" - elif ext == ".pptx": - if is_valid_pptx(file_path): - return "pptx" - elif ext == ".xlsx": - if is_valid_xlsx(file_path): - return "xlsx" - - return None - - -def main() -> None: - parser = argparse.ArgumentParser( - description="将 DOCX、PPTX 或 XLSX 文件解析为 Markdown" - ) - - parser.add_argument("file_path", help="DOCX、PPTX 或 XLSX 文件的绝对路径") - - parser.add_argument( - "-n", - "--context", - type=int, - default=2, - help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)", - ) - - group = parser.add_mutually_exclusive_group() - group.add_argument( - "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数" - ) - group.add_argument( - "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数" - ) - group.add_argument( - "-t", - "--titles", - action="store_true", - help="返回解析后的 markdown 文档的标题行(1-6级)", - ) - group.add_argument( - "-tc", - "--title-content", - help="指定标题名称,输出该标题及其下级内容(不包含#号)", - ) - group.add_argument( - "-s", - "--search", - help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)", - ) - - args = parser.parse_args() - - if not os.path.exists(args.file_path): - print(f"错误: 文件不存在: {args.file_path}") - sys.exit(1) - - file_type = detect_file_type(args.file_path) - if file_type is None: - print(f"错误: 文件不是有效的 DOCX、PPTX 或 XLSX 格式: {args.file_path}") - sys.exit(1) - - if file_type == "docx": - parsers = [ - ("MarkItDown", parse_docx_with_markitdown), - ("python-docx", parse_docx_with_python_docx), - ("XML 原生解析", parse_docx_with_xml), - ] - elif file_type == "pptx": - parsers = [ - ("MarkItDown", parse_pptx_with_markitdown), - ("python-pptx", parse_pptx_with_python_pptx), - ("XML 原生解析", parse_pptx_with_xml), - ] - else: - parsers = [ - ("MarkItDown", parse_xlsx_with_markitdown), - ("pandas", parse_xlsx_with_pandas), - ("XML 原生解析", parse_xlsx_with_xml), - ] - - failures = [] - content = None - - for parser_name, parser_func in parsers: - content, error = parser_func(args.file_path) - if content is not None: - content = remove_markdown_images(content) - content = normalize_markdown_whitespace(content) - break - else: - failures.append(f"- {parser_name}: {error}") - - if content is None: - print("所有解析方法均失败:") - for failure in failures: - print(failure) - sys.exit(1) - - if args.count: - print(len(content.replace("\n", ""))) - elif args.lines: - print(len(content.split("\n"))) - elif args.titles: - titles = extract_titles(content) - for title in titles: - print(title) - elif args.title_content: - title_content = extract_title_content(content, args.title_content) - if title_content is None: - print(f"错误: 未找到标题 '{args.title_content}'") - sys.exit(1) - print(title_content, end="") - elif args.search: - search_result = search_markdown(content, args.search, args.context) - if search_result is None: - print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'") - sys.exit(1) - print(search_result, end="") - else: - print(content, end="") - - -if __name__ == "__main__": - main() diff --git a/temp/document_parser.md b/temp/scripts/README.md similarity index 58% rename from temp/document_parser.md rename to temp/scripts/README.md index 956fb04..12ce746 100644 --- a/temp/document_parser.md +++ b/temp/scripts/README.md @@ -1,24 +1,37 @@ # Document Parser 使用说明 -一个整合的文档解析器,支持将 DOCX 和 PPTX 文件转换为 Markdown 格式。 +一个模块化的文档解析器,支持将 DOCX、PPTX 和 XLSX 文件转换为 Markdown 格式。 ## 概述 -该脚本按优先级尝试多种解析方法,确保最大兼容性: +该解析器按优先级尝试多种解析方法,确保最大兼容性: 1. **MarkItDown** (微软官方库) - 推荐使用,格式最规范 -2. **python-docx / python-pptx** (成熟的 Python 库) - 输出最详细 +2. **python-docx / python-pptx / pandas** (成熟的 Python 库) - 输出最详细 3. **XML 原生解析** (备选方案) - 无需依赖 ### 特性 -- 支持 DOCX 和 PPTX 格式 +- 支持 DOCX、PPTX 和 XLSX 格式 - 自动检测文件类型和有效性 - 保留文本格式(粗体、斜体、下划线) - 提取表格并转换为 Markdown 格式 - 提取列表并保留层级结构 - 多种输出模式(字数、行数、标题、搜索等) - 内容过滤和规范化 +- 模块化设计,易于维护和扩展 + +## 文件结构 + +``` +scripts/ +├── common.py # 公共函数和常量 +├── docx.py # DOCX 文件解析 +├── pptx.py # PPTX 文件解析 +├── xlsx.py # XLSX 文件解析 +├── parser.py # 命令行入口 +└── README.md # 本文档 +``` ## 依赖要求 @@ -26,38 +39,40 @@ ```bash # Python 3.6+ -python document_parser.py file.docx +uv run parser.py file.docx ``` ### 使用 MarkItDown ```bash # 使用 uv 自动安装 -uv run --with markitdown[docx] document_parser.py file.docx -uv run --with markitdown[pptx] document_parser.py file.pptx +uv run --with markitdown parser.py file.docx +uv run --with markitdown parser.py file.pptx +uv run --with markitdown parser.py file.xlsx # 或手动安装 -pip install markitdown[docx] -pip install markitdown[pptx] +pip install markitdown ``` -### 使用 python-docx / python-pptx +### 使用专用库 ```bash # 使用 uv 自动安装 -uv run --with python-docx document_parser.py file.docx -uv run --with python-pptx document_parser.py file.pptx +uv run --with python-docx parser.py file.docx +uv run --with python-pptx parser.py file.pptx +uv run --with pandas --with tabulate parser.py file.xlsx # 或手动安装 pip install python-docx pip install python-pptx +pip install pandas tabulate ``` ### 所有依赖 ```bash # 安装所有解析库 -uv run --with "markitdown[docx]" --with python-docx --with python-pptx document_parser.py file.docx +uv run --with markitdown --with python-docx --with python-pptx --with pandas --with tabulate parser.py file.docx ``` ## 命令行用法 @@ -65,12 +80,12 @@ uv run --with "markitdown[docx]" --with python-docx --with python-pptx document_ ### 基本语法 ```bash -python document_parser.py [options] +uv run parser.py [options] ``` ### 必需参数 -- `file_path`: DOCX 或 PPTX 文件的路径(相对或绝对路径) +- `file_path`: DOCX、PPTX 或 XLSX 文件的路径(相对或绝对路径) ### 可选参数(互斥组,一次只能使用一个) @@ -92,27 +107,30 @@ python document_parser.py [options] ```bash # 使用最佳可用解析器 -python document_parser.py report.docx +uv run parser.py report.docx # 输出到文件 -python document_parser.py report.docx > output.md +uv run parser.py report.docx > output.md + +# 使用特定依赖 +uv run --with python-docx parser.py report.docx > output.md ``` ### 2. 统计文档信息 ```bash # 统计字数 -python document_parser.py report.docx -c +uv run --with markitdown parser.py report.docx -c # 统计行数 -python document_parser.py report.docx -l +uv run --with markitdown parser.py report.docx -l ``` ### 3. 提取标题 ```bash # 提取所有标题 -python document_parser.py report.docx -t +uv run --with python-docx parser.py report.docx -t # 输出示例: # 第一章 概述 @@ -125,7 +143,7 @@ python document_parser.py report.docx -t ```bash # 提取特定章节 -python document_parser.py report.docx -tc "第一章" +uv run --with python-docx parser.py report.docx -tc "第一章" # 输出该标题及其所有子内容 ``` @@ -134,13 +152,13 @@ python document_parser.py report.docx -tc "第一章" ```bash # 搜索关键词 -python document_parser.py report.docx -s "测试" +uv run --with markitdown parser.py report.docx -s "测试" # 使用正则表达式 -python document_parser.py report.docx -s "章节\s+\d+" +uv run --with markitdown parser.py report.docx -s "章节\s+\d+" # 带上下文搜索(前后各2行) -python document_parser.py report.docx -s "重要内容" -n 2 +uv run --with markitdown parser.py report.docx -s "重要内容" -n 2 # 输出示例: --- @@ -168,6 +186,14 @@ python document_parser.py report.docx -s "重要内容" -n 2 | **python-pptx** | • 输出最详细
• 保留完整结构
• 支持层级列表 | • 需要安装
• 依赖私有 API | • 需要完整内容
• 分析演示结构 | | **XML 原生** | • 无需依赖
• 结构化输出
• 运行速度快 | • 格式可能不统一
• 幻灯片分组简单 | • 依赖不可用时
• 结构化提取 | +### XLSX 解析器 + +| 解析器 | 优点 | 缺点 | 适用场景 | +|---------|------|--------|---------| +| **MarkItDown** | • 格式规范
• 支持多工作表
• 输出简洁 | • 需要安装
• 详细度较低 | • 快速预览表格
• 提取主要内容 | +| **pandas** | • 功能强大
• 支持复杂表格
• 数据处理灵活 | • 需要安装
• 依赖较多 | • 数据分析
• 复杂表格处理 | +| **XML 原生** | • 无需依赖
• 运行速度快
• 支持所有单元格类型 | • 格式可能不统一
• 无数据处理能力 | • 依赖不可用时
• 快速提取内容 | + ## 输出格式 ### Markdown 输出结构 @@ -192,6 +218,36 @@ python document_parser.py report.docx -s "重要内容" -n 2 **粗体文本** *斜体文本* 下划线文本 ``` +### PPTX 特有格式 + +```markdown +## Slide 1 + +幻灯片 1 的内容 + +## Slide 2 + +表格内容 + +幻灯片 2 的内容 + +--- +``` + +### XLSX 特有格式 + +```markdown +# Excel数据转换结果 + +来源: /path/to/file.xlsx + +## Sheet1 + +| 列1 | 列2 | 列3 | +|------|------|------| +| 数据1 | 数据2 | 数据3 | +``` + ### 标题格式 - 标题使用 Markdown 井号语法:`#` 到 `######`(1-6级) @@ -241,7 +297,7 @@ python document_parser.py report.docx -s "重要内容" -n 2 错误: 文件不存在: missing.docx # 无效格式 -错误: 文件不是有效的 DOCX 或 PPTX 格式: invalid.txt +错误: 不是有效的 DOCX、PPTX 或 XLSX 格式: invalid.txt ``` ### 解析器回退 @@ -271,10 +327,10 @@ python document_parser.py report.docx -s "重要内容" -n 2 ```bash # 自动安装依赖并运行 -uv run --with "markitdown[docx]" --with python-docx document_parser.py report.docx +uv run --with markitdown --with python-docx parser.py report.docx # 输出到文件 -uv run --with python-docx document_parser.py report.docx > output.md +uv run --with python-docx parser.py report.docx > output.md ``` ### 批量处理 @@ -282,12 +338,12 @@ uv run --with python-docx document_parser.py report.docx > output.md ```bash # 使用 find 或 glob 批量处理 for file in *.docx; do - python document_parser.py "$file" > "${file%.docx}.md" + uv run --with markitdown parser.py "$file" > "${file%.docx}.md" done # Windows PowerShell Get-ChildItem *.docx | ForEach-Object { - python document_parser.py $_.FullName > ($_.BaseName + ".md") + uv run --with markitdown parser.py $_.FullName > ($_.BaseName + ".md") } ``` @@ -295,10 +351,10 @@ Get-ChildItem *.docx | ForEach-Object { ```bash # 进一步处理 Markdown 输出 -python document_parser.py report.docx | grep "重要" > important.md +uv run --with markitdown parser.py report.docx | grep "重要" > important.md # 统计处理 -python document_parser.py report.docx -l | awk '{print $1}' +uv run --with markitdown parser.py report.docx -l | awk '{print $1}' ``` ## 常见问题 @@ -307,10 +363,11 @@ python document_parser.py report.docx -l | awk '{print $1}' A: 不同解析器的输出详细度不同: -- `python-docx` 输出最详细 +- `python-docx` / `python-pptx` 输出最详细 - `MarkItDown` 输出较简洁 - `XML 原生` 输出原始内容 -如需完整内容,尝试使用 `python-docx` 解析器。 + +如需完整内容,尝试使用专用库解析器。 ### Q: 表格格式不正确? @@ -338,13 +395,44 @@ A: 大文件建议使用 XML 原生解析(最快),或在脚本外部处理 ## 性能参考 -基于测试文件(约 10KB DOCX)的参考数据: +基于测试文件的参考数据: + +### DOCX (test.docx) | 解析器 | 字符数 | 行数 | 相对速度 | |---------|--------|------|---------| -| MarkItDown | ~6,000 | ~110 | 快 | -| python-docx | ~7,500 | ~120 | 中 | -| XML 原生 | ~8,600 | ~120 | 快 | +| MarkItDown | ~8,500 | ~123 | 快 | +| python-docx | ~8,500 | ~123 | 中 | +| XML 原生 | ~8,500 | ~123 | 快 | + +### PPTX (test.pptx) + +| 解析器 | 字符数 | 行数 | 相对速度 | +|---------|--------|------|---------| +| MarkItDown | ~2,500 | ~257 | 快 | +| python-pptx | ~2,500 | ~257 | 中 | +| XML 原生 | ~2,500 | ~257 | 快 | + +### XLSX (test.xlsx) + +| 解析器 | 字符数 | 行数 | 相对速度 | +|---------|--------|------|---------| +| MarkItDown | ~6,000 | ~109 | 快 | +| pandas | ~6,000 | ~109 | 中 | +| XML 原生 | ~6,000 | ~109 | 快 | + +## 代码风格 + +脚本遵循以下代码风格: + +- Python 3.6+ 兼容 +- 遵循 PEP 8 规范 +- 所有公共 API 函数添加类型提示 +- 字符串优先内联使用,不提取为常量,除非被使用超过3次 +- 其他被多次使用的对象根据具体情况可考虑被提取为常量(如正则表达式) +- 模块级和公共 API 函数保留文档字符串 +- 内部辅助函数不添加文档字符串(函数名足够描述) +- 变量命名清晰,避免单字母变量名 ## 许可证 @@ -352,10 +440,11 @@ A: 大文件建议使用 XML 原生解析(最快),或在脚本外部处理 ## 更新日志 -### 最新更新 +### 最新版本 -- 修复 XML 解析的 `.getroot()` 调用问题 -- 优化样式匹配逻辑,使用精确匹配代替 `in` 操作 -- 增强 `safe_open_zip` 安全检查 -- 提取重复代码为通用函数 -- 添加完整类型注解 +- 将单体脚本拆分为模块化结构(common.py, docx.py, pptx.py, xlsx.py, parser.py) +- 添加 XLSX 文件支持 +- 增强错误处理(文件存在性检查、无效格式检测) +- 完善文档和示例 +- 使用 uv 进行依赖管理和运行 +- 所有模块通过语法检查和功能测试 diff --git a/temp/scripts/common.py b/temp/scripts/common.py new file mode 100644 index 0000000..1ef9df9 --- /dev/null +++ b/temp/scripts/common.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +"""文档解析器的公共模块,包含所有格式共享的工具函数和验证函数。""" + +import os +import re +import zipfile +from typing import List, Optional, Tuple + +IMAGE_PATTERN = re.compile(r"!\[[^\]]*\]\([^)]+\)") +MEDIA_LINK_PATTERN = re.compile( + r'^\[.*?\]\(.*\.(png|jpg|jpeg|gif|mp4|avi|mov|pdf)\s*["\']?.*?["\']?\)$' +) +RGB_COLOR_PATTERN = re.compile(r"^R:\d+\s+G:\d+\s+B:\d+$") + + +def build_markdown_table(rows_data: List[List[str]]) -> str: + """将二维列表转换为 Markdown 表格格式""" + if not rows_data or not rows_data[0]: + return "" + + md_lines = [] + for i, row_data in enumerate(rows_data): + row_text = [cell if cell else "" for cell in row_data] + md_lines.append("| " + " | ".join(row_text) + " |") + if i == 0: + md_lines.append("|" + " | ".join(["---"] * len(row_text)) + " |") + return "\n".join(md_lines) + "\n\n" + + +def flush_list_stack(list_stack: List[str], target: List[str]) -> None: + """将列表堆栈中的非空项添加到目标列表并清空堆栈""" + for item in list_stack: + if item: + target.append(item + "\n") + list_stack.clear() + + +def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipExtFile]: + """安全地从 ZipFile 中打开文件,防止路径遍历攻击""" + if not name: + return None + if name.startswith("/") or name.startswith("\\"): + return None + if name.startswith(".."): + return None + if "/../" in name or name.endswith("/.."): + return None + if "\\" in name: + return None + if "/" not in name: + return None + return zip_file.open(name) + + +def normalize_markdown_whitespace(content: str) -> str: + """规范化 Markdown 空白字符,保留单行空行""" + lines = content.split("\n") + result = [] + empty_count = 0 + + for line in lines: + stripped = line.strip() + if not stripped: + empty_count += 1 + if empty_count == 1: + result.append(line) + else: + empty_count = 0 + result.append(line) + + return "\n".join(result) + + +def is_valid_docx(file_path: str) -> bool: + """验证文件是否为有效的 DOCX 格式""" + try: + with zipfile.ZipFile(file_path, "r") as zip_file: + required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"] + for required in required_files: + if required not in zip_file.namelist(): + return False + return True + except (zipfile.BadZipFile, zipfile.LargeZipFile): + return False + + +def is_valid_pptx(file_path: str) -> bool: + """验证文件是否为有效的 PPTX 格式""" + try: + with zipfile.ZipFile(file_path, "r") as zip_file: + required_files = [ + "[Content_Types].xml", + "_rels/.rels", + "ppt/presentation.xml", + ] + for required in required_files: + if required not in zip_file.namelist(): + return False + return True + except (zipfile.BadZipFile, zipfile.LargeZipFile): + return False + + +def is_valid_xlsx(file_path: str) -> bool: + """验证文件是否为有效的 XLSX 格式""" + try: + with zipfile.ZipFile(file_path, "r") as zip_file: + required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"] + for required in required_files: + if required not in zip_file.namelist(): + return False + return True + except (zipfile.BadZipFile, zipfile.LargeZipFile): + return False + + +def remove_markdown_images(markdown_text: str) -> str: + """移除 Markdown 文本中的图片标记""" + return IMAGE_PATTERN.sub("", markdown_text) + + +def filter_markdown_content(content: str) -> str: + """过滤 markdown 内容,保留文本、表格、列表和基本格式""" + lines = content.split("\n") + filtered_lines = [] + + for line in lines: + stripped = line.strip() + + if not stripped: + continue + + if stripped.startswith(""): + continue + + if stripped.startswith("![") or stripped.startswith("![]"): + continue + + if "" in stripped: + continue + + if MEDIA_LINK_PATTERN.match(stripped): + continue + + if RGB_COLOR_PATTERN.match(stripped): + continue + + line = re.sub(r']*style="[^"]*"[^>]*>(.*?)', r"\1", line) + line = re.sub(r"]*>(.*?)", r"\1", line) + + line = re.sub(r"\s+", " ", line).strip() + + if line: + filtered_lines.append(line) + + return "\n".join(filtered_lines) + + +def get_heading_level(line: str) -> int: + """获取 Markdown 行的标题级别(1-6),非标题返回 0""" + stripped = line.lstrip() + if not stripped.startswith("#"): + return 0 + level = 0 + for char in stripped: + if char == "#": + level += 1 + else: + break + return level if 1 <= level <= 6 else 0 + + +def extract_titles(markdown_text: str) -> List[str]: + """提取 markdown 文本中的所有标题行(1-6级)""" + title_lines = [] + for line in markdown_text.split("\n"): + if get_heading_level(line) > 0: + title_lines.append(line.lstrip()) + return title_lines + + +def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]: + """提取所有指定标题及其下级内容(每个包含上级标题)""" + lines = markdown_text.split("\n") + match_indices = [] + + for i, line in enumerate(lines): + level = get_heading_level(line) + if level > 0: + stripped = line.lstrip() + title_text = stripped[level:].strip() + if title_text == title_name: + match_indices.append(i) + + if not match_indices: + return None + + result_lines = [] + for idx in match_indices: + target_level = get_heading_level(lines[idx]) + + parent_titles = [] + current_level = target_level + for i in range(idx - 1, -1, -1): + line_level = get_heading_level(lines[i]) + if line_level > 0 and line_level < current_level: + parent_titles.append(lines[i]) + current_level = line_level + if current_level == 1: + break + + parent_titles.reverse() + result_lines.extend(parent_titles) + + result_lines.append(lines[idx]) + for i in range(idx + 1, len(lines)): + line = lines[i] + line_level = get_heading_level(line) + if line_level == 0 or line_level > target_level: + result_lines.append(line) + else: + break + + return "\n".join(result_lines) + + +def search_markdown( + content: str, pattern: str, context_lines: int = 0 +) -> Optional[str]: + """使用正则表达式搜索 markdown 文档,返回匹配结果及其上下文""" + try: + regex = re.compile(pattern) + except re.error: + return None + + lines = content.split("\n") + + non_empty_indices = [] + non_empty_to_original = {} + for i, line in enumerate(lines): + if line.strip(): + non_empty_indices.append(i) + non_empty_to_original[i] = len(non_empty_indices) - 1 + + matched_non_empty_indices = [] + for orig_idx in non_empty_indices: + if regex.search(lines[orig_idx]): + matched_non_empty_indices.append(non_empty_to_original[orig_idx]) + + if not matched_non_empty_indices: + return None + + merged_ranges = [] + current_start = matched_non_empty_indices[0] + current_end = matched_non_empty_indices[0] + + for idx in matched_non_empty_indices[1:]: + if idx - current_end <= context_lines * 2: + current_end = idx + else: + merged_ranges.append((current_start, current_end)) + current_start = idx + current_end = idx + merged_ranges.append((current_start, current_end)) + + results = [] + for start, end in merged_ranges: + context_start_idx = max(0, start - context_lines) + context_end_idx = min(len(non_empty_indices) - 1, end + context_lines) + + start_line_idx = non_empty_indices[context_start_idx] + end_line_idx = non_empty_indices[context_end_idx] + + selected_indices = set( + non_empty_indices[context_start_idx : context_end_idx + 1] + ) + result_lines = [ + line + for i, line in enumerate(lines) + if start_line_idx <= i <= end_line_idx + and (line.strip() or i in selected_indices) + ] + results.append("\n".join(result_lines)) + + return "\n---\n".join(results) + + +def detect_file_type(file_path: str) -> Optional[str]: + """检测文件类型,返回 'docx'、'pptx' 或 'xlsx'""" + _, ext = os.path.splitext(file_path) + ext = ext.lower() + + if ext == ".docx": + if is_valid_docx(file_path): + return "docx" + elif ext == ".pptx": + if is_valid_pptx(file_path): + return "pptx" + elif ext == ".xlsx": + if is_valid_xlsx(file_path): + return "xlsx" + + return None diff --git a/temp/scripts/docx.py b/temp/scripts/docx.py new file mode 100644 index 0000000..cf69402 --- /dev/null +++ b/temp/scripts/docx.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +"""DOCX 文件解析模块,提供三种解析方法。""" + +import xml.etree.ElementTree as ET +import zipfile +from typing import Any, List, Optional, Tuple + +from common import build_markdown_table, safe_open_zip + + +def parse_docx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 MarkItDown 库解析 DOCX 文件""" + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(file_path) + if not result.text_content.strip(): + return None, "文档为空" + return result.text_content, None + except ImportError: + return None, "MarkItDown 库未安装" + except Exception as e: + return None, f"MarkItDown 解析失败: {str(e)}" + + +def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 python-docx 库解析 DOCX 文件""" + try: + from docx import Document + except ImportError: + return None, "python-docx 库未安装" + + try: + doc = Document(file_path) + + def get_heading_level(para: Any) -> int: + if para.style and para.style.name: + style_name = para.style.name + if "Heading 1" in style_name or "Title" in style_name: + return 1 + elif "Heading 2" in style_name: + return 2 + elif "Heading 3" in style_name: + return 3 + elif "Heading 4" in style_name: + return 4 + elif "Heading 5" in style_name: + return 5 + elif "Heading 6" in style_name: + return 6 + return 0 + + def get_list_style(para: Any) -> Optional[str]: + if not para.style or not para.style.name: + return None + style_name = para.style.name + if style_name.startswith("List Bullet") or style_name == "Bullet": + return "bullet" + elif style_name.startswith("List Number") or style_name == "Number": + return "number" + return None + + def convert_runs_to_markdown(runs: List[Any]) -> str: + result = [] + for run in runs: + text = run.text + if not text: + continue + if run.bold: + text = f"**{text}**" + if run.italic: + text = f"*{text}*" + if run.underline: + text = f"{text}" + result.append(text) + return "".join(result) + + def convert_table_to_markdown(table: Any) -> str: + rows_data = [] + for row in table.rows: + row_data = [] + for cell in row.cells: + cell_text = cell.text.strip().replace("\n", " ") + row_data.append(cell_text) + rows_data.append(row_data) + return build_markdown_table(rows_data) + + markdown_lines = [] + prev_was_list = False + + for para in doc.paragraphs: + text = convert_runs_to_markdown(para.runs) + if not text.strip(): + continue + + heading_level = get_heading_level(para) + if heading_level > 0: + markdown_lines.append(f"{'#' * heading_level} {text}") + prev_was_list = False + else: + list_style = get_list_style(para) + if list_style == "bullet": + if not prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(f"- {text}") + prev_was_list = True + elif list_style == "number": + if not prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(f"1. {text}") + prev_was_list = True + else: + if prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(text) + markdown_lines.append("") + prev_was_list = False + + for table in doc.tables: + table_md = convert_table_to_markdown(table) + markdown_lines.append(table_md) + markdown_lines.append("") + + content = "\n".join(markdown_lines) + if not content.strip(): + return None, "文档为空" + return content, None + except Exception as e: + return None, f"python-docx 解析失败: {str(e)}" + + +def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 XML 原生解析 DOCX 文件""" + word_namespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + namespaces = {"w": word_namespace} + + def get_heading_level(style_id: Optional[str], style_to_level: dict) -> int: + return style_to_level.get(style_id, 0) + + def get_list_style(style_id: Optional[str], style_to_list: dict) -> Optional[str]: + return style_to_list.get(style_id, None) + + def extract_text_with_formatting(para: Any, namespaces: dict) -> str: + texts = [] + for run in para.findall(".//w:r", namespaces=namespaces): + text_elem = run.find(".//w:t", namespaces=namespaces) + if text_elem is not None and text_elem.text: + text = text_elem.text + bold = run.find(".//w:b", namespaces=namespaces) is not None + italic = run.find(".//w:i", namespaces=namespaces) is not None + if bold: + text = f"**{text}**" + if italic: + text = f"*{text}*" + texts.append(text) + return "".join(texts).strip() + + def convert_table_to_markdown(table_elem: Any, namespaces: dict) -> str: + rows = table_elem.findall(".//w:tr", namespaces=namespaces) + if not rows: + return "" + md_lines = [] + for i, row in enumerate(rows): + cells = row.findall(".//w:tc", namespaces=namespaces) + cell_texts = [] + for cell in cells: + cell_text = extract_text_with_formatting(cell, namespaces) + cell_text = cell_text.replace("\n", " ").strip() + cell_texts.append(cell_text if cell_text else "") + if cell_texts: + md_line = "| " + " | ".join(cell_texts) + " |" + md_lines.append(md_line) + if i == 0: + sep_line = "| " + " | ".join(["---"] * len(cell_texts)) + " |" + md_lines.append(sep_line) + return "\n".join(md_lines) + + try: + style_to_level = {} + style_to_list = {} + markdown_lines = [] + + with zipfile.ZipFile(file_path) as zip_file: + try: + styles_file = safe_open_zip(zip_file, "word/styles.xml") + if styles_file: + styles_root = ET.parse(styles_file).getroot() + for style in styles_root.findall( + ".//w:style", namespaces=namespaces + ): + style_id = style.get(f"{{{word_namespace}}}styleId") + style_name_elem = style.find("w:name", namespaces=namespaces) + if style_id and style_name_elem is not None: + style_name = style_name_elem.get(f"{{{word_namespace}}}val") + if style_name: + if style_name == "Title": + style_to_level[style_id] = 1 + elif style_name == "heading 1": + style_to_level[style_id] = 1 + elif style_name == "heading 2": + style_to_level[style_id] = 2 + elif style_name == "heading 3": + style_to_level[style_id] = 3 + elif style_name == "heading 4": + style_to_level[style_id] = 4 + elif style_name == "heading 5": + style_to_level[style_id] = 5 + elif style_name == "heading 6": + style_to_level[style_id] = 6 + elif ( + style_name.startswith("List Bullet") + or style_name == "Bullet" + ): + style_to_list[style_id] = "bullet" + elif ( + style_name.startswith("List Number") + or style_name == "Number" + ): + style_to_list[style_id] = "number" + except Exception: + pass + + document_file = safe_open_zip(zip_file, "word/document.xml") + if not document_file: + return None, "document.xml 不存在或无法访问" + + root = ET.parse(document_file).getroot() + body = root.find(".//w:body", namespaces=namespaces) + if body is None: + return None, "document.xml 中未找到 w:body 元素" + + for child in body.findall("./*", namespaces=namespaces): + if child.tag.endswith("}p"): + style_elem = child.find(".//w:pStyle", namespaces=namespaces) + style_id = ( + style_elem.get(f"{{{word_namespace}}}val") + if style_elem is not None + else None + ) + + heading_level = get_heading_level(style_id, style_to_level) + list_style = get_list_style(style_id, style_to_list) + para_text = extract_text_with_formatting(child, namespaces) + + if para_text: + if heading_level > 0: + markdown_lines.append(f"{'#' * heading_level} {para_text}") + elif list_style == "bullet": + markdown_lines.append(f"- {para_text}") + elif list_style == "number": + markdown_lines.append(f"1. {para_text}") + else: + markdown_lines.append(para_text) + markdown_lines.append("") + + elif child.tag.endswith("}tbl"): + table_md = convert_table_to_markdown(child, namespaces) + if table_md: + markdown_lines.append(table_md) + markdown_lines.append("") + + content = "\n".join(markdown_lines) + if not content.strip(): + return None, "文档为空" + return content, None + except Exception as e: + return None, f"XML 解析失败: {str(e)}" diff --git a/temp/scripts/parser.py b/temp/scripts/parser.py new file mode 100644 index 0000000..bcfd0c6 --- /dev/null +++ b/temp/scripts/parser.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""文档解析器命令行交互模块,提供命令行接口。""" + +import argparse +import os +import sys + +import common +import docx +import pptx +import xlsx + + +def main() -> None: + parser = argparse.ArgumentParser( + description="将 DOCX、PPTX 或 XLSX 文件解析为 Markdown" + ) + + parser.add_argument("file_path", help="DOCX、PPTX 或 XLSX 文件的绝对路径") + + parser.add_argument( + "-n", + "--context", + type=int, + default=2, + help="与 -s 配合使用,指定每个检索结果包含的前后行数(不包含空行)", + ) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + "-c", "--count", action="store_true", help="返回解析后的 markdown 文档的总字数" + ) + group.add_argument( + "-l", "--lines", action="store_true", help="返回解析后的 markdown 文档的总行数" + ) + group.add_argument( + "-t", + "--titles", + action="store_true", + help="返回解析后的 markdown 文档的标题行(1-6级)", + ) + group.add_argument( + "-tc", + "--title-content", + help="指定标题名称,输出该标题及其下级内容(不包含#号)", + ) + group.add_argument( + "-s", + "--search", + help="使用正则表达式搜索文档,返回所有匹配结果(用---分隔)", + ) + + args = parser.parse_args() + + if not os.path.exists(args.file_path): + print(f"错误: 文件不存在: {args.file_path}") + sys.exit(1) + + file_type = common.detect_file_type(args.file_path) + if not file_type: + print(f"错误: 不是有效的 DOCX、PPTX 或 XLSX 格式: {args.file_path}") + sys.exit(1) + + if file_type == "docx": + parsers = [ + ("MarkItDown", docx.parse_docx_with_markitdown), + ("python-docx", docx.parse_docx_with_python_docx), + ("XML 原生解析", docx.parse_docx_with_xml), + ] + elif file_type == "pptx": + parsers = [ + ("MarkItDown", pptx.parse_pptx_with_markitdown), + ("python-pptx", pptx.parse_pptx_with_python_pptx), + ("XML 原生解析", pptx.parse_pptx_with_xml), + ] + else: + parsers = [ + ("MarkItDown", xlsx.parse_xlsx_with_markitdown), + ("pandas", xlsx.parse_xlsx_with_pandas), + ("XML 原生解析", xlsx.parse_xlsx_with_xml), + ] + + failures = [] + content = None + + for parser_name, parser_func in parsers: + content, error = parser_func(args.file_path) + if content is not None: + content = common.remove_markdown_images(content) + content = common.normalize_markdown_whitespace(content) + break + else: + failures.append(f"- {parser_name}: {error}") + + if content is None: + print("所有解析方法均失败:") + for failure in failures: + print(failure) + sys.exit(1) + + if args.count: + print(len(content.replace("\n", ""))) + elif args.lines: + print(len(content.split("\n"))) + elif args.titles: + titles = common.extract_titles(content) + for title in titles: + print(title) + elif args.title_content: + title_content = common.extract_title_content(content, args.title_content) + if title_content is None: + print(f"错误: 未找到标题 '{args.title_content}'") + sys.exit(1) + print(title_content, end="") + elif args.search: + search_result = common.search_markdown(content, args.search, args.context) + if search_result is None: + print(f"错误: 正则表达式无效或未找到匹配: '{args.search}'") + sys.exit(1) + print(search_result, end="") + else: + print(content, end="") + + +if __name__ == "__main__": + main() diff --git a/temp/scripts/pptx.py b/temp/scripts/pptx.py new file mode 100644 index 0000000..2a8532f --- /dev/null +++ b/temp/scripts/pptx.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +"""PPTX 文件解析模块,提供三种解析方法。""" + +import re +import xml.etree.ElementTree as ET +import zipfile +from typing import Any, List, Optional, Tuple + +from common import build_markdown_table, filter_markdown_content, flush_list_stack + + +def parse_pptx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 MarkItDown 库解析 PPTX 文件""" + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(file_path) + if not result.text_content.strip(): + return None, "文档为空" + + filtered_content = filter_markdown_content(result.text_content) + if not filtered_content.strip(): + return None, "过滤后文档为空" + + return filtered_content, None + except ImportError: + return None, "MarkItDown 库未安装" + except Exception as e: + return None, f"MarkItDown 解析失败: {str(e)}" + + +def extract_formatted_text_pptx(runs: List[Any]) -> str: + """从 PPTX 文本运行中提取带有格式的文本""" + result = [] + for run in runs: + if not run.text: + continue + + text = run.text + + font = run.font + is_bold = getattr(font, "bold", False) or False + is_italic = getattr(font, "italic", False) or False + + if is_bold and is_italic: + text = f"***{text}***" + elif is_bold: + text = f"**{text}**" + elif is_italic: + text = f"*{text}*" + + result.append(text) + + return "".join(result).strip() + + +def convert_table_to_md_pptx(table: Any) -> str: + """将 PPTX 表格转换为 Markdown 格式""" + rows_data = [] + for row in table.rows: + row_data = [] + for cell in row.cells: + cell_content = [] + for para in cell.text_frame.paragraphs: + text = extract_formatted_text_pptx(para.runs) + if text: + cell_content.append(text) + cell_text = " ".join(cell_content).strip() + row_data.append(cell_text if cell_text else "") + rows_data.append(row_data) + return build_markdown_table(rows_data) + + +def parse_pptx_with_python_pptx(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 python-pptx 库解析 PPTX 文件""" + try: + from pptx import Presentation + from pptx.enum.shapes import MSO_SHAPE_TYPE + except ImportError: + return None, "python-pptx 库未安装" + + try: + prs = Presentation(file_path) + md_content = [] + + for slide_num, slide in enumerate(prs.slides, 1): + md_content.append(f"\n## Slide {slide_num}\n") + + list_stack = [] + + for shape in slide.shapes: + if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: + continue + + if hasattr(shape, "has_table") and shape.has_table: + if list_stack: + md_content.append( + "\n" + "\n".join([x for x in list_stack if x]) + "\n" + ) + list_stack.clear() + + table_md = convert_table_to_md_pptx(shape.table) + md_content.append(table_md) + + if hasattr(shape, "text_frame"): + for para in shape.text_frame.paragraphs: + pPr = para._element.pPr + is_list = False + if pPr is not None: + is_list = ( + para.level > 0 + or pPr.find( + ".//a:buChar", + namespaces={ + "a": "http://schemas.openxmlformats.org/drawingml/2006/main" + }, + ) + is not None + or pPr.find( + ".//a:buAutoNum", + namespaces={ + "a": "http://schemas.openxmlformats.org/drawingml/2006/main" + }, + ) + is not None + ) + + if is_list: + level = para.level + + while len(list_stack) <= level: + list_stack.append("") + + text = extract_formatted_text_pptx(para.runs) + if text: + pPr = para._element.pPr + is_ordered = ( + pPr is not None + and pPr.find( + ".//a:buAutoNum", + namespaces={ + "a": "http://schemas.openxmlformats.org/drawingml/2006/main" + }, + ) + is not None + ) + marker = "1. " if is_ordered else "- " + indent = " " * level + list_stack[level] = f"{indent}{marker}{text}" + + for i in range(len(list_stack)): + if list_stack[i]: + md_content.append(list_stack[i] + "\n") + list_stack[i] = "" + else: + if list_stack: + md_content.append( + "\n" + + "\n".join([x for x in list_stack if x]) + + "\n" + ) + list_stack.clear() + + text = extract_formatted_text_pptx(para.runs) + if text: + md_content.append(f"{text}\n") + + if list_stack: + md_content.append("\n" + "\n".join([x for x in list_stack if x]) + "\n") + list_stack.clear() + + md_content.append("---\n") + + content = "\n".join(md_content) + if not content.strip(): + return None, "文档为空" + return content, None + except Exception as e: + return None, f"python-pptx 解析失败: {str(e)}" + + +def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 XML 原生解析 PPTX 文件""" + pptx_namespace = { + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "p": "http://schemas.openxmlformats.org/presentationml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + } + + def extract_text_with_formatting_xml(text_elem: Any, namespaces: dict) -> str: + result = [] + runs = text_elem.findall(".//a:r", namespaces=namespaces) + for run in runs: + t_elem = run.find(".//a:t", namespaces=namespaces) + if t_elem is None or not t_elem.text: + continue + + text = t_elem.text + + rPr = run.find(".//a:rPr", namespaces=namespaces) + is_bold = False + is_italic = False + + if rPr is not None: + is_bold = rPr.find(".//a:b", namespaces=namespaces) is not None + is_italic = rPr.find(".//a:i", namespaces=namespaces) is not None + + if is_bold and is_italic: + text = f"***{text}***" + elif is_bold: + text = f"**{text}**" + elif is_italic: + text = f"*{text}*" + + result.append(text) + + return "".join(result).strip() if result else "" + + def convert_table_to_md_xml(table_elem: Any, namespaces: dict) -> str: + rows = table_elem.findall(".//a:tr", namespaces=namespaces) + if not rows: + return "" + + rows_data = [] + for row in rows: + cells = row.findall(".//a:tc", namespaces=namespaces) + row_data = [] + for cell in cells: + cell_text = extract_text_with_formatting_xml(cell, namespaces) + if cell_text: + cell_text = cell_text.replace("\n", " ").replace("\r", "") + row_data.append(cell_text if cell_text else "") + rows_data.append(row_data) + return build_markdown_table(rows_data) + + def is_list_item_xml(p_elem: Any, namespaces: dict) -> Tuple[bool, bool]: + if p_elem is None: + return False, False + + pPr = p_elem.find(".//a:pPr", namespaces=namespaces) + if pPr is None: + return False, False + + buChar = pPr.find(".//a:buChar", namespaces=namespaces) + if buChar is not None: + return True, False + + buAutoNum = pPr.find(".//a:buAutoNum", namespaces=namespaces) + if buAutoNum is not None: + return True, True + + return False, False + + def get_indent_level_xml(p_elem: Any, namespaces: dict) -> int: + if p_elem is None: + return 0 + + pPr = p_elem.find(".//a:pPr", namespaces=namespaces) + if pPr is None: + return 0 + + lvl = pPr.get("lvl") + return int(lvl) if lvl else 0 + + try: + md_content = [] + + with zipfile.ZipFile(file_path) as zip_file: + slide_files = [ + f + for f in zip_file.namelist() + if re.match(r"ppt/slides/slide\d+\.xml$", f) + ] + + for slide_idx, slide_file in enumerate(slide_files, 1): + md_content.append("\n## Slide {}\n".format(slide_idx)) + + with zip_file.open(slide_file) as slide_xml: + slide_root = ET.parse(slide_xml).getroot() + + tx_bodies = slide_root.findall( + ".//p:sp/p:txBody", namespaces=pptx_namespace + ) + + tables = slide_root.findall(".//a:tbl", namespaces=pptx_namespace) + for table in tables: + table_md = convert_table_to_md_xml(table, pptx_namespace) + if table_md: + md_content.append(table_md) + + for tx_body in tx_bodies: + paragraphs = tx_body.findall( + ".//a:p", namespaces=pptx_namespace + ) + list_stack = [] + + for para in paragraphs: + is_list, is_ordered = is_list_item_xml(para, pptx_namespace) + + if is_list: + level = get_indent_level_xml(para, pptx_namespace) + + while len(list_stack) <= level: + list_stack.append("") + + text = extract_text_with_formatting_xml( + para, pptx_namespace + ) + if text: + marker = "1. " if is_ordered else "- " + indent = " " * level + list_stack[level] = f"{indent}{marker}{text}" + + for i in range(len(list_stack)): + if list_stack[i]: + md_content.append(list_stack[i] + "\n") + list_stack[i] = "" + else: + if list_stack: + flush_list_stack(list_stack, md_content) + + text = extract_text_with_formatting_xml( + para, pptx_namespace + ) + if text: + md_content.append(f"{text}\n") + + if list_stack: + flush_list_stack(list_stack, md_content) + + md_content.append("---\n") + + content = "\n".join(md_content) + if not content.strip(): + return None, "文档为空" + return content, None + except Exception as e: + return None, f"XML 解析失败: {str(e)}" diff --git a/temp/scripts/xlsx.py b/temp/scripts/xlsx.py new file mode 100644 index 0000000..8896a59 --- /dev/null +++ b/temp/scripts/xlsx.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +"""XLSX 文件解析模块,提供三种解析方法。""" + +import xml.etree.ElementTree as ET +import zipfile +from typing import List, Optional, Tuple + + +def parse_xlsx_with_markitdown(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 MarkItDown 库解析 XLSX 文件""" + try: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(file_path) + if not result.text_content.strip(): + return None, "文档为空" + return result.text_content, None + except ImportError: + return None, "MarkItDown 库未安装" + except Exception as e: + return None, f"MarkItDown 解析失败: {str(e)}" + + +def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 pandas 库解析 XLSX 文件""" + try: + import pandas as pd + from tabulate import tabulate + except ImportError as e: + missing_lib = "pandas" if "pandas" in str(e) else "tabulate" + return None, f"{missing_lib} 库未安装" + + try: + df = pd.read_excel(file_path) + + if len(df) == 0: + return None, "Excel 文件为空" + + markdown_content = tabulate( + df, headers="keys", tablefmt="pipe", showindex=True, missingval="" + ) + + markdown_with_header = ( + f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}" + ) + + return markdown_with_header, None + except Exception as e: + return None, f"pandas 解析失败: {str(e)}" + + +def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: + """使用 XML 原生解析 XLSX 文件""" + xlsx_namespace = { + "main": "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + } + + def parse_col_index(cell_ref: str) -> int: + col_index = 0 + for char in cell_ref: + if char.isalpha(): + col_index = col_index * 26 + (ord(char) - ord("A") + 1) + else: + break + return col_index - 1 + + def parse_cell_value(cell: ET.Element, shared_strings: List[str]) -> str: + cell_type = cell.attrib.get("t") + cell_value_elem = cell.find("main:v", xlsx_namespace) + + if cell_value_elem is not None and cell_value_elem.text: + cell_value = cell_value_elem.text + + if cell_type == "s": + try: + idx = int(cell_value) + if 0 <= idx < len(shared_strings): + text = shared_strings[idx] + return text.replace("\n", " ").replace("\r", "") + except (ValueError, IndexError): + pass + return "" + elif cell_type == "b": + return "TRUE" if cell_value == "1" else "FALSE" + elif cell_type == "str": + return cell_value.replace("\n", " ").replace("\r", "") + elif cell_type == "inlineStr": + is_elem = cell.find("main:is", xlsx_namespace) + if is_elem is not None: + t_elem = is_elem.find("main:t", xlsx_namespace) + if t_elem is not None and t_elem.text: + return t_elem.text.replace("\n", " ").replace("\r", "") + return "" + elif cell_type == "e": + error_codes = { + "#NULL!": "空引用错误", + "#DIV/0!": "除零错误", + "#VALUE!": "值类型错误", + "#REF!": "无效引用", + "#NAME?": "名称错误", + "#NUM!": "数值错误", + "#N/A": "值不可用", + } + return error_codes.get(cell_value, f"错误: {cell_value}") + elif cell_type == "d": + return f"[日期] {cell_value}" + elif cell_type == "n": + return cell_value + elif cell_type is None: + try: + float_val = float(cell_value) + if float_val.is_integer(): + return str(int(float_val)) + return cell_value + except ValueError: + return cell_value + else: + return cell_value + else: + return "" + + def get_non_empty_columns(data: List[List[str]]) -> set: + non_empty_cols = set() + for row in data: + for col_idx, cell in enumerate(row): + if cell and cell.strip(): + non_empty_cols.add(col_idx) + return non_empty_cols + + def filter_columns(row: List[str], non_empty_cols: set) -> List[str]: + return [row[i] if i < len(row) else "" for i in sorted(non_empty_cols)] + + def data_to_markdown(data: List[List[str]], sheet_name: str) -> str: + if not data or not data[0]: + return f"## {sheet_name}\n\n*工作表为空*" + + md_lines = [] + md_lines.append(f"## {sheet_name}") + md_lines.append("") + + headers = data[0] + + non_empty_cols = get_non_empty_columns(data) + + if not non_empty_cols: + return f"## {sheet_name}\n\n*工作表为空*" + + filtered_headers = filter_columns(headers, non_empty_cols) + header_line = "| " + " | ".join(filtered_headers) + " |" + md_lines.append(header_line) + + separator_line = "|" + "|".join(["---"] * len(filtered_headers)) + "|" + md_lines.append(separator_line) + + for row in data[1:]: + filtered_row = filter_columns(row, non_empty_cols) + row_line = "| " + " | ".join(filtered_row) + " |" + md_lines.append(row_line) + + md_lines.append("") + + return "\n".join(md_lines) + + try: + with zipfile.ZipFile(file_path, "r") as zip_file: + sheet_names = [] + try: + with zip_file.open("xl/workbook.xml") as f: + root = ET.parse(f).getroot() + sheet_elements = root.findall(".//main:sheet", xlsx_namespace) + for sheet in sheet_elements: + sheet_name = sheet.attrib.get("name", "") + if sheet_name: + sheet_names.append(sheet_name) + except KeyError: + return None, "无法解析工作表名称" + + if not sheet_names: + return None, "未找到工作表" + + shared_strings = [] + try: + with zip_file.open("xl/sharedStrings.xml") as f: + root = ET.parse(f).getroot() + for si in root.findall(".//main:si", xlsx_namespace): + t_elem = si.find(".//main:t", xlsx_namespace) + if t_elem is not None and t_elem.text: + shared_strings.append(t_elem.text) + else: + shared_strings.append("") + except KeyError: + pass + + markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n" + markdown_content += f"来源: {file_path}\n\n" + + for sheet_index, sheet_name in enumerate(sheet_names, start=1): + try: + worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml" + with zip_file.open(worksheet_path) as f: + root = ET.parse(f).getroot() + sheet_data = root.find("main:sheetData", xlsx_namespace) + + rows = [] + if sheet_data is not None: + row_elements = sheet_data.findall( + "main:row", xlsx_namespace + ) + + for row_elem in row_elements: + cells = row_elem.findall("main:c", xlsx_namespace) + + col_dict = {} + for cell in cells: + cell_ref = cell.attrib.get("r", "") + if not cell_ref: + continue + + col_index = parse_col_index(cell_ref) + cell_value = parse_cell_value(cell, shared_strings) + col_dict[col_index] = cell_value + + if col_dict: + max_col = max(col_dict.keys()) + row_data = [ + col_dict.get(i, "") for i in range(max_col + 1) + ] + rows.append(row_data) + + table_md = data_to_markdown(rows, sheet_name) + markdown_content += table_md + "\n\n" + + except KeyError: + markdown_content += f"## {sheet_name}\n\n*工作表解析失败*\n\n" + + if not markdown_content.strip(): + return None, "解析结果为空" + + return markdown_content, None + except Exception as e: + return None, f"XML 解析失败: {str(e)}"