修复bug

2026-02-15 19:53:31 +08:00
parent b022ac736b
commit f30ea08805
6 changed files with 139 additions and 97 deletions
--- a/temp/scripts/common.py
+++ b/temp/scripts/common.py
@@ -39,16 +39,12 @@ def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipE
    """安全地从 ZipFile 中打开文件，防止路径遍历攻击"""
    if not name:
        return None
-    if name.startswith("/") or name.startswith("\\"):
-        return None
-    if name.startswith(".."):
+    if name.startswith("/") or name.startswith(".."):
        return None
    if "/../" in name or name.endswith("/.."):
        return None
    if "\\" in name:
        return None
-    if "/" not in name:
-        return None
    return zip_file.open(name)


@@ -75,11 +71,9 @@ def is_valid_docx(file_path: str) -> bool:
    """验证文件是否为有效的 DOCX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
+            names = set(zip_file.namelist())
            required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
-            for required in required_files:
-                if required not in zip_file.namelist():
-                    return False
-        return True
+            return all(r in names for r in required_files)
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False

@@ -88,15 +82,13 @@ def is_valid_pptx(file_path: str) -> bool:
    """验证文件是否为有效的 PPTX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
+            names = set(zip_file.namelist())
            required_files = [
                "[Content_Types].xml",
                "_rels/.rels",
                "ppt/presentation.xml",
            ]
-            for required in required_files:
-                if required not in zip_file.namelist():
-                    return False
-        return True
+            return all(r in names for r in required_files)
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False

@@ -105,11 +97,9 @@ def is_valid_xlsx(file_path: str) -> bool:
    """验证文件是否为有效的 XLSX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
+            names = set(zip_file.namelist())
            required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
-            for required in required_files:
-                if required not in zip_file.namelist():
-                    return False
-        return True
+            return all(r in names for r in required_files)
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False

@@ -177,7 +167,13 @@ def get_heading_level(line: str) -> int:
            level += 1
        else:
            break
-    return level if 1 <= level <= 6 else 0
+    if not (1 <= level <= 6):
+        return 0
+    if len(stripped) == level:
+        return level
+    if stripped[level] != " ":
+        return 0
+    return level


 def extract_titles(markdown_text: str) -> List[str]:
@@ -206,7 +202,10 @@ def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
        return None

    result_lines = []
-    for idx in match_indices:
+    for match_num, idx in enumerate(match_indices):
+        if match_num > 0:
+            result_lines.append("\n---\n")
+
        target_level = get_heading_level(lines[idx])

        parent_titles = []
@@ -288,7 +287,6 @@ def search_markdown(
            line
            for i, line in enumerate(lines)
            if start_line_idx <= i <= end_line_idx
-            and (line.strip() or i in selected_indices)
        ]
        results.append("\n".join(result_lines))

--- a/temp/scripts/docx_parser.py
+++ b/temp/scripts/docx_parser.py
@@ -37,17 +37,19 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
        def get_heading_level(para: Any) -> int:
            if para.style and para.style.name:
                style_name = para.style.name
-                if "Heading 1" in style_name or "Title" in style_name:
+                if style_name == "Title":
                    return 1
-                elif "Heading 2" in style_name:
+                elif style_name == "Heading 1":
+                    return 1
+                elif style_name == "Heading 2":
                    return 2
-                elif "Heading 3" in style_name:
+                elif style_name == "Heading 3":
                    return 3
-                elif "Heading 4" in style_name:
+                elif style_name == "Heading 4":
                    return 4
-                elif "Heading 5" in style_name:
+                elif style_name == "Heading 5":
                    return 5
-                elif "Heading 6" in style_name:
+                elif style_name == "Heading 6":
                    return 6
            return 0

@@ -89,7 +91,12 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
        markdown_lines = []
        prev_was_list = False

-        for para in doc.paragraphs:
+        from docx.table import Table as DocxTable
+        from docx.text.paragraph import Paragraph
+
+        for element in doc.element.body:
+            if element.tag.endswith('}p'):
+                para = Paragraph(element, doc)
                text = convert_runs_to_markdown(para.runs)
                if not text.strip():
                    continue
@@ -117,10 +124,13 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
                        markdown_lines.append("")
                        prev_was_list = False

-        for table in doc.tables:
+            elif element.tag.endswith('}tbl'):
+                table = DocxTable(element, doc)
                table_md = convert_table_to_markdown(table)
+                if table_md:
                    markdown_lines.append(table_md)
                    markdown_lines.append("")
+                prev_was_list = False

        content = "\n".join(markdown_lines)
        if not content.strip():
@@ -194,28 +204,29 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                        if style_id and style_name_elem is not None:
                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
                            if style_name:
-                                if style_name == "Title":
+                                style_name_lower = style_name.lower()
+                                if style_name_lower == "title":
                                    style_to_level[style_id] = 1
-                                elif style_name == "heading 1":
+                                elif style_name_lower == "heading 1":
                                    style_to_level[style_id] = 1
-                                elif style_name == "heading 2":
+                                elif style_name_lower == "heading 2":
                                    style_to_level[style_id] = 2
-                                elif style_name == "heading 3":
+                                elif style_name_lower == "heading 3":
                                    style_to_level[style_id] = 3
-                                elif style_name == "heading 4":
+                                elif style_name_lower == "heading 4":
                                    style_to_level[style_id] = 4
-                                elif style_name == "heading 5":
+                                elif style_name_lower == "heading 5":
                                    style_to_level[style_id] = 5
-                                elif style_name == "heading 6":
+                                elif style_name_lower == "heading 6":
                                    style_to_level[style_id] = 6
                                elif (
-                                    style_name.startswith("List Bullet")
-                                    or style_name == "Bullet"
+                                    style_name_lower.startswith("list bullet")
+                                    or style_name_lower == "bullet"
                                ):
                                    style_to_list[style_id] = "bullet"
                                elif (
-                                    style_name.startswith("List Number")
-                                    or style_name == "Number"
+                                    style_name_lower.startswith("list number")
+                                    or style_name_lower == "number"
                                ):
                                    style_to_list[style_id] = "number"
            except Exception:
--- a/temp/scripts/parser.py
+++ b/temp/scripts/parser.py
@@ -6,10 +6,10 @@ import os
 import sys

 import common
-import docx
-import pdf
-import pptx
-import xlsx
+import docx_parser
+import pdf_parser
+import pptx_parser
+import xlsx_parser


 def main() -> None:
@@ -64,27 +64,27 @@ def main() -> None:

    if file_type == "docx":
        parsers = [
-            ("MarkItDown", docx.parse_docx_with_markitdown),
-            ("python-docx", docx.parse_docx_with_python_docx),
-            ("XML 原生解析", docx.parse_docx_with_xml),
+            ("MarkItDown", docx_parser.parse_docx_with_markitdown),
+            ("python-docx", docx_parser.parse_docx_with_python_docx),
+            ("XML 原生解析", docx_parser.parse_docx_with_xml),
        ]
    elif file_type == "pptx":
        parsers = [
-            ("MarkItDown", pptx.parse_pptx_with_markitdown),
-            ("python-pptx", pptx.parse_pptx_with_python_pptx),
-            ("XML 原生解析", pptx.parse_pptx_with_xml),
+            ("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
+            ("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
+            ("XML 原生解析", pptx_parser.parse_pptx_with_xml),
        ]
    elif file_type == "xlsx":
        parsers = [
-            ("MarkItDown", xlsx.parse_xlsx_with_markitdown),
-            ("pandas", xlsx.parse_xlsx_with_pandas),
-            ("XML 原生解析", xlsx.parse_xlsx_with_xml),
+            ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
+            ("pandas", xlsx_parser.parse_xlsx_with_pandas),
+            ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
        ]
    else:
        parsers = [
-            ("MarkItDown", pdf.parse_pdf_with_markitdown),
-            ("unstructured", pdf.parse_pdf_with_unstructured),
-            ("pypdf", pdf.parse_pdf_with_pypdf),
+            ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
+            ("unstructured", pdf_parser.parse_pdf_with_unstructured),
+            ("pypdf", pdf_parser.parse_pdf_with_pypdf),
        ]

    failures = []
--- a/temp/scripts/pdf_parser.py
+++ b/temp/scripts/pdf_parser.py
--- a/temp/scripts/pptx_parser.py
+++ b/temp/scripts/pptx_parser.py
@@ -272,6 +272,9 @@ def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                for f in zip_file.namelist()
                if re.match(r"ppt/slides/slide\d+\.xml$", f)
            ]
+            slide_files.sort(
+                key=lambda f: int(re.search(r"slide(\d+)\.xml$", f).group(1))
+            )

            for slide_idx, slide_file in enumerate(slide_files, 1):
                md_content.append("\n## Slide {}\n".format(slide_idx))
--- a/temp/scripts/xlsx_parser.py
+++ b/temp/scripts/xlsx_parser.py
@@ -32,20 +32,25 @@ def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]
        return None, f"{missing_lib} 库未安装"

    try:
-        df = pd.read_excel(file_path)
+        sheets = pd.read_excel(file_path, sheet_name=None)

+        markdown_parts = []
+        for sheet_name, df in sheets.items():
            if len(df) == 0:
-            return None, "Excel 文件为空"
+                markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
+                continue

-        markdown_content = tabulate(
+            table_md = tabulate(
                df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
            )
+            markdown_parts.append(f"## {sheet_name}\n\n{table_md}")

-        markdown_with_header = (
-            f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}"
-        )
+        if not markdown_parts:
+            return None, "Excel 文件为空"

-        return markdown_with_header, None
+        markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
+
+        return markdown_content, None
    except Exception as e:
        return None, f"pandas 解析失败: {str(e)}"

@@ -165,20 +170,37 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
            sheet_names = []
+            sheet_rids = []
            try:
                with zip_file.open("xl/workbook.xml") as f:
                    root = ET.parse(f).getroot()
+                    rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
                    sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
                    for sheet in sheet_elements:
                        sheet_name = sheet.attrib.get("name", "")
+                        rid = sheet.attrib.get(f"{{{rel_ns}}}id", "")
                        if sheet_name:
                            sheet_names.append(sheet_name)
+                            sheet_rids.append(rid)
            except KeyError:
                return None, "无法解析工作表名称"

            if not sheet_names:
                return None, "未找到工作表"

+            rid_to_target = {}
+            try:
+                rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships"
+                with zip_file.open("xl/_rels/workbook.xml.rels") as f:
+                    rels_root = ET.parse(f).getroot()
+                    for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"):
+                        rid = rel.attrib.get("Id", "")
+                        target = rel.attrib.get("Target", "")
+                        if rid and target:
+                            rid_to_target[rid] = target
+            except KeyError:
+                pass
+
            shared_strings = []
            try:
                with zip_file.open("xl/sharedStrings.xml") as f:
@@ -193,11 +215,19 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                pass

            markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
-            markdown_content += f"来源: {file_path}\n\n"

-            for sheet_index, sheet_name in enumerate(sheet_names, start=1):
+            for sheet_index, sheet_name in enumerate(sheet_names):
+                rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else ""
+                target = rid_to_target.get(rid, "")
+                if target:
+                    if target.startswith("/"):
+                        worksheet_path = target.lstrip("/")
+                    else:
+                        worksheet_path = f"xl/{target}"
+                else:
+                    worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml"
+
                try:
-                    worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml"
                    with zip_file.open(worksheet_path) as f:
                        root = ET.parse(f).getroot()
                        sheet_data = root.find("main:sheetData", xlsx_namespace)