修复bug

2026-02-15 19:53:31 +08:00
parent b022ac736b
commit f30ea08805
6 changed files with 139 additions and 97 deletions
--- a/temp/scripts/common.py
+++ b/temp/scripts/common.py
@@ -23,7 +23,7 @@ def build_markdown_table(rows_data: List[List[str]]) -> str:
        row_text = [cell if cell else "" for cell in row_data]
        md_lines.append("| " + " | ".join(row_text) + " |")
        if i == 0:
-            md_lines.append("|" + " | ".join(["---"] * len(row_text)) + " |")
+            md_lines.append("| " + " | ".join(["---"] * len(row_text)) + " |")
    return "\n".join(md_lines) + "\n\n"
@@ -39,16 +39,12 @@ def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipE
    """安全地从 ZipFile 中打开文件，防止路径遍历攻击"""
    if not name:
        return None
-    if name.startswith("/") or name.startswith("\\"):
+    if name.startswith("/") or name.startswith(".."):
        return None
    if name.startswith(".."):
        return None
    if "/../" in name or name.endswith("/.."):
        return None
    if "\\" in name:
        return None
    if "/" not in name:
        return None
    return zip_file.open(name)
@@ -75,11 +71,9 @@ def is_valid_docx(file_path: str) -> bool:
    """验证文件是否为有效的 DOCX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
            names = set(zip_file.namelist())
            required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
-            for required in required_files:
+            return all(r in names for r in required_files)
                if required not in zip_file.namelist():
                    return False
        return True
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False
@@ -88,15 +82,13 @@ def is_valid_pptx(file_path: str) -> bool:
    """验证文件是否为有效的 PPTX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
            names = set(zip_file.namelist())
            required_files = [
                "[Content_Types].xml",
                "_rels/.rels",
                "ppt/presentation.xml",
            ]
-            for required in required_files:
+            return all(r in names for r in required_files)
                if required not in zip_file.namelist():
                    return False
        return True
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False
@@ -105,11 +97,9 @@ def is_valid_xlsx(file_path: str) -> bool:
    """验证文件是否为有效的 XLSX 格式"""
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
            names = set(zip_file.namelist())
            required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
-            for required in required_files:
+            return all(r in names for r in required_files)
                if required not in zip_file.namelist():
                    return False
        return True
    except (zipfile.BadZipFile, zipfile.LargeZipFile):
        return False
@@ -177,7 +167,13 @@ def get_heading_level(line: str) -> int:
            level += 1
        else:
            break
-    return level if 1 <= level <= 6 else 0
+    if not (1 <= level <= 6):
        return 0
    if len(stripped) == level:
        return level
    if stripped[level] != " ":
        return 0
    return level
 def extract_titles(markdown_text: str) -> List[str]:
@@ -206,7 +202,10 @@ def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
        return None
    result_lines = []
-    for idx in match_indices:
+    for match_num, idx in enumerate(match_indices):
        if match_num > 0:
            result_lines.append("\n---\n")
        target_level = get_heading_level(lines[idx])
        parent_titles = []
@@ -288,7 +287,6 @@ def search_markdown(
            line
            for i, line in enumerate(lines)
            if start_line_idx <= i <= end_line_idx
            and (line.strip() or i in selected_indices)
        ]
        results.append("\n".join(result_lines))
--- a/temp/scripts/docx_parser.py
+++ b/temp/scripts/docx_parser.py
@@ -37,17 +37,19 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
        def get_heading_level(para: Any) -> int:
            if para.style and para.style.name:
                style_name = para.style.name
-                if "Heading 1" in style_name or "Title" in style_name:
+                if style_name == "Title":
                    return 1
-                elif "Heading 2" in style_name:
+                elif style_name == "Heading 1":
                    return 1
                elif style_name == "Heading 2":
                    return 2
-                elif "Heading 3" in style_name:
+                elif style_name == "Heading 3":
                    return 3
-                elif "Heading 4" in style_name:
+                elif style_name == "Heading 4":
                    return 4
-                elif "Heading 5" in style_name:
+                elif style_name == "Heading 5":
                    return 5
-                elif "Heading 6" in style_name:
+                elif style_name == "Heading 6":
                    return 6
            return 0
@@ -89,38 +91,46 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
        markdown_lines = []
        prev_was_list = False
-        for para in doc.paragraphs:
+        from docx.table import Table as DocxTable
-            text = convert_runs_to_markdown(para.runs)
+        from docx.text.paragraph import Paragraph
            if not text.strip():
                continue
-            heading_level = get_heading_level(para)
+        for element in doc.element.body:
-            if heading_level > 0:
+            if element.tag.endswith('}p'):
-                markdown_lines.append(f"{'#' * heading_level} {text}")
+                para = Paragraph(element, doc)
-                prev_was_list = False
+                text = convert_runs_to_markdown(para.runs)
-            else:
+                if not text.strip():
-                list_style = get_list_style(para)
+                    continue
-                if list_style == "bullet":
+
-                    if not prev_was_list and markdown_lines:
+                heading_level = get_heading_level(para)
-                        markdown_lines.append("")
+                if heading_level > 0:
-                    markdown_lines.append(f"- {text}")
+                    markdown_lines.append(f"{'#' * heading_level} {text}")
                    prev_was_list = True
                elif list_style == "number":
                    if not prev_was_list and markdown_lines:
                        markdown_lines.append("")
                    markdown_lines.append(f"1. {text}")
                    prev_was_list = True
                else:
                    if prev_was_list and markdown_lines:
                        markdown_lines.append("")
                    markdown_lines.append(text)
                    markdown_lines.append("")
                    prev_was_list = False
                else:
                    list_style = get_list_style(para)
                    if list_style == "bullet":
                        if not prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(f"- {text}")
                        prev_was_list = True
                    elif list_style == "number":
                        if not prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(f"1. {text}")
                        prev_was_list = True
                    else:
                        if prev_was_list and markdown_lines:
                            markdown_lines.append("")
                        markdown_lines.append(text)
                        markdown_lines.append("")
                        prev_was_list = False
-        for table in doc.tables:
+            elif element.tag.endswith('}tbl'):
-            table_md = convert_table_to_markdown(table)
+                table = DocxTable(element, doc)
-            markdown_lines.append(table_md)
+                table_md = convert_table_to_markdown(table)
-            markdown_lines.append("")
+                if table_md:
                    markdown_lines.append(table_md)
                    markdown_lines.append("")
                prev_was_list = False
        content = "\n".join(markdown_lines)
        if not content.strip():
@@ -194,28 +204,29 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                        if style_id and style_name_elem is not None:
                            style_name = style_name_elem.get(f"{{{word_namespace}}}val")
                            if style_name:
-                                if style_name == "Title":
+                                style_name_lower = style_name.lower()
                                if style_name_lower == "title":
                                    style_to_level[style_id] = 1
-                                elif style_name == "heading 1":
+                                elif style_name_lower == "heading 1":
                                    style_to_level[style_id] = 1
-                                elif style_name == "heading 2":
+                                elif style_name_lower == "heading 2":
                                    style_to_level[style_id] = 2
-                                elif style_name == "heading 3":
+                                elif style_name_lower == "heading 3":
                                    style_to_level[style_id] = 3
-                                elif style_name == "heading 4":
+                                elif style_name_lower == "heading 4":
                                    style_to_level[style_id] = 4
-                                elif style_name == "heading 5":
+                                elif style_name_lower == "heading 5":
                                    style_to_level[style_id] = 5
-                                elif style_name == "heading 6":
+                                elif style_name_lower == "heading 6":
                                    style_to_level[style_id] = 6
                                elif (
-                                    style_name.startswith("List Bullet")
+                                    style_name_lower.startswith("list bullet")
-                                    or style_name == "Bullet"
+                                    or style_name_lower == "bullet"
                                ):
                                    style_to_list[style_id] = "bullet"
                                elif (
-                                    style_name.startswith("List Number")
+                                    style_name_lower.startswith("list number")
-                                    or style_name == "Number"
+                                    or style_name_lower == "number"
                                ):
                                    style_to_list[style_id] = "number"
            except Exception:
--- a/temp/scripts/parser.py
+++ b/temp/scripts/parser.py
@@ -6,10 +6,10 @@ import os
 import sys
 import common
-import docx
+import docx_parser
-import pdf
+import pdf_parser
-import pptx
+import pptx_parser
-import xlsx
+import xlsx_parser
 def main() -> None:
@@ -64,27 +64,27 @@ def main() -> None:
    if file_type == "docx":
        parsers = [
-            ("MarkItDown", docx.parse_docx_with_markitdown),
+            ("MarkItDown", docx_parser.parse_docx_with_markitdown),
-            ("python-docx", docx.parse_docx_with_python_docx),
+            ("python-docx", docx_parser.parse_docx_with_python_docx),
-            ("XML 原生解析", docx.parse_docx_with_xml),
+            ("XML 原生解析", docx_parser.parse_docx_with_xml),
        ]
    elif file_type == "pptx":
        parsers = [
-            ("MarkItDown", pptx.parse_pptx_with_markitdown),
+            ("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
-            ("python-pptx", pptx.parse_pptx_with_python_pptx),
+            ("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
-            ("XML 原生解析", pptx.parse_pptx_with_xml),
+            ("XML 原生解析", pptx_parser.parse_pptx_with_xml),
        ]
    elif file_type == "xlsx":
        parsers = [
-            ("MarkItDown", xlsx.parse_xlsx_with_markitdown),
+            ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
-            ("pandas", xlsx.parse_xlsx_with_pandas),
+            ("pandas", xlsx_parser.parse_xlsx_with_pandas),
-            ("XML 原生解析", xlsx.parse_xlsx_with_xml),
+            ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
        ]
    else:
        parsers = [
-            ("MarkItDown", pdf.parse_pdf_with_markitdown),
+            ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
-            ("unstructured", pdf.parse_pdf_with_unstructured),
+            ("unstructured", pdf_parser.parse_pdf_with_unstructured),
-            ("pypdf", pdf.parse_pdf_with_pypdf),
+            ("pypdf", pdf_parser.parse_pdf_with_pypdf),
        ]
    failures = []
--- a/temp/scripts/pdf_parser.py
+++ b/temp/scripts/pdf_parser.py
--- a/temp/scripts/pptx_parser.py
+++ b/temp/scripts/pptx_parser.py
@@ -272,6 +272,9 @@ def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                for f in zip_file.namelist()
                if re.match(r"ppt/slides/slide\d+\.xml$", f)
            ]
            slide_files.sort(
                key=lambda f: int(re.search(r"slide(\d+)\.xml$", f).group(1))
            )
            for slide_idx, slide_file in enumerate(slide_files, 1):
                md_content.append("\n## Slide {}\n".format(slide_idx))
--- a/temp/scripts/xlsx_parser.py
+++ b/temp/scripts/xlsx_parser.py
@@ -32,20 +32,25 @@ def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]
        return None, f"{missing_lib} 库未安装"
    try:
-        df = pd.read_excel(file_path)
+        sheets = pd.read_excel(file_path, sheet_name=None)
-        if len(df) == 0:
+        markdown_parts = []
        for sheet_name, df in sheets.items():
            if len(df) == 0:
                markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
                continue
            table_md = tabulate(
                df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
            )
            markdown_parts.append(f"## {sheet_name}\n\n{table_md}")
        if not markdown_parts:
            return None, "Excel 文件为空"
-        markdown_content = tabulate(
+        markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
            df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
        )
-        markdown_with_header = (
+        return markdown_content, None
            f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}"
        )
        return markdown_with_header, None
    except Exception as e:
        return None, f"pandas 解析失败: {str(e)}"
@@ -150,7 +155,7 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
        header_line = "| " + " | ".join(filtered_headers) + " |"
        md_lines.append(header_line)
-        separator_line = "|" + "|".join(["---"] * len(filtered_headers)) + "|"
+        separator_line = "| " + " | ".join(["---"] * len(filtered_headers)) + " |"
        md_lines.append(separator_line)
        for row in data[1:]:
@@ -165,20 +170,37 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
    try:
        with zipfile.ZipFile(file_path, "r") as zip_file:
            sheet_names = []
            sheet_rids = []
            try:
                with zip_file.open("xl/workbook.xml") as f:
                    root = ET.parse(f).getroot()
                    rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
                    sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
                    for sheet in sheet_elements:
                        sheet_name = sheet.attrib.get("name", "")
                        rid = sheet.attrib.get(f"{{{rel_ns}}}id", "")
                        if sheet_name:
                            sheet_names.append(sheet_name)
                            sheet_rids.append(rid)
            except KeyError:
                return None, "无法解析工作表名称"
            if not sheet_names:
                return None, "未找到工作表"
            rid_to_target = {}
            try:
                rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships"
                with zip_file.open("xl/_rels/workbook.xml.rels") as f:
                    rels_root = ET.parse(f).getroot()
                    for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"):
                        rid = rel.attrib.get("Id", "")
                        target = rel.attrib.get("Target", "")
                        if rid and target:
                            rid_to_target[rid] = target
            except KeyError:
                pass
            shared_strings = []
            try:
                with zip_file.open("xl/sharedStrings.xml") as f:
@@ -193,11 +215,19 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
                pass
            markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
            markdown_content += f"来源: {file_path}\n\n"
-            for sheet_index, sheet_name in enumerate(sheet_names, start=1):
+            for sheet_index, sheet_name in enumerate(sheet_names):
                rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else ""
                target = rid_to_target.get(rid, "")
                if target:
                    if target.startswith("/"):
                        worksheet_path = target.lstrip("/")
                    else:
                        worksheet_path = f"xl/{target}"
                else:
                    worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml"
                try:
                    worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml"
                    with zip_file.open(worksheet_path) as f:
                        root = ET.parse(f).getroot()
                        sheet_data = root.find("main:sheetData", xlsx_namespace)