diff --git a/temp/scripts/common.py b/temp/scripts/common.py index 73f101d..f65fabb 100644 --- a/temp/scripts/common.py +++ b/temp/scripts/common.py @@ -23,7 +23,7 @@ def build_markdown_table(rows_data: List[List[str]]) -> str: row_text = [cell if cell else "" for cell in row_data] md_lines.append("| " + " | ".join(row_text) + " |") if i == 0: - md_lines.append("|" + " | ".join(["---"] * len(row_text)) + " |") + md_lines.append("| " + " | ".join(["---"] * len(row_text)) + " |") return "\n".join(md_lines) + "\n\n" @@ -39,16 +39,12 @@ def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipE """安全地从 ZipFile 中打开文件,防止路径遍历攻击""" if not name: return None - if name.startswith("/") or name.startswith("\\"): - return None - if name.startswith(".."): + if name.startswith("/") or name.startswith(".."): return None if "/../" in name or name.endswith("/.."): return None if "\\" in name: return None - if "/" not in name: - return None return zip_file.open(name) @@ -75,11 +71,9 @@ def is_valid_docx(file_path: str) -> bool: """验证文件是否为有效的 DOCX 格式""" try: with zipfile.ZipFile(file_path, "r") as zip_file: + names = set(zip_file.namelist()) required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True + return all(r in names for r in required_files) except (zipfile.BadZipFile, zipfile.LargeZipFile): return False @@ -88,15 +82,13 @@ def is_valid_pptx(file_path: str) -> bool: """验证文件是否为有效的 PPTX 格式""" try: with zipfile.ZipFile(file_path, "r") as zip_file: + names = set(zip_file.namelist()) required_files = [ "[Content_Types].xml", "_rels/.rels", "ppt/presentation.xml", ] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True + return all(r in names for r in required_files) except (zipfile.BadZipFile, zipfile.LargeZipFile): return False @@ -105,11 +97,9 @@ def is_valid_xlsx(file_path: str) -> bool: """验证文件是否为有效的 XLSX 格式""" try: with zipfile.ZipFile(file_path, "r") as zip_file: + names = set(zip_file.namelist()) required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"] - for required in required_files: - if required not in zip_file.namelist(): - return False - return True + return all(r in names for r in required_files) except (zipfile.BadZipFile, zipfile.LargeZipFile): return False @@ -177,7 +167,13 @@ def get_heading_level(line: str) -> int: level += 1 else: break - return level if 1 <= level <= 6 else 0 + if not (1 <= level <= 6): + return 0 + if len(stripped) == level: + return level + if stripped[level] != " ": + return 0 + return level def extract_titles(markdown_text: str) -> List[str]: @@ -206,7 +202,10 @@ def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]: return None result_lines = [] - for idx in match_indices: + for match_num, idx in enumerate(match_indices): + if match_num > 0: + result_lines.append("\n---\n") + target_level = get_heading_level(lines[idx]) parent_titles = [] @@ -288,7 +287,6 @@ def search_markdown( line for i, line in enumerate(lines) if start_line_idx <= i <= end_line_idx - and (line.strip() or i in selected_indices) ] results.append("\n".join(result_lines)) diff --git a/temp/scripts/docx.py b/temp/scripts/docx_parser.py similarity index 76% rename from temp/scripts/docx.py rename to temp/scripts/docx_parser.py index cf69402..cdf31b8 100644 --- a/temp/scripts/docx.py +++ b/temp/scripts/docx_parser.py @@ -37,17 +37,19 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional def get_heading_level(para: Any) -> int: if para.style and para.style.name: style_name = para.style.name - if "Heading 1" in style_name or "Title" in style_name: + if style_name == "Title": return 1 - elif "Heading 2" in style_name: + elif style_name == "Heading 1": + return 1 + elif style_name == "Heading 2": return 2 - elif "Heading 3" in style_name: + elif style_name == "Heading 3": return 3 - elif "Heading 4" in style_name: + elif style_name == "Heading 4": return 4 - elif "Heading 5" in style_name: + elif style_name == "Heading 5": return 5 - elif "Heading 6" in style_name: + elif style_name == "Heading 6": return 6 return 0 @@ -89,38 +91,46 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional markdown_lines = [] prev_was_list = False - for para in doc.paragraphs: - text = convert_runs_to_markdown(para.runs) - if not text.strip(): - continue + from docx.table import Table as DocxTable + from docx.text.paragraph import Paragraph - heading_level = get_heading_level(para) - if heading_level > 0: - markdown_lines.append(f"{'#' * heading_level} {text}") - prev_was_list = False - else: - list_style = get_list_style(para) - if list_style == "bullet": - if not prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(f"- {text}") - prev_was_list = True - elif list_style == "number": - if not prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(f"1. {text}") - prev_was_list = True - else: - if prev_was_list and markdown_lines: - markdown_lines.append("") - markdown_lines.append(text) - markdown_lines.append("") + for element in doc.element.body: + if element.tag.endswith('}p'): + para = Paragraph(element, doc) + text = convert_runs_to_markdown(para.runs) + if not text.strip(): + continue + + heading_level = get_heading_level(para) + if heading_level > 0: + markdown_lines.append(f"{'#' * heading_level} {text}") prev_was_list = False + else: + list_style = get_list_style(para) + if list_style == "bullet": + if not prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(f"- {text}") + prev_was_list = True + elif list_style == "number": + if not prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(f"1. {text}") + prev_was_list = True + else: + if prev_was_list and markdown_lines: + markdown_lines.append("") + markdown_lines.append(text) + markdown_lines.append("") + prev_was_list = False - for table in doc.tables: - table_md = convert_table_to_markdown(table) - markdown_lines.append(table_md) - markdown_lines.append("") + elif element.tag.endswith('}tbl'): + table = DocxTable(element, doc) + table_md = convert_table_to_markdown(table) + if table_md: + markdown_lines.append(table_md) + markdown_lines.append("") + prev_was_list = False content = "\n".join(markdown_lines) if not content.strip(): @@ -194,28 +204,29 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: if style_id and style_name_elem is not None: style_name = style_name_elem.get(f"{{{word_namespace}}}val") if style_name: - if style_name == "Title": + style_name_lower = style_name.lower() + if style_name_lower == "title": style_to_level[style_id] = 1 - elif style_name == "heading 1": + elif style_name_lower == "heading 1": style_to_level[style_id] = 1 - elif style_name == "heading 2": + elif style_name_lower == "heading 2": style_to_level[style_id] = 2 - elif style_name == "heading 3": + elif style_name_lower == "heading 3": style_to_level[style_id] = 3 - elif style_name == "heading 4": + elif style_name_lower == "heading 4": style_to_level[style_id] = 4 - elif style_name == "heading 5": + elif style_name_lower == "heading 5": style_to_level[style_id] = 5 - elif style_name == "heading 6": + elif style_name_lower == "heading 6": style_to_level[style_id] = 6 elif ( - style_name.startswith("List Bullet") - or style_name == "Bullet" + style_name_lower.startswith("list bullet") + or style_name_lower == "bullet" ): style_to_list[style_id] = "bullet" elif ( - style_name.startswith("List Number") - or style_name == "Number" + style_name_lower.startswith("list number") + or style_name_lower == "number" ): style_to_list[style_id] = "number" except Exception: diff --git a/temp/scripts/parser.py b/temp/scripts/parser.py index a8a6393..0eda960 100644 --- a/temp/scripts/parser.py +++ b/temp/scripts/parser.py @@ -6,10 +6,10 @@ import os import sys import common -import docx -import pdf -import pptx -import xlsx +import docx_parser +import pdf_parser +import pptx_parser +import xlsx_parser def main() -> None: @@ -64,27 +64,27 @@ def main() -> None: if file_type == "docx": parsers = [ - ("MarkItDown", docx.parse_docx_with_markitdown), - ("python-docx", docx.parse_docx_with_python_docx), - ("XML 原生解析", docx.parse_docx_with_xml), + ("MarkItDown", docx_parser.parse_docx_with_markitdown), + ("python-docx", docx_parser.parse_docx_with_python_docx), + ("XML 原生解析", docx_parser.parse_docx_with_xml), ] elif file_type == "pptx": parsers = [ - ("MarkItDown", pptx.parse_pptx_with_markitdown), - ("python-pptx", pptx.parse_pptx_with_python_pptx), - ("XML 原生解析", pptx.parse_pptx_with_xml), + ("MarkItDown", pptx_parser.parse_pptx_with_markitdown), + ("python-pptx", pptx_parser.parse_pptx_with_python_pptx), + ("XML 原生解析", pptx_parser.parse_pptx_with_xml), ] elif file_type == "xlsx": parsers = [ - ("MarkItDown", xlsx.parse_xlsx_with_markitdown), - ("pandas", xlsx.parse_xlsx_with_pandas), - ("XML 原生解析", xlsx.parse_xlsx_with_xml), + ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown), + ("pandas", xlsx_parser.parse_xlsx_with_pandas), + ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml), ] else: parsers = [ - ("MarkItDown", pdf.parse_pdf_with_markitdown), - ("unstructured", pdf.parse_pdf_with_unstructured), - ("pypdf", pdf.parse_pdf_with_pypdf), + ("MarkItDown", pdf_parser.parse_pdf_with_markitdown), + ("unstructured", pdf_parser.parse_pdf_with_unstructured), + ("pypdf", pdf_parser.parse_pdf_with_pypdf), ] failures = [] diff --git a/temp/scripts/pdf.py b/temp/scripts/pdf_parser.py similarity index 100% rename from temp/scripts/pdf.py rename to temp/scripts/pdf_parser.py diff --git a/temp/scripts/pptx.py b/temp/scripts/pptx_parser.py similarity index 99% rename from temp/scripts/pptx.py rename to temp/scripts/pptx_parser.py index 2a8532f..e8373b2 100644 --- a/temp/scripts/pptx.py +++ b/temp/scripts/pptx_parser.py @@ -272,6 +272,9 @@ def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: for f in zip_file.namelist() if re.match(r"ppt/slides/slide\d+\.xml$", f) ] + slide_files.sort( + key=lambda f: int(re.search(r"slide(\d+)\.xml$", f).group(1)) + ) for slide_idx, slide_file in enumerate(slide_files, 1): md_content.append("\n## Slide {}\n".format(slide_idx)) diff --git a/temp/scripts/xlsx.py b/temp/scripts/xlsx_parser.py similarity index 81% rename from temp/scripts/xlsx.py rename to temp/scripts/xlsx_parser.py index 8896a59..877c9d8 100644 --- a/temp/scripts/xlsx.py +++ b/temp/scripts/xlsx_parser.py @@ -32,20 +32,25 @@ def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str] return None, f"{missing_lib} 库未安装" try: - df = pd.read_excel(file_path) + sheets = pd.read_excel(file_path, sheet_name=None) - if len(df) == 0: + markdown_parts = [] + for sheet_name, df in sheets.items(): + if len(df) == 0: + markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*") + continue + + table_md = tabulate( + df, headers="keys", tablefmt="pipe", showindex=True, missingval="" + ) + markdown_parts.append(f"## {sheet_name}\n\n{table_md}") + + if not markdown_parts: return None, "Excel 文件为空" - markdown_content = tabulate( - df, headers="keys", tablefmt="pipe", showindex=True, missingval="" - ) + markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts) - markdown_with_header = ( - f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}" - ) - - return markdown_with_header, None + return markdown_content, None except Exception as e: return None, f"pandas 解析失败: {str(e)}" @@ -150,7 +155,7 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: header_line = "| " + " | ".join(filtered_headers) + " |" md_lines.append(header_line) - separator_line = "|" + "|".join(["---"] * len(filtered_headers)) + "|" + separator_line = "| " + " | ".join(["---"] * len(filtered_headers)) + " |" md_lines.append(separator_line) for row in data[1:]: @@ -165,20 +170,37 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: try: with zipfile.ZipFile(file_path, "r") as zip_file: sheet_names = [] + sheet_rids = [] try: with zip_file.open("xl/workbook.xml") as f: root = ET.parse(f).getroot() + rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" sheet_elements = root.findall(".//main:sheet", xlsx_namespace) for sheet in sheet_elements: sheet_name = sheet.attrib.get("name", "") + rid = sheet.attrib.get(f"{{{rel_ns}}}id", "") if sheet_name: sheet_names.append(sheet_name) + sheet_rids.append(rid) except KeyError: return None, "无法解析工作表名称" if not sheet_names: return None, "未找到工作表" + rid_to_target = {} + try: + rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships" + with zip_file.open("xl/_rels/workbook.xml.rels") as f: + rels_root = ET.parse(f).getroot() + for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"): + rid = rel.attrib.get("Id", "") + target = rel.attrib.get("Target", "") + if rid and target: + rid_to_target[rid] = target + except KeyError: + pass + shared_strings = [] try: with zip_file.open("xl/sharedStrings.xml") as f: @@ -193,11 +215,19 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]: pass markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n" - markdown_content += f"来源: {file_path}\n\n" - for sheet_index, sheet_name in enumerate(sheet_names, start=1): + for sheet_index, sheet_name in enumerate(sheet_names): + rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else "" + target = rid_to_target.get(rid, "") + if target: + if target.startswith("/"): + worksheet_path = target.lstrip("/") + else: + worksheet_path = f"xl/{target}" + else: + worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml" + try: - worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml" with zip_file.open(worksheet_path) as f: root = ET.parse(f).getroot() sheet_data = root.find("main:sheetData", xlsx_namespace)