1
0

修复bug

This commit is contained in:
2026-02-15 19:53:31 +08:00
parent b022ac736b
commit f30ea08805
6 changed files with 139 additions and 97 deletions

View File

@@ -39,16 +39,12 @@ def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipE
"""安全地从 ZipFile 中打开文件,防止路径遍历攻击""" """安全地从 ZipFile 中打开文件,防止路径遍历攻击"""
if not name: if not name:
return None return None
if name.startswith("/") or name.startswith("\\"): if name.startswith("/") or name.startswith(".."):
return None
if name.startswith(".."):
return None return None
if "/../" in name or name.endswith("/.."): if "/../" in name or name.endswith("/.."):
return None return None
if "\\" in name: if "\\" in name:
return None return None
if "/" not in name:
return None
return zip_file.open(name) return zip_file.open(name)
@@ -75,11 +71,9 @@ def is_valid_docx(file_path: str) -> bool:
"""验证文件是否为有效的 DOCX 格式""" """验证文件是否为有效的 DOCX 格式"""
try: try:
with zipfile.ZipFile(file_path, "r") as zip_file: with zipfile.ZipFile(file_path, "r") as zip_file:
names = set(zip_file.namelist())
required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"] required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
for required in required_files: return all(r in names for r in required_files)
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile): except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False return False
@@ -88,15 +82,13 @@ def is_valid_pptx(file_path: str) -> bool:
"""验证文件是否为有效的 PPTX 格式""" """验证文件是否为有效的 PPTX 格式"""
try: try:
with zipfile.ZipFile(file_path, "r") as zip_file: with zipfile.ZipFile(file_path, "r") as zip_file:
names = set(zip_file.namelist())
required_files = [ required_files = [
"[Content_Types].xml", "[Content_Types].xml",
"_rels/.rels", "_rels/.rels",
"ppt/presentation.xml", "ppt/presentation.xml",
] ]
for required in required_files: return all(r in names for r in required_files)
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile): except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False return False
@@ -105,11 +97,9 @@ def is_valid_xlsx(file_path: str) -> bool:
"""验证文件是否为有效的 XLSX 格式""" """验证文件是否为有效的 XLSX 格式"""
try: try:
with zipfile.ZipFile(file_path, "r") as zip_file: with zipfile.ZipFile(file_path, "r") as zip_file:
names = set(zip_file.namelist())
required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"] required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
for required in required_files: return all(r in names for r in required_files)
if required not in zip_file.namelist():
return False
return True
except (zipfile.BadZipFile, zipfile.LargeZipFile): except (zipfile.BadZipFile, zipfile.LargeZipFile):
return False return False
@@ -177,7 +167,13 @@ def get_heading_level(line: str) -> int:
level += 1 level += 1
else: else:
break break
return level if 1 <= level <= 6 else 0 if not (1 <= level <= 6):
return 0
if len(stripped) == level:
return level
if stripped[level] != " ":
return 0
return level
def extract_titles(markdown_text: str) -> List[str]: def extract_titles(markdown_text: str) -> List[str]:
@@ -206,7 +202,10 @@ def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
return None return None
result_lines = [] result_lines = []
for idx in match_indices: for match_num, idx in enumerate(match_indices):
if match_num > 0:
result_lines.append("\n---\n")
target_level = get_heading_level(lines[idx]) target_level = get_heading_level(lines[idx])
parent_titles = [] parent_titles = []
@@ -288,7 +287,6 @@ def search_markdown(
line line
for i, line in enumerate(lines) for i, line in enumerate(lines)
if start_line_idx <= i <= end_line_idx if start_line_idx <= i <= end_line_idx
and (line.strip() or i in selected_indices)
] ]
results.append("\n".join(result_lines)) results.append("\n".join(result_lines))

View File

@@ -37,17 +37,19 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
def get_heading_level(para: Any) -> int: def get_heading_level(para: Any) -> int:
if para.style and para.style.name: if para.style and para.style.name:
style_name = para.style.name style_name = para.style.name
if "Heading 1" in style_name or "Title" in style_name: if style_name == "Title":
return 1 return 1
elif "Heading 2" in style_name: elif style_name == "Heading 1":
return 1
elif style_name == "Heading 2":
return 2 return 2
elif "Heading 3" in style_name: elif style_name == "Heading 3":
return 3 return 3
elif "Heading 4" in style_name: elif style_name == "Heading 4":
return 4 return 4
elif "Heading 5" in style_name: elif style_name == "Heading 5":
return 5 return 5
elif "Heading 6" in style_name: elif style_name == "Heading 6":
return 6 return 6
return 0 return 0
@@ -89,7 +91,12 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
markdown_lines = [] markdown_lines = []
prev_was_list = False prev_was_list = False
for para in doc.paragraphs: from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
for element in doc.element.body:
if element.tag.endswith('}p'):
para = Paragraph(element, doc)
text = convert_runs_to_markdown(para.runs) text = convert_runs_to_markdown(para.runs)
if not text.strip(): if not text.strip():
continue continue
@@ -117,10 +124,13 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
markdown_lines.append("") markdown_lines.append("")
prev_was_list = False prev_was_list = False
for table in doc.tables: elif element.tag.endswith('}tbl'):
table = DocxTable(element, doc)
table_md = convert_table_to_markdown(table) table_md = convert_table_to_markdown(table)
if table_md:
markdown_lines.append(table_md) markdown_lines.append(table_md)
markdown_lines.append("") markdown_lines.append("")
prev_was_list = False
content = "\n".join(markdown_lines) content = "\n".join(markdown_lines)
if not content.strip(): if not content.strip():
@@ -194,28 +204,29 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
if style_id and style_name_elem is not None: if style_id and style_name_elem is not None:
style_name = style_name_elem.get(f"{{{word_namespace}}}val") style_name = style_name_elem.get(f"{{{word_namespace}}}val")
if style_name: if style_name:
if style_name == "Title": style_name_lower = style_name.lower()
if style_name_lower == "title":
style_to_level[style_id] = 1 style_to_level[style_id] = 1
elif style_name == "heading 1": elif style_name_lower == "heading 1":
style_to_level[style_id] = 1 style_to_level[style_id] = 1
elif style_name == "heading 2": elif style_name_lower == "heading 2":
style_to_level[style_id] = 2 style_to_level[style_id] = 2
elif style_name == "heading 3": elif style_name_lower == "heading 3":
style_to_level[style_id] = 3 style_to_level[style_id] = 3
elif style_name == "heading 4": elif style_name_lower == "heading 4":
style_to_level[style_id] = 4 style_to_level[style_id] = 4
elif style_name == "heading 5": elif style_name_lower == "heading 5":
style_to_level[style_id] = 5 style_to_level[style_id] = 5
elif style_name == "heading 6": elif style_name_lower == "heading 6":
style_to_level[style_id] = 6 style_to_level[style_id] = 6
elif ( elif (
style_name.startswith("List Bullet") style_name_lower.startswith("list bullet")
or style_name == "Bullet" or style_name_lower == "bullet"
): ):
style_to_list[style_id] = "bullet" style_to_list[style_id] = "bullet"
elif ( elif (
style_name.startswith("List Number") style_name_lower.startswith("list number")
or style_name == "Number" or style_name_lower == "number"
): ):
style_to_list[style_id] = "number" style_to_list[style_id] = "number"
except Exception: except Exception:

View File

@@ -6,10 +6,10 @@ import os
import sys import sys
import common import common
import docx import docx_parser
import pdf import pdf_parser
import pptx import pptx_parser
import xlsx import xlsx_parser
def main() -> None: def main() -> None:
@@ -64,27 +64,27 @@ def main() -> None:
if file_type == "docx": if file_type == "docx":
parsers = [ parsers = [
("MarkItDown", docx.parse_docx_with_markitdown), ("MarkItDown", docx_parser.parse_docx_with_markitdown),
("python-docx", docx.parse_docx_with_python_docx), ("python-docx", docx_parser.parse_docx_with_python_docx),
("XML 原生解析", docx.parse_docx_with_xml), ("XML 原生解析", docx_parser.parse_docx_with_xml),
] ]
elif file_type == "pptx": elif file_type == "pptx":
parsers = [ parsers = [
("MarkItDown", pptx.parse_pptx_with_markitdown), ("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
("python-pptx", pptx.parse_pptx_with_python_pptx), ("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
("XML 原生解析", pptx.parse_pptx_with_xml), ("XML 原生解析", pptx_parser.parse_pptx_with_xml),
] ]
elif file_type == "xlsx": elif file_type == "xlsx":
parsers = [ parsers = [
("MarkItDown", xlsx.parse_xlsx_with_markitdown), ("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
("pandas", xlsx.parse_xlsx_with_pandas), ("pandas", xlsx_parser.parse_xlsx_with_pandas),
("XML 原生解析", xlsx.parse_xlsx_with_xml), ("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
] ]
else: else:
parsers = [ parsers = [
("MarkItDown", pdf.parse_pdf_with_markitdown), ("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
("unstructured", pdf.parse_pdf_with_unstructured), ("unstructured", pdf_parser.parse_pdf_with_unstructured),
("pypdf", pdf.parse_pdf_with_pypdf), ("pypdf", pdf_parser.parse_pdf_with_pypdf),
] ]
failures = [] failures = []

View File

@@ -272,6 +272,9 @@ def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
for f in zip_file.namelist() for f in zip_file.namelist()
if re.match(r"ppt/slides/slide\d+\.xml$", f) if re.match(r"ppt/slides/slide\d+\.xml$", f)
] ]
slide_files.sort(
key=lambda f: int(re.search(r"slide(\d+)\.xml$", f).group(1))
)
for slide_idx, slide_file in enumerate(slide_files, 1): for slide_idx, slide_file in enumerate(slide_files, 1):
md_content.append("\n## Slide {}\n".format(slide_idx)) md_content.append("\n## Slide {}\n".format(slide_idx))

View File

@@ -32,20 +32,25 @@ def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]
return None, f"{missing_lib} 库未安装" return None, f"{missing_lib} 库未安装"
try: try:
df = pd.read_excel(file_path) sheets = pd.read_excel(file_path, sheet_name=None)
markdown_parts = []
for sheet_name, df in sheets.items():
if len(df) == 0: if len(df) == 0:
return None, "Excel 文件为空" markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
continue
markdown_content = tabulate( table_md = tabulate(
df, headers="keys", tablefmt="pipe", showindex=True, missingval="" df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
) )
markdown_parts.append(f"## {sheet_name}\n\n{table_md}")
markdown_with_header = ( if not markdown_parts:
f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}" return None, "Excel 文件为空"
)
return markdown_with_header, None markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
return markdown_content, None
except Exception as e: except Exception as e:
return None, f"pandas 解析失败: {str(e)}" return None, f"pandas 解析失败: {str(e)}"
@@ -165,20 +170,37 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
try: try:
with zipfile.ZipFile(file_path, "r") as zip_file: with zipfile.ZipFile(file_path, "r") as zip_file:
sheet_names = [] sheet_names = []
sheet_rids = []
try: try:
with zip_file.open("xl/workbook.xml") as f: with zip_file.open("xl/workbook.xml") as f:
root = ET.parse(f).getroot() root = ET.parse(f).getroot()
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
sheet_elements = root.findall(".//main:sheet", xlsx_namespace) sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
for sheet in sheet_elements: for sheet in sheet_elements:
sheet_name = sheet.attrib.get("name", "") sheet_name = sheet.attrib.get("name", "")
rid = sheet.attrib.get(f"{{{rel_ns}}}id", "")
if sheet_name: if sheet_name:
sheet_names.append(sheet_name) sheet_names.append(sheet_name)
sheet_rids.append(rid)
except KeyError: except KeyError:
return None, "无法解析工作表名称" return None, "无法解析工作表名称"
if not sheet_names: if not sheet_names:
return None, "未找到工作表" return None, "未找到工作表"
rid_to_target = {}
try:
rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships"
with zip_file.open("xl/_rels/workbook.xml.rels") as f:
rels_root = ET.parse(f).getroot()
for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"):
rid = rel.attrib.get("Id", "")
target = rel.attrib.get("Target", "")
if rid and target:
rid_to_target[rid] = target
except KeyError:
pass
shared_strings = [] shared_strings = []
try: try:
with zip_file.open("xl/sharedStrings.xml") as f: with zip_file.open("xl/sharedStrings.xml") as f:
@@ -193,11 +215,19 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
pass pass
markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n" markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
markdown_content += f"来源: {file_path}\n\n"
for sheet_index, sheet_name in enumerate(sheet_names, start=1): for sheet_index, sheet_name in enumerate(sheet_names):
rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else ""
target = rid_to_target.get(rid, "")
if target:
if target.startswith("/"):
worksheet_path = target.lstrip("/")
else:
worksheet_path = f"xl/{target}"
else:
worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml"
try: try:
worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml"
with zip_file.open(worksheet_path) as f: with zip_file.open(worksheet_path) as f:
root = ET.parse(f).getroot() root = ET.parse(f).getroot()
sheet_data = root.find("main:sheetData", xlsx_namespace) sheet_data = root.find("main:sheetData", xlsx_namespace)