修复bug
This commit is contained in:
@@ -39,16 +39,12 @@ def safe_open_zip(zip_file: zipfile.ZipFile, name: str) -> Optional[zipfile.ZipE
|
||||
"""安全地从 ZipFile 中打开文件,防止路径遍历攻击"""
|
||||
if not name:
|
||||
return None
|
||||
if name.startswith("/") or name.startswith("\\"):
|
||||
return None
|
||||
if name.startswith(".."):
|
||||
if name.startswith("/") or name.startswith(".."):
|
||||
return None
|
||||
if "/../" in name or name.endswith("/.."):
|
||||
return None
|
||||
if "\\" in name:
|
||||
return None
|
||||
if "/" not in name:
|
||||
return None
|
||||
return zip_file.open(name)
|
||||
|
||||
|
||||
@@ -75,11 +71,9 @@ def is_valid_docx(file_path: str) -> bool:
|
||||
"""验证文件是否为有效的 DOCX 格式"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as zip_file:
|
||||
names = set(zip_file.namelist())
|
||||
required_files = ["[Content_Types].xml", "_rels/.rels", "word/document.xml"]
|
||||
for required in required_files:
|
||||
if required not in zip_file.namelist():
|
||||
return False
|
||||
return True
|
||||
return all(r in names for r in required_files)
|
||||
except (zipfile.BadZipFile, zipfile.LargeZipFile):
|
||||
return False
|
||||
|
||||
@@ -88,15 +82,13 @@ def is_valid_pptx(file_path: str) -> bool:
|
||||
"""验证文件是否为有效的 PPTX 格式"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as zip_file:
|
||||
names = set(zip_file.namelist())
|
||||
required_files = [
|
||||
"[Content_Types].xml",
|
||||
"_rels/.rels",
|
||||
"ppt/presentation.xml",
|
||||
]
|
||||
for required in required_files:
|
||||
if required not in zip_file.namelist():
|
||||
return False
|
||||
return True
|
||||
return all(r in names for r in required_files)
|
||||
except (zipfile.BadZipFile, zipfile.LargeZipFile):
|
||||
return False
|
||||
|
||||
@@ -105,11 +97,9 @@ def is_valid_xlsx(file_path: str) -> bool:
|
||||
"""验证文件是否为有效的 XLSX 格式"""
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as zip_file:
|
||||
names = set(zip_file.namelist())
|
||||
required_files = ["[Content_Types].xml", "_rels/.rels", "xl/workbook.xml"]
|
||||
for required in required_files:
|
||||
if required not in zip_file.namelist():
|
||||
return False
|
||||
return True
|
||||
return all(r in names for r in required_files)
|
||||
except (zipfile.BadZipFile, zipfile.LargeZipFile):
|
||||
return False
|
||||
|
||||
@@ -177,7 +167,13 @@ def get_heading_level(line: str) -> int:
|
||||
level += 1
|
||||
else:
|
||||
break
|
||||
return level if 1 <= level <= 6 else 0
|
||||
if not (1 <= level <= 6):
|
||||
return 0
|
||||
if len(stripped) == level:
|
||||
return level
|
||||
if stripped[level] != " ":
|
||||
return 0
|
||||
return level
|
||||
|
||||
|
||||
def extract_titles(markdown_text: str) -> List[str]:
|
||||
@@ -206,7 +202,10 @@ def extract_title_content(markdown_text: str, title_name: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
result_lines = []
|
||||
for idx in match_indices:
|
||||
for match_num, idx in enumerate(match_indices):
|
||||
if match_num > 0:
|
||||
result_lines.append("\n---\n")
|
||||
|
||||
target_level = get_heading_level(lines[idx])
|
||||
|
||||
parent_titles = []
|
||||
@@ -288,7 +287,6 @@ def search_markdown(
|
||||
line
|
||||
for i, line in enumerate(lines)
|
||||
if start_line_idx <= i <= end_line_idx
|
||||
and (line.strip() or i in selected_indices)
|
||||
]
|
||||
results.append("\n".join(result_lines))
|
||||
|
||||
|
||||
@@ -37,17 +37,19 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
|
||||
def get_heading_level(para: Any) -> int:
|
||||
if para.style and para.style.name:
|
||||
style_name = para.style.name
|
||||
if "Heading 1" in style_name or "Title" in style_name:
|
||||
if style_name == "Title":
|
||||
return 1
|
||||
elif "Heading 2" in style_name:
|
||||
elif style_name == "Heading 1":
|
||||
return 1
|
||||
elif style_name == "Heading 2":
|
||||
return 2
|
||||
elif "Heading 3" in style_name:
|
||||
elif style_name == "Heading 3":
|
||||
return 3
|
||||
elif "Heading 4" in style_name:
|
||||
elif style_name == "Heading 4":
|
||||
return 4
|
||||
elif "Heading 5" in style_name:
|
||||
elif style_name == "Heading 5":
|
||||
return 5
|
||||
elif "Heading 6" in style_name:
|
||||
elif style_name == "Heading 6":
|
||||
return 6
|
||||
return 0
|
||||
|
||||
@@ -89,7 +91,12 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
|
||||
markdown_lines = []
|
||||
prev_was_list = False
|
||||
|
||||
for para in doc.paragraphs:
|
||||
from docx.table import Table as DocxTable
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
for element in doc.element.body:
|
||||
if element.tag.endswith('}p'):
|
||||
para = Paragraph(element, doc)
|
||||
text = convert_runs_to_markdown(para.runs)
|
||||
if not text.strip():
|
||||
continue
|
||||
@@ -117,10 +124,13 @@ def parse_docx_with_python_docx(file_path: str) -> Tuple[Optional[str], Optional
|
||||
markdown_lines.append("")
|
||||
prev_was_list = False
|
||||
|
||||
for table in doc.tables:
|
||||
elif element.tag.endswith('}tbl'):
|
||||
table = DocxTable(element, doc)
|
||||
table_md = convert_table_to_markdown(table)
|
||||
if table_md:
|
||||
markdown_lines.append(table_md)
|
||||
markdown_lines.append("")
|
||||
prev_was_list = False
|
||||
|
||||
content = "\n".join(markdown_lines)
|
||||
if not content.strip():
|
||||
@@ -194,28 +204,29 @@ def parse_docx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
if style_id and style_name_elem is not None:
|
||||
style_name = style_name_elem.get(f"{{{word_namespace}}}val")
|
||||
if style_name:
|
||||
if style_name == "Title":
|
||||
style_name_lower = style_name.lower()
|
||||
if style_name_lower == "title":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name == "heading 1":
|
||||
elif style_name_lower == "heading 1":
|
||||
style_to_level[style_id] = 1
|
||||
elif style_name == "heading 2":
|
||||
elif style_name_lower == "heading 2":
|
||||
style_to_level[style_id] = 2
|
||||
elif style_name == "heading 3":
|
||||
elif style_name_lower == "heading 3":
|
||||
style_to_level[style_id] = 3
|
||||
elif style_name == "heading 4":
|
||||
elif style_name_lower == "heading 4":
|
||||
style_to_level[style_id] = 4
|
||||
elif style_name == "heading 5":
|
||||
elif style_name_lower == "heading 5":
|
||||
style_to_level[style_id] = 5
|
||||
elif style_name == "heading 6":
|
||||
elif style_name_lower == "heading 6":
|
||||
style_to_level[style_id] = 6
|
||||
elif (
|
||||
style_name.startswith("List Bullet")
|
||||
or style_name == "Bullet"
|
||||
style_name_lower.startswith("list bullet")
|
||||
or style_name_lower == "bullet"
|
||||
):
|
||||
style_to_list[style_id] = "bullet"
|
||||
elif (
|
||||
style_name.startswith("List Number")
|
||||
or style_name == "Number"
|
||||
style_name_lower.startswith("list number")
|
||||
or style_name_lower == "number"
|
||||
):
|
||||
style_to_list[style_id] = "number"
|
||||
except Exception:
|
||||
@@ -6,10 +6,10 @@ import os
|
||||
import sys
|
||||
|
||||
import common
|
||||
import docx
|
||||
import pdf
|
||||
import pptx
|
||||
import xlsx
|
||||
import docx_parser
|
||||
import pdf_parser
|
||||
import pptx_parser
|
||||
import xlsx_parser
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@@ -64,27 +64,27 @@ def main() -> None:
|
||||
|
||||
if file_type == "docx":
|
||||
parsers = [
|
||||
("MarkItDown", docx.parse_docx_with_markitdown),
|
||||
("python-docx", docx.parse_docx_with_python_docx),
|
||||
("XML 原生解析", docx.parse_docx_with_xml),
|
||||
("MarkItDown", docx_parser.parse_docx_with_markitdown),
|
||||
("python-docx", docx_parser.parse_docx_with_python_docx),
|
||||
("XML 原生解析", docx_parser.parse_docx_with_xml),
|
||||
]
|
||||
elif file_type == "pptx":
|
||||
parsers = [
|
||||
("MarkItDown", pptx.parse_pptx_with_markitdown),
|
||||
("python-pptx", pptx.parse_pptx_with_python_pptx),
|
||||
("XML 原生解析", pptx.parse_pptx_with_xml),
|
||||
("MarkItDown", pptx_parser.parse_pptx_with_markitdown),
|
||||
("python-pptx", pptx_parser.parse_pptx_with_python_pptx),
|
||||
("XML 原生解析", pptx_parser.parse_pptx_with_xml),
|
||||
]
|
||||
elif file_type == "xlsx":
|
||||
parsers = [
|
||||
("MarkItDown", xlsx.parse_xlsx_with_markitdown),
|
||||
("pandas", xlsx.parse_xlsx_with_pandas),
|
||||
("XML 原生解析", xlsx.parse_xlsx_with_xml),
|
||||
("MarkItDown", xlsx_parser.parse_xlsx_with_markitdown),
|
||||
("pandas", xlsx_parser.parse_xlsx_with_pandas),
|
||||
("XML 原生解析", xlsx_parser.parse_xlsx_with_xml),
|
||||
]
|
||||
else:
|
||||
parsers = [
|
||||
("MarkItDown", pdf.parse_pdf_with_markitdown),
|
||||
("unstructured", pdf.parse_pdf_with_unstructured),
|
||||
("pypdf", pdf.parse_pdf_with_pypdf),
|
||||
("MarkItDown", pdf_parser.parse_pdf_with_markitdown),
|
||||
("unstructured", pdf_parser.parse_pdf_with_unstructured),
|
||||
("pypdf", pdf_parser.parse_pdf_with_pypdf),
|
||||
]
|
||||
|
||||
failures = []
|
||||
|
||||
@@ -272,6 +272,9 @@ def parse_pptx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
for f in zip_file.namelist()
|
||||
if re.match(r"ppt/slides/slide\d+\.xml$", f)
|
||||
]
|
||||
slide_files.sort(
|
||||
key=lambda f: int(re.search(r"slide(\d+)\.xml$", f).group(1))
|
||||
)
|
||||
|
||||
for slide_idx, slide_file in enumerate(slide_files, 1):
|
||||
md_content.append("\n## Slide {}\n".format(slide_idx))
|
||||
@@ -32,20 +32,25 @@ def parse_xlsx_with_pandas(file_path: str) -> Tuple[Optional[str], Optional[str]
|
||||
return None, f"{missing_lib} 库未安装"
|
||||
|
||||
try:
|
||||
df = pd.read_excel(file_path)
|
||||
sheets = pd.read_excel(file_path, sheet_name=None)
|
||||
|
||||
markdown_parts = []
|
||||
for sheet_name, df in sheets.items():
|
||||
if len(df) == 0:
|
||||
return None, "Excel 文件为空"
|
||||
markdown_parts.append(f"## {sheet_name}\n\n*工作表为空*")
|
||||
continue
|
||||
|
||||
markdown_content = tabulate(
|
||||
table_md = tabulate(
|
||||
df, headers="keys", tablefmt="pipe", showindex=True, missingval=""
|
||||
)
|
||||
markdown_parts.append(f"## {sheet_name}\n\n{table_md}")
|
||||
|
||||
markdown_with_header = (
|
||||
f"# Excel数据转换结果\n\n来源: {file_path}\n\n{markdown_content}"
|
||||
)
|
||||
if not markdown_parts:
|
||||
return None, "Excel 文件为空"
|
||||
|
||||
return markdown_with_header, None
|
||||
markdown_content = "# Excel数据转换结果\n\n" + "\n\n".join(markdown_parts)
|
||||
|
||||
return markdown_content, None
|
||||
except Exception as e:
|
||||
return None, f"pandas 解析失败: {str(e)}"
|
||||
|
||||
@@ -165,20 +170,37 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, "r") as zip_file:
|
||||
sheet_names = []
|
||||
sheet_rids = []
|
||||
try:
|
||||
with zip_file.open("xl/workbook.xml") as f:
|
||||
root = ET.parse(f).getroot()
|
||||
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
sheet_elements = root.findall(".//main:sheet", xlsx_namespace)
|
||||
for sheet in sheet_elements:
|
||||
sheet_name = sheet.attrib.get("name", "")
|
||||
rid = sheet.attrib.get(f"{{{rel_ns}}}id", "")
|
||||
if sheet_name:
|
||||
sheet_names.append(sheet_name)
|
||||
sheet_rids.append(rid)
|
||||
except KeyError:
|
||||
return None, "无法解析工作表名称"
|
||||
|
||||
if not sheet_names:
|
||||
return None, "未找到工作表"
|
||||
|
||||
rid_to_target = {}
|
||||
try:
|
||||
rels_ns = "http://schemas.openxmlformats.org/package/2006/relationships"
|
||||
with zip_file.open("xl/_rels/workbook.xml.rels") as f:
|
||||
rels_root = ET.parse(f).getroot()
|
||||
for rel in rels_root.findall(f"{{{rels_ns}}}Relationship"):
|
||||
rid = rel.attrib.get("Id", "")
|
||||
target = rel.attrib.get("Target", "")
|
||||
if rid and target:
|
||||
rid_to_target[rid] = target
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
shared_strings = []
|
||||
try:
|
||||
with zip_file.open("xl/sharedStrings.xml") as f:
|
||||
@@ -193,11 +215,19 @@ def parse_xlsx_with_xml(file_path: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
pass
|
||||
|
||||
markdown_content = "# Excel数据转换结果 (原生XML解析)\n\n"
|
||||
markdown_content += f"来源: {file_path}\n\n"
|
||||
|
||||
for sheet_index, sheet_name in enumerate(sheet_names, start=1):
|
||||
for sheet_index, sheet_name in enumerate(sheet_names):
|
||||
rid = sheet_rids[sheet_index] if sheet_index < len(sheet_rids) else ""
|
||||
target = rid_to_target.get(rid, "")
|
||||
if target:
|
||||
if target.startswith("/"):
|
||||
worksheet_path = target.lstrip("/")
|
||||
else:
|
||||
worksheet_path = f"xl/{target}"
|
||||
else:
|
||||
worksheet_path = f"xl/worksheets/sheet{sheet_index + 1}.xml"
|
||||
|
||||
try:
|
||||
worksheet_path = f"xl/worksheets/sheet{sheet_index}.xml"
|
||||
with zip_file.open(worksheet_path) as f:
|
||||
root = ET.parse(f).getroot()
|
||||
sheet_data = root.find("main:sheetData", xlsx_namespace)
|
||||
Reference in New Issue
Block a user