383 lines
14 KiB
Python
383 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
智能修复图片链接脚本
|
|||
|
|
自动匹配已重命名的图片文件,修复Markdown中的引用
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import json
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Tuple, Optional
|
|||
|
|
from difflib import SequenceMatcher
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
|
|||
|
|
class ImageLinkFixer:
|
|||
|
|
def __init__(self, base_path: Path):
|
|||
|
|
self.base_path = base_path
|
|||
|
|
self.data_path = base_path / "data/订单班文档资料"
|
|||
|
|
self.mapping_cache = {} # 缓存映射关系
|
|||
|
|
|
|||
|
|
def find_best_match(self, target_name: str, available_files: List[str]) -> Optional[str]:
|
|||
|
|
"""
|
|||
|
|
使用模糊匹配找到最可能的文件
|
|||
|
|
"""
|
|||
|
|
# 先尝试精确匹配
|
|||
|
|
if target_name in available_files:
|
|||
|
|
return target_name
|
|||
|
|
|
|||
|
|
# 尝试去掉扩展名匹配
|
|||
|
|
target_base = Path(target_name).stem
|
|||
|
|
for file in available_files:
|
|||
|
|
if Path(file).stem == target_base:
|
|||
|
|
return file
|
|||
|
|
|
|||
|
|
# 使用相似度匹配(可能文件已被重命名)
|
|||
|
|
best_match = None
|
|||
|
|
best_ratio = 0
|
|||
|
|
|
|||
|
|
for file in available_files:
|
|||
|
|
# 计算相似度
|
|||
|
|
ratio = SequenceMatcher(None, target_name.lower(), file.lower()).ratio()
|
|||
|
|
if ratio > best_ratio and ratio > 0.3: # 至少30%相似度
|
|||
|
|
best_ratio = ratio
|
|||
|
|
best_match = file
|
|||
|
|
|
|||
|
|
return best_match
|
|||
|
|
|
|||
|
|
def load_image_mapping(self, order_dir: Path) -> Dict[str, str]:
|
|||
|
|
"""
|
|||
|
|
加载或创建图片映射关系
|
|||
|
|
"""
|
|||
|
|
mapping_file = order_dir / "notion文稿" / "image_mapping.json"
|
|||
|
|
|
|||
|
|
if mapping_file.exists():
|
|||
|
|
with open(mapping_file, 'r', encoding='utf-8') as f:
|
|||
|
|
return json.load(f)
|
|||
|
|
|
|||
|
|
# 如果没有映射文件,尝试从图片索引.md读取
|
|||
|
|
index_file = order_dir / "notion文稿" / "图片索引.md"
|
|||
|
|
mapping = {}
|
|||
|
|
|
|||
|
|
if index_file.exists():
|
|||
|
|
content = index_file.read_text(encoding='utf-8')
|
|||
|
|
# 解析索引文件格式: 原始文件名 -> 新文件名
|
|||
|
|
lines = content.split('\n')
|
|||
|
|
for line in lines:
|
|||
|
|
if '->' in line:
|
|||
|
|
parts = line.split('->')
|
|||
|
|
if len(parts) == 2:
|
|||
|
|
old_name = parts[0].strip()
|
|||
|
|
new_name = parts[1].strip()
|
|||
|
|
mapping[old_name] = new_name
|
|||
|
|
|
|||
|
|
return mapping
|
|||
|
|
|
|||
|
|
def analyze_order_class(self, order_dir: Path) -> Dict:
|
|||
|
|
"""
|
|||
|
|
分析单个订单班的图片链接状态
|
|||
|
|
"""
|
|||
|
|
result = {
|
|||
|
|
"name": order_dir.name,
|
|||
|
|
"total_refs": 0,
|
|||
|
|
"broken_refs": [],
|
|||
|
|
"fixed_refs": [],
|
|||
|
|
"unmapped_refs": []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
notion_dir = order_dir / "notion文稿"
|
|||
|
|
if not notion_dir.exists():
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
# 获取实际存在的图片文件
|
|||
|
|
image_dir = notion_dir / "image"
|
|||
|
|
actual_images = set()
|
|||
|
|
if image_dir.exists():
|
|||
|
|
for img in image_dir.iterdir():
|
|||
|
|
if img.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
|
|||
|
|
actual_images.add(img.name)
|
|||
|
|
|
|||
|
|
# 加载映射关系
|
|||
|
|
mapping = self.load_image_mapping(order_dir)
|
|||
|
|
|
|||
|
|
# 检查所有Markdown文件
|
|||
|
|
for md_file in notion_dir.glob("*.md"):
|
|||
|
|
if md_file.name == "图片索引.md":
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
content = md_file.read_text(encoding='utf-8')
|
|||
|
|
|
|||
|
|
# 查找所有图片引用
|
|||
|
|
img_pattern = r'!\[(.*?)\]\((.*?)\)'
|
|||
|
|
matches = re.finditer(img_pattern, content)
|
|||
|
|
|
|||
|
|
for match in matches:
|
|||
|
|
alt_text = match.group(1)
|
|||
|
|
img_path = match.group(2)
|
|||
|
|
result["total_refs"] += 1
|
|||
|
|
|
|||
|
|
# 跳过外部链接
|
|||
|
|
if img_path.startswith('http'):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 提取文件名
|
|||
|
|
if img_path.startswith('image/'):
|
|||
|
|
img_name = img_path[6:]
|
|||
|
|
elif img_path.startswith('./image/'):
|
|||
|
|
img_name = img_path[8:]
|
|||
|
|
else:
|
|||
|
|
img_name = Path(img_path).name
|
|||
|
|
|
|||
|
|
# 解码URL编码
|
|||
|
|
img_name = unquote(img_name)
|
|||
|
|
|
|||
|
|
# 检查文件是否存在
|
|||
|
|
if img_name not in actual_images:
|
|||
|
|
# 尝试从映射中找到
|
|||
|
|
if img_name in mapping:
|
|||
|
|
mapped_name = mapping[img_name]
|
|||
|
|
if mapped_name in actual_images:
|
|||
|
|
result["fixed_refs"].append({
|
|||
|
|
"file": md_file.name,
|
|||
|
|
"original": img_name,
|
|||
|
|
"mapped": mapped_name,
|
|||
|
|
"line": match.group(0)
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
result["broken_refs"].append({
|
|||
|
|
"file": md_file.name,
|
|||
|
|
"reference": img_name,
|
|||
|
|
"line": match.group(0)
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
# 尝试模糊匹配
|
|||
|
|
best_match = self.find_best_match(img_name, list(actual_images))
|
|||
|
|
if best_match:
|
|||
|
|
result["unmapped_refs"].append({
|
|||
|
|
"file": md_file.name,
|
|||
|
|
"original": img_name,
|
|||
|
|
"suggested": best_match,
|
|||
|
|
"line": match.group(0)
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
result["broken_refs"].append({
|
|||
|
|
"file": md_file.name,
|
|||
|
|
"reference": img_name,
|
|||
|
|
"line": match.group(0)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
def fix_markdown_references(self, order_dir: Path, auto_fix: bool = False) -> int:
|
|||
|
|
"""
|
|||
|
|
修复Markdown中的图片引用
|
|||
|
|
"""
|
|||
|
|
fixed_count = 0
|
|||
|
|
notion_dir = order_dir / "notion文稿"
|
|||
|
|
|
|||
|
|
if not notion_dir.exists():
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
# 获取实际图片
|
|||
|
|
image_dir = notion_dir / "image"
|
|||
|
|
actual_images = {}
|
|||
|
|
if image_dir.exists():
|
|||
|
|
for img in image_dir.iterdir():
|
|||
|
|
if img.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp']:
|
|||
|
|
actual_images[img.name] = img
|
|||
|
|
|
|||
|
|
# 加载或创建映射
|
|||
|
|
mapping = self.load_image_mapping(order_dir)
|
|||
|
|
|
|||
|
|
# 处理每个Markdown文件
|
|||
|
|
for md_file in notion_dir.glob("*.md"):
|
|||
|
|
if md_file.name == "图片索引.md":
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
content = md_file.read_text(encoding='utf-8')
|
|||
|
|
original_content = content
|
|||
|
|
|
|||
|
|
def replace_image_ref(match):
|
|||
|
|
nonlocal fixed_count
|
|||
|
|
alt_text = match.group(1)
|
|||
|
|
img_path = match.group(2)
|
|||
|
|
|
|||
|
|
# 跳过外部链接
|
|||
|
|
if img_path.startswith('http'):
|
|||
|
|
return match.group(0)
|
|||
|
|
|
|||
|
|
# 提取文件名
|
|||
|
|
if img_path.startswith('image/'):
|
|||
|
|
img_name = img_path[6:]
|
|||
|
|
path_prefix = "image/"
|
|||
|
|
elif img_path.startswith('./image/'):
|
|||
|
|
img_name = img_path[8:]
|
|||
|
|
path_prefix = "image/"
|
|||
|
|
else:
|
|||
|
|
img_name = Path(img_path).name
|
|||
|
|
path_prefix = "image/"
|
|||
|
|
|
|||
|
|
# 解码URL编码
|
|||
|
|
img_name = unquote(img_name)
|
|||
|
|
|
|||
|
|
# 如果文件存在,保持不变
|
|||
|
|
if img_name in actual_images:
|
|||
|
|
if img_path.startswith('./'):
|
|||
|
|
# 移除 ./ 前缀
|
|||
|
|
fixed_count += 1
|
|||
|
|
return f""
|
|||
|
|
return match.group(0)
|
|||
|
|
|
|||
|
|
# 尝试从映射中找到
|
|||
|
|
if img_name in mapping:
|
|||
|
|
mapped_name = mapping[img_name]
|
|||
|
|
if mapped_name in actual_images:
|
|||
|
|
fixed_count += 1
|
|||
|
|
print(f" 映射修复: {img_name} → {mapped_name}")
|
|||
|
|
return f""
|
|||
|
|
|
|||
|
|
# 尝试智能匹配
|
|||
|
|
if auto_fix:
|
|||
|
|
best_match = self.find_best_match(img_name, list(actual_images.keys()))
|
|||
|
|
if best_match:
|
|||
|
|
fixed_count += 1
|
|||
|
|
print(f" 智能修复: {img_name} → {best_match}")
|
|||
|
|
# 保存映射关系
|
|||
|
|
mapping[img_name] = best_match
|
|||
|
|
return f""
|
|||
|
|
|
|||
|
|
# 无法修复,保持原样但报告问题
|
|||
|
|
print(f" ⚠️ 无法修复: {img_name}")
|
|||
|
|
return match.group(0)
|
|||
|
|
|
|||
|
|
# 替换所有图片引用
|
|||
|
|
content = re.sub(r'!\[(.*?)\]\((.*?)\)', replace_image_ref, content)
|
|||
|
|
|
|||
|
|
# 保存修改
|
|||
|
|
if content != original_content:
|
|||
|
|
# 备份原文件
|
|||
|
|
backup_file = md_file.with_suffix('.md.bak')
|
|||
|
|
if not backup_file.exists():
|
|||
|
|
md_file.rename(backup_file)
|
|||
|
|
md_file.write_text(content, encoding='utf-8')
|
|||
|
|
else:
|
|||
|
|
md_file.write_text(content, encoding='utf-8')
|
|||
|
|
|
|||
|
|
print(f" ✅ 修复了 {md_file.name}")
|
|||
|
|
|
|||
|
|
# 保存映射关系
|
|||
|
|
if mapping and auto_fix:
|
|||
|
|
mapping_file = notion_dir / "image_mapping.json"
|
|||
|
|
with open(mapping_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(mapping, f, ensure_ascii=False, indent=2)
|
|||
|
|
print(f" 💾 保存映射关系到 image_mapping.json")
|
|||
|
|
|
|||
|
|
return fixed_count
|
|||
|
|
|
|||
|
|
def validate_all_orders(self) -> Dict:
|
|||
|
|
"""
|
|||
|
|
验证所有订单班的图片链接状态
|
|||
|
|
"""
|
|||
|
|
all_results = {}
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("智能图片链接验证和修复系统")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
for order_dir in self.data_path.iterdir():
|
|||
|
|
if order_dir.is_dir() and not order_dir.name.startswith('.'):
|
|||
|
|
print(f"\n分析 {order_dir.name}...")
|
|||
|
|
result = self.analyze_order_class(order_dir)
|
|||
|
|
all_results[order_dir.name] = result
|
|||
|
|
|
|||
|
|
# 显示分析结果
|
|||
|
|
if result["broken_refs"]:
|
|||
|
|
print(f" ❌ 损坏的引用: {len(result['broken_refs'])} 个")
|
|||
|
|
for ref in result["broken_refs"][:3]:
|
|||
|
|
print(f" - {ref['file']}: {ref['reference']}")
|
|||
|
|
|
|||
|
|
if result["unmapped_refs"]:
|
|||
|
|
print(f" 🔍 可智能修复: {len(result['unmapped_refs'])} 个")
|
|||
|
|
for ref in result["unmapped_refs"][:3]:
|
|||
|
|
print(f" - {ref['original']} → {ref['suggested']}")
|
|||
|
|
|
|||
|
|
if result["fixed_refs"]:
|
|||
|
|
print(f" ✅ 已映射修复: {len(result['fixed_refs'])} 个")
|
|||
|
|
|
|||
|
|
if not result["broken_refs"] and not result["unmapped_refs"]:
|
|||
|
|
print(f" ✅ 所有链接正常")
|
|||
|
|
|
|||
|
|
return all_results
|
|||
|
|
|
|||
|
|
def fix_all_orders(self, order_classes: List[str] = None, auto_fix: bool = True):
|
|||
|
|
"""
|
|||
|
|
修复所有或指定的订单班
|
|||
|
|
"""
|
|||
|
|
dirs_to_process = []
|
|||
|
|
|
|||
|
|
if order_classes:
|
|||
|
|
for name in order_classes:
|
|||
|
|
order_dir = self.data_path / name
|
|||
|
|
if order_dir.exists():
|
|||
|
|
dirs_to_process.append(order_dir)
|
|||
|
|
else:
|
|||
|
|
dirs_to_process = [d for d in self.data_path.iterdir()
|
|||
|
|
if d.is_dir() and not d.name.startswith('.')]
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("开始智能修复图片链接")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
total_fixed = 0
|
|||
|
|
|
|||
|
|
for order_dir in dirs_to_process:
|
|||
|
|
print(f"\n处理 {order_dir.name}...")
|
|||
|
|
fixed = self.fix_markdown_references(order_dir, auto_fix)
|
|||
|
|
total_fixed += fixed
|
|||
|
|
|
|||
|
|
if fixed > 0:
|
|||
|
|
print(f" 📊 修复了 {fixed} 个引用")
|
|||
|
|
else:
|
|||
|
|
print(f" ✅ 无需修复")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print(f"✅ 完成!总计修复 {total_fixed} 个引用")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
base_path = Path("/Users/xiaoqi/Documents/Dev/Project/2025-09-08_n8nDEMO演示")
|
|||
|
|
fixer = ImageLinkFixer(base_path)
|
|||
|
|
|
|||
|
|
if len(sys.argv) > 1:
|
|||
|
|
if sys.argv[1] == "validate":
|
|||
|
|
# 仅验证不修复
|
|||
|
|
fixer.validate_all_orders()
|
|||
|
|
elif sys.argv[1] == "fix":
|
|||
|
|
# 智能修复
|
|||
|
|
if len(sys.argv) > 2:
|
|||
|
|
order_classes = sys.argv[2].split(',')
|
|||
|
|
fixer.fix_all_orders(order_classes, auto_fix=True)
|
|||
|
|
else:
|
|||
|
|
fixer.fix_all_orders(auto_fix=True)
|
|||
|
|
else:
|
|||
|
|
# 默认:先验证,询问是否修复
|
|||
|
|
results = fixer.validate_all_orders()
|
|||
|
|
|
|||
|
|
# 统计问题
|
|||
|
|
total_broken = sum(len(r["broken_refs"]) for r in results.values())
|
|||
|
|
total_fixable = sum(len(r["unmapped_refs"]) for r in results.values())
|
|||
|
|
|
|||
|
|
if total_broken > 0 or total_fixable > 0:
|
|||
|
|
print(f"\n发现 {total_broken} 个损坏的引用,{total_fixable} 个可智能修复")
|
|||
|
|
response = input("\n是否执行智能修复?(y/n): ")
|
|||
|
|
if response.lower() == 'y':
|
|||
|
|
fixer.fix_all_orders(auto_fix=True)
|
|||
|
|
else:
|
|||
|
|
print("\n✅ 所有图片链接正常,无需修复")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|