370 lines
15 KiB
Python
370 lines
15 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
高级图片链接修复工具 - 使用多种策略恢复原始文件名映射
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
import json
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Optional, Tuple
|
|||
|
|
from difflib import SequenceMatcher
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from urllib.parse import unquote
|
|||
|
|
import shutil
|
|||
|
|
|
|||
|
|
class AdvancedImageRestoration:
|
|||
|
|
def __init__(self):
|
|||
|
|
self.data_path = Path(__file__).parent.parent / "data" / "订单班文档资料"
|
|||
|
|
self.total_fixed = 0
|
|||
|
|
self.total_unmapped = 0
|
|||
|
|
|
|||
|
|
def extract_image_refs(self, content: str) -> List[Tuple[str, str, int]]:
|
|||
|
|
"""提取Markdown中的图片引用,返回(完整引用, 路径, 位置)"""
|
|||
|
|
# 匹配  格式
|
|||
|
|
pattern1 = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|||
|
|
# 匹配 <img> 标签
|
|||
|
|
pattern2 = r'<img[^>]+src=["\']((?!http)[^"\']+)["\'][^>]*>'
|
|||
|
|
|
|||
|
|
refs = []
|
|||
|
|
for match in re.finditer(pattern1, content):
|
|||
|
|
refs.append((match.group(0), match.group(2), match.start()))
|
|||
|
|
for match in re.finditer(pattern2, content):
|
|||
|
|
refs.append((match.group(0), match.group(1), match.start()))
|
|||
|
|
|
|||
|
|
return refs
|
|||
|
|
|
|||
|
|
def normalize_path(self, path: str) -> str:
|
|||
|
|
"""标准化路径"""
|
|||
|
|
path = unquote(path) # URL解码
|
|||
|
|
if path.startswith('./'):
|
|||
|
|
path = path[2:]
|
|||
|
|
if path.startswith('image/'):
|
|||
|
|
return path.split('/')[-1]
|
|||
|
|
return path.split('/')[-1] if '/' in path else path
|
|||
|
|
|
|||
|
|
def extract_number(self, filename: str) -> Optional[int]:
|
|||
|
|
"""提取文件名中的数字"""
|
|||
|
|
# 匹配各种数字格式
|
|||
|
|
patterns = [
|
|||
|
|
r'(\d+)', # 纯数字
|
|||
|
|
r'[一二三四五六七八九十]+', # 中文数字
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for pattern in patterns:
|
|||
|
|
match = re.search(pattern, filename)
|
|||
|
|
if match:
|
|||
|
|
num_str = match.group(1) if match.lastindex else match.group(0)
|
|||
|
|
# 转换中文数字
|
|||
|
|
chinese_nums = {
|
|||
|
|
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
|||
|
|
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10
|
|||
|
|
}
|
|||
|
|
if num_str in chinese_nums:
|
|||
|
|
return chinese_nums[num_str]
|
|||
|
|
try:
|
|||
|
|
return int(num_str)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def categorize_filename(self, filename: str) -> Tuple[str, Optional[int]]:
|
|||
|
|
"""分类文件名并提取序号"""
|
|||
|
|
# 移除扩展名
|
|||
|
|
name = filename.rsplit('.', 1)[0].lower()
|
|||
|
|
|
|||
|
|
# 分类规则
|
|||
|
|
categories = {
|
|||
|
|
'展示图': ['展示', 'display', 'show'],
|
|||
|
|
'设计图': ['设计', 'design'],
|
|||
|
|
'场景图': ['场景', 'scene'],
|
|||
|
|
'分镜设计': ['分镜'],
|
|||
|
|
'章节图': ['一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、', '十、',
|
|||
|
|
'项目概述', '市场分析', '品牌定位', '菜品服务', '选址与装修',
|
|||
|
|
'人员管理', '营销与推广', '财务管理', '风险管理'],
|
|||
|
|
'流程图': ['流程', 'flow', 'process'],
|
|||
|
|
'架构图': ['架构', 'structure', 'architecture'],
|
|||
|
|
'原型图': ['原型', 'prototype', 'mockup'],
|
|||
|
|
'截图': ['截图', 'screenshot', 'capture'],
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for cat, keywords in categories.items():
|
|||
|
|
for keyword in keywords:
|
|||
|
|
if keyword in name:
|
|||
|
|
num = self.extract_number(filename)
|
|||
|
|
return cat, num
|
|||
|
|
|
|||
|
|
# 默认类别
|
|||
|
|
num = self.extract_number(filename)
|
|||
|
|
return '图片', num
|
|||
|
|
|
|||
|
|
def build_intelligent_mapping(self, order_dir: Path) -> Dict[str, str]:
|
|||
|
|
"""使用高级策略建立文件映射"""
|
|||
|
|
notion_dir = order_dir / "notion文稿"
|
|||
|
|
image_dir = notion_dir / "image"
|
|||
|
|
|
|||
|
|
if not image_dir.exists():
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
print(f"\n📊 分析 {order_dir.name} 的图片文件...")
|
|||
|
|
|
|||
|
|
# 1. 收集所有实际图片文件
|
|||
|
|
actual_images = {} # filename -> full_path
|
|||
|
|
for img_file in image_dir.glob("*"):
|
|||
|
|
if img_file.is_file() and img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
|
|||
|
|
actual_images[img_file.name] = img_file
|
|||
|
|
|
|||
|
|
# 2. 收集原始引用(从备份文件)
|
|||
|
|
original_refs = set()
|
|||
|
|
for bak_file in notion_dir.glob("*.bak"):
|
|||
|
|
try:
|
|||
|
|
content = bak_file.read_text(encoding='utf-8')
|
|||
|
|
refs = self.extract_image_refs(content)
|
|||
|
|
for full_ref, path, pos in refs:
|
|||
|
|
normalized = self.normalize_path(path)
|
|||
|
|
if normalized and not normalized.startswith('http'):
|
|||
|
|
original_refs.add(normalized)
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 3. 收集当前引用(从当前文件)
|
|||
|
|
current_refs = set()
|
|||
|
|
for md_file in notion_dir.glob("*.md"):
|
|||
|
|
if not md_file.name.endswith('.bak'):
|
|||
|
|
try:
|
|||
|
|
content = md_file.read_text(encoding='utf-8')
|
|||
|
|
refs = self.extract_image_refs(content)
|
|||
|
|
for full_ref, path, pos in refs:
|
|||
|
|
normalized = self.normalize_path(path)
|
|||
|
|
if normalized and not normalized.startswith('http'):
|
|||
|
|
current_refs.add(normalized)
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 4. 找出需要映射的引用(在原始中存在但实际文件中不存在)
|
|||
|
|
refs_to_map = original_refs - set(actual_images.keys())
|
|||
|
|
|
|||
|
|
print(f" 📁 实际图片: {len(actual_images)}")
|
|||
|
|
print(f" 📝 原始引用: {len(original_refs)}")
|
|||
|
|
print(f" 🔍 需要映射: {len(refs_to_map)}")
|
|||
|
|
|
|||
|
|
# 5. 分类所有文件
|
|||
|
|
categorized_originals = defaultdict(list)
|
|||
|
|
categorized_actuals = defaultdict(list)
|
|||
|
|
|
|||
|
|
for ref in refs_to_map:
|
|||
|
|
cat, num = self.categorize_filename(ref)
|
|||
|
|
categorized_originals[cat].append((ref, num))
|
|||
|
|
|
|||
|
|
for img_name in actual_images.keys():
|
|||
|
|
cat, num = self.categorize_filename(img_name)
|
|||
|
|
categorized_actuals[cat].append((img_name, num))
|
|||
|
|
|
|||
|
|
# 6. 对每个类别排序
|
|||
|
|
for cat in categorized_originals:
|
|||
|
|
# 按序号排序,无序号的放最后
|
|||
|
|
categorized_originals[cat].sort(key=lambda x: (x[1] is None, x[1] if x[1] else 999))
|
|||
|
|
for cat in categorized_actuals:
|
|||
|
|
categorized_actuals[cat].sort(key=lambda x: (x[1] is None, x[1] if x[1] else 999))
|
|||
|
|
|
|||
|
|
# 7. 建立映射
|
|||
|
|
mappings = {}
|
|||
|
|
used_actuals = set()
|
|||
|
|
|
|||
|
|
# 第一轮:按类别和序号精确匹配
|
|||
|
|
for cat in categorized_originals:
|
|||
|
|
if cat not in categorized_actuals:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
orig_list = categorized_originals[cat][:]
|
|||
|
|
actual_list = categorized_actuals[cat][:]
|
|||
|
|
|
|||
|
|
for orig_name, orig_num in orig_list[:]:
|
|||
|
|
if orig_num is not None:
|
|||
|
|
# 查找相同序号的实际文件
|
|||
|
|
for actual_name, actual_num in actual_list:
|
|||
|
|
if actual_num == orig_num and actual_name not in used_actuals:
|
|||
|
|
mappings[orig_name] = actual_name
|
|||
|
|
used_actuals.add(actual_name)
|
|||
|
|
orig_list.remove((orig_name, orig_num))
|
|||
|
|
actual_list.remove((actual_name, actual_num))
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 第二轮:按类别和位置顺序匹配
|
|||
|
|
for cat in categorized_originals:
|
|||
|
|
if cat not in categorized_actuals:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
orig_list = [x for x in categorized_originals[cat] if x[0] not in mappings]
|
|||
|
|
actual_list = [x for x in categorized_actuals[cat] if x[0] not in used_actuals]
|
|||
|
|
|
|||
|
|
# 按顺序配对
|
|||
|
|
for i, (orig_name, _) in enumerate(orig_list):
|
|||
|
|
if i < len(actual_list):
|
|||
|
|
actual_name = actual_list[i][0]
|
|||
|
|
if actual_name not in used_actuals:
|
|||
|
|
mappings[orig_name] = actual_name
|
|||
|
|
used_actuals.add(actual_name)
|
|||
|
|
|
|||
|
|
# 第三轮:跨类别模糊匹配
|
|||
|
|
unmapped = [ref for ref in refs_to_map if ref not in mappings]
|
|||
|
|
unused = [name for name in actual_images.keys() if name not in used_actuals]
|
|||
|
|
|
|||
|
|
if unmapped and unused:
|
|||
|
|
print(f"\n 🔄 执行模糊匹配...")
|
|||
|
|
for orig in unmapped:
|
|||
|
|
best_match = None
|
|||
|
|
best_score = 0
|
|||
|
|
|
|||
|
|
for actual in unused:
|
|||
|
|
# 计算相似度
|
|||
|
|
score = SequenceMatcher(None, orig.lower(), actual.lower()).ratio()
|
|||
|
|
if score > best_score and score > 0.3: # 降低阈值
|
|||
|
|
best_score = score
|
|||
|
|
best_match = actual
|
|||
|
|
|
|||
|
|
if best_match:
|
|||
|
|
mappings[orig] = best_match
|
|||
|
|
used_actuals.add(best_match)
|
|||
|
|
unused.remove(best_match)
|
|||
|
|
|
|||
|
|
# 8. 添加已存在的映射(文件名未改变的)
|
|||
|
|
for img_name in actual_images.keys():
|
|||
|
|
if img_name in original_refs:
|
|||
|
|
mappings[img_name] = img_name
|
|||
|
|
|
|||
|
|
print(f"\n✅ 建立了 {len(mappings)} 个映射关系")
|
|||
|
|
|
|||
|
|
# 显示未映射的文件
|
|||
|
|
still_unmapped = [ref for ref in refs_to_map if ref not in mappings]
|
|||
|
|
if still_unmapped:
|
|||
|
|
print(f"⚠️ 仍有 {len(still_unmapped)} 个引用未映射:")
|
|||
|
|
for ref in still_unmapped[:5]:
|
|||
|
|
print(f" - {ref}")
|
|||
|
|
if len(still_unmapped) > 5:
|
|||
|
|
print(f" ... 还有 {len(still_unmapped) - 5} 个")
|
|||
|
|
|
|||
|
|
return mappings
|
|||
|
|
|
|||
|
|
def apply_mappings(self, order_dir: Path, mappings: Dict[str, str]) -> int:
|
|||
|
|
"""应用映射修复Markdown文件"""
|
|||
|
|
notion_dir = order_dir / "notion文稿"
|
|||
|
|
fixed_count = 0
|
|||
|
|
|
|||
|
|
for md_file in notion_dir.glob("*.md"):
|
|||
|
|
if md_file.name.endswith('.bak') or md_file.name == "图片索引.md":
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 读取文件
|
|||
|
|
content = md_file.read_text(encoding='utf-8')
|
|||
|
|
original = content
|
|||
|
|
|
|||
|
|
# 提取所有图片引用
|
|||
|
|
refs = self.extract_image_refs(content)
|
|||
|
|
|
|||
|
|
# 按位置倒序处理(避免位置偏移)
|
|||
|
|
for full_ref, path, pos in sorted(refs, key=lambda x: x[2], reverse=True):
|
|||
|
|
normalized = self.normalize_path(path)
|
|||
|
|
|
|||
|
|
if normalized in mappings:
|
|||
|
|
new_name = mappings[normalized]
|
|||
|
|
if normalized != new_name:
|
|||
|
|
# 构建新引用
|
|||
|
|
new_path = f"image/{new_name}"
|
|||
|
|
if '"
|
|||
|
|
else:
|
|||
|
|
# HTML格式
|
|||
|
|
new_ref = full_ref.replace(path, new_path)
|
|||
|
|
|
|||
|
|
# 替换内容
|
|||
|
|
content = content[:pos] + new_ref + content[pos + len(full_ref):]
|
|||
|
|
fixed_count += 1
|
|||
|
|
|
|||
|
|
# 保存修改
|
|||
|
|
if content != original:
|
|||
|
|
# 备份原文件
|
|||
|
|
if not md_file.with_suffix('.bak').exists():
|
|||
|
|
shutil.copy(md_file, md_file.with_suffix('.bak'))
|
|||
|
|
|
|||
|
|
md_file.write_text(content, encoding='utf-8')
|
|||
|
|
print(f" ✅ 修复了 {md_file.name}")
|
|||
|
|
|
|||
|
|
return fixed_count
|
|||
|
|
|
|||
|
|
def process_order_class(self, order_name: str):
|
|||
|
|
"""处理单个订单班"""
|
|||
|
|
order_dir = self.data_path / order_name
|
|||
|
|
|
|||
|
|
if not order_dir.exists():
|
|||
|
|
print(f"❌ 订单班 '{order_name}' 不存在")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"\n{'='*50}")
|
|||
|
|
print(f"🔧 处理订单班: {order_name}")
|
|||
|
|
print(f"{'='*50}")
|
|||
|
|
|
|||
|
|
# 建立映射
|
|||
|
|
mappings = self.build_intelligent_mapping(order_dir)
|
|||
|
|
|
|||
|
|
if mappings:
|
|||
|
|
# 保存映射
|
|||
|
|
mapping_file = order_dir / "notion文稿" / "image_mapping.json"
|
|||
|
|
with open(mapping_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(mappings, f, ensure_ascii=False, indent=2)
|
|||
|
|
print(f"💾 保存映射到 {mapping_file.name}")
|
|||
|
|
|
|||
|
|
# 应用映射
|
|||
|
|
fixed = self.apply_mappings(order_dir, mappings)
|
|||
|
|
self.total_fixed += fixed
|
|||
|
|
print(f"✅ 修复了 {fixed} 个图片引用")
|
|||
|
|
else:
|
|||
|
|
print("ℹ️ 没有需要修复的图片引用")
|
|||
|
|
|
|||
|
|
def run(self):
|
|||
|
|
"""主执行函数"""
|
|||
|
|
print("\n🎯 高级图片链接修复工具")
|
|||
|
|
print("=" * 50)
|
|||
|
|
|
|||
|
|
# 获取所有订单班
|
|||
|
|
order_classes = [d.name for d in self.data_path.iterdir() if d.is_dir()]
|
|||
|
|
|
|||
|
|
print(f"\n找到 {len(order_classes)} 个订单班:")
|
|||
|
|
for i, name in enumerate(order_classes, 1):
|
|||
|
|
print(f" {i}. {name}")
|
|||
|
|
|
|||
|
|
# 选择处理方式
|
|||
|
|
print("\n请选择处理方式:")
|
|||
|
|
print("1. 处理所有订单班")
|
|||
|
|
print("2. 选择特定订单班")
|
|||
|
|
print("0. 退出")
|
|||
|
|
|
|||
|
|
choice = input("\n请输入选项 [0-2]: ").strip()
|
|||
|
|
|
|||
|
|
if choice == '0':
|
|||
|
|
return
|
|||
|
|
elif choice == '1':
|
|||
|
|
for order in order_classes:
|
|||
|
|
self.process_order_class(order)
|
|||
|
|
elif choice == '2':
|
|||
|
|
print("\n请输入要处理的订单班编号(多个用逗号分隔):")
|
|||
|
|
indices = input("编号: ").strip().split(',')
|
|||
|
|
|
|||
|
|
for idx_str in indices:
|
|||
|
|
try:
|
|||
|
|
idx = int(idx_str.strip()) - 1
|
|||
|
|
if 0 <= idx < len(order_classes):
|
|||
|
|
self.process_order_class(order_classes[idx])
|
|||
|
|
else:
|
|||
|
|
print(f"⚠️ 编号 {idx+1} 超出范围")
|
|||
|
|
except:
|
|||
|
|
print(f"⚠️ 无效的编号: {idx_str}")
|
|||
|
|
|
|||
|
|
# 显示总结
|
|||
|
|
print(f"\n{'='*50}")
|
|||
|
|
print(f"📊 处理完成!")
|
|||
|
|
print(f" ✅ 总共修复: {self.total_fixed} 个图片引用")
|
|||
|
|
print(f"{'='*50}")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
restorer = AdvancedImageRestoration()
|
|||
|
|
restorer.run()
|