Files
Agent-n8n/scripts/validate_all_images.py

175 lines
6.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
验证所有图片链接是否有效
"""
import re
from pathlib import Path
from urllib.parse import unquote
from collections import defaultdict
class ImageValidator:
def __init__(self):
self.data_path = Path(__file__).parent.parent / "data" / "订单班文档资料"
self.broken_links = defaultdict(list)
self.valid_links = 0
self.total_links = 0
def extract_image_refs(self, content: str) -> list:
"""提取Markdown中的图片引用"""
# 匹配 ![alt](path) 格式
pattern1 = r'!\[([^\]]*)\]\(([^)]+)\)'
# 匹配 <img> 标签
pattern2 = r'<img[^>]+src=["\']((?!http)[^"\']+)["\'][^>]*>'
refs = []
for match in re.finditer(pattern1, content):
refs.append(match.group(2))
for match in re.finditer(pattern2, content):
refs.append(match.group(1))
return refs
def normalize_path(self, path: str) -> str:
"""标准化路径"""
path = unquote(path) # URL解码
if path.startswith('./'):
path = path[2:]
if path.startswith('image/'):
return path
return f"image/{path}" if '/' not in path else path
def validate_order_class(self, order_dir: Path) -> dict:
"""验证单个订单班的图片链接"""
notion_dir = order_dir / "notion文稿"
image_dir = notion_dir / "image"
stats = {
"name": order_dir.name,
"total": 0,
"valid": 0,
"broken": 0,
"broken_files": []
}
if not notion_dir.exists():
return stats
# 收集所有实际图片
actual_images = set()
if image_dir.exists():
for img_file in image_dir.glob("*"):
if img_file.is_file() and img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
actual_images.add(img_file.name)
# 检查所有Markdown文件
for md_file in notion_dir.glob("*.md"):
if md_file.name.endswith('.bak') or md_file.name == "图片索引.md":
continue
try:
content = md_file.read_text(encoding='utf-8')
refs = self.extract_image_refs(content)
for ref in refs:
stats["total"] += 1
self.total_links += 1
# 规范化路径
normalized = self.normalize_path(ref)
if normalized.startswith('image/'):
img_name = normalized.split('/')[-1]
else:
img_name = normalized
# 检查文件是否存在
if img_name in actual_images:
stats["valid"] += 1
self.valid_links += 1
else:
stats["broken"] += 1
stats["broken_files"].append({
"file": md_file.name,
"ref": ref,
"expected": img_name
})
self.broken_links[order_dir.name].append({
"file": md_file.name,
"ref": ref,
"expected": img_name
})
except Exception as e:
print(f" ⚠️ 无法读取 {md_file.name}: {e}")
return stats
def run(self):
"""验证所有订单班"""
print("\n" + "="*60)
print("🔍 图片链接验证工具")
print("="*60)
# 获取所有订单班
order_classes = sorted([d for d in self.data_path.iterdir() if d.is_dir()])
all_stats = []
for order_dir in order_classes:
stats = self.validate_order_class(order_dir)
all_stats.append(stats)
# 显示结果
if stats["total"] > 0:
if stats["broken"] == 0:
print(f"{stats['name']}: 所有 {stats['total']} 个链接有效")
else:
print(f"{stats['name']}: {stats['broken']}/{stats['total']} 个链接损坏")
for broken in stats["broken_files"][:3]:
print(f" - {broken['file']}: {broken['ref']}")
if len(stats["broken_files"]) > 3:
print(f" ... 还有 {len(stats['broken_files']) - 3}")
else:
print(f" {stats['name']}: 无图片引用")
# 显示总结
print("\n" + "="*60)
print("📊 总结")
print("="*60)
print(f" 检查订单班: {len(order_classes)}")
print(f" 总链接数: {self.total_links}")
print(f" 有效链接: {self.valid_links}")
print(f" 损坏链接: {self.total_links - self.valid_links}")
if self.valid_links == self.total_links:
print(f"\n🎉 所有图片链接都有效!")
else:
broken_percentage = ((self.total_links - self.valid_links) / self.total_links) * 100
print(f"\n⚠️ {broken_percentage:.1f}% 的链接需要修复")
# 显示需要修复的订单班
print("\n需要修复的订单班:")
for order_name, broken_refs in self.broken_links.items():
print(f"{order_name}: {len(broken_refs)} 个损坏链接")
# 保存验证报告
report_file = Path(__file__).parent.parent / "image_validation_report.txt"
with open(report_file, 'w', encoding='utf-8') as f:
f.write("图片链接验证报告\n")
f.write("="*60 + "\n\n")
for stats in all_stats:
f.write(f"{stats['name']}:\n")
f.write(f" 总链接: {stats['total']}\n")
f.write(f" 有效: {stats['valid']}\n")
f.write(f" 损坏: {stats['broken']}\n")
if stats['broken_files']:
f.write(" 损坏详情:\n")
for broken in stats['broken_files']:
f.write(f" - {broken['file']}: {broken['ref']} -> {broken['expected']}\n")
f.write("\n")
print(f"\n📄 详细报告已保存到: {report_file}")
if __name__ == "__main__":
validator = ImageValidator()
validator.run()