teach_sys_Demo/deep_clean_modified.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re

def deep_clean_markdown(content):
    """深度清理markdown内容中的所有删除线和加粗符号"""
    if not content:
        return content
    
    # 1. 删除所有类型的删除线及其内容
    # 标准删除线 ~~text~~
    content = re.sub(r'~~[^~]*~~', '', content)
    # 中文删除线 ～～text～～
    content = re.sub(r'～～[^～]*～～', '', content)
    
    # 2. 去除加粗符号但保留内容
    # **text** -> text
    content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content)
    # __text__ -> text  
    content = re.sub(r'__([^_]+)__', r'\1', content)
    
    # 3. 清理因删除产生的多余符号和空格
    # 清理多余的逗号和顿号
    content = re.sub(r'，\s*，', '，', content)
    content = re.sub(r'、\s*、', '、', content)
    content = re.sub(r'，\s*。', '。', content)
    content = re.sub(r'、\s*。', '。', content)
    
    # 清理行首的逗号或顿号
    content = re.sub(r'^[，、]\s*', '', content, flags=re.MULTILINE)
    
    # 清理多余的空格
    content = re.sub(r' {2,}', ' ', content)
    content = re.sub(r'\n{3,}', '\n\n', content)
    
    # 清理空的列表项
    content = re.sub(r'^\d+\.\s*$', '', content, flags=re.MULTILINE)
    content = re.sub(r'^\d+\.\s*\n', '', content, flags=re.MULTILINE)
    
    return content.strip()

# 读取文件
with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
    content = f.read()

# 需要清理的岗位列表
positions_to_clean = [
    "会展策划师",
    "会展讲解员", 
    "活动执行",
    "活动策划师",
    "漫展策划师",
    "会展执行助理",
    "旅游规划师",
    "旅游计调专员",
    "景区运营专员",
    "文旅运营总监助理"
]

print("开始深度清理修改版简历内容...")
total_cleaned = 0

for position in positions_to_clean:
    # 查找该岗位的modified内容
    # 使用更宽松的正则表达式来匹配
    pattern = rf'title:\s*["\']({position})["\'][^}}]*?modified:\s*`([^`]+)`'
    
    matches = list(re.finditer(pattern, content, re.DOTALL))
    
    for match in matches:
        original_modified = match.group(2)
        
        # 统计删除线数量
        strikethrough_count = len(re.findall(r'~~[^~]*~~', original_modified))
        strikethrough_count += len(re.findall(r'～～[^～]*～～', original_modified))
        
        # 统计加粗数量
        bold_count = len(re.findall(r'\*\*[^*]+\*\*', original_modified))
        bold_count += len(re.findall(r'__[^_]+__', original_modified))
        
        if strikethrough_count > 0 or bold_count > 0:
            cleaned_modified = deep_clean_markdown(original_modified)
            
            # 替换内容
            old_text = f"modified: `{original_modified}`"
            new_text = f"modified: `{cleaned_modified}`"
            content = content.replace(old_text, new_text)
            
            print(f"\n✓ {position}")
            print(f"  - 删除了 {strikethrough_count} 处删除线")
            print(f"  - 清理了 {bold_count} 处加粗符号")
            total_cleaned += 1

# 写回文件
with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
    f.write(content)

print(f"\n✅ 深度清理完成！共处理了 {total_cleaned} 个岗位的修改版内容")

# 验证是否还有遗漏的删除线
remaining_strikethrough = len(re.findall(r'~~[^~]*~~', content))
remaining_strikethrough += len(re.findall(r'～～[^～]*～～', content))

if remaining_strikethrough > 0:
    print(f"\n⚠️ 警告：文件中仍有 {remaining_strikethrough} 处删除线符号")
    # 查找并显示位置
    for match in re.finditer(r'(~~[^~]*~~|～～[^～]*～～)', content):
        start = max(0, match.start() - 50)
        end = min(len(content), match.end() + 50)
        context = content[start:end]
        print(f"  位置: ...{context}...")
else:
    print("\n✅ 已确认：所有删除线符号都已清理完毕")