Files
teach_sys_Demo/deep_clean_modified.py

114 lines
4.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
def deep_clean_markdown(content):
"""深度清理markdown内容中的所有删除线和加粗符号"""
if not content:
return content
# 1. 删除所有类型的删除线及其内容
# 标准删除线 ~~text~~
content = re.sub(r'~~[^~]*~~', '', content)
# 中文删除线 text
content = re.sub(r'[^]*', '', content)
# 2. 去除加粗符号但保留内容
# **text** -> text
content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content)
# __text__ -> text
content = re.sub(r'__([^_]+)__', r'\1', content)
# 3. 清理因删除产生的多余符号和空格
# 清理多余的逗号和顿号
content = re.sub(r'\s*', '', content)
content = re.sub(r'\s*、', '', content)
content = re.sub(r'\s*。', '', content)
content = re.sub(r'\s*。', '', content)
# 清理行首的逗号或顿号
content = re.sub(r'^[,、]\s*', '', content, flags=re.MULTILINE)
# 清理多余的空格
content = re.sub(r' {2,}', ' ', content)
content = re.sub(r'\n{3,}', '\n\n', content)
# 清理空的列表项
content = re.sub(r'^\d+\.\s*$', '', content, flags=re.MULTILINE)
content = re.sub(r'^\d+\.\s*\n', '', content, flags=re.MULTILINE)
return content.strip()
# 读取文件
with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 需要清理的岗位列表
positions_to_clean = [
"会展策划师",
"会展讲解员",
"活动执行",
"活动策划师",
"漫展策划师",
"会展执行助理",
"旅游规划师",
"旅游计调专员",
"景区运营专员",
"文旅运营总监助理"
]
print("开始深度清理修改版简历内容...")
total_cleaned = 0
for position in positions_to_clean:
# 查找该岗位的modified内容
# 使用更宽松的正则表达式来匹配
pattern = rf'title:\s*["\']({position})["\'][^}}]*?modified:\s*`([^`]+)`'
matches = list(re.finditer(pattern, content, re.DOTALL))
for match in matches:
original_modified = match.group(2)
# 统计删除线数量
strikethrough_count = len(re.findall(r'~~[^~]*~~', original_modified))
strikethrough_count += len(re.findall(r'[^]*', original_modified))
# 统计加粗数量
bold_count = len(re.findall(r'\*\*[^*]+\*\*', original_modified))
bold_count += len(re.findall(r'__[^_]+__', original_modified))
if strikethrough_count > 0 or bold_count > 0:
cleaned_modified = deep_clean_markdown(original_modified)
# 替换内容
old_text = f"modified: `{original_modified}`"
new_text = f"modified: `{cleaned_modified}`"
content = content.replace(old_text, new_text)
print(f"\n{position}")
print(f" - 删除了 {strikethrough_count} 处删除线")
print(f" - 清理了 {bold_count} 处加粗符号")
total_cleaned += 1
# 写回文件
with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n✅ 深度清理完成!共处理了 {total_cleaned} 个岗位的修改版内容")
# 验证是否还有遗漏的删除线
remaining_strikethrough = len(re.findall(r'~~[^~]*~~', content))
remaining_strikethrough += len(re.findall(r'[^]*', content))
if remaining_strikethrough > 0:
print(f"\n⚠️ 警告:文件中仍有 {remaining_strikethrough} 处删除线符号")
# 查找并显示位置
for match in re.finditer(r'(~~[^~]*~~|[^]*)', content):
start = max(0, match.start() - 50)
end = min(len(content), match.end() + 50)
context = content[start:end]
print(f" 位置: ...{context}...")
else:
print("\n✅ 已确认:所有删除线符号都已清理完毕")