Files
ALL-teach_sys/frontend_视觉设计/fix_interview_newlines.py

141 lines
5.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
from datetime import datetime
import shutil
def clean_newlines_in_text(text):
"""清理文本中的换行符问题"""
if not isinstance(text, str):
return text
# 将 \\n 转换为实际的换行符
text = text.replace('\\n', '\n')
# 清理多余的换行符
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # 将三个或更多连续换行符合并为两个
text = re.sub(r'^\n+', '', text) # 删除开头的换行符
text = re.sub(r'\n+$', '', text) # 删除结尾的换行符
return text.strip()
def process_interview_data():
"""处理面试题数据中的换行符"""
print("🚀 开始处理面试题中的换行符...")
# 创建备份
backup_name = f"src/mocks/resumeInterviewMock.js.backup_newlines_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
shutil.copy('src/mocks/resumeInterviewMock.js', backup_name)
print(f"📦 已创建备份: {backup_name}")
# 读取文件
with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 提取industries数据
industries_match = re.search(r'const industries = (\[.*?\]);', content, re.DOTALL)
if not industries_match:
print("❌ 未找到industries数据")
return
industries_str = industries_match.group(1)
industries = json.loads(industries_str)
# 处理面试题数据
total_questions = 0
processed_questions = 0
for industry in industries:
for question_group in industry.get('questions', []):
for sub_question in question_group.get('subQuestions', []):
total_questions += 1
# 清理问题文本
if 'question' in sub_question:
original_question = sub_question['question']
cleaned_question = clean_newlines_in_text(original_question)
if original_question != cleaned_question:
sub_question['question'] = cleaned_question
processed_questions += 1
# 清理答案文本
if 'answer' in sub_question:
original_answer = sub_question['answer']
cleaned_answer = clean_newlines_in_text(original_answer)
if original_answer != cleaned_answer:
sub_question['answer'] = cleaned_answer
processed_questions += 1
print(f"📊 处理统计:")
print(f" - 总面试题数: {total_questions}")
print(f" - 处理的字段数: {processed_questions}")
# 重新生成文件内容
new_industries_str = json.dumps(industries, ensure_ascii=False, indent=2)
# 替换原有的industries数据
new_content = re.sub(
r'const industries = \[.*?\];',
f'const industries = {new_industries_str};',
content,
flags=re.DOTALL
)
# 写入文件
with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
f.write(new_content)
print("✅ 已更新 resumeInterviewMock.js")
return industries
def verify_cleaned_data():
"""验证清理后的数据"""
print("\\n🔍 验证清理结果...")
with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 检查是否还有 \\n
escaped_newlines = content.count('\\\\n')
print(f" - 剩余的 \\\\n 数量: {escaped_newlines}")
# 提取一个示例查看
industries_match = re.search(r'const industries = (\[.*?\]);', content, re.DOTALL)
if industries_match:
industries = json.loads(industries_match.group(1))
# 查看第一个面试题的答案
if industries and industries[0]['questions'] and industries[0]['questions'][0]['subQuestions']:
first_answer = industries[0]['questions'][0]['subQuestions'][0]['answer']
print(f"\\n📝 清理后的答案示例:")
print(f" 长度: {len(first_answer)} 字符")
print(f" 前100字符: {first_answer[:100]}...")
# 检查换行符
real_newlines = first_answer.count('\\n')
print(f" 实际换行符数量: {real_newlines}")
if escaped_newlines == 0:
print("\\n✅ 换行符清理完成!")
else:
print(f"\\n⚠ 仍有 {escaped_newlines} 个转义换行符需要处理")
def main():
try:
industries = process_interview_data()
verify_cleaned_data()
print("\\n🎉 面试题换行符处理完成!")
print("\\n📝 主要改进:")
print(" - 清理了双重转义的换行符 (\\\\n → \\n)")
print(" - 合并了多余的连续换行符")
print(" - 删除了开头和结尾的多余换行符")
print(" - 保持了文本的可读性和格式")
except Exception as e:
print(f"❌ 处理失败: {e}")
if __name__ == "__main__":
main()