Files
online_sys/frontend_大健康/clean_all_interview_questions.py
KQL a7242f0c69 Initial commit: 教务系统在线平台
- 包含4个产业方向的前端项目:智能开发、智能制造、大健康、财经商贸
- 已清理node_modules、.yoyo等大文件,项目大小从2.6GB优化至631MB
- 配置完善的.gitignore文件

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-12 18:16:55 +08:00

210 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
完全清理并重写所有面试题数据为扁平结构
确保所有岗位都使用正确的 questions: [{ id, question, answer }] 格式
"""
import json
import re
import sys
from datetime import datetime
def load_health_resume_data():
"""加载大健康岗位简历数据"""
try:
with open('网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Error loading health resume data: {e}")
return None
def parse_interview_content_to_flat_array(content):
"""解析面试题内容,转换为扁平的问答数组"""
if not content:
return []
questions = []
# 按大标题分割(# 一、二、三等)
if content.startswith('# '):
content = '\n' + content
sections = re.split(r'\n# ([一二三四五六七八九十]+、[^#\n]+)', content)
if len(sections) < 2:
return []
question_counter = 1
for i in range(1, len(sections), 2):
if i + 1 < len(sections):
section_title = sections[i].strip()
section_content = sections[i + 1].strip()
# 按问题编号分割 (1. 2. 3. 等)
question_parts = re.split(r'\n\s*(\d+\.)\s+', section_content)
for j in range(1, len(question_parts), 2):
if j + 1 < len(question_parts):
question_block = question_parts[j + 1].strip()
# 提取问题和答案
lines = question_block.split('\n')
question_text = ""
answer_text = ""
in_answer = False
for line in lines:
line = line.strip()
if line.startswith('示例答案:'):
in_answer = True
continue
if not in_answer and line and not line.startswith('示例答案:'):
if question_text:
question_text += " "
question_text += line
elif in_answer and line:
if answer_text:
answer_text += " "
answer_text += line
if question_text:
questions.append({
"id": f"q{question_counter}",
"question": question_text,
"answer": answer_text
})
question_counter += 1
return questions
def clean_and_update_all_questions():
"""完全清理并更新所有面试题数据"""
try:
# 加载大健康数据
health_data = load_health_resume_data()
if not health_data:
print("Failed to load health resume data")
return False
# 创建岗位到面试题的映射
position_to_questions = {}
for item in health_data:
position_name = item.get('岗位名称', '')
interview_content = item.get('面试题内容', '')
if position_name and interview_content:
questions = parse_interview_content_to_flat_array(interview_content)
position_to_questions[position_name] = questions
print(f"解析了 {len(position_to_questions)} 个岗位的面试题")
# 读取现有文件
with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 找到所有岗位并完全重写questions字段
updated_content = content
update_count = 0
for position_name, questions in position_to_questions.items():
if not questions:
continue
# 将questions数组转换为JavaScript格式的字符串
questions_js_parts = []
for q in questions:
q_text = q['question'].replace('"', '\\"').replace('\n', '\\n')
a_text = q['answer'].replace('"', '\\"').replace('\n', '\\n')
question_js = ''' {
"id": "%s",
"question": "%s",
"answer": "%s"
}''' % (q['id'], q_text, a_text)
questions_js_parts.append(question_js)
questions_js = '''[
%s
]''' % ',\n'.join(questions_js_parts)
# 使用更宽泛的正则表达式来匹配岗位
# 匹配从"title"开始到下一个position或结束的整个岗位定义
position_pattern = rf'"title": "{re.escape(position_name)}"[\s\S]*?(?="title":|^\]\s*;\s*$|^const\s+)'
def replace_position_questions(match):
matched_text = match.group(0)
# 删除现有的questions字段不管是什么格式
cleaned_text = re.sub(r',?\s*"questions": \[[^\]]*?\](?:\s*,\s*)?\s*(?=\]|\})', '', matched_text, flags=re.DOTALL)
cleaned_text = re.sub(r',?\s*"questions": \[[\s\S]*?\](?:\s*,\s*)?\s*(?=\]|\})', '', cleaned_text, flags=re.DOTALL)
# 在requirements后添加新的questions字段
if '"requirements":' in cleaned_text:
cleaned_text = re.sub(
r'("requirements": \[[^\]]*?\])',
r'\1,\n "questions": ' + questions_js,
cleaned_text,
flags=re.DOTALL
)
else:
# 如果没有requirements字段在最后一个字段后添加
cleaned_text = re.sub(
r'(\s+)(\]|\})\s*$',
r',\n "questions": ' + questions_js + r'\1\2',
cleaned_text,
flags=re.DOTALL
)
return cleaned_text
new_content = re.sub(position_pattern, replace_position_questions, updated_content, flags=re.MULTILINE)
if new_content != updated_content:
updated_content = new_content
update_count += 1
print(f"✅ 重写 {position_name} 的面试题 ({len(questions)} 个问题)")
# 最后清理任何残留的旧格式问题
# 删除任何包含subQuestions的问题结构
updated_content = re.sub(
r'"questions": \[\s*\{[^}]*"subQuestions"[\s\S]*?\}\s*\]',
'"questions": []',
updated_content,
flags=re.DOTALL
)
# 删除任何独立的subQuestions结构
updated_content = re.sub(
r'"subQuestions": \[[\s\S]*?\][\s,]*',
'',
updated_content,
flags=re.DOTALL
)
# 写回文件
with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
f.write(updated_content)
print(f"\n🎉 成功清理并重写 {update_count} 个岗位的面试题数据!")
return True
except Exception as e:
print(f"Error cleaning interview questions: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""主函数"""
print("开始完全清理并重写面试题数据...")
success = clean_and_update_all_questions()
if success:
print("面试题数据清理和重写完成!")
else:
print("面试题数据清理失败!")
return success
if __name__ == "__main__":
main()