Files
online_sys/frontend_大健康/extract_all_interview_questions.py
KQL a7242f0c69 Initial commit: 教务系统在线平台
- 包含4个产业方向的前端项目:智能开发、智能制造、大健康、财经商贸
- 已清理node_modules、.yoyo等大文件,项目大小从2.6GB优化至631MB
- 配置完善的.gitignore文件

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-12 18:16:55 +08:00

231 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
def parse_all_interview_questions(content):
"""解析所有面试题内容,包括所有问题"""
questions = []
question_id = 1
# 删除"判断题:"等前缀
content = re.sub(r'判断题:\s*', '', content)
# 分割成不同的问题类别(一、二、三等)
sections = re.split(r'\n# ([一二三四五六七八九十]+、[^#\n]+)', content)
# 如果没有找到类别标记,尝试直接查找所有问题
all_questions = []
if len(sections) > 1:
# 有类别的情况
for i in range(1, len(sections), 2):
if i >= len(sections):
break
section_title = sections[i].strip()
section_content = sections[i + 1] if i + 1 < len(sections) else ""
# 提取该类别下的所有问题
category_questions = extract_questions_from_section(section_content, question_id)
question_id += len(category_questions)
if category_questions:
all_questions.append({
"category": section_title,
"questions": category_questions
})
else:
# 没有类别的情况,直接提取所有问题
category_questions = extract_questions_from_section(content, question_id)
if category_questions:
all_questions.append({
"category": "综合面试题",
"questions": category_questions
})
return all_questions
def extract_questions_from_section(content, start_id):
"""从内容中提取所有问题和答案"""
questions = []
question_id = start_id
# 使用更宽松的模式匹配问题
# 模式1: 数字. 问题
pattern1 = r'\n(\d+)\.\s*([^\n]+?)[\n\s]+((?:示例)?答案[:]\s*[^\n]+(?:\n(?!\d+\.).*)*)'
# 模式2: 问题后跟答案段落
pattern2 = r'\n(\d+)\.\s*([^\n]+)\n\s*\n\s*((?:示例)?答案[:])?\s*\n\s*([^\n]+(?:\n(?!\d+\.|示例答案).*)*)'
# 先尝试模式1
matches = re.findall(pattern1, content, re.MULTILINE)
if not matches:
# 尝试模式2
matches = re.findall(pattern2, content, re.MULTILINE)
matches = [(m[0], m[1], m[3]) for m in matches] # 调整格式
# 如果还是没有匹配,使用更简单的模式
if not matches:
lines = content.split('\n')
current_question = None
current_answer = []
in_answer = False
for line in lines:
line = line.strip()
# 检查是否是新问题
question_match = re.match(r'^(\d+)\.\s*(.+)$', line)
if question_match:
# 保存上一个问题
if current_question and current_answer:
answer_text = ' '.join(current_answer).strip()
if answer_text:
questions.append({
"id": f"q{question_id}",
"question": current_question,
"answer": answer_text
})
question_id += 1
# 开始新问题
current_question = question_match.group(2).strip()
current_answer = []
in_answer = False
# 检查是否是答案开始
elif '答案' in line or '示例答案' in line:
in_answer = True
# 可能答案就在同一行
answer_part = re.sub(r'^(示例)?答案[:]?\s*', '', line).strip()
if answer_part:
current_answer.append(answer_part)
# 收集答案内容
elif in_answer and line:
# 检查是否是下一个问题的开始
if not re.match(r'^\d+\.', line):
current_answer.append(line)
else:
in_answer = False
# 如果没有明确的答案标记,但有内容,也收集
elif current_question and not in_answer and line and not re.match(r'^\d+\.', line):
current_answer.append(line)
# 保存最后一个问题
if current_question and current_answer:
answer_text = ' '.join(current_answer).strip()
if answer_text:
questions.append({
"id": f"q{question_id}",
"question": current_question,
"answer": answer_text
})
else:
# 处理正则匹配的结果
for match in matches:
question_text = match[1].strip()
answer_text = match[2].strip()
# 清理答案文本
answer_text = re.sub(r'^(示例)?答案[:]?\s*', '', answer_text).strip()
answer_text = re.sub(r'\s+', ' ', answer_text) # 合并多余空格
if question_text and answer_text:
questions.append({
"id": f"q{question_id}",
"question": question_text,
"answer": answer_text
})
question_id += 1
return questions
def main():
# 读取大健康岗位简历数据
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
health_data = json.load(f)
# 读取Mock文件
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 创建岗位群到面试题的映射
industry_questions_map = {}
for item in health_data:
industry = item.get('简历岗位群', '')
interview_content = item.get('面试题内容', '')
if industry and interview_content and industry not in industry_questions_map:
all_categories = parse_all_interview_questions(interview_content)
# 转换为前端期望的格式
questions_array = []
cat_id = 1
for category_data in all_categories:
if category_data['questions']:
questions_array.append({
"id": f"group_q{cat_id}",
"question": category_data['category'],
"subQuestions": category_data['questions']
})
cat_id += 1
if questions_array:
industry_questions_map[industry] = questions_array
total_questions = sum(len(q['subQuestions']) for q in questions_array)
print(f"{industry}: 提取了 {len(questions_array)} 个分类,共 {total_questions} 个面试题")
# 映射岗位群名称到ID
industry_mapping = {
'健康管理': 'health_1',
'健康检查': 'health_2',
'康复治疗': 'health_3',
'慢性病管理': 'health_4',
'轻医美': 'health_5',
'心理健康': 'health_6',
'社群运营': 'health_7',
'药品供应链管理': 'health_8',
'药品生产': 'health_9',
'药品质量检测': 'health_10',
'药物研发': 'health_11'
}
# 更新Mock文件
updates = 0
for orig_name, industry_id in industry_mapping.items():
if orig_name in industry_questions_map:
questions = industry_questions_map[orig_name]
# 生成questions的JSON字符串
questions_json = json.dumps(questions, ensure_ascii=False, indent=2)
# 查找并替换questions字段
# 先删除旧的questions字段
pattern1 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\]),\s*"questions":\s*\[[^\]]*?\](\s*\}})'
replacement1 = rf'\1\2'
content = re.sub(pattern1, replacement1, content, flags=re.DOTALL)
# 再添加新的questions字段
pattern2 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\])(\s*\}})'
replacement2 = rf'\1,\n "questions": {questions_json}\2'
new_content, count = re.subn(pattern2, replacement2, content, flags=re.DOTALL)
if count > 0:
content = new_content
updates += 1
# 写回文件
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n✅ 完成!更新了 {updates} 个岗位群的完整面试题数据")
if __name__ == "__main__":
main()