Files
ALL-teach_sys/frontend_大健康/extract_complete_questions.py

251 lines
9.1 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
def extract_all_questions_from_content(content):
"""从内容中提取所有面试题"""
all_questions = []
# 删除"判断题:"等前缀
content = re.sub(r'判断题[:]?\s*', '', content)
# 先按大类分割(# 一、二、三等)
category_pattern = r'# ([一二三四五六七八九十]+、[^\n]+)'
categories = re.split(category_pattern, content)
if len(categories) > 1:
# 有分类的情况
for i in range(1, len(categories), 2):
if i+1 >= len(categories):
break
category_title = categories[i].strip()
category_content = categories[i+1]
# 从该分类中提取所有问题
questions = extract_questions_from_text(category_content)
if questions:
all_questions.append({
"category": category_title,
"questions": questions
})
else:
# 没有分类,直接提取所有问题
questions = extract_questions_from_text(content)
if questions:
all_questions.append({
"category": "综合面试题",
"questions": questions
})
return all_questions
def extract_questions_from_text(text):
"""从文本中提取问题和答案"""
questions = []
question_id = 1
# 分割文本为行
lines = text.split('\n')
current_question = None
current_answer = []
in_answer_section = False
for i, line in enumerate(lines):
line = line.strip()
# 检查是否是问题行(数字开头)
question_match = re.match(r'^(\d+)\.\s*(.+)$', line)
if question_match:
# 先保存上一个问题
if current_question and current_answer:
answer_text = ' '.join(current_answer).strip()
# 清理答案文本
answer_text = re.sub(r'^(示例)?答案[:]?\s*', '', answer_text)
answer_text = re.sub(r'\s+', ' ', answer_text)
if answer_text:
questions.append({
"id": f"q{question_id}",
"question": current_question,
"answer": answer_text
})
question_id += 1
# 开始新问题
current_question = question_match.group(2).strip()
current_answer = []
in_answer_section = False
# 检查是否进入答案部分
elif line and ('示例答案' in line or '答案:' in line or '答案:' in line):
in_answer_section = True
# 答案可能在同一行
answer_on_same_line = re.sub(r'^.*(示例)?答案[:]?\s*', '', line).strip()
if answer_on_same_line:
current_answer.append(answer_on_same_line)
# 收集答案内容
elif in_answer_section and line:
# 检查是否是下一个问题或分类
if not re.match(r'^(\d+)\.', line) and not line.startswith('#'):
current_answer.append(line)
# 空行可能表示答案结束
elif not line and in_answer_section:
in_answer_section = False
# 保存最后一个问题
if current_question and current_answer:
answer_text = ' '.join(current_answer).strip()
answer_text = re.sub(r'^(示例)?答案[:]?\s*', '', answer_text)
answer_text = re.sub(r'\s+', ' ', answer_text)
if answer_text:
questions.append({
"id": f"q{question_id}",
"question": current_question,
"answer": answer_text
})
# 如果没有找到答案,尝试另一种模式
if not questions:
# 使用正则表达式匹配问题和答案
pattern = r'(\d+)\.\s*([^\n]+)\s*\n\s*(?:示例)?答案[:]?\s*\n\s*([^\n]+(?:\n(?!\d+\.|#)[^\n]*)*)'
matches = re.findall(pattern, text, re.MULTILINE)
question_id = 1
for match in matches:
question_text = match[1].strip()
answer_text = match[2].strip()
answer_text = re.sub(r'\s+', ' ', answer_text)
if question_text and answer_text:
questions.append({
"id": f"q{question_id}",
"question": question_text,
"answer": answer_text
})
question_id += 1
return questions
def main():
# 读取大健康岗位简历数据
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
health_data = json.load(f)
# 读取Mock文件
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
content = f.read()
# 收集每个岗位群的所有面试题
industry_all_questions = {}
for item in health_data:
industry = item.get('简历岗位群', '')
interview_content = item.get('面试题内容', '')
if industry and interview_content:
if industry not in industry_all_questions:
industry_all_questions[industry] = []
# 提取该岗位的所有问题
categories = extract_all_questions_from_content(interview_content)
# 合并到该岗位群的问题列表中
for cat in categories:
# 检查是否已有该分类
existing_cat = None
for existing in industry_all_questions[industry]:
if existing['category'] == cat['category']:
existing_cat = existing
break
if existing_cat:
# 合并问题,避免重复
existing_questions = {q['question'] for q in existing_cat['questions']}
for q in cat['questions']:
if q['question'] not in existing_questions:
existing_cat['questions'].append(q)
else:
# 添加新分类
industry_all_questions[industry].append(cat)
# 转换为前端期望的格式并更新Mock文件
industry_mapping = {
'健康管理': 'health_1',
'健康检查': 'health_2',
'康复治疗': 'health_3',
'慢性病管理': 'health_4',
'轻医美': 'health_5',
'心理健康': 'health_6',
'社群运营': 'health_7',
'药品供应链管理': 'health_8',
'药品生产': 'health_9',
'药品质量检测': 'health_10',
'药物研发': 'health_11'
}
updates = 0
for orig_name, industry_id in industry_mapping.items():
if orig_name in industry_all_questions:
categories = industry_all_questions[orig_name]
# 转换为questions数组
questions_array = []
cat_id = 1
total_questions = 0
for cat in categories:
if cat['questions']:
# 重新编号问题ID
renumbered_questions = []
for i, q in enumerate(cat['questions'], 1):
renumbered_questions.append({
"id": f"q{total_questions + i}",
"question": q['question'],
"answer": q['answer']
})
total_questions += len(renumbered_questions)
questions_array.append({
"id": f"group_q{cat_id}",
"question": cat['category'],
"subQuestions": renumbered_questions
})
cat_id += 1
if questions_array:
print(f"{orig_name} ({industry_id}): {len(questions_array)} 个分类,共 {total_questions} 个面试题")
# 生成JSON字符串
questions_json = json.dumps(questions_array, ensure_ascii=False, indent=2)
# 删除旧的questions字段
pattern1 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\]),\s*"questions":\s*\[[^\]]*?\](\s*\}})'
replacement1 = rf'\1\2'
content = re.sub(pattern1, replacement1, content, flags=re.DOTALL)
# 添加新的questions字段
pattern2 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\])(\s*\}})'
replacement2 = rf'\1,\n "questions": {questions_json}\2'
new_content, count = re.subn(pattern2, replacement2, content, flags=re.DOTALL)
if count > 0:
content = new_content
updates += 1
# 写回文件
with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
f.write(content)
print(f"\n✅ 完成!更新了 {updates} 个岗位群的完整面试题数据")
if __name__ == "__main__":
main()