#!/usr/bin/env python3 import json import re from datetime import datetime print("正在提取化工岗位简历中的所有面试题...") # 读取化工岗位简历数据 with open('网页未导入数据/化工产业/化工岗位简历.json', 'r', encoding='utf-8') as f: chemical_data = json.load(f) # 收集每个岗位群的所有面试题 job_group_all_questions = {} total_questions = 0 # 遍历所有岗位,提取完整的面试题内容 for position in chemical_data: job_group = position.get('简历岗位群', '') interview_content = position.get('面试题内容', '') if not job_group or not interview_content: continue # 初始化岗位群 if job_group not in job_group_all_questions: job_group_all_questions[job_group] = [] # 解析面试题内容 questions = [] # 分割内容为行 lines = interview_content.split('\n') current_question = None current_answer = [] in_answer = False for i, line in enumerate(lines): line = line.strip() # 跳过空行和标题行 if not line or line.startswith('#'): continue # 检测新的问题(以数字开头) if re.match(r'^[0-9]+[\.、]', line): # 保存之前的问答 if current_question and current_answer: answer_text = '\n'.join(current_answer).strip() if answer_text and not any(skip in answer_text for skip in ['![', 'image']): questions.append({ 'question': current_question, 'answer': answer_text }) # 开始新问题 current_question = line current_answer = [] in_answer = False # 检测答案标记 elif any(marker in line for marker in ['示例答案', '答案:', '正确答案', '正确选项', '答案是']): in_answer = True # 如果答案在同一行 if ':' in line or ':' in line: answer_part = line.split(':', 1)[-1].split(':', 1)[-1].strip() if answer_part: current_answer.append(answer_part) # 收集选项(选择题) elif current_question and re.match(r'^[A-D][\.、]', line): current_question += '\n' + line # 收集答案内容 elif in_answer and line: if not line.startswith('#'): current_answer.append(line) # 如果还没有明确的答案标记,但这可能是答案内容 elif current_question and not re.match(r'^[0-9]+[\.、]', line): # 检查是否可能是答案(在问题后面的非问题行) if i > 0 and not in_answer: # 如果上一行是问题,这行可能是答案 prev_line = lines[i-1].strip() if i > 0 else '' if re.match(r'^[0-9]+[\.、]', prev_line) or prev_line == current_question: in_answer = True current_answer.append(line) # 保存最后一个问答 if current_question and current_answer: answer_text = '\n'.join(current_answer).strip() if answer_text and not any(skip in answer_text for skip in ['![', 'image']): questions.append({ 'question': current_question, 'answer': answer_text }) # 将问题添加到岗位群(避免重复) for q in questions: # 清理问题和答案文本 q['question'] = q['question'].strip() q['answer'] = q['answer'].strip() # 检查是否重复 is_duplicate = False for existing in job_group_all_questions[job_group]: # 比较问题的前50个字符来判断是否重复 if existing['question'][:50] == q['question'][:50]: is_duplicate = True break if not is_duplicate and len(q['question']) > 5 and len(q['answer']) > 5: job_group_all_questions[job_group].append(q) total_questions += 1 # 输出统计信息 print(f"\n===== 提取完成 =====") print(f"总岗位群数: {len(job_group_all_questions)}") print(f"总面试题数: {total_questions}") print(f"\n各岗位群面试题数量:") for job_group, questions in sorted(job_group_all_questions.items()): print(f" {job_group}: {len(questions)} 道题") # 显示一些示例 print(f"\n===== 面试题示例 =====") for job_group in ['化工安全', '化工检验检测', '化工生产']: if job_group in job_group_all_questions: questions = job_group_all_questions[job_group] print(f"\n【{job_group}】共 {len(questions)} 道题,前2题:") for i, q in enumerate(questions[:2], 1): print(f" {i}. {q['question'][:60]}...") print(f" 答: {q['answer'][:60]}...") # 保存提取的所有面试题 with open('all_interview_questions_complete.json', 'w', encoding='utf-8') as f: json.dump(job_group_all_questions, f, ensure_ascii=False, indent=2) print(f"\n✅ 已保存到 all_interview_questions_complete.json") # 读取现有mock文件并更新 print(f"\n正在更新 resumeInterviewMock.js...") with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f: content = f.read() # 备份 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') backup_file = f'src/mocks/resumeInterviewMock.js.backup_complete_{timestamp}' with open(backup_file, 'w', encoding='utf-8') as f: f.write(content) print(f"已创建备份: {backup_file}") # 更新每个岗位群 updated_count = 0 for job_group, questions in job_group_all_questions.items(): if not questions: continue # 为每个问题创建正确的格式 formatted_questions = [] for i, q in enumerate(questions, 1): formatted_questions.append({ "id": f"q{i}", "question": q['question'], "answer": q['answer'] }) # 转换为JSON字符串 sub_questions_str = json.dumps(formatted_questions, ensure_ascii=False, indent=8) # 替换对应岗位群的subQuestions pattern = rf'("question"\s*:\s*"{re.escape(job_group)}岗位群面试题"\s*,\s*"subQuestions"\s*:\s*)\[[^\]]*\]' replacement = rf'\1{sub_questions_str}' new_content = re.sub(pattern, replacement, content, flags=re.DOTALL) if new_content != content: content = new_content updated_count += 1 print(f"✓ 已更新 {job_group}: {len(questions)} 道题") # 保存更新后的文件 if updated_count > 0: with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f: f.write(content) print(f"\n✅ 成功更新了 {updated_count} 个岗位群,共 {total_questions} 道面试题") # 验证语法 import subprocess try: result = subprocess.run(['node', '-c', 'src/mocks/resumeInterviewMock.js'], capture_output=True, text=True, encoding='utf-8') if result.returncode == 0: print("✓ 语法检查通过") else: print(f"✗ 语法检查失败: {result.stderr}") # 恢复备份 with open(backup_file, 'r', encoding='utf-8') as f: backup_content = f.read() with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f: f.write(backup_content) print("已从备份恢复") except Exception as e: print(f"错误: {e}")