#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从土木水利岗位简历.json中提取各岗位群的面试题数据 """ import json import re def extract_interview_questions(): # 读取土木水利岗位简历数据 with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f: data = json.load(f) # 创建岗位群到面试题的映射 job_group_questions = {} for item in data: job_group = item.get('简历岗位群') interview_title = item.get('面试题') interview_content = item.get('面试题内容') if job_group and interview_content and job_group not in job_group_questions: # 解析面试题内容为结构化数据 questions = parse_interview_content(interview_content, interview_title or job_group) if questions: job_group_questions[job_group] = questions # 输出结果 with open('interview_questions_data.json', 'w', encoding='utf-8') as f: json.dump(job_group_questions, f, ensure_ascii=False, indent=2) print(f"成功提取 {len(job_group_questions)} 个岗位群的面试题数据") return job_group_questions def parse_interview_content(content, title): """解析面试题内容为结构化数据""" questions = [] # 分割成各个题目 sections = re.split(r'\n(?=\d+\.\s)', content) question_id = 1 for section in sections: if not section.strip(): continue # 提取问题和答案 lines = section.strip().split('\n') if len(lines) < 1: continue # 查找问题行 question_text = "" answer_text = "" for i, line in enumerate(lines): # 匹配问题开头 if re.match(r'^\d+\.\s', line): question_text = re.sub(r'^\d+\.\s+', '', line).strip() # 匹配答案 elif '示例答案:' in line or '答案:' in line: answer_text = line.split(':', 1)[1].strip() if ':' in line else "" # 收集多行答案 for j in range(i + 1, len(lines)): if lines[j].strip() and not re.match(r'^\d+\.\s', lines[j]): answer_text += " " + lines[j].strip() else: break if question_text: questions.append({ "id": f"q_{title}_{question_id}", "question": question_text, "answer": answer_text or "请根据实际情况回答" }) question_id += 1 # 如果没有成功解析出问题,则创建一个通用问题 if not questions: # 尝试提取主要内容作为一个大问题 if '问答题' in content or '选择题' in content or '填空题' in content: # 提取第一个完整的问答题 match = re.search(r'问答题[::]\s*(.+?)(?:示例答案|答案)[::]\s*(.+?)(?:\n\n|\n\d+\.|\Z)', content, re.DOTALL) if match: questions.append({ "id": f"q_{title}_1", "question": match.group(1).strip(), "answer": match.group(2).strip() }) return questions if __name__ == "__main__": job_group_questions = extract_interview_questions() # 打印统计信息 for group, questions in job_group_questions.items(): print(f"{group}: {len(questions)} 道题")