Files
ALL-teach_sys/frontend_土木水利/extract_interview_questions.py

99 lines
3.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从土木水利岗位简历.json中提取各岗位群的面试题数据
"""
import json
import re
def extract_interview_questions():
# 读取土木水利岗位简历数据
with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# 创建岗位群到面试题的映射
job_group_questions = {}
for item in data:
job_group = item.get('简历岗位群')
interview_title = item.get('面试题')
interview_content = item.get('面试题内容')
if job_group and interview_content and job_group not in job_group_questions:
# 解析面试题内容为结构化数据
questions = parse_interview_content(interview_content, interview_title or job_group)
if questions:
job_group_questions[job_group] = questions
# 输出结果
with open('interview_questions_data.json', 'w', encoding='utf-8') as f:
json.dump(job_group_questions, f, ensure_ascii=False, indent=2)
print(f"成功提取 {len(job_group_questions)} 个岗位群的面试题数据")
return job_group_questions
def parse_interview_content(content, title):
"""解析面试题内容为结构化数据"""
questions = []
# 分割成各个题目
sections = re.split(r'\n(?=\d+\.\s)', content)
question_id = 1
for section in sections:
if not section.strip():
continue
# 提取问题和答案
lines = section.strip().split('\n')
if len(lines) < 1:
continue
# 查找问题行
question_text = ""
answer_text = ""
for i, line in enumerate(lines):
# 匹配问题开头
if re.match(r'^\d+\.\s', line):
question_text = re.sub(r'^\d+\.\s+', '', line).strip()
# 匹配答案
elif '示例答案:' in line or '答案:' in line:
answer_text = line.split('', 1)[1].strip() if '' in line else ""
# 收集多行答案
for j in range(i + 1, len(lines)):
if lines[j].strip() and not re.match(r'^\d+\.\s', lines[j]):
answer_text += " " + lines[j].strip()
else:
break
if question_text:
questions.append({
"id": f"q_{title}_{question_id}",
"question": question_text,
"answer": answer_text or "请根据实际情况回答"
})
question_id += 1
# 如果没有成功解析出问题,则创建一个通用问题
if not questions:
# 尝试提取主要内容作为一个大问题
if '问答题' in content or '选择题' in content or '填空题' in content:
# 提取第一个完整的问答题
match = re.search(r'问答题[:]\s*(.+?)(?:示例答案|答案)[:]\s*(.+?)(?:\n\n|\n\d+\.|\Z)', content, re.DOTALL)
if match:
questions.append({
"id": f"q_{title}_1",
"question": match.group(1).strip(),
"answer": match.group(2).strip()
})
return questions
if __name__ == "__main__":
job_group_questions = extract_interview_questions()
# 打印统计信息
for group, questions in job_group_questions.items():
print(f"{group}: {len(questions)} 道题")