99 lines
3.5 KiB
Python
99 lines
3.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
从土木水利岗位简历.json中提取各岗位群的面试题数据
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
def extract_interview_questions():
|
|||
|
|
# 读取土木水利岗位简历数据
|
|||
|
|
with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
# 创建岗位群到面试题的映射
|
|||
|
|
job_group_questions = {}
|
|||
|
|
|
|||
|
|
for item in data:
|
|||
|
|
job_group = item.get('简历岗位群')
|
|||
|
|
interview_title = item.get('面试题')
|
|||
|
|
interview_content = item.get('面试题内容')
|
|||
|
|
|
|||
|
|
if job_group and interview_content and job_group not in job_group_questions:
|
|||
|
|
# 解析面试题内容为结构化数据
|
|||
|
|
questions = parse_interview_content(interview_content, interview_title or job_group)
|
|||
|
|
if questions:
|
|||
|
|
job_group_questions[job_group] = questions
|
|||
|
|
|
|||
|
|
# 输出结果
|
|||
|
|
with open('interview_questions_data.json', 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(job_group_questions, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"成功提取 {len(job_group_questions)} 个岗位群的面试题数据")
|
|||
|
|
return job_group_questions
|
|||
|
|
|
|||
|
|
def parse_interview_content(content, title):
|
|||
|
|
"""解析面试题内容为结构化数据"""
|
|||
|
|
questions = []
|
|||
|
|
|
|||
|
|
# 分割成各个题目
|
|||
|
|
sections = re.split(r'\n(?=\d+\.\s)', content)
|
|||
|
|
|
|||
|
|
question_id = 1
|
|||
|
|
for section in sections:
|
|||
|
|
if not section.strip():
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 提取问题和答案
|
|||
|
|
lines = section.strip().split('\n')
|
|||
|
|
if len(lines) < 1:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 查找问题行
|
|||
|
|
question_text = ""
|
|||
|
|
answer_text = ""
|
|||
|
|
|
|||
|
|
for i, line in enumerate(lines):
|
|||
|
|
# 匹配问题开头
|
|||
|
|
if re.match(r'^\d+\.\s', line):
|
|||
|
|
question_text = re.sub(r'^\d+\.\s+', '', line).strip()
|
|||
|
|
# 匹配答案
|
|||
|
|
elif '示例答案:' in line or '答案:' in line:
|
|||
|
|
answer_text = line.split(':', 1)[1].strip() if ':' in line else ""
|
|||
|
|
# 收集多行答案
|
|||
|
|
for j in range(i + 1, len(lines)):
|
|||
|
|
if lines[j].strip() and not re.match(r'^\d+\.\s', lines[j]):
|
|||
|
|
answer_text += " " + lines[j].strip()
|
|||
|
|
else:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if question_text:
|
|||
|
|
questions.append({
|
|||
|
|
"id": f"q_{title}_{question_id}",
|
|||
|
|
"question": question_text,
|
|||
|
|
"answer": answer_text or "请根据实际情况回答"
|
|||
|
|
})
|
|||
|
|
question_id += 1
|
|||
|
|
|
|||
|
|
# 如果没有成功解析出问题,则创建一个通用问题
|
|||
|
|
if not questions:
|
|||
|
|
# 尝试提取主要内容作为一个大问题
|
|||
|
|
if '问答题' in content or '选择题' in content or '填空题' in content:
|
|||
|
|
# 提取第一个完整的问答题
|
|||
|
|
match = re.search(r'问答题[::]\s*(.+?)(?:示例答案|答案)[::]\s*(.+?)(?:\n\n|\n\d+\.|\Z)', content, re.DOTALL)
|
|||
|
|
if match:
|
|||
|
|
questions.append({
|
|||
|
|
"id": f"q_{title}_1",
|
|||
|
|
"question": match.group(1).strip(),
|
|||
|
|
"answer": match.group(2).strip()
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return questions
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
job_group_questions = extract_interview_questions()
|
|||
|
|
|
|||
|
|
# 打印统计信息
|
|||
|
|
for group, questions in job_group_questions.items():
|
|||
|
|
print(f"{group}: {len(questions)} 道题")
|