ALL-teach_sys/frontend_土木水利/extract_interview_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从土木水利岗位简历.json中提取各岗位群的面试题数据
"""

import json
import re

def extract_interview_questions():
    # 读取土木水利岗位简历数据
    with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 创建岗位群到面试题的映射
    job_group_questions = {}

    for item in data:
        job_group = item.get('简历岗位群')
        interview_title = item.get('面试题')
        interview_content = item.get('面试题内容')

        if job_group and interview_content and job_group not in job_group_questions:
            # 解析面试题内容为结构化数据
            questions = parse_interview_content(interview_content, interview_title or job_group)
            if questions:
                job_group_questions[job_group] = questions

    # 输出结果
    with open('interview_questions_data.json', 'w', encoding='utf-8') as f:
        json.dump(job_group_questions, f, ensure_ascii=False, indent=2)

    print(f"成功提取 {len(job_group_questions)} 个岗位群的面试题数据")
    return job_group_questions

def parse_interview_content(content, title):
    """解析面试题内容为结构化数据"""
    questions = []

    # 分割成各个题目
    sections = re.split(r'\n(?=\d+\.\s)', content)

    question_id = 1
    for section in sections:
        if not section.strip():
            continue

        # 提取问题和答案
        lines = section.strip().split('\n')
        if len(lines) < 1:
            continue

        # 查找问题行
        question_text = ""
        answer_text = ""

        for i, line in enumerate(lines):
            # 匹配问题开头
            if re.match(r'^\d+\.\s', line):
                question_text = re.sub(r'^\d+\.\s+', '', line).strip()
            # 匹配答案
            elif '示例答案：' in line or '答案：' in line:
                answer_text = line.split('：', 1)[1].strip() if '：' in line else ""
                # 收集多行答案
                for j in range(i + 1, len(lines)):
                    if lines[j].strip() and not re.match(r'^\d+\.\s', lines[j]):
                        answer_text += " " + lines[j].strip()
                    else:
                        break

        if question_text:
            questions.append({
                "id": f"q_{title}_{question_id}",
                "question": question_text,
                "answer": answer_text or "请根据实际情况回答"
            })
            question_id += 1

    # 如果没有成功解析出问题，则创建一个通用问题
    if not questions:
        # 尝试提取主要内容作为一个大问题
        if '问答题' in content or '选择题' in content or '填空题' in content:
            # 提取第一个完整的问答题
            match = re.search(r'问答题[：:]\s*(.+?)(?:示例答案|答案)[：:]\s*(.+?)(?:\n\n|\n\d+\.|\Z)', content, re.DOTALL)
            if match:
                questions.append({
                    "id": f"q_{title}_1",
                    "question": match.group(1).strip(),
                    "answer": match.group(2).strip()
                })

    return questions

if __name__ == "__main__":
    job_group_questions = extract_interview_questions()

    # 打印统计信息
    for group, questions in job_group_questions.items():
        print(f"{group}: {len(questions)} 道题")