ALL-teach_sys/frontend_能源/extract_interview_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re
from collections import defaultdict
import datetime
import shutil

def parse_interview_questions(content):
    """
    解析面试题内容，提取问题和答案
    """
    questions = []

    # 按段落分割内容
    sections = content.split('\n\n')

    question_id = 1
    current_question = None

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # 跳过标题行（以#开头）
        if section.startswith('#'):
            continue

        # 检查是否是问题（通常是数字开头或包含"？"）
        if re.match(r'^\d+\.', section) or '？' in section:
            # 提取问题文本
            question_text = re.sub(r'^\d+\.\s*', '', section).strip()
            current_question = {
                'id': f'q{question_id}',
                'question': question_text,
                'answer': '',
                'difficulty': '中等',
                'tags': []
            }
            question_id += 1

        elif current_question and (section.startswith('示例答案') or section.startswith('答案')):
            # 提取答案
            answer_text = re.sub(r'^(示例答案|答案)[：:]?\s*', '', section).strip()
            current_question['answer'] = answer_text
            questions.append(current_question)
            current_question = None

    return questions

def extract_and_update_interview_questions():
    """
    从能源岗位简历.json提取面试题并更新mock文件
    """
    # 读取能源岗位简历数据
    with open("网页未导入数据/能源产业/能源岗位简历.json", 'r', encoding='utf-8') as f:
        energy_jobs = json.load(f)

    # 按岗位群分组并提取面试题
    job_groups = defaultdict(list)
    interview_questions = {}

    for job in energy_jobs:
        group_name = job.get("简历岗位群", "")
        if group_name:
            job_groups[group_name].append(job)

            # 提取面试题内容
            if "面试题内容" in job and group_name not in interview_questions:
                questions = parse_interview_questions(job["面试题内容"])
                if questions:
                    interview_questions[group_name] = questions[:5]  # 每个岗位群取前5个问题

    # 读取mock文件
    mock_file = "src/mocks/resumeInterviewMock.js"

    # 备份文件
    backup_path = f"{mock_file}.backup_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
    shutil.copy(mock_file, backup_path)
    print(f"✅ 已备份文件到：{backup_path}")

    # 读取文件内容
    with open(mock_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # 查找并替换每个岗位群的questions
    import re

    # 分析content，找到industries数组
    industries_match = re.search(r'const industries = (\[[\s\S]*?\]);', content)
    if industries_match:
        industries_str = industries_match.group(1)
        industries = json.loads(industries_str)

        # 更新每个岗位群的questions
        for industry in industries:
            group_name = industry.get("name", "")
            if group_name in interview_questions:
                # 更新subQuestions
                for question_obj in industry.get("questions", []):
                    question_obj["subQuestions"] = interview_questions[group_name]

        # 将更新后的industries转回字符串
        new_industries_str = json.dumps(industries, ensure_ascii=False, indent=2)
        new_content = content[:industries_match.start(1)] + new_industries_str + content[industries_match.end(1):]

        # 写回文件
        with open(mock_file, 'w', encoding='utf-8') as f:
            f.write(new_content)

        print("✅ 成功更新面试题数据")
        print("\n📊 更新的岗位群面试题：")
        for group_name, questions in interview_questions.items():
            print(f"   - {group_name}: {len(questions)}个问题")
    else:
        print("❌ 未找到industries数组")

if __name__ == "__main__":
    extract_and_update_interview_questions()