ALL-teach_sys/frontend_能源/extract_all_interview_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re
from collections import defaultdict
import datetime
import shutil

def parse_interview_questions(content):
    """
    解析面试题内容，提取所有问题和答案
    """
    questions = []

    # 按行分割内容
    lines = content.split('\n')

    current_question = None
    current_answer = []
    question_id = 1
    in_answer = False

    for i, line in enumerate(lines):
        line = line.strip()

        # 检查是否是问题行（数字+句号开头）
        question_match = re.match(r'^(\d+)\.\s+(.+)$', line)
        if question_match:
            # 保存上一个问题
            if current_question and current_answer:
                answer_text = '\n'.join(current_answer).strip()
                if answer_text:
                    current_question['answer'] = answer_text
                    questions.append(current_question)
                current_answer = []

            # 创建新问题
            question_text = question_match.group(2)
            # 移除问题末尾的问号（如果有）
            question_text = question_text.rstrip('？?')

            current_question = {
                'id': f'q{question_id}',
                'question': question_text,
                'answer': ''
            }
            question_id += 1
            in_answer = False

        # 检查是否是答案开始标记
        elif '示例答案' in line or '答案：' in line or '答案:' in line:
            in_answer = True
            # 如果答案在同一行
            answer_in_line = re.sub(r'^.*?(示例答案|答案)[：:]?\s*', '', line).strip()
            if answer_in_line:
                current_answer.append(answer_in_line)

        # 收集答案内容
        elif in_answer and current_question and line:
            # 跳过标题行
            if not line.startswith('#'):
                current_answer.append(line)

        # 检查是否到达下一个部分（标题）
        elif line.startswith('#') and current_question and current_answer:
            # 保存当前问题
            answer_text = '\n'.join(current_answer).strip()
            if answer_text:
                current_question['answer'] = answer_text
                questions.append(current_question)
            current_question = None
            current_answer = []
            in_answer = False

    # 保存最后一个问题
    if current_question and current_answer:
        answer_text = '\n'.join(current_answer).strip()
        if answer_text:
            current_question['answer'] = answer_text
            questions.append(current_question)

    return questions

def extract_and_update_all_interview_questions():
    """
    从能源岗位简历.json提取所有面试题并更新mock文件
    """
    # 读取能源岗位简历数据
    with open("网页未导入数据/能源产业/能源岗位简历.json", 'r', encoding='utf-8') as f:
        energy_jobs = json.load(f)

    # 按岗位群分组并提取所有面试题
    interview_questions = {}

    for job in energy_jobs:
        group_name = job.get("简历岗位群", "")
        if group_name and group_name not in interview_questions:
            # 提取面试题内容
            if "面试题内容" in job:
                questions = parse_interview_questions(job["面试题内容"])
                if questions:
                    interview_questions[group_name] = questions
                    print(f"✅ {group_name}: 提取了 {len(questions)} 个问题")

    # 读取mock文件
    mock_file = "src/mocks/resumeInterviewMock.js"

    # 备份文件
    backup_path = f"{mock_file}.backup_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
    shutil.copy(mock_file, backup_path)
    print(f"\n✅ 已备份文件到：{backup_path}")

    # 读取文件内容
    with open(mock_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # 查找并替换每个岗位群的questions
    import re

    # 分析content，找到industries数组
    industries_match = re.search(r'const industries = (\[[\s\S]*?\]);', content)
    if industries_match:
        industries_str = industries_match.group(1)

        # 手动解析并更新每个岗位群的subQuestions
        # 因为JSON格式可能有问题，我们逐个替换
        new_content = content

        for group_name, questions in interview_questions.items():
            # 查找该岗位群的questions部分
            pattern = f'"name": "{group_name}"[\\s\\S]*?"questions":[\\s\\S]*?"subQuestions":\\s*\\[[^\\]]*\\]'

            match = re.search(pattern, new_content)
            if match:
                # 找到subQuestions的位置
                sub_pattern = r'"subQuestions":\s*\[[^\]]*\]'
                sub_match = re.search(sub_pattern, match.group(0))

                if sub_match:
                    # 生成新的subQuestions
                    new_sub_questions = json.dumps(questions, ensure_ascii=False, indent=8)
                    # 调整缩进（8个空格）
                    new_sub_questions = '\n'.join(['        ' + line if line.strip() else line
                                                  for line in new_sub_questions.split('\n')])

                    # 替换
                    replacement = f'"subQuestions": {new_sub_questions}'
                    new_match_str = match.group(0).replace(sub_match.group(0), replacement)
                    new_content = new_content.replace(match.group(0), new_match_str)

        # 写回文件
        with open(mock_file, 'w', encoding='utf-8') as f:
            f.write(new_content)

        print("\n✅ 成功更新所有面试题数据")
        print("\n📊 更新统计：")
        total_questions = 0
        for group_name, questions in interview_questions.items():
            total_questions += len(questions)
            print(f"   - {group_name}: {len(questions)}个问题")
        print(f"\n📈 总计：{len(interview_questions)}个岗位群，{total_questions}个面试题")
    else:
        print("❌ 未找到industries数组")

if __name__ == "__main__":
    extract_and_update_all_interview_questions()