ALL-teach_sys/frontend_大健康/extract_all_interview_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re

def parse_all_interview_questions(content):
    """解析所有面试题内容，包括所有问题"""
    questions = []
    question_id = 1

    # 删除"判断题："等前缀
    content = re.sub(r'判断题：\s*', '', content)

    # 分割成不同的问题类别（一、二、三等）
    sections = re.split(r'\n# ([一二三四五六七八九十]+、[^#\n]+)', content)

    # 如果没有找到类别标记，尝试直接查找所有问题
    all_questions = []

    if len(sections) > 1:
        # 有类别的情况
        for i in range(1, len(sections), 2):
            if i >= len(sections):
                break

            section_title = sections[i].strip()
            section_content = sections[i + 1] if i + 1 < len(sections) else ""

            # 提取该类别下的所有问题
            category_questions = extract_questions_from_section(section_content, question_id)
            question_id += len(category_questions)

            if category_questions:
                all_questions.append({
                    "category": section_title,
                    "questions": category_questions
                })
    else:
        # 没有类别的情况，直接提取所有问题
        category_questions = extract_questions_from_section(content, question_id)
        if category_questions:
            all_questions.append({
                "category": "综合面试题",
                "questions": category_questions
            })

    return all_questions

def extract_questions_from_section(content, start_id):
    """从内容中提取所有问题和答案"""
    questions = []
    question_id = start_id

    # 使用更宽松的模式匹配问题
    # 模式1: 数字. 问题
    pattern1 = r'\n(\d+)\.\s*([^\n]+?)[\n\s]+((?:示例)?答案[：:]\s*[^\n]+(?:\n(?!\d+\.).*)*)'

    # 模式2: 问题后跟答案段落
    pattern2 = r'\n(\d+)\.\s*([^\n]+)\n\s*\n\s*((?:示例)?答案[：:])?\s*\n\s*([^\n]+(?:\n(?!\d+\.|示例答案).*)*)'

    # 先尝试模式1
    matches = re.findall(pattern1, content, re.MULTILINE)

    if not matches:
        # 尝试模式2
        matches = re.findall(pattern2, content, re.MULTILINE)
        matches = [(m[0], m[1], m[3]) for m in matches]  # 调整格式

    # 如果还是没有匹配，使用更简单的模式
    if not matches:
        lines = content.split('\n')
        current_question = None
        current_answer = []
        in_answer = False

        for line in lines:
            line = line.strip()

            # 检查是否是新问题
            question_match = re.match(r'^(\d+)\.\s*(.+)$', line)
            if question_match:
                # 保存上一个问题
                if current_question and current_answer:
                    answer_text = ' '.join(current_answer).strip()
                    if answer_text:
                        questions.append({
                            "id": f"q{question_id}",
                            "question": current_question,
                            "answer": answer_text
                        })
                        question_id += 1

                # 开始新问题
                current_question = question_match.group(2).strip()
                current_answer = []
                in_answer = False

            # 检查是否是答案开始
            elif '答案' in line or '示例答案' in line:
                in_answer = True
                # 可能答案就在同一行
                answer_part = re.sub(r'^(示例)?答案[：:]?\s*', '', line).strip()
                if answer_part:
                    current_answer.append(answer_part)

            # 收集答案内容
            elif in_answer and line:
                # 检查是否是下一个问题的开始
                if not re.match(r'^\d+\.', line):
                    current_answer.append(line)
                else:
                    in_answer = False

            # 如果没有明确的答案标记，但有内容，也收集
            elif current_question and not in_answer and line and not re.match(r'^\d+\.', line):
                current_answer.append(line)

        # 保存最后一个问题
        if current_question and current_answer:
            answer_text = ' '.join(current_answer).strip()
            if answer_text:
                questions.append({
                    "id": f"q{question_id}",
                    "question": current_question,
                    "answer": answer_text
                })
    else:
        # 处理正则匹配的结果
        for match in matches:
            question_text = match[1].strip()
            answer_text = match[2].strip()

            # 清理答案文本
            answer_text = re.sub(r'^(示例)?答案[：:]?\s*', '', answer_text).strip()
            answer_text = re.sub(r'\s+', ' ', answer_text)  # 合并多余空格

            if question_text and answer_text:
                questions.append({
                    "id": f"q{question_id}",
                    "question": question_text,
                    "answer": answer_text
                })
                question_id += 1

    return questions

def main():
    # 读取大健康岗位简历数据
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
        health_data = json.load(f)

    # 读取Mock文件
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
        content = f.read()

    # 创建岗位群到面试题的映射
    industry_questions_map = {}

    for item in health_data:
        industry = item.get('简历岗位群', '')
        interview_content = item.get('面试题内容', '')

        if industry and interview_content and industry not in industry_questions_map:
            all_categories = parse_all_interview_questions(interview_content)

            # 转换为前端期望的格式
            questions_array = []
            cat_id = 1

            for category_data in all_categories:
                if category_data['questions']:
                    questions_array.append({
                        "id": f"group_q{cat_id}",
                        "question": category_data['category'],
                        "subQuestions": category_data['questions']
                    })
                    cat_id += 1

            if questions_array:
                industry_questions_map[industry] = questions_array
                total_questions = sum(len(q['subQuestions']) for q in questions_array)
                print(f"✓ {industry}: 提取了 {len(questions_array)} 个分类，共 {total_questions} 个面试题")

    # 映射岗位群名称到ID
    industry_mapping = {
        '健康管理': 'health_1',
        '健康检查': 'health_2',
        '康复治疗': 'health_3',
        '慢性病管理': 'health_4',
        '轻医美': 'health_5',
        '心理健康': 'health_6',
        '社群运营': 'health_7',
        '药品供应链管理': 'health_8',
        '药品生产': 'health_9',
        '药品质量检测': 'health_10',
        '药物研发': 'health_11'
    }

    # 更新Mock文件
    updates = 0
    for orig_name, industry_id in industry_mapping.items():
        if orig_name in industry_questions_map:
            questions = industry_questions_map[orig_name]

            # 生成questions的JSON字符串
            questions_json = json.dumps(questions, ensure_ascii=False, indent=2)

            # 查找并替换questions字段
            # 先删除旧的questions字段
            pattern1 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\]),\s*"questions":\s*\[[^\]]*?\](\s*\}})'
            replacement1 = rf'\1\2'
            content = re.sub(pattern1, replacement1, content, flags=re.DOTALL)

            # 再添加新的questions字段
            pattern2 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\])(\s*\}})'
            replacement2 = rf'\1,\n    "questions": {questions_json}\2'

            new_content, count = re.subn(pattern2, replacement2, content, flags=re.DOTALL)
            if count > 0:
                content = new_content
                updates += 1

    # 写回文件
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"\n✅ 完成！更新了 {updates} 个岗位群的完整面试题数据")

if __name__ == "__main__":
    main()