ALL-teach_sys/frontend_大健康/extract_complete_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re

def extract_all_questions_from_content(content):
    """从内容中提取所有面试题"""
    all_questions = []

    # 删除"判断题："等前缀
    content = re.sub(r'判断题[：:]?\s*', '', content)

    # 先按大类分割（# 一、二、三等）
    category_pattern = r'# ([一二三四五六七八九十]+、[^\n]+)'
    categories = re.split(category_pattern, content)

    if len(categories) > 1:
        # 有分类的情况
        for i in range(1, len(categories), 2):
            if i+1 >= len(categories):
                break

            category_title = categories[i].strip()
            category_content = categories[i+1]

            # 从该分类中提取所有问题
            questions = extract_questions_from_text(category_content)

            if questions:
                all_questions.append({
                    "category": category_title,
                    "questions": questions
                })
    else:
        # 没有分类，直接提取所有问题
        questions = extract_questions_from_text(content)
        if questions:
            all_questions.append({
                "category": "综合面试题",
                "questions": questions
            })

    return all_questions

def extract_questions_from_text(text):
    """从文本中提取问题和答案"""
    questions = []
    question_id = 1

    # 分割文本为行
    lines = text.split('\n')

    current_question = None
    current_answer = []
    in_answer_section = False

    for i, line in enumerate(lines):
        line = line.strip()

        # 检查是否是问题行（数字开头）
        question_match = re.match(r'^(\d+)\.\s*(.+)$', line)

        if question_match:
            # 先保存上一个问题
            if current_question and current_answer:
                answer_text = ' '.join(current_answer).strip()
                # 清理答案文本
                answer_text = re.sub(r'^(示例)?答案[：:]?\s*', '', answer_text)
                answer_text = re.sub(r'\s+', ' ', answer_text)

                if answer_text:
                    questions.append({
                        "id": f"q{question_id}",
                        "question": current_question,
                        "answer": answer_text
                    })
                    question_id += 1

            # 开始新问题
            current_question = question_match.group(2).strip()
            current_answer = []
            in_answer_section = False

        # 检查是否进入答案部分
        elif line and ('示例答案' in line or '答案：' in line or '答案:' in line):
            in_answer_section = True
            # 答案可能在同一行
            answer_on_same_line = re.sub(r'^.*(示例)?答案[：:]?\s*', '', line).strip()
            if answer_on_same_line:
                current_answer.append(answer_on_same_line)

        # 收集答案内容
        elif in_answer_section and line:
            # 检查是否是下一个问题或分类
            if not re.match(r'^(\d+)\.', line) and not line.startswith('#'):
                current_answer.append(line)

        # 空行可能表示答案结束
        elif not line and in_answer_section:
            in_answer_section = False

    # 保存最后一个问题
    if current_question and current_answer:
        answer_text = ' '.join(current_answer).strip()
        answer_text = re.sub(r'^(示例)?答案[：:]?\s*', '', answer_text)
        answer_text = re.sub(r'\s+', ' ', answer_text)

        if answer_text:
            questions.append({
                "id": f"q{question_id}",
                "question": current_question,
                "answer": answer_text
            })

    # 如果没有找到答案，尝试另一种模式
    if not questions:
        # 使用正则表达式匹配问题和答案
        pattern = r'(\d+)\.\s*([^\n]+)\s*\n\s*(?:示例)?答案[：:]?\s*\n\s*([^\n]+(?:\n(?!\d+\.|#)[^\n]*)*)'
        matches = re.findall(pattern, text, re.MULTILINE)

        question_id = 1
        for match in matches:
            question_text = match[1].strip()
            answer_text = match[2].strip()
            answer_text = re.sub(r'\s+', ' ', answer_text)

            if question_text and answer_text:
                questions.append({
                    "id": f"q{question_id}",
                    "question": question_text,
                    "answer": answer_text
                })
                question_id += 1

    return questions

def main():
    # 读取大健康岗位简历数据
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
        health_data = json.load(f)

    # 读取Mock文件
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
        content = f.read()

    # 收集每个岗位群的所有面试题
    industry_all_questions = {}

    for item in health_data:
        industry = item.get('简历岗位群', '')
        interview_content = item.get('面试题内容', '')

        if industry and interview_content:
            if industry not in industry_all_questions:
                industry_all_questions[industry] = []

            # 提取该岗位的所有问题
            categories = extract_all_questions_from_content(interview_content)

            # 合并到该岗位群的问题列表中
            for cat in categories:
                # 检查是否已有该分类
                existing_cat = None
                for existing in industry_all_questions[industry]:
                    if existing['category'] == cat['category']:
                        existing_cat = existing
                        break

                if existing_cat:
                    # 合并问题，避免重复
                    existing_questions = {q['question'] for q in existing_cat['questions']}
                    for q in cat['questions']:
                        if q['question'] not in existing_questions:
                            existing_cat['questions'].append(q)
                else:
                    # 添加新分类
                    industry_all_questions[industry].append(cat)

    # 转换为前端期望的格式并更新Mock文件
    industry_mapping = {
        '健康管理': 'health_1',
        '健康检查': 'health_2',
        '康复治疗': 'health_3',
        '慢性病管理': 'health_4',
        '轻医美': 'health_5',
        '心理健康': 'health_6',
        '社群运营': 'health_7',
        '药品供应链管理': 'health_8',
        '药品生产': 'health_9',
        '药品质量检测': 'health_10',
        '药物研发': 'health_11'
    }

    updates = 0
    for orig_name, industry_id in industry_mapping.items():
        if orig_name in industry_all_questions:
            categories = industry_all_questions[orig_name]

            # 转换为questions数组
            questions_array = []
            cat_id = 1
            total_questions = 0

            for cat in categories:
                if cat['questions']:
                    # 重新编号问题ID
                    renumbered_questions = []
                    for i, q in enumerate(cat['questions'], 1):
                        renumbered_questions.append({
                            "id": f"q{total_questions + i}",
                            "question": q['question'],
                            "answer": q['answer']
                        })
                    total_questions += len(renumbered_questions)

                    questions_array.append({
                        "id": f"group_q{cat_id}",
                        "question": cat['category'],
                        "subQuestions": renumbered_questions
                    })
                    cat_id += 1

            if questions_array:
                print(f"✓ {orig_name} ({industry_id}): {len(questions_array)} 个分类，共 {total_questions} 个面试题")

                # 生成JSON字符串
                questions_json = json.dumps(questions_array, ensure_ascii=False, indent=2)

                # 删除旧的questions字段
                pattern1 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\]),\s*"questions":\s*\[[^\]]*?\](\s*\}})'
                replacement1 = rf'\1\2'
                content = re.sub(pattern1, replacement1, content, flags=re.DOTALL)

                # 添加新的questions字段
                pattern2 = rf'("id":\s*"{industry_id}"[^{{]*?"positions":\s*\[[^\]]*?\])(\s*\}})'
                replacement2 = rf'\1,\n    "questions": {questions_json}\2'

                new_content, count = re.subn(pattern2, replacement2, content, flags=re.DOTALL)
                if count > 0:
                    content = new_content
                    updates += 1

    # 写回文件
    with open('/Users/apple/Documents/cursor/教务系统/frontend_大健康/src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"\n✅ 完成！更新了 {updates} 个岗位群的完整面试题数据")

if __name__ == "__main__":
    main()