online_sys/frontend_大健康/clean_all_interview_questions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
完全清理并重写所有面试题数据为扁平结构
确保所有岗位都使用正确的 questions: [{ id, question, answer }] 格式
"""

import json
import re
import sys
from datetime import datetime

def load_health_resume_data():
    """加载大健康岗位简历数据"""
    try:
        with open('网页未导入数据/大健康产业/大健康岗位简历.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading health resume data: {e}")
        return None

def parse_interview_content_to_flat_array(content):
    """解析面试题内容，转换为扁平的问答数组"""
    if not content:
        return []

    questions = []

    # 按大标题分割（# 一、二、三等）
    if content.startswith('# '):
        content = '\n' + content
    sections = re.split(r'\n# ([一二三四五六七八九十]+、[^#\n]+)', content)

    if len(sections) < 2:
        return []

    question_counter = 1
    for i in range(1, len(sections), 2):
        if i + 1 < len(sections):
            section_title = sections[i].strip()
            section_content = sections[i + 1].strip()

            # 按问题编号分割 (1. 2. 3. 等)
            question_parts = re.split(r'\n\s*(\d+\.)\s+', section_content)

            for j in range(1, len(question_parts), 2):
                if j + 1 < len(question_parts):
                    question_block = question_parts[j + 1].strip()

                    # 提取问题和答案
                    lines = question_block.split('\n')
                    question_text = ""
                    answer_text = ""
                    in_answer = False

                    for line in lines:
                        line = line.strip()
                        if line.startswith('示例答案：'):
                            in_answer = True
                            continue

                        if not in_answer and line and not line.startswith('示例答案：'):
                            if question_text:
                                question_text += " "
                            question_text += line
                        elif in_answer and line:
                            if answer_text:
                                answer_text += " "
                            answer_text += line

                    if question_text:
                        questions.append({
                            "id": f"q{question_counter}",
                            "question": question_text,
                            "answer": answer_text
                        })
                        question_counter += 1

    return questions

def clean_and_update_all_questions():
    """完全清理并更新所有面试题数据"""
    try:
        # 加载大健康数据
        health_data = load_health_resume_data()
        if not health_data:
            print("Failed to load health resume data")
            return False

        # 创建岗位到面试题的映射
        position_to_questions = {}
        for item in health_data:
            position_name = item.get('岗位名称', '')
            interview_content = item.get('面试题内容', '')

            if position_name and interview_content:
                questions = parse_interview_content_to_flat_array(interview_content)
                position_to_questions[position_name] = questions

        print(f"解析了 {len(position_to_questions)} 个岗位的面试题")

        # 读取现有文件
        with open('src/mocks/resumeInterviewMock.js', 'r', encoding='utf-8') as f:
            content = f.read()

        # 找到所有岗位并完全重写questions字段
        updated_content = content
        update_count = 0

        for position_name, questions in position_to_questions.items():
            if not questions:
                continue

            # 将questions数组转换为JavaScript格式的字符串
            questions_js_parts = []
            for q in questions:
                q_text = q['question'].replace('"', '\\"').replace('\n', '\\n')
                a_text = q['answer'].replace('"', '\\"').replace('\n', '\\n')
                question_js = '''      {
        "id": "%s",
        "question": "%s",
        "answer": "%s"
      }''' % (q['id'], q_text, a_text)
                questions_js_parts.append(question_js)

            questions_js = '''[
%s
    ]''' % ',\n'.join(questions_js_parts)

            # 使用更宽泛的正则表达式来匹配岗位
            # 匹配从"title"开始到下一个position或结束的整个岗位定义
            position_pattern = rf'"title": "{re.escape(position_name)}"[\s\S]*?(?="title":|^\]\s*;\s*$|^const\s+)'

            def replace_position_questions(match):
                matched_text = match.group(0)
                # 删除现有的questions字段（不管是什么格式）
                cleaned_text = re.sub(r',?\s*"questions": \[[^\]]*?\](?:\s*,\s*)?\s*(?=\]|\})', '', matched_text, flags=re.DOTALL)
                cleaned_text = re.sub(r',?\s*"questions": \[[\s\S]*?\](?:\s*,\s*)?\s*(?=\]|\})', '', cleaned_text, flags=re.DOTALL)

                # 在requirements后添加新的questions字段
                if '"requirements":' in cleaned_text:
                    cleaned_text = re.sub(
                        r'("requirements": \[[^\]]*?\])',
                        r'\1,\n    "questions": ' + questions_js,
                        cleaned_text,
                        flags=re.DOTALL
                    )
                else:
                    # 如果没有requirements字段，在最后一个字段后添加
                    cleaned_text = re.sub(
                        r'(\s+)(\]|\})\s*$',
                        r',\n    "questions": ' + questions_js + r'\1\2',
                        cleaned_text,
                        flags=re.DOTALL
                    )

                return cleaned_text

            new_content = re.sub(position_pattern, replace_position_questions, updated_content, flags=re.MULTILINE)

            if new_content != updated_content:
                updated_content = new_content
                update_count += 1
                print(f"✅ 重写 {position_name} 的面试题 ({len(questions)} 个问题)")

        # 最后清理任何残留的旧格式问题
        # 删除任何包含subQuestions的问题结构
        updated_content = re.sub(
            r'"questions": \[\s*\{[^}]*"subQuestions"[\s\S]*?\}\s*\]',
            '"questions": []',
            updated_content,
            flags=re.DOTALL
        )

        # 删除任何独立的subQuestions结构
        updated_content = re.sub(
            r'"subQuestions": \[[\s\S]*?\][\s,]*',
            '',
            updated_content,
            flags=re.DOTALL
        )

        # 写回文件
        with open('src/mocks/resumeInterviewMock.js', 'w', encoding='utf-8') as f:
            f.write(updated_content)

        print(f"\n🎉 成功清理并重写 {update_count} 个岗位的面试题数据！")
        return True

    except Exception as e:
        print(f"Error cleaning interview questions: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """主函数"""
    print("开始完全清理并重写面试题数据...")

    success = clean_and_update_all_questions()

    if success:
        print("面试题数据清理和重写完成！")
    else:
        print("面试题数据清理失败！")

    return success

if __name__ == "__main__":
    main()