#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re
from typing import List, Dict, Any

def parse_interview_content(content: str) -> List[Dict[str, Any]]:
    """解析面试题内容，转换为问答格式"""
    questions = []
    
    # 按章节分割
    sections = re.split(r'\n# [一二三四五六七八九十]、', content)
    
    question_id = 1
    for section in sections[1:]:  # 跳过第一个空白section
        lines = section.strip().split('\n')
        if not lines:
            continue
            
        section_title = lines[0].strip()
        
        # 提取问题和答案
        current_question = ""
        current_answer = ""
        collecting_answer = False
        
        for line in lines[1:]:
            line = line.strip()
            if not line:
                continue
                
            # 识别问题（以数字开头）
            if re.match(r'^\d+\.', line):
                # 保存上一个问题
                if current_question and current_answer:
                    questions.append({
                        "id": f"q_{question_id}",
                        "question": current_question.strip(),
                        "answer": current_answer.strip()
                    })
                    question_id += 1
                
                # 开始新问题
                current_question = re.sub(r'^\d+\.\s*', '', line)
                current_answer = ""
                collecting_answer = False
            
            # 识别答案（示例答案：或答案：）
            elif line.startswith('示例答案：') or line.startswith('答案：'):
                current_answer = line.replace('示例答案：', '').replace('答案：', '').strip()
                collecting_answer = True
            
            # 继续收集答案
            elif collecting_answer and not re.match(r'^\d+\.', line) and not line.startswith('选择题：') and not line.startswith('填空题：'):
                if not line.startswith('A.') and not line.startswith('B.') and not line.startswith('C.') and not line.startswith('D.'):
                    current_answer += " " + line
        
        # 保存最后一个问题
        if current_question and current_answer:
            questions.append({
                "id": f"q_{question_id}",
                "question": current_question.strip(),
                "answer": current_answer.strip()
            })
            question_id += 1
    
    return questions

def main():
    # 读取土木水利岗位简历数据
    with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 按岗位群分组面试题
    interview_groups = {}
    
    for position in data:
        group_name = position['简历岗位群']
        interview_title = position['面试题']
        interview_content = position['面试题内容']
        
        if group_name not in interview_groups:
            interview_groups[group_name] = {
                'title': interview_title,
                'content': interview_content
            }
    
    # 转换面试题格式
    converted_questions = []
    
    for group_name, group_data in interview_groups.items():
        questions = parse_interview_content(group_data['content'])
        
        # 只取前4个问题以适应界面显示
        questions = questions[:4]
        
        # 为每个问题添加难度和标签
        for i, q in enumerate(questions):
            if 'BIM' in group_name:
                q['difficulty'] = ['基础', '中等', '中高', '高级'][min(i, 3)]
                q['tags'] = ['BIM基础', 'BIM建模', 'BIM协作'][min(i, 2)]
            else:
                q['difficulty'] = ['基础', '中等', '中高', '高级'][min(i, 3)]
                q['tags'] = ['房地产', '经纪业务', '客户服务'][min(i, 2)]
        
        converted_questions.append({
            'group_name': group_name,
            'title': group_data['title'],
            'questions': questions
        })
    
    # 输出转换结果
    print("转换后的面试题数据：")
    print(json.dumps(converted_questions, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()