jiaowu-test/update_resume_data.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import re

# 读取提取的数据
with open('extracted_resume_data.json', 'r', encoding='utf-8') as f:
    extracted_data = json.load(f)

def parse_resume_content(content):
    """解析简历内容，提取项目经历、核心能力、复合能力和个人总结"""

    # 提取项目信息
    project_name = re.search(r'项目名称：(.+?)(?:\n|$)', content)
    position = re.search(r'实习岗位：(.+?)(?:\n|$)', content)
    time_period = re.search(r'实习时间：(.+?)(?:\n|$)', content)
    company = re.search(r'实习单位：(.+?)(?:\n|$)', content)

    # 提取岗位职责
    duties_match = re.search(r'(?:岗位职责|项目职责)[：\s]*\n((?:\d+\..+?(?=\n#|\n\n|\Z))+)', content, re.DOTALL)
    if duties_match:
        duties_text = duties_match.group(1).strip()
        # 提取所有职责项并合并
        duties_items = re.findall(r'\d+\.\s*(.+?)(?=\d+\.|$)', duties_text, re.DOTALL)
        description = '\n'.join([d.strip().replace('\n', '') for d in duties_items if d])
    else:
        description = ""

    # 提取核心能力
    core_skills_match = re.search(r'核心能力[：\s]*\n((?:\d+\..+?(?=\n#|\n\n|\Z))+)', content, re.DOTALL)
    core_skills = []
    if core_skills_match:
        skills_text = core_skills_match.group(1).strip()
        # 分割并清理每个技能项
        core_skills = re.findall(r'\d+\.\s*(.+?)(?=\d+\.|$)', skills_text, re.DOTALL)
        core_skills = [s.strip().replace('\n', '').replace('  ', ' ') for s in core_skills if s]

    # 提取复合能力
    compound_skills_match = re.search(r'复合能力[：\s]*\n((?:\d+\..+?(?=\n#|\Z))+)', content, re.DOTALL)
    compound_skills = []
    if compound_skills_match:
        skills_text = compound_skills_match.group(1)
        # 提取每个复合能力（格式：数字. 能力名称：描述）
        compound_skills = re.findall(r'\d+\.\s*(.+?)(?=\d+\.|$)', skills_text, re.DOTALL)
        compound_skills = [s.strip().replace('\n', '') for s in compound_skills if s]

    # 提取个人总结/评价
    personal_summary_match = re.search(r'(?:个人总结|个人评价)[：\s]*\n(.+?)(?:\Z)', content, re.DOTALL)
    personal_summary = personal_summary_match.group(1).strip() if personal_summary_match else ""

    return {
        'project_experience': {
            'project_name': project_name.group(1) if project_name else "",
            'position': position.group(1) if position else "",
            'time_period': time_period.group(1) if time_period else "",
            'company': company.group(1) if company else "",
            'description': description
        },
        'core_skills': core_skills,
        'compound_skills': compound_skills,
        'personal_summary': personal_summary
    }

# 生成JavaScript代码更新
updates = []

for position_name, data in extracted_data.items():
    content = data.get('简历内容', '')
    if content:
        student_info = parse_resume_content(content)

        # 格式化为JavaScript对象
        js_obj = {
            'position': position_name,
            'studentInfo': student_info
        }

        updates.append(js_obj)

# 保存为JSON供后续处理
with open('resume_updates.json', 'w', encoding='utf-8') as f:
    json.dump(updates, f, ensure_ascii=False, indent=2)

print(f"成功处理 {len(updates)} 个岗位的数据")
print("已保存到 resume_updates.json")

# 显示第一个更新作为示例
if updates:
    print("\n示例数据（第一个岗位）：")
    print(json.dumps(updates[0], ensure_ascii=False, indent=2)[:1000])