Files
ALL-teach_sys/frontend_土木水利/extract_interview_questions.py
KQL cd2e307402 初始化12个产业教务系统项目
主要内容:
- 包含12个产业的完整教务系统前端代码
- 智能启动脚本 (start-industry.sh)
- 可视化产业导航页面 (index.html)
- 项目文档 (README.md)

优化内容:
- 删除所有node_modules和.yoyo文件夹,从7.5GB减少到2.7GB
- 添加.gitignore文件避免上传不必要的文件
- 自动依赖管理和智能启动系统

产业列表:
1. 文旅产业 (5150)
2. 智能制造 (5151)
3. 智能开发 (5152)
4. 财经商贸 (5153)
5. 视觉设计 (5154)
6. 交通物流 (5155)
7. 大健康 (5156)
8. 土木水利 (5157)
9. 食品产业 (5158)
10. 化工产业 (5159)
11. 能源产业 (5160)
12. 环保产业 (5161)

🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-24 14:14:14 +08:00

99 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从土木水利岗位简历.json中提取各岗位群的面试题数据
"""
import json
import re
def extract_interview_questions():
# 读取土木水利岗位简历数据
with open('网页未导入数据/土木水利产业/土木水利岗位简历.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# 创建岗位群到面试题的映射
job_group_questions = {}
for item in data:
job_group = item.get('简历岗位群')
interview_title = item.get('面试题')
interview_content = item.get('面试题内容')
if job_group and interview_content and job_group not in job_group_questions:
# 解析面试题内容为结构化数据
questions = parse_interview_content(interview_content, interview_title or job_group)
if questions:
job_group_questions[job_group] = questions
# 输出结果
with open('interview_questions_data.json', 'w', encoding='utf-8') as f:
json.dump(job_group_questions, f, ensure_ascii=False, indent=2)
print(f"成功提取 {len(job_group_questions)} 个岗位群的面试题数据")
return job_group_questions
def parse_interview_content(content, title):
"""解析面试题内容为结构化数据"""
questions = []
# 分割成各个题目
sections = re.split(r'\n(?=\d+\.\s)', content)
question_id = 1
for section in sections:
if not section.strip():
continue
# 提取问题和答案
lines = section.strip().split('\n')
if len(lines) < 1:
continue
# 查找问题行
question_text = ""
answer_text = ""
for i, line in enumerate(lines):
# 匹配问题开头
if re.match(r'^\d+\.\s', line):
question_text = re.sub(r'^\d+\.\s+', '', line).strip()
# 匹配答案
elif '示例答案:' in line or '答案:' in line:
answer_text = line.split('', 1)[1].strip() if '' in line else ""
# 收集多行答案
for j in range(i + 1, len(lines)):
if lines[j].strip() and not re.match(r'^\d+\.\s', lines[j]):
answer_text += " " + lines[j].strip()
else:
break
if question_text:
questions.append({
"id": f"q_{title}_{question_id}",
"question": question_text,
"answer": answer_text or "请根据实际情况回答"
})
question_id += 1
# 如果没有成功解析出问题,则创建一个通用问题
if not questions:
# 尝试提取主要内容作为一个大问题
if '问答题' in content or '选择题' in content or '填空题' in content:
# 提取第一个完整的问答题
match = re.search(r'问答题[:]\s*(.+?)(?:示例答案|答案)[:]\s*(.+?)(?:\n\n|\n\d+\.|\Z)', content, re.DOTALL)
if match:
questions.append({
"id": f"q_{title}_1",
"question": match.group(1).strip(),
"answer": match.group(2).strip()
})
return questions
if __name__ == "__main__":
job_group_questions = extract_interview_questions()
# 打印统计信息
for group, questions in job_group_questions.items():
print(f"{group}: {len(questions)} 道题")