Files
ALL-teach_sys/frontend_土木水利/remove_duplicate_titles.py

220 lines
7.8 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
def extract_sections(content):
"""从内容中提取概述、流程和关键点三个部分,不添加额外的标题"""
overview = ""
process = ""
keyPoints = ""
if not content:
return overview, process, keyPoints
# 使用正则表达式匹配一级标题
# 匹配模式: # 一、项目概述 # 二、项目整体流程介绍 # 三、项目案例关键技术点
# 提取项目概述
overview_match = re.search(r'#\s*一、\s*项目概述(.*?)(?=#\s*二、|$)', content, re.DOTALL)
if overview_match:
overview = overview_match.group(1).strip()
# 提取项目流程(不添加重复的标题)
process_match = re.search(r'#\s*二、\s*项目整体流程介绍(.*?)(?=#\s*三、|$)', content, re.DOTALL)
if process_match:
process = process_match.group(1).strip()
# 不再添加额外的标题
# 提取关键技术点(不添加重复的标题)
keypoints_match = re.search(r'#\s*三、\s*项目案例关键技术点(.*?)$', content, re.DOTALL)
if keypoints_match:
keyPoints = keypoints_match.group(1).strip()
# 不再添加额外的标题
# 如果没有找到标准格式,尝试其他格式
if not overview and not process and not keyPoints:
# 尝试查找其他可能的章节标题
sections = re.split(r'^#{1,2}\s+', content, flags=re.MULTILINE)
for section in sections:
section_lower = section.lower()
if not overview and ('概述' in section or '背景' in section or '简介' in section):
# 提取第一段作为概述
lines = section.split('\n')
overview = '\n'.join(lines[1:]) if len(lines) > 1 else section
overview = overview.strip()
elif not process and ('流程' in section or '步骤' in section or '实施' in section or '方法' in section):
process = section.strip()
elif not keyPoints and ('关键' in section or '要点' in section or '技术' in section or '成果' in section):
keyPoints = section.strip()
# 如果还是没有找到,使用默认处理
if not overview:
# 取前500字作为概述
overview = content[:500].strip()
if len(content) > 500:
overview += "..."
if not process and not keyPoints:
# 如果没有明确的流程和关键点,将剩余内容作为流程
remaining = content[len(overview):].strip() if overview in content else content
if remaining:
process = remaining
return overview, process, keyPoints
def remove_duplicate_titles():
# 读取土木水利项目案例数据
with open('网页未导入数据/土木水利产业/土木水利项目案例.json', 'r', encoding='utf-8') as f:
civil_data = json.load(f)
print(f"开始处理项目详情数据,移除重复标题,共{len(civil_data)}个项目")
# 转换为班级项目库格式
projects_list = []
projects_detail = []
for idx, item in enumerate(civil_data, 1):
# 提取字段
project_name = item.get('案例名称', '')
direction = item.get('所属垂直方向', '综合项目')
content = item.get('项目案例内容', '')
units = item.get('对应单元名称(垂直能力课)', '')
positions = item.get('对应个人简历名称', '')
# 处理岗位列表
position_list = []
position_detail_list = []
if positions:
pos_names = [p.strip() for p in positions.split(',')]
position_list = pos_names
for pos in pos_names:
if '助理' in pos or '实习' in pos:
level = "实习生岗"
elif '经理' in pos or '主管' in pos:
level = "储备干部岗"
else:
level = "技术骨干岗"
position_detail_list.append({
"level": level,
"position": pos
})
# 处理单元名称
unit_name = units.split(',')[0] if units else direction
# 创建列表数据
list_item = {
"id": idx,
"name": project_name,
"description": direction,
"positions": position_list,
"unit": unit_name,
"direction": direction,
"category": direction.split('')[0] if '' in direction else direction
}
projects_list.append(list_item)
# 正确解析内容的三个部分,不添加重复标题
overview, process, keyPoints = extract_sections(content)
# 如果某部分为空,提供默认内容
if not overview:
overview = f"{project_name}{direction}领域的重要实践项目,通过本项目的实施,学生能够掌握相关的专业技能和实践经验。"
if not process:
process = f"本项目按照标准的{direction}流程进行实施,包括需求分析、方案设计、实施执行、测试验收等关键环节。"
if not keyPoints:
keyPoints = f"1. 掌握{direction}的核心技术\\n2. 熟悉项目实施的完整流程\\n3. 培养解决实际问题的能力\\n4. 提升团队协作和沟通能力"
# 创建详情数据
detail_item = {
"id": idx,
"name": project_name,
"positions": position_detail_list,
"unit": unit_name,
"overview": overview,
"process": process,
"keyPoints": keyPoints
}
projects_detail.append(detail_item)
# 显示处理进度
if idx <= 3:
print(f"\n项目{idx}: {project_name}")
print(f" - overview长度: {len(overview)}")
print(f" - process长度: {len(process)}")
print(f" - keyPoints长度: {len(keyPoints)}")
# 显示process的前100字符以验证没有重复标题
print(f" - process开头: {process[:100]}...")
# 生成JavaScript代码
output = """// 项目库Mock数据
export const getMockProjectsList = (params = {}) => {
const { search = "", page = 1, pageSize = 10 } = params;
// 完整项目列表数据
const projects = """
output += json.dumps(projects_list, ensure_ascii=False, indent=2)
output += ";\n\n"
output += """ // 根据搜索条件过滤
let filteredProjects = projects;
if (search) {
filteredProjects = projects.filter(project =>
project.name.toLowerCase().includes(search.toLowerCase()) ||
project.description.toLowerCase().includes(search.toLowerCase())
);
}
// 分页处理
const startIndex = (page - 1) * pageSize;
const endIndex = startIndex + pageSize;
const paginatedProjects = filteredProjects.slice(startIndex, endIndex);
return {
success: true,
data: paginatedProjects,
total: filteredProjects.length,
page: page,
pageSize: pageSize
};
};
// 获取项目详情
export const getMockProjectDetail = (id) => {
// 直接根据ID返回对应项目的详情
const projects = """
output += json.dumps(projects_detail, ensure_ascii=False, indent=2)
output += """;\n
const project = projects.find(p => p.id === parseInt(id));
if (project) {
return {
success: true,
data: project
};
} else {
return {
success: false,
message: "项目不存在"
};
}
};
"""
# 保存到文件
with open('src/mocks/projectLibraryMock.js', 'w', encoding='utf-8') as f:
f.write(output)
print(f"\n✅ 项目详情数据已处理完成")
print(f" - 已移除重复的标题")
print(f" - process和keyPoints字段现在直接包含内容没有多余的标题")
if __name__ == "__main__":
remove_duplicate_titles()