67 lines
2.8 KiB
Python
67 lines
2.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
def extract_full_sections(content):
|
|||
|
|
"""从项目内容中提取完整的各个部分"""
|
|||
|
|
|
|||
|
|
# 提取项目概述 - 从"一、项目概述"到"二、项目整体流程介绍"之前
|
|||
|
|
overview_match = re.search(r'# 一、项目概述\s*\n\n(.*?)(?=\n# 二、)', content, re.DOTALL)
|
|||
|
|
overview = overview_match.group(1).strip() if overview_match else ""
|
|||
|
|
|
|||
|
|
# 提取项目整体流程介绍 - 从"二、项目整体流程介绍"到"三、项目案例关键技术点"之前
|
|||
|
|
process_match = re.search(r'# 二、项目整体流程介绍\s*\n\n(.*?)(?=\n# 三、)', content, re.DOTALL)
|
|||
|
|
process = process_match.group(1).strip() if process_match else ""
|
|||
|
|
|
|||
|
|
# 提取项目案例关键技术点 - 从"三、项目案例关键技术点"到结尾
|
|||
|
|
keypoints_match = re.search(r'# 三、项目案例关键技术点\s*\n\n(.*?)$', content, re.DOTALL)
|
|||
|
|
keypoints = keypoints_match.group(1).strip() if keypoints_match else ""
|
|||
|
|
|
|||
|
|
return overview, process, keypoints
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
# 读取原始数据
|
|||
|
|
with open('网页未导入数据/化工产业/化工项目案例.json', 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
# 为每个项目提取完整内容
|
|||
|
|
complete_projects = []
|
|||
|
|
|
|||
|
|
for i, project in enumerate(data):
|
|||
|
|
project_id = i + 1
|
|||
|
|
content = project['项目案例内容']
|
|||
|
|
overview, process, keypoints = extract_full_sections(content)
|
|||
|
|
|
|||
|
|
# 统计流程和技术点数量
|
|||
|
|
process_count = len(re.findall(r'### 流程[一二三四五六七八九十]+:', process))
|
|||
|
|
keypoints_count = len(re.findall(r'### ([一二三四五六七八九十]+)', keypoints))
|
|||
|
|
|
|||
|
|
print(f"项目 {project_id}: {project['案例名称']}")
|
|||
|
|
print(f" 概述长度: {len(overview)} 字符")
|
|||
|
|
print(f" 流程部分长度: {len(process)} 字符 ({process_count} 个流程)")
|
|||
|
|
print(f" 技术点长度: {len(keypoints)} 字符 ({keypoints_count} 个技术点)")
|
|||
|
|
|
|||
|
|
complete_projects.append({
|
|||
|
|
'id': project_id,
|
|||
|
|
'name': project['案例名称'],
|
|||
|
|
'overview': overview,
|
|||
|
|
'process': process,
|
|||
|
|
'keypoints': keypoints,
|
|||
|
|
'process_count': process_count,
|
|||
|
|
'keypoints_count': keypoints_count
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 保存提取的完整数据
|
|||
|
|
with open('complete_project_data.json', 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(complete_projects, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"\n✅ 已提取 {len(complete_projects)} 个项目的完整数据到 complete_project_data.json")
|
|||
|
|
|
|||
|
|
# 显示统计信息
|
|||
|
|
total_processes = sum(p['process_count'] for p in complete_projects)
|
|||
|
|
total_keypoints = sum(p['keypoints_count'] for p in complete_projects)
|
|||
|
|
print(f"总计: {total_processes} 个流程, {total_keypoints} 个技术点")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|