#!/usr/bin/env python3 # -*- coding: utf-8 -*- import json import re def extract_sections(content): """从项目内容中提取各个部分""" # 提取项目概述 overview_match = re.search(r'# 一、项目概述\s*\n\n(.*?)(?=\n# 二、|\n\n# 二、|$)', content, re.DOTALL) overview = overview_match.group(1).strip() if overview_match else "" # 提取项目整体流程介绍 process_match = re.search(r'# 二、项目整体流程介绍\s*\n\n(.*?)(?=\n# 三、|\n\n# 三、|$)', content, re.DOTALL) process = process_match.group(1).strip() if process_match else "" # 提取项目案例关键技术点 keypoints_match = re.search(r'# 三、项目案例关键技术点\s*\n\n(.*?)$', content, re.DOTALL) keypoints = keypoints_match.group(1).strip() if keypoints_match else "" return overview, process, keypoints def main(): # 读取原始数据 with open('网页未导入数据/化工产业/化工项目案例.json', 'r', encoding='utf-8') as f: data = json.load(f) # 为每个项目生成完整的数据 complete_projects = [] for i, project in enumerate(data): project_id = i + 1 content = project['项目案例内容'] overview, process, keypoints = extract_sections(content) complete_projects.append({ 'id': project_id, 'name': project['案例名称'], 'overview': overview, 'process': process, 'keypoints': keypoints }) print(f"项目 {project_id}: {project['案例名称']}") print(f" 概述长度: {len(overview)}") print(f" 流程长度: {len(process)}") print(f" 技术点长度: {len(keypoints)}") # 保存提取的完整数据 with open('extracted_project_data.json', 'w', encoding='utf-8') as f: json.dump(complete_projects, f, ensure_ascii=False, indent=2) print(f"\n✅ 已提取 {len(complete_projects)} 个项目的完整数据到 extracted_project_data.json") if __name__ == "__main__": main()