初始化多多畅职企业内推平台项目
功能特性: - 3D地球动画与中国地图可视化 - 省份/城市/企业搜索功能 - 308家企业数据展示 - 响应式设计(PC端和移动端) - 企业详情页面与业务板块展示 - 官网新闻轮播图 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
117
scripts/checkCSV.js
Normal file
117
scripts/checkCSV.js
Normal file
@@ -0,0 +1,117 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV文件路径
|
||||
const COMPANY_CSV = path.join(__dirname, '..', '公司介绍.csv');
|
||||
|
||||
// CSV解析函数(与convertCSV.js相同的逻辑)
|
||||
function parseCSV(content) {
|
||||
const data = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
console.log('读取CSV文件...\n');
|
||||
const content = fs.readFileSync(COMPANY_CSV, 'utf-8');
|
||||
const data = parseCSV(content);
|
||||
|
||||
console.log(`✅ 解析到 ${data.length} 条数据记录\n`);
|
||||
|
||||
// 统计企业名称
|
||||
const companyNames = new Map();
|
||||
data.forEach((row, index) => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
if (name) {
|
||||
if (companyNames.has(name)) {
|
||||
companyNames.get(name).push(index + 2); // +2 因为第1行是表头,索引从0开始
|
||||
} else {
|
||||
companyNames.set(name, [index + 2]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`📊 唯一企业数量: ${companyNames.size}\n`);
|
||||
|
||||
// 检查重复
|
||||
const duplicates = [];
|
||||
companyNames.forEach((indices, name) => {
|
||||
if (indices.length > 1) {
|
||||
duplicates.push({ name, indices });
|
||||
}
|
||||
});
|
||||
|
||||
if (duplicates.length > 0) {
|
||||
console.log(`⚠️ 发现 ${duplicates.length} 个重复的企业名称:\n`);
|
||||
duplicates.forEach(dup => {
|
||||
console.log(` "${dup.name}" 出现 ${dup.indices.length} 次,在数据行: ${dup.indices.join(', ')}`);
|
||||
});
|
||||
} else {
|
||||
console.log('✅ 没有发现重复的企业名称\n');
|
||||
}
|
||||
|
||||
// 列出所有企业名称(前50个)
|
||||
console.log('\n📝 前50个企业名称:');
|
||||
const names = Array.from(companyNames.keys());
|
||||
names.slice(0, 50).forEach((name, index) => {
|
||||
console.log(` ${index + 1}. ${name}`);
|
||||
});
|
||||
|
||||
if (names.length > 50) {
|
||||
console.log(` ... 还有 ${names.length - 50} 个企业`);
|
||||
}
|
||||
|
||||
console.log(`\n总计: ${names.length} 家企业`);
|
||||
13
scripts/checkLines783_788.js
Normal file
13
scripts/checkLines783_788.js
Normal file
@@ -0,0 +1,13 @@
|
||||
const fs = require('fs');
|
||||
|
||||
const content = fs.readFileSync('公司介绍.csv', 'utf-8');
|
||||
const lines = content.split('\n');
|
||||
|
||||
console.log('第783-788行内容分析:\n');
|
||||
|
||||
for (let i = 782; i <= 787; i++) {
|
||||
const line = lines[i];
|
||||
console.log(`\n第${i+1}行 (长度: ${line.length}):`);
|
||||
console.log(line.substring(0, 200) + '...');
|
||||
console.log('末尾100字符:', line.substring(line.length - 100));
|
||||
}
|
||||
170
scripts/checkMissingData.js
Normal file
170
scripts/checkMissingData.js
Normal file
@@ -0,0 +1,170 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV文件路径
|
||||
const BASE_DIR = path.join(__dirname, '..');
|
||||
const COMPANY_CSV = path.join(BASE_DIR, '公司介绍.csv');
|
||||
const IMAGE_CSV = path.join(BASE_DIR, '企业图片.csv');
|
||||
const BUSINESS_CSV = path.join(BASE_DIR, '企业业务板块和内推岗位.csv');
|
||||
|
||||
// CSV解析函数
|
||||
function parseCSV(content) {
|
||||
const data = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
function extractCity(region) {
|
||||
if (!region) return '';
|
||||
const match = region.match(/总部[::]\s*([^ \n]+)/);
|
||||
if (match) {
|
||||
let city = match[1].trim();
|
||||
city = city.replace(/^(江苏|浙江|广东|山东|河北|河南|四川|湖北|湖南|安徽|福建|陕西|辽宁)\s*/, '');
|
||||
if (!city.endsWith('市') && !city.includes('自治') && !city.includes('特别行政区')) {
|
||||
city += '市';
|
||||
}
|
||||
return city;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
console.log('读取CSV文件...\n');
|
||||
|
||||
const companyData = parseCSV(fs.readFileSync(COMPANY_CSV, 'utf-8'));
|
||||
const imageData = parseCSV(fs.readFileSync(IMAGE_CSV, 'utf-8'));
|
||||
const businessData = parseCSV(fs.readFileSync(BUSINESS_CSV, 'utf-8'));
|
||||
|
||||
console.log(`✅ 读取 ${companyData.length} 条公司介绍数据`);
|
||||
console.log(`✅ 读取 ${imageData.length} 条企业图片数据`);
|
||||
console.log(`✅ 读取 ${businessData.length} 条业务板块数据\n`);
|
||||
|
||||
// 建立企业Map
|
||||
const companiesMap = new Map();
|
||||
|
||||
companyData.forEach((row) => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
if (!name) return;
|
||||
|
||||
if (companiesMap.has(name)) return;
|
||||
|
||||
const region = row['地区'] || row['✅地区'] || '';
|
||||
const city = extractCity(region);
|
||||
|
||||
companiesMap.set(name, {
|
||||
name: name,
|
||||
city: city,
|
||||
hasImage: false,
|
||||
hasSegment: false
|
||||
});
|
||||
});
|
||||
|
||||
// 标记有图片的企业
|
||||
imageData.forEach(row => {
|
||||
const name = row['企业名称'];
|
||||
if (name && companiesMap.has(name)) {
|
||||
companiesMap.get(name).hasImage = true;
|
||||
}
|
||||
});
|
||||
|
||||
// 标记有业务板块的企业
|
||||
businessData.forEach(row => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
if (name && companiesMap.has(name)) {
|
||||
companiesMap.get(name).hasSegment = true;
|
||||
}
|
||||
});
|
||||
|
||||
// 检查缺失数据
|
||||
const noImage = [];
|
||||
const noSegment = [];
|
||||
const noCity = [];
|
||||
|
||||
companiesMap.forEach((company, name) => {
|
||||
if (!company.hasImage) noImage.push(name);
|
||||
if (!company.hasSegment) noSegment.push(name);
|
||||
if (!company.city) noCity.push(name);
|
||||
});
|
||||
|
||||
console.log('========== 数据质量检查结果 ==========\n');
|
||||
|
||||
if (noImage.length > 0) {
|
||||
console.log(`❌ 缺少图片的企业 (${noImage.length}家):`);
|
||||
noImage.forEach((name, index) => {
|
||||
console.log(` ${index + 1}. ${name}`);
|
||||
});
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (noSegment.length > 0) {
|
||||
console.log(`❌ 缺少业务板块的企业 (${noSegment.length}家):`);
|
||||
noSegment.forEach((name, index) => {
|
||||
console.log(` ${index + 1}. ${name}`);
|
||||
});
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (noCity.length > 0) {
|
||||
console.log(`❌ 缺少城市信息的企业 (${noCity.length}家):`);
|
||||
noCity.forEach((name, index) => {
|
||||
console.log(` ${index + 1}. ${name}`);
|
||||
});
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (noImage.length === 0 && noSegment.length === 0 && noCity.length === 0) {
|
||||
console.log('✅ 所有企业数据完整!');
|
||||
}
|
||||
391
scripts/convertCSV.js
Normal file
391
scripts/convertCSV.js
Normal file
@@ -0,0 +1,391 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* CSV数据转换脚本
|
||||
* 将三个CSV文件合并转换为JavaScript数据格式
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV文件路径
|
||||
const BASE_DIR = path.join(__dirname, '..');
|
||||
const COMPANY_CSV = path.join(BASE_DIR, '公司介绍.csv');
|
||||
const IMAGE_CSV = path.join(BASE_DIR, '企业图片.csv');
|
||||
const BUSINESS_CSV = path.join(BASE_DIR, '企业业务板块和内推岗位.csv');
|
||||
const OUTPUT_FILE = path.join(BASE_DIR, 'js', 'data.js');
|
||||
|
||||
/**
|
||||
* 更强大的CSV解析函数
|
||||
* 处理带引号的字段、逗号分隔和字段内换行符
|
||||
*/
|
||||
function parseCSV(content) {
|
||||
const data = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
// 双引号转义
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
// 切换引号状态
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
// 字段分隔符
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
// 行结束(不在引号内)
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
// 第一行是标题
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
// 只有当字段数量匹配时才添加
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
// 跳过 \r\n 的第二个字符
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一行
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从地区字段提取总部城市
|
||||
* 例如:"总部:苏州市 分公司:大连市" → "苏州市"
|
||||
*/
|
||||
function extractCity(region) {
|
||||
if (!region) return '';
|
||||
|
||||
// 匹配"总部:XX市"或"总部:XX"
|
||||
const match = region.match(/总部[::]\s*([^ \n]+)/);
|
||||
if (match) {
|
||||
let city = match[1].trim();
|
||||
|
||||
// 移除省份前缀(如"江苏无锡市" → "无锡市")
|
||||
city = city.replace(/^(江苏|浙江|广东|山东|河北|河南|四川|湖北|湖南|安徽|福建|陕西|辽宁)\s*/, '');
|
||||
|
||||
// 确保城市名以"市"结尾
|
||||
if (!city.endsWith('市') && !city.includes('自治') && !city.includes('特别行政区')) {
|
||||
city += '市';
|
||||
}
|
||||
return city;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成企业简称
|
||||
* 保留完整的企业名称,只移除常见后缀
|
||||
*/
|
||||
function generateShortName(fullName) {
|
||||
if (!fullName) return '';
|
||||
|
||||
// 移除常见后缀,但保留企业核心名称
|
||||
let name = fullName
|
||||
.replace(/有限责任公司$/, '')
|
||||
.replace(/股份有限公司$/, '')
|
||||
.replace(/有限公司$/, '')
|
||||
.replace(/\(集团\)$/, '')
|
||||
.replace(/(集团)$/, '')
|
||||
.trim();
|
||||
|
||||
// 保留完整名称,不再截断
|
||||
return name || fullName;
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成拼音ID(简化版)
|
||||
*/
|
||||
function generateId(name, index) {
|
||||
// 简单使用索引作为ID
|
||||
return `company${String(index + 1).padStart(3, '0')}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理标签
|
||||
* "民营企业, 大型企业" → ["民营企业", "大型企业"]
|
||||
*/
|
||||
function processTags(tagString) {
|
||||
if (!tagString) return [];
|
||||
return tagString.split(',').map(t => t.trim()).filter(t => t);
|
||||
}
|
||||
|
||||
/**
|
||||
* 城市到省份的映射
|
||||
*/
|
||||
const cityToProvince = {
|
||||
// 江苏省
|
||||
'南京市': '江苏省', '苏州市': '江苏省', '无锡市': '江苏省', '常州市': '江苏省',
|
||||
'徐州市': '江苏省', '南通市': '江苏省', '连云港市': '江苏省', '淮安市': '江苏省',
|
||||
'盐城市': '江苏省', '扬州市': '江苏省', '镇江市': '江苏省', '泰州市': '江苏省',
|
||||
'宿迁市': '江苏省',
|
||||
|
||||
// 浙江省
|
||||
'杭州市': '浙江省', '宁波市': '浙江省', '温州市': '浙江省', '嘉兴市': '浙江省',
|
||||
'湖州市': '浙江省', '绍兴市': '浙江省', '金华市': '浙江省', '衢州市': '浙江省',
|
||||
'舟山市': '浙江省', '台州市': '浙江省', '丽水市': '浙江省',
|
||||
|
||||
// 广东省
|
||||
'广州市': '广东省', '深圳市': '广东省', '珠海市': '广东省', '汕头市': '广东省',
|
||||
'佛山市': '广东省', '韶关市': '广东省', '湛江市': '广东省', '肇庆市': '广东省',
|
||||
'江门市': '广东省', '茂名市': '广东省', '惠州市': '广东省', '梅州市': '广东省',
|
||||
'汕尾市': '广东省', '河源市': '广东省', '阳江市': '广东省', '清远市': '广东省',
|
||||
'东莞市': '广东省', '中山市': '广东省', '潮州市': '广东省', '揭阳市': '广东省',
|
||||
'云浮市': '广东省',
|
||||
|
||||
// 上海、北京、天津、重庆(直辖市)
|
||||
'上海市': '上海市', '北京市': '北京市', '天津市': '天津市', '重庆市': '重庆市',
|
||||
|
||||
// 山东省
|
||||
'济南市': '山东省', '青岛市': '山东省', '淄博市': '山东省', '枣庄市': '山东省',
|
||||
'东营市': '山东省', '烟台市': '山东省', '潍坊市': '山东省', '济宁市': '山东省',
|
||||
'泰安市': '山东省', '威海市': '山东省', '日照市': '山东省', '临沂市': '山东省',
|
||||
'德州市': '山东省', '聊城市': '山东省', '滨州市': '山东省', '菏泽市': '山东省',
|
||||
|
||||
// 四川省
|
||||
'成都市': '四川省', '自贡市': '四川省', '攀枝花市': '四川省', '泸州市': '四川省',
|
||||
'德阳市': '四川省', '绵阳市': '四川省', '广元市': '四川省', '遂宁市': '四川省',
|
||||
'内江市': '四川省', '乐山市': '四川省', '南充市': '四川省', '眉山市': '四川省',
|
||||
'宜宾市': '四川省', '广安市': '四川省', '达州市': '四川省', '雅安市': '四川省',
|
||||
'巴中市': '四川省', '资阳市': '四川省',
|
||||
|
||||
// 安徽省
|
||||
'合肥市': '安徽省', '芜湖市': '安徽省', '蚌埠市': '安徽省', '淮南市': '安徽省',
|
||||
'马鞍山市': '安徽省', '淮北市': '安徽省', '铜陵市': '安徽省', '安庆市': '安徽省',
|
||||
'黄山市': '安徽省', '滁州市': '安徽省', '阜阳市': '安徽省', '宿州市': '安徽省',
|
||||
'六安市': '安徽省', '亳州市': '安徽省', '池州市': '安徽省', '宣城市': '安徽省',
|
||||
|
||||
// 河北省
|
||||
'石家庄市': '河北省', '唐山市': '河北省', '秦皇岛市': '河北省', '邯郸市': '河北省',
|
||||
'邢台市': '河北省', '保定市': '河北省', '张家口市': '河北省', '承德市': '河北省',
|
||||
'沧州市': '河北省', '廊坊市': '河北省', '衡水市': '河北省',
|
||||
|
||||
// 湖北省
|
||||
'武汉市': '湖北省', '黄石市': '湖北省', '十堰市': '湖北省', '宜昌市': '湖北省',
|
||||
'襄阳市': '湖北省', '鄂州市': '湖北省', '荆门市': '湖北省', '孝感市': '湖北省',
|
||||
'荆州市': '湖北省', '黄冈市': '湖北省', '咸宁市': '湖北省', '随州市': '湖北省',
|
||||
|
||||
// 湖南省
|
||||
'长沙市': '湖南省', '株洲市': '湖南省', '湘潭市': '湖南省', '衡阳市': '湖南省',
|
||||
'邵阳市': '湖南省', '岳阳市': '湖南省', '常德市': '湖南省', '张家界市': '湖南省',
|
||||
'益阳市': '湖南省', '郴州市': '湖南省', '永州市': '湖南省', '怀化市': '湖南省',
|
||||
'娄底市': '湖南省',
|
||||
|
||||
// 福建省
|
||||
'福州市': '福建省', '厦门市': '福建省', '莆田市': '福建省', '三明市': '福建省',
|
||||
'泉州市': '福建省', '漳州市': '福建省', '南平市': '福建省', '龙岩市': '福建省',
|
||||
'宁德市': '福建省',
|
||||
|
||||
// 河南省
|
||||
'郑州市': '河南省', '开封市': '河南省', '洛阳市': '河南省', '平顶山市': '河南省',
|
||||
'安阳市': '河南省', '鹤壁市': '河南省', '新乡市': '河南省', '焦作市': '河南省',
|
||||
'濮阳市': '河南省', '许昌市': '河南省', '漯河市': '河南省', '三门峡市': '河南省',
|
||||
'南阳市': '河南省', '商丘市': '河南省', '信阳市': '河南省', '周口市': '河南省',
|
||||
'驻马店市': '河南省',
|
||||
|
||||
// 陕西省
|
||||
'西安市': '陕西省', '铜川市': '陕西省', '宝鸡市': '陕西省', '咸阳市': '陕西省',
|
||||
'渭南市': '陕西省', '延安市': '陕西省', '汉中市': '陕西省', '榆林市': '陕西省',
|
||||
'安康市': '陕西省', '商洛市': '陕西省',
|
||||
|
||||
// 辽宁省
|
||||
'沈阳市': '辽宁省', '大连市': '辽宁省', '鞍山市': '辽宁省', '抚顺市': '辽宁省',
|
||||
'本溪市': '辽宁省', '丹东市': '辽宁省', '锦州市': '辽宁省', '营口市': '辽宁省',
|
||||
'阜新市': '辽宁省', '辽阳市': '辽宁省', '盘锦市': '辽宁省', '铁岭市': '辽宁省',
|
||||
'朝阳市': '辽宁省', '葫芦岛市': '辽宁省',
|
||||
|
||||
// 可以继续添加其他省份...
|
||||
};
|
||||
|
||||
console.log('开始读取CSV文件...\n');
|
||||
|
||||
// 读取三个CSV文件
|
||||
const companyData = parseCSV(fs.readFileSync(COMPANY_CSV, 'utf-8'));
|
||||
const imageData = parseCSV(fs.readFileSync(IMAGE_CSV, 'utf-8'));
|
||||
const businessData = parseCSV(fs.readFileSync(BUSINESS_CSV, 'utf-8'));
|
||||
|
||||
console.log(`✅ 读取 ${companyData.length} 条公司介绍数据`);
|
||||
console.log(`✅ 读取 ${imageData.length} 条企业图片数据`);
|
||||
console.log(`✅ 读取 ${businessData.length} 条业务板块数据\n`);
|
||||
|
||||
// 以企业名称为键,去重并合并数据
|
||||
const companiesMap = new Map();
|
||||
const citySet = new Set();
|
||||
const provinceSet = new Set();
|
||||
|
||||
// 第一步:处理公司基本信息(去重)
|
||||
companyData.forEach((row, index) => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
if (!name) return;
|
||||
|
||||
// 如果企业已存在,跳过(第一次出现的为准)
|
||||
if (companiesMap.has(name)) return;
|
||||
|
||||
const region = row['地区'] || row['✅地区'] || '';
|
||||
const city = extractCity(region);
|
||||
|
||||
if (city) {
|
||||
citySet.add(city);
|
||||
const province = cityToProvince[city];
|
||||
if (province) {
|
||||
provinceSet.add(province);
|
||||
}
|
||||
}
|
||||
|
||||
companiesMap.set(name, {
|
||||
id: generateId(name, companiesMap.size),
|
||||
city: city,
|
||||
name: name,
|
||||
shortName: generateShortName(name),
|
||||
tags: processTags(row['企业类型'] || row['✅企业类型'] || ''),
|
||||
intro: row['企业简介'] || row['✅企业简介'] || '',
|
||||
reason: row['推荐理由'] || row['✅推荐理由'] || '',
|
||||
region: region,
|
||||
cover: '',
|
||||
gallery: [],
|
||||
segments: []
|
||||
});
|
||||
});
|
||||
|
||||
console.log(`✅ 去重后企业数量: ${companiesMap.size}\n`);
|
||||
|
||||
// 第二步:合并企业图片
|
||||
imageData.forEach(row => {
|
||||
const name = row['企业名称'];
|
||||
const imagePath = row['图片路径'];
|
||||
|
||||
if (name && imagePath && companiesMap.has(name)) {
|
||||
const company = companiesMap.get(name);
|
||||
company.gallery.push(imagePath);
|
||||
|
||||
// 第一张图片作为封面
|
||||
if (!company.cover) {
|
||||
company.cover = imagePath;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`✅ 已合并企业图片数据\n`);
|
||||
|
||||
// 第三步:合并业务板块和岗位
|
||||
businessData.forEach(row => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
const segmentName = row['业务板块/主要业务'] || row['✅业务板块/主要业务'] || '';
|
||||
const jobsString = row['关联岗位'] || row['✅关联岗位'] || '';
|
||||
|
||||
if (name && segmentName && companiesMap.has(name)) {
|
||||
const company = companiesMap.get(name);
|
||||
|
||||
// 岗位按逗号分割
|
||||
const jobs = jobsString.split(',').map(j => j.trim()).filter(j => j);
|
||||
|
||||
company.segments.push({
|
||||
name: segmentName,
|
||||
jobs: jobs
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`✅ 已合并业务板块和岗位数据\n`);
|
||||
|
||||
// 转换为数组
|
||||
const companiesArray = Array.from(companiesMap.values());
|
||||
|
||||
// 按城市排序
|
||||
companiesArray.sort((a, b) => a.city.localeCompare(b.city, 'zh-CN'));
|
||||
|
||||
console.log('数据统计:');
|
||||
console.log(` 企业总数: ${companiesArray.length}`);
|
||||
console.log(` 覆盖城市: ${citySet.size}`);
|
||||
console.log(` 覆盖省份: ${provinceSet.size}\n`);
|
||||
|
||||
// 输出城市列表(用于config.js)
|
||||
console.log('涉及城市列表:');
|
||||
const citiesArray = Array.from(citySet).sort((a, b) => a.localeCompare(b, 'zh-CN'));
|
||||
console.log(citiesArray.join(', '));
|
||||
console.log('');
|
||||
|
||||
// 输出省份列表(用于config.js)
|
||||
console.log('涉及省份列表:');
|
||||
const provincesArray = Array.from(provinceSet).sort((a, b) => a.localeCompare(b, 'zh-CN'));
|
||||
console.log(provincesArray.join(', '));
|
||||
console.log('');
|
||||
|
||||
// 生成JavaScript文件
|
||||
const jsContent = `/* ===================================
|
||||
企业数据 - 从CSV自动生成
|
||||
生成时间: ${new Date().toLocaleString('zh-CN')}
|
||||
企业数量: ${companiesArray.length}
|
||||
=================================== */
|
||||
|
||||
export const companiesData = ${JSON.stringify(companiesArray, null, 4)};
|
||||
|
||||
// 城市列表 (共${citiesArray.length}个)
|
||||
export const activeCities = ${JSON.stringify(citiesArray, null, 4)};
|
||||
|
||||
// 省份列表 (共${provincesArray.length}个)
|
||||
export const activeProvinces = ${JSON.stringify(provincesArray, null, 4)};
|
||||
`;
|
||||
|
||||
// 写入文件
|
||||
fs.writeFileSync(OUTPUT_FILE, jsContent, 'utf-8');
|
||||
|
||||
console.log(`✅ 数据已成功写入: ${OUTPUT_FILE}`);
|
||||
console.log(`✅ 文件大小: ${(fs.statSync(OUTPUT_FILE).size / 1024).toFixed(2)} KB\n`);
|
||||
|
||||
// 数据质量检查
|
||||
let noImageCount = 0;
|
||||
let noSegmentCount = 0;
|
||||
let noCityCount = 0;
|
||||
|
||||
companiesArray.forEach(company => {
|
||||
if (company.gallery.length === 0) noImageCount++;
|
||||
if (company.segments.length === 0) noSegmentCount++;
|
||||
if (!company.city) noCityCount++;
|
||||
});
|
||||
|
||||
console.log('数据质量检查:');
|
||||
console.log(` 缺少图片的企业: ${noImageCount}`);
|
||||
console.log(` 缺少业务板块的企业: ${noSegmentCount}`);
|
||||
console.log(` 缺少城市信息的企业: ${noCityCount}`);
|
||||
|
||||
console.log('\n✨ 转换完成!');
|
||||
280
scripts/convertJSON.js
Normal file
280
scripts/convertJSON.js
Normal file
@@ -0,0 +1,280 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// JSON文件路径
|
||||
const BASE_DIR = path.join(__dirname, '..');
|
||||
const COMPANY_JSON = path.join(BASE_DIR, '公司介绍.json');
|
||||
const IMAGE_JSON = path.join(BASE_DIR, '企业图片.json');
|
||||
const BUSINESS_JSON = path.join(BASE_DIR, '企业业务板块和内推岗位.json');
|
||||
|
||||
// 提取城市信息
|
||||
function extractCity(region) {
|
||||
if (!region) return '';
|
||||
|
||||
// 尝试匹配 "总部:城市名" 的格式
|
||||
const match = region.match(/总部[::]\s*([^\n]+)/);
|
||||
if (match) {
|
||||
let city = match[1].trim();
|
||||
|
||||
// 移除省份前缀(如果有)
|
||||
city = city.replace(/^(江苏|浙江|广东|山东|河北|河南|四川|湖北|湖南|安徽|福建|陕西|辽宁|重庆|天津|上海|北京)\s*/, '');
|
||||
|
||||
// 如果不是以"市"结尾且不是特殊行政区,添加"市"
|
||||
if (!city.endsWith('市') && !city.includes('自治') && !city.includes('特别行政区')) {
|
||||
city += '市';
|
||||
}
|
||||
|
||||
return city;
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
console.log('开始读取JSON文件...\n');
|
||||
|
||||
// 读取JSON文件
|
||||
const companyData = JSON.parse(fs.readFileSync(COMPANY_JSON, 'utf-8'));
|
||||
const imageDataObj = JSON.parse(fs.readFileSync(IMAGE_JSON, 'utf-8'));
|
||||
const businessDataObj = JSON.parse(fs.readFileSync(BUSINESS_JSON, 'utf-8'));
|
||||
|
||||
console.log(`✅ 读取 ${companyData.length} 条公司介绍数据`);
|
||||
console.log(`✅ 读取 ${Object.keys(imageDataObj).length} 家企业图片数据`);
|
||||
console.log(`✅ 读取 ${Object.keys(businessDataObj).length} 家企业业务板块数据\n`);
|
||||
|
||||
// 建立企业Map,避免重复
|
||||
const companiesMap = new Map();
|
||||
|
||||
companyData.forEach((row) => {
|
||||
const name = row['✅企业名称'];
|
||||
if (!name) return;
|
||||
|
||||
// 如果企业已存在,跳过
|
||||
if (companiesMap.has(name)) return;
|
||||
|
||||
const region = row['✅地区'] || '';
|
||||
const city = extractCity(region);
|
||||
const type = row['✅企业类型'] || '';
|
||||
const description = row['✅企业简介'] || '';
|
||||
|
||||
// 生成企业简称(去掉"有限公司"等后缀)
|
||||
const shortName = name
|
||||
.replace(/股份有限公司$/, '')
|
||||
.replace(/有限责任公司$/, '')
|
||||
.replace(/有限公司$/, '')
|
||||
.replace(/集团$/, '集团');
|
||||
|
||||
// 生成标签(从企业类型中提取)
|
||||
const tags = type.split(',').map(t => t.trim()).filter(t => t);
|
||||
|
||||
// 生成简介(截取前100个字符)
|
||||
const intro = description.length > 100 ? description.substring(0, 100) + '...' : description;
|
||||
|
||||
companiesMap.set(name, {
|
||||
name: name,
|
||||
shortName: shortName,
|
||||
type: type,
|
||||
tags: tags,
|
||||
region: region,
|
||||
city: city,
|
||||
description: description,
|
||||
intro: intro,
|
||||
reason: row['✅推荐理由'] || '', // 前端期望的字段名是 reason
|
||||
gallery: [], // 前端期望的字段名是 gallery
|
||||
cover: '', // 将在合并图片时设置
|
||||
segments: []
|
||||
});
|
||||
});
|
||||
|
||||
console.log(`✅ 去重后企业数量: ${companiesMap.size}\n`);
|
||||
|
||||
// 合并图片数据
|
||||
Object.keys(imageDataObj).forEach(companyName => {
|
||||
if (!companiesMap.has(companyName)) return;
|
||||
|
||||
const company = companiesMap.get(companyName);
|
||||
const images = imageDataObj[companyName];
|
||||
|
||||
if (Array.isArray(images)) {
|
||||
images.forEach(imageUrl => {
|
||||
if (imageUrl && !company.gallery.includes(imageUrl)) {
|
||||
company.gallery.push(imageUrl);
|
||||
}
|
||||
});
|
||||
// 设置封面为第一张图片
|
||||
if (company.gallery.length > 0) {
|
||||
company.cover = company.gallery[0];
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log('✅ 已合并企业图片数据\n');
|
||||
|
||||
// 合并业务板块和岗位数据
|
||||
Object.keys(businessDataObj).forEach(companyName => {
|
||||
if (!companiesMap.has(companyName)) return;
|
||||
|
||||
const company = companiesMap.get(companyName);
|
||||
const segments = businessDataObj[companyName];
|
||||
|
||||
if (Array.isArray(segments)) {
|
||||
segments.forEach(item => {
|
||||
const segment = item['业务板块'] || '';
|
||||
const positions = item['关联岗位'] || [];
|
||||
|
||||
if (segment || positions.length > 0) {
|
||||
company.segments.push({
|
||||
name: segment, // 前端代码期望name字段
|
||||
jobs: positions // 前端代码期望jobs字段
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
console.log('✅ 已合并业务板块和岗位数据\n');
|
||||
|
||||
// 转换为数组
|
||||
const companies = Array.from(companiesMap.values());
|
||||
|
||||
// 对苏州市的特定企业进行排序调整
|
||||
// 确保恒力集团和亨通集团排在阿特斯前面
|
||||
const suzhouPriority = {
|
||||
'恒力集团有限公司': 1,
|
||||
'亨通集团有限公司': 2,
|
||||
'阿特斯阳光电力集团股份有限公司': 3
|
||||
};
|
||||
|
||||
companies.sort((a, b) => {
|
||||
// 如果两个都是苏州的优先级企业,按优先级排序
|
||||
const aPriority = suzhouPriority[a.name];
|
||||
const bPriority = suzhouPriority[b.name];
|
||||
|
||||
if (aPriority && bPriority) {
|
||||
return aPriority - bPriority;
|
||||
}
|
||||
|
||||
// 如果只有一个是优先级企业,优先级企业排前面
|
||||
if (aPriority) return -1;
|
||||
if (bPriority) return 1;
|
||||
|
||||
// 其他情况保持原顺序
|
||||
return 0;
|
||||
});
|
||||
|
||||
// 统计信息
|
||||
const cities = new Set();
|
||||
const provinces = new Set();
|
||||
|
||||
companies.forEach(company => {
|
||||
if (company.city) {
|
||||
cities.add(company.city);
|
||||
|
||||
// 提取省份
|
||||
const cityToProv = {
|
||||
'苏州市': '江苏省', '南京市': '江苏省', '无锡市': '江苏省', '常州市': '江苏省',
|
||||
'南通市': '江苏省', '徐州市': '江苏省', '盐城市': '江苏省', '扬州市': '江苏省',
|
||||
'镇江市': '江苏省', '泰州市': '江苏省', '连云港市': '江苏省', '淮安市': '江苏省',
|
||||
'宿迁市': '江苏省', '常熟市': '江苏省',
|
||||
|
||||
'杭州市': '浙江省', '宁波市': '浙江省', '温州市': '浙江省', '绍兴市': '浙江省',
|
||||
'台州市': '浙江省',
|
||||
|
||||
'广州市': '广东省', '深圳市': '广东省', '东莞市': '广东省', '佛山市': '广东省',
|
||||
'惠州市': '广东省', '珠海市': '广东省',
|
||||
|
||||
'合肥市': '安徽省', '芜湖市': '安徽省', '安庆市': '安徽省', '滁州市': '安徽省',
|
||||
'池州市': '安徽省', '铜陵市': '安徽省', '六安市': '安徽省',
|
||||
|
||||
'石家庄市': '河北省', '唐山市': '河北省', '保定市': '河北省', '廊坊市': '河北省',
|
||||
'衡水市': '河北省',
|
||||
|
||||
'长沙市': '湖南省', '岳阳市': '湖南省', '衡阳市': '湖南省',
|
||||
|
||||
'武汉市': '湖北省',
|
||||
|
||||
'福州市': '福建省', '厦门市': '福建省', '泉州市': '福建省', '莆田市': '福建省',
|
||||
'三明市': '福建省',
|
||||
|
||||
'北京市': '北京市',
|
||||
'上海市': '上海市',
|
||||
'天津市': '天津市'
|
||||
};
|
||||
|
||||
const province = cityToProv[company.city];
|
||||
if (province) {
|
||||
provinces.add(province);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log('数据统计:');
|
||||
console.log(` 企业总数: ${companies.length}`);
|
||||
console.log(` 覆盖城市: ${cities.size}`);
|
||||
console.log(` 覆盖省份: ${provinces.size}\n`);
|
||||
|
||||
console.log('涉及城市列表:');
|
||||
console.log(Array.from(cities).sort().join(', ') + '\n');
|
||||
|
||||
console.log('涉及省份列表:');
|
||||
console.log(Array.from(provinces).sort().join(', ') + '\n');
|
||||
|
||||
// 生成data.js文件
|
||||
const dataJsContent = `// 自动生成的数据文件
|
||||
// 生成时间: ${new Date().toLocaleString('zh-CN')}
|
||||
|
||||
const companies = ${JSON.stringify(companies, null, 2)};
|
||||
|
||||
const cities = ${JSON.stringify(Array.from(cities).sort(), null, 2)};
|
||||
|
||||
const provinces = ${JSON.stringify(Array.from(provinces).sort(), null, 2)};
|
||||
|
||||
// 为ES6模块导出添加别名
|
||||
const activeCities = cities;
|
||||
const activeProvinces = provinces;
|
||||
const companiesData = companies;
|
||||
|
||||
// ES6模块导出
|
||||
export { companies, cities, provinces, activeCities, activeProvinces, companiesData };
|
||||
|
||||
// CommonJS导出(向后兼容)
|
||||
if (typeof module !== 'undefined' && module.exports) {
|
||||
module.exports = { companies, cities, provinces, activeCities, activeProvinces, companiesData };
|
||||
}
|
||||
`;
|
||||
|
||||
const outputPath = path.join(BASE_DIR, 'js', 'data.js');
|
||||
fs.writeFileSync(outputPath, dataJsContent, 'utf-8');
|
||||
|
||||
const fileSize = (fs.statSync(outputPath).size / 1024).toFixed(2);
|
||||
console.log(`✅ 数据已成功写入: ${outputPath}`);
|
||||
console.log(`✅ 文件大小: ${fileSize} KB\n`);
|
||||
|
||||
// 数据质量检查
|
||||
const noImage = companies.filter(c => c.gallery.length === 0);
|
||||
const noSegment = companies.filter(c => c.segments.length === 0);
|
||||
const noCity = companies.filter(c => !c.city);
|
||||
|
||||
console.log('数据质量检查:');
|
||||
console.log(` 缺少图片的企业: ${noImage.length}`);
|
||||
console.log(` 缺少业务板块的企业: ${noSegment.length}`);
|
||||
console.log(` 缺少城市信息的企业: ${noCity.length}\n`);
|
||||
|
||||
if (noImage.length > 0 && noImage.length <= 10) {
|
||||
console.log('缺少图片的企业:');
|
||||
noImage.forEach(c => console.log(` - ${c.name}`));
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (noSegment.length > 0 && noSegment.length <= 10) {
|
||||
console.log('缺少业务板块的企业:');
|
||||
noSegment.forEach(c => console.log(` - ${c.name}`));
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (noCity.length > 0 && noCity.length <= 10) {
|
||||
console.log('缺少城市信息的企业:');
|
||||
noCity.forEach(c => console.log(` - ${c.name}`));
|
||||
console.log('');
|
||||
}
|
||||
|
||||
console.log('✨ 转换完成!');
|
||||
157
scripts/csvErrorReport.js
Normal file
157
scripts/csvErrorReport.js
Normal file
@@ -0,0 +1,157 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV解析函数
|
||||
function parseCSV(content) {
|
||||
const data = [];
|
||||
const skipped = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
let lineNum = 1;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
} else {
|
||||
// 尝试提取企业名称
|
||||
let companyName = '未知';
|
||||
for (let field of currentRow) {
|
||||
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
|
||||
// 提取企业名称
|
||||
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
|
||||
if (match) {
|
||||
companyName = match[1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
skipped.push({
|
||||
lineNum,
|
||||
expectedFields: headers.length,
|
||||
actualFields: currentRow.length,
|
||||
companyName,
|
||||
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
|
||||
});
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
lineNum++;
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一行
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
} else {
|
||||
let companyName = '未知';
|
||||
for (let field of currentRow) {
|
||||
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
|
||||
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
|
||||
if (match) {
|
||||
companyName = match[1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
skipped.push({
|
||||
lineNum,
|
||||
expectedFields: headers ? headers.length : 0,
|
||||
actualFields: currentRow.length,
|
||||
companyName,
|
||||
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return { data, skipped };
|
||||
}
|
||||
|
||||
console.log('正在分析公司介绍.csv文件...\n');
|
||||
|
||||
const content = fs.readFileSync(path.join(__dirname, '..', '公司介绍.csv'), 'utf-8');
|
||||
const result = parseCSV(content);
|
||||
|
||||
console.log('========== 解析结果统计 ==========');
|
||||
console.log(`✅ 成功解析: ${result.data.length} 条记录`);
|
||||
console.log(`❌ 格式错误被跳过: ${result.skipped.length} 条记录`);
|
||||
console.log(`📊 总计: ${result.data.length + result.skipped.length} 条\n`);
|
||||
|
||||
if (result.skipped.length > 0) {
|
||||
console.log('========== 格式错误的记录列表 ==========\n');
|
||||
|
||||
// 按企业名称分组
|
||||
const grouped = {};
|
||||
result.skipped.forEach(item => {
|
||||
if (!grouped[item.companyName]) {
|
||||
grouped[item.companyName] = [];
|
||||
}
|
||||
grouped[item.companyName].push(item);
|
||||
});
|
||||
|
||||
Object.keys(grouped).forEach((companyName, index) => {
|
||||
const items = grouped[companyName];
|
||||
console.log(`${index + 1}. 企业名称: ${companyName}`);
|
||||
console.log(` 错误记录数: ${items.length}`);
|
||||
console.log(` 期望字段数: 5`);
|
||||
console.log(` 实际字段数: ${items[0].actualFields}`);
|
||||
console.log(` 首个字段内容: ${items[0].firstField}...`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
console.log('\n========== 修复建议 ==========');
|
||||
console.log('1. 检查CSV文件中每条企业记录是否有完整的5个字段:');
|
||||
console.log(' - ✅企业名称');
|
||||
console.log(' - ✅企业类型');
|
||||
console.log(' - ✅地区');
|
||||
console.log(' - ✅企业简介');
|
||||
console.log(' - ✅推荐理由');
|
||||
console.log('\n2. 确保每条记录的推荐理由字段末尾有换行符');
|
||||
console.log('\n3. 确保所有字段内容如果包含逗号、引号或换行符,必须用双引号包裹');
|
||||
console.log('\n4. 特别检查上述列表中的企业记录');
|
||||
}
|
||||
|
||||
// 输出缺少的企业名单
|
||||
console.log('\n========== 特别关注 ==========');
|
||||
console.log('以下企业应该在CSV中但未被成功解析:');
|
||||
console.log('- 江苏恒瑞医药股份有限公司');
|
||||
console.log('- 宿迁阿特斯阳光能源科技有限公司');
|
||||
console.log('\n建议: 检查这些企业所在行的前一条记录是否缺少换行符');
|
||||
105
scripts/debugParse.js
Normal file
105
scripts/debugParse.js
Normal file
@@ -0,0 +1,105 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// 使用与convertCSV.js相同的解析函数
|
||||
function parseCSV(content) {
|
||||
const data = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
console.log('表头字段数:', headers.length);
|
||||
console.log('表头:', headers);
|
||||
} else if (currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
} else {
|
||||
console.log(`⚠️ 跳过字段数不匹配的行: 期望${headers.length}个字段,实际${currentRow.length}个`);
|
||||
console.log(' 字段内容:', currentRow.map((f, i) => `[${i}] ${f.substring(0, 30)}...`).join(', '));
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一行
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
console.log('\n最后一行字段数:', currentRow.length);
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
console.log('✅ 最后一行被成功解析');
|
||||
} else {
|
||||
console.log('❌ 最后一行字段数不匹配,被跳过');
|
||||
console.log(' 期望字段数:', headers ? headers.length : '未知');
|
||||
console.log(' 实际字段数:', currentRow.length);
|
||||
if (currentRow.length > 0) {
|
||||
console.log(' 第1个字段:', currentRow[0].substring(0, 50));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
const content = fs.readFileSync(path.join(__dirname, '..', '公司介绍.csv'), 'utf-8');
|
||||
const data = parseCSV(content);
|
||||
|
||||
console.log('\n解析结果统计:');
|
||||
console.log('总记录数:', data.length);
|
||||
|
||||
console.log('\n最后10家企业:');
|
||||
data.slice(-10).forEach((row, index) => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
console.log(` ${data.length - 10 + index + 1}. ${name}`);
|
||||
});
|
||||
|
||||
// 搜索这两家企业
|
||||
const hengrui = data.find(row => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
return name && name.includes('江苏恒瑞医药');
|
||||
});
|
||||
|
||||
const ates = data.find(row => {
|
||||
const name = row['企业名称'] || row['✅企业名称'];
|
||||
return name && name.includes('宿迁阿特斯');
|
||||
});
|
||||
|
||||
console.log('\n查找结果:');
|
||||
console.log('江苏恒瑞医药:', hengrui ? '✅ 找到' : '❌ 未找到');
|
||||
console.log('宿迁阿特斯:', ates ? '✅ 找到' : '❌ 未找到');
|
||||
104
scripts/deleteCompanies.js
Normal file
104
scripts/deleteCompanies.js
Normal file
@@ -0,0 +1,104 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// 要删除的企业列表
|
||||
const companiesToDelete = [
|
||||
'福建省长乐市金磊纺织有限公司',
|
||||
'福建顺大运动品有限公司',
|
||||
'福建鑫海冶金有限公司',
|
||||
'河北财商商贸有限公司',
|
||||
'湖南德元顺生物科技有限公司',
|
||||
'湖南福陆特科技发展有限公司',
|
||||
'湖南航祥机电科技有限公司',
|
||||
'湖南中邦恒盛医药有限公司',
|
||||
'石家庄北国人百集团有限责任公司',
|
||||
'皙悦(天津)文旅产业发展有限公司'
|
||||
];
|
||||
|
||||
// CSV文件路径
|
||||
const files = [
|
||||
'../公司介绍.csv',
|
||||
'../企业业务板块和内推岗位.csv',
|
||||
'../企业图片.csv'
|
||||
];
|
||||
|
||||
// 简单的CSV解析函数(处理带引号的字段)
|
||||
function parseCSVLine(line) {
|
||||
const result = [];
|
||||
let current = '';
|
||||
let inQuotes = false;
|
||||
|
||||
for (let i = 0; i < line.length; i++) {
|
||||
const char = line[i];
|
||||
const nextChar = line[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
// 转义的引号
|
||||
current += '"';
|
||||
i++;
|
||||
} else {
|
||||
// 切换引号状态
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
// 字段分隔符
|
||||
result.push(current);
|
||||
current = '';
|
||||
} else {
|
||||
current += char;
|
||||
}
|
||||
}
|
||||
|
||||
result.push(current);
|
||||
return result;
|
||||
}
|
||||
|
||||
// 处理每个文件
|
||||
files.forEach(file => {
|
||||
const filePath = path.join(__dirname, file);
|
||||
|
||||
if (!fs.existsSync(filePath)) {
|
||||
console.log(`⚠️ 文件不存在: ${file}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n📄 处理文件: ${file}`);
|
||||
|
||||
// 读取文件内容
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
const lines = content.split('\n');
|
||||
|
||||
// 保留第一行(表头)
|
||||
const header = lines[0];
|
||||
const dataLines = lines.slice(1);
|
||||
|
||||
// 过滤掉要删除的企业
|
||||
const filteredLines = [];
|
||||
let deletedCount = 0;
|
||||
|
||||
for (let i = 0; i < dataLines.length; i++) {
|
||||
const line = dataLines[i].trim();
|
||||
if (!line) continue; // 跳过空行
|
||||
|
||||
// 解析CSV行
|
||||
const fields = parseCSVLine(line);
|
||||
const companyName = fields[0]; // 第一列是企业名称
|
||||
|
||||
// 检查是否需要删除
|
||||
if (companiesToDelete.includes(companyName)) {
|
||||
console.log(` ❌ 删除: ${companyName}`);
|
||||
deletedCount++;
|
||||
} else {
|
||||
filteredLines.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
// 写回文件
|
||||
const newContent = [header, ...filteredLines].join('\n');
|
||||
fs.writeFileSync(filePath, newContent, 'utf-8');
|
||||
|
||||
console.log(` ✅ 完成: 删除了 ${deletedCount} 家企业`);
|
||||
});
|
||||
|
||||
console.log('\n\n🎉 所有文件处理完成!');
|
||||
198
scripts/fixCSV.js
Normal file
198
scripts/fixCSV.js
Normal file
@@ -0,0 +1,198 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV解析函数,收集所有字段
|
||||
function parseCSVToRawFields(content) {
|
||||
const allFields = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
allFields.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField.trim()) {
|
||||
allFields.push(currentField.trim());
|
||||
}
|
||||
currentField = '';
|
||||
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
if (currentField.trim()) {
|
||||
allFields.push(currentField.trim());
|
||||
}
|
||||
|
||||
return allFields;
|
||||
}
|
||||
|
||||
// 判断一个字段是否可能是企业名称
|
||||
function looksLikeCompanyName(field) {
|
||||
return field && (
|
||||
field.includes('有限公司') ||
|
||||
field.includes('股份有限公司') ||
|
||||
field.includes('集团有限公司') ||
|
||||
field.includes('科技有限公司') ||
|
||||
field.includes('股份公司') ||
|
||||
field.endsWith('集团')
|
||||
);
|
||||
}
|
||||
|
||||
// 判断一个字段是否可能是企业类型
|
||||
function looksLikeCompanyType(field) {
|
||||
return field && (
|
||||
field.includes('国有企业') ||
|
||||
field.includes('民营企业') ||
|
||||
field.includes('外资企业') ||
|
||||
field.includes('中外合资') ||
|
||||
field.includes('大型企业') ||
|
||||
field.includes('中小企业')
|
||||
);
|
||||
}
|
||||
|
||||
// 判断一个字段是否可能是地区信息
|
||||
function looksLikeRegion(field) {
|
||||
return field && (
|
||||
field.startsWith('总部:') ||
|
||||
field.includes('总部:') ||
|
||||
field.includes('分公司:')
|
||||
);
|
||||
}
|
||||
|
||||
// 将所有字段重新组合成5字段记录
|
||||
function reconstructRecords(allFields) {
|
||||
const headers = allFields.slice(0, 5); // 前5个是表头
|
||||
const dataFields = allFields.slice(5);
|
||||
const records = [];
|
||||
|
||||
let i = 0;
|
||||
while (i < dataFields.length) {
|
||||
// 尝试找到企业名称作为记录的开始
|
||||
if (looksLikeCompanyName(dataFields[i])) {
|
||||
const record = [];
|
||||
|
||||
// 字段1: 企业名称
|
||||
record.push(dataFields[i]);
|
||||
i++;
|
||||
|
||||
// 字段2: 企业类型
|
||||
if (i < dataFields.length && looksLikeCompanyType(dataFields[i])) {
|
||||
record.push(dataFields[i]);
|
||||
i++;
|
||||
} else if (i < dataFields.length) {
|
||||
// 如果不是企业类型,可能是合并到名称中了,尝试继续
|
||||
record.push('');
|
||||
}
|
||||
|
||||
// 字段3: 地区
|
||||
if (i < dataFields.length && looksLikeRegion(dataFields[i])) {
|
||||
record.push(dataFields[i]);
|
||||
i++;
|
||||
} else if (i < dataFields.length) {
|
||||
record.push('');
|
||||
}
|
||||
|
||||
// 字段4: 企业简介(通常很长)
|
||||
if (i < dataFields.length && !looksLikeCompanyName(dataFields[i])) {
|
||||
record.push(dataFields[i]);
|
||||
i++;
|
||||
} else if (i < dataFields.length) {
|
||||
record.push('');
|
||||
}
|
||||
|
||||
// 字段5: 推荐理由(通常也很长)
|
||||
if (i < dataFields.length && !looksLikeCompanyName(dataFields[i])) {
|
||||
record.push(dataFields[i]);
|
||||
i++;
|
||||
} else if (i < dataFields.length) {
|
||||
record.push('');
|
||||
}
|
||||
|
||||
if (record.length === 5) {
|
||||
records.push(record);
|
||||
}
|
||||
} else {
|
||||
// 如果这个字段看起来不像企业名称,但包含分公司信息,可能是地区字段
|
||||
// 尝试回溯添加到上一条记录
|
||||
if (i > 0 && looksLikeRegion(dataFields[i]) && records.length > 0) {
|
||||
const lastRecord = records[records.length - 1];
|
||||
if (!lastRecord[2]) { // 如果上一条记录没有地区信息
|
||||
lastRecord[2] = dataFields[i];
|
||||
} else {
|
||||
// 合并到地区信息
|
||||
lastRecord[2] += '\n' + dataFields[i];
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return { headers, records };
|
||||
}
|
||||
|
||||
// 转义CSV字段
|
||||
function escapeCSVField(field) {
|
||||
if (!field) return '';
|
||||
|
||||
// 如果字段包含逗号、引号或换行符,用引号包裹
|
||||
if (field.includes(',') || field.includes('"') || field.includes('\n') || field.includes('\r')) {
|
||||
// 将字段中的引号转义为两个引号
|
||||
const escaped = field.replace(/"/g, '""');
|
||||
return `"${escaped}"`;
|
||||
}
|
||||
return field;
|
||||
}
|
||||
|
||||
console.log('正在修复CSV文件...\n');
|
||||
|
||||
const csvPath = path.join(__dirname, '..', '公司介绍.csv');
|
||||
const content = fs.readFileSync(csvPath, 'utf-8');
|
||||
|
||||
console.log('步骤1: 解析CSV为原始字段...');
|
||||
const allFields = parseCSVToRawFields(content);
|
||||
console.log(` 提取了 ${allFields.length} 个字段\n`);
|
||||
|
||||
console.log('步骤2: 重构为5字段记录...');
|
||||
const { headers, records } = reconstructRecords(allFields);
|
||||
console.log(` 表头: ${headers.join(', ')}`);
|
||||
console.log(` 成功重构 ${records.length} 条记录\n`);
|
||||
|
||||
console.log('步骤3: 生成修复后的CSV内容...');
|
||||
const lines = [];
|
||||
|
||||
// 添加表头
|
||||
lines.push(headers.map(escapeCSVField).join(','));
|
||||
|
||||
// 添加数据行
|
||||
records.forEach(record => {
|
||||
lines.push(record.map(escapeCSVField).join(','));
|
||||
});
|
||||
|
||||
const fixedContent = lines.join('\n');
|
||||
|
||||
// 输出到新文件
|
||||
const fixedPath = path.join(__dirname, '..', '公司介绍_fixed.csv');
|
||||
fs.writeFileSync(fixedPath, fixedContent, 'utf-8');
|
||||
|
||||
console.log(`✅ 修复完成!`);
|
||||
console.log(` 原始记录数: ${allFields.length / 5} (理论值)`);
|
||||
console.log(` 修复后记录数: ${records.length}`);
|
||||
console.log(` 输出文件: 公司介绍_fixed.csv\n`);
|
||||
|
||||
console.log('请检查修复后的文件,如果正确,可以替换原文件。');
|
||||
208
scripts/fixCSV_v2.js
Normal file
208
scripts/fixCSV_v2.js
Normal file
@@ -0,0 +1,208 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// 扩展的CSV解析函数,收集完整记录和不完整字段
|
||||
function parseCSVWithSkipped(content) {
|
||||
const goodRecords = [];
|
||||
const skippedRows = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
let rowNum = 0;
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
goodRecords.push({
|
||||
rowNum,
|
||||
fields: currentRow
|
||||
});
|
||||
} else {
|
||||
skippedRows.push({
|
||||
rowNum,
|
||||
fields: currentRow
|
||||
});
|
||||
}
|
||||
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
rowNum++;
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一行
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
goodRecords.push({
|
||||
rowNum,
|
||||
fields: currentRow
|
||||
});
|
||||
} else {
|
||||
skippedRows.push({
|
||||
rowNum,
|
||||
fields: currentRow
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return { headers, goodRecords, skippedRows };
|
||||
}
|
||||
|
||||
// 尝试修复被跳过的行
|
||||
function fixSkippedRows(goodRecords, skippedRows, headers) {
|
||||
const allRecords = [];
|
||||
let goodIndex = 0;
|
||||
let skipIndex = 0;
|
||||
|
||||
while (goodIndex < goodRecords.length || skipIndex < skippedRows.length) {
|
||||
if (goodIndex < goodRecords.length &&
|
||||
(skipIndex >= skippedRows.length || goodRecords[goodIndex].rowNum < skippedRows[skipIndex].rowNum)) {
|
||||
// 添加一个完整的记录
|
||||
allRecords.push(goodRecords[goodIndex].fields);
|
||||
goodIndex++;
|
||||
} else if (skipIndex < skippedRows.length) {
|
||||
const skipped = skippedRows[skipIndex];
|
||||
|
||||
// 尝试修复这条记录
|
||||
if (skipped.fields.length < headers.length) {
|
||||
// 字段太少,可能需要从下一行借字段
|
||||
const combined = [...skipped.fields];
|
||||
|
||||
// 查看下一个跳过的行
|
||||
let nextSkipIndex = skipIndex + 1;
|
||||
while (combined.length < headers.length && nextSkipIndex < skippedRows.length) {
|
||||
const nextSkipped = skippedRows[nextSkipIndex];
|
||||
if (nextSkipped.rowNum === skipped.rowNum + (nextSkipIndex - skipIndex)) {
|
||||
// 连续的跳过行,合并字段
|
||||
combined.push(...nextSkipped.fields);
|
||||
nextSkipIndex++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (combined.length === headers.length) {
|
||||
allRecords.push(combined);
|
||||
skipIndex = nextSkipIndex;
|
||||
} else if (combined.length > headers.length) {
|
||||
// 字段太多,取前5个
|
||||
allRecords.push(combined.slice(0, headers.length));
|
||||
skipIndex = nextSkipIndex;
|
||||
} else {
|
||||
// 还是不够,跳过
|
||||
console.log(`⚠️ 无法修复第${skipped.rowNum}行: 字段数${combined.length}/${headers.length}`);
|
||||
skipIndex++;
|
||||
}
|
||||
} else if (skipped.fields.length > headers.length) {
|
||||
// 字段太多,可能包含了下一条记录的一部分
|
||||
// 尝试拆分
|
||||
const firstRecord = skipped.fields.slice(0, headers.length);
|
||||
const remaining = skipped.fields.slice(headers.length);
|
||||
|
||||
allRecords.push(firstRecord);
|
||||
|
||||
// 将剩余字段作为新的跳过行处理
|
||||
if (remaining.length > 0) {
|
||||
skippedRows.splice(skipIndex + 1, 0, {
|
||||
rowNum: skipped.rowNum + 0.5,
|
||||
fields: remaining
|
||||
});
|
||||
}
|
||||
|
||||
skipIndex++;
|
||||
} else {
|
||||
// 字段数正确
|
||||
allRecords.push(skipped.fields);
|
||||
skipIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allRecords;
|
||||
}
|
||||
|
||||
// 转义CSV字段
|
||||
function escapeCSVField(field) {
|
||||
if (!field) return '';
|
||||
|
||||
if (field.includes(',') || field.includes('"') || field.includes('\n') || field.includes('\r')) {
|
||||
const escaped = field.replace(/"/g, '""');
|
||||
return `"${escaped}"`;
|
||||
}
|
||||
return field;
|
||||
}
|
||||
|
||||
console.log('正在修复CSV文件...\n');
|
||||
|
||||
const csvPath = path.join(__dirname, '..', '公司介绍.csv');
|
||||
const content = fs.readFileSync(csvPath, 'utf-8');
|
||||
|
||||
console.log('步骤1: 解析CSV,区分完整记录和跳过的行...');
|
||||
const { headers, goodRecords, skippedRows } = parseCSVWithSkipped(content);
|
||||
console.log(` 表头: ${headers.join(', ')}`);
|
||||
console.log(` 完整记录: ${goodRecords.length}`);
|
||||
console.log(` 跳过的行: ${skippedRows.length}\n`);
|
||||
|
||||
console.log('步骤2: 尝试修复跳过的行...');
|
||||
const allRecords = fixSkippedRows(goodRecords, skippedRows, headers);
|
||||
console.log(` 修复后总记录数: ${allRecords.length}\n`);
|
||||
|
||||
console.log('步骤3: 生成修复后的CSV内容...');
|
||||
const lines = [];
|
||||
lines.push(headers.map(escapeCSVField).join(','));
|
||||
|
||||
allRecords.forEach(record => {
|
||||
lines.push(record.map(escapeCSVField).join(','));
|
||||
});
|
||||
|
||||
const fixedContent = lines.join('\n');
|
||||
|
||||
// 输出到新文件
|
||||
const fixedPath = path.join(__dirname, '..', '公司介绍_fixed_v2.csv');
|
||||
fs.writeFileSync(fixedPath, fixedContent, 'utf-8');
|
||||
|
||||
console.log(`✅ 修复完成!`);
|
||||
console.log(` 原始完整记录: ${goodRecords.length}`);
|
||||
console.log(` 原始跳过记录: ${skippedRows.length}`);
|
||||
console.log(` 修复后总记录: ${allRecords.length}`);
|
||||
console.log(` 输出文件: 公司介绍_fixed_v2.csv\n`);
|
||||
|
||||
console.log('正在验证修复后的文件...');
|
||||
const verifyContent = fs.readFileSync(fixedPath, 'utf-8');
|
||||
const verifyResult = parseCSVWithSkipped(verifyContent);
|
||||
console.log(` 验证结果:`);
|
||||
console.log(` - 完整记录: ${verifyResult.goodRecords.length}`);
|
||||
console.log(` - 跳过的行: ${verifyResult.skippedRows.length}`);
|
||||
|
||||
if (verifyResult.skippedRows.length === 0) {
|
||||
console.log('\n✅ 修复成功!所有记录都符合格式要求。');
|
||||
console.log('请检查修复后的文件内容,如果正确,可以替换原文件。');
|
||||
} else {
|
||||
console.log(`\n⚠️ 还有 ${verifyResult.skippedRows.length} 行需要手动处理。`);
|
||||
}
|
||||
123
scripts/locateErrors.js
Normal file
123
scripts/locateErrors.js
Normal file
@@ -0,0 +1,123 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// CSV解析函数,记录行号
|
||||
function parseCSVWithLineNumbers(content) {
|
||||
const data = [];
|
||||
const errors = [];
|
||||
let currentRow = [];
|
||||
let currentField = '';
|
||||
let inQuotes = false;
|
||||
let headers = null;
|
||||
let lineNum = 1;
|
||||
let rowStartLine = 2; // 第2行开始(第1行是表头)
|
||||
|
||||
for (let i = 0; i < content.length; i++) {
|
||||
const char = content[i];
|
||||
const nextChar = content[i + 1];
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
currentField += '"';
|
||||
i++;
|
||||
} else {
|
||||
inQuotes = !inQuotes;
|
||||
}
|
||||
} else if (char === ',' && !inQuotes) {
|
||||
currentRow.push(currentField.trim());
|
||||
currentField = '';
|
||||
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
|
||||
if (!headers) {
|
||||
headers = currentRow;
|
||||
} else if (currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
} else {
|
||||
// 记录错误及其行号
|
||||
errors.push({
|
||||
lineNum: rowStartLine,
|
||||
expectedFields: headers.length,
|
||||
actualFields: currentRow.length,
|
||||
fields: currentRow.map(f => f.substring(0, 50))
|
||||
});
|
||||
}
|
||||
|
||||
rowStartLine = lineNum + 1;
|
||||
currentRow = [];
|
||||
currentField = '';
|
||||
}
|
||||
if (char === '\r' && nextChar === '\n') {
|
||||
i++;
|
||||
}
|
||||
lineNum++;
|
||||
} else {
|
||||
currentField += char;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一行
|
||||
if (currentField || currentRow.length > 0) {
|
||||
currentRow.push(currentField.trim());
|
||||
if (headers && currentRow.length === headers.length) {
|
||||
const row = {};
|
||||
headers.forEach((header, index) => {
|
||||
row[header] = currentRow[index];
|
||||
});
|
||||
data.push(row);
|
||||
} else {
|
||||
errors.push({
|
||||
lineNum: rowStartLine,
|
||||
expectedFields: headers ? headers.length : 0,
|
||||
actualFields: currentRow.length,
|
||||
fields: currentRow.map(f => f.substring(0, 50))
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return { data, errors };
|
||||
}
|
||||
|
||||
console.log('正在定位CSV格式错误的行号...\n');
|
||||
|
||||
const content = fs.readFileSync(path.join(__dirname, '..', '公司介绍.csv'), 'utf-8');
|
||||
const result = parseCSVWithLineNumbers(content);
|
||||
|
||||
console.log('========== 错误记录详细位置 ==========\n');
|
||||
console.log(`总共发现 ${result.errors.length} 条格式错误的记录\n`);
|
||||
|
||||
result.errors.forEach((error, index) => {
|
||||
console.log(`${index + 1}. 行号: ${error.lineNum}`);
|
||||
console.log(` 期望字段数: ${error.expectedFields}, 实际字段数: ${error.actualFields}`);
|
||||
console.log(` 第1个字段: ${error.fields[0] || '(空)'}...`);
|
||||
if (error.fields[1]) {
|
||||
console.log(` 第2个字段: ${error.fields[1]}...`);
|
||||
}
|
||||
|
||||
// 特别标记用户关注的两家企业
|
||||
const firstField = error.fields[0] || '';
|
||||
if (firstField.includes('江苏恒瑞医药') || firstField.includes('宿迁阿特斯')) {
|
||||
console.log(` ⚠️ 这是用户新添加的企业!`);
|
||||
}
|
||||
console.log('');
|
||||
});
|
||||
|
||||
console.log('\n========== 按行号排序的错误行列表 ==========');
|
||||
result.errors
|
||||
.sort((a, b) => a.lineNum - b.lineNum)
|
||||
.forEach(error => {
|
||||
const firstField = error.fields[0] || '';
|
||||
let label = '';
|
||||
if (firstField.includes('有限公司') || firstField.includes('股份')) {
|
||||
const match = firstField.match(/([^,,。;]+?(有限公司|股份有限公司|集团))/);
|
||||
label = match ? match[1] : '(无法提取)';
|
||||
} else {
|
||||
label = '(无法识别企业名称)';
|
||||
}
|
||||
console.log(`第 ${error.lineNum} 行: ${label}`);
|
||||
});
|
||||
30
scripts/testLastLines.js
Normal file
30
scripts/testLastLines.js
Normal file
@@ -0,0 +1,30 @@
|
||||
const fs = require('fs');
|
||||
|
||||
// 读取最后10行
|
||||
const content = fs.readFileSync('../公司介绍.csv', 'utf-8');
|
||||
const lines = content.split('\n');
|
||||
|
||||
console.log('CSV文件总行数:', lines.length);
|
||||
console.log('\n最后10行:');
|
||||
lines.slice(-11, -1).forEach((line, index) => {
|
||||
const lineNum = lines.length - 11 + index;
|
||||
console.log(`第${lineNum}行 (前100字符): ${line.substring(0, 100)}...`);
|
||||
});
|
||||
|
||||
// 查找江苏恒瑞医药
|
||||
const hengruiIndex = lines.findIndex(line => line.includes('江苏恒瑞医药'));
|
||||
if (hengruiIndex >= 0) {
|
||||
console.log('\n✅ 找到江苏恒瑞医药,在第', hengruiIndex + 1, '行');
|
||||
console.log('内容:', lines[hengruiIndex].substring(0, 150));
|
||||
} else {
|
||||
console.log('\n❌ 未找到江苏恒瑞医药');
|
||||
}
|
||||
|
||||
// 查找宿迁阿特斯
|
||||
const atesIndex = lines.findIndex(line => line.includes('宿迁阿特斯'));
|
||||
if (atesIndex >= 0) {
|
||||
console.log('\n✅ 找到宿迁阿特斯,在第', atesIndex + 1, '行');
|
||||
console.log('内容:', lines[atesIndex].substring(0, 150));
|
||||
} else {
|
||||
console.log('\n❌ 未找到宿迁阿特斯');
|
||||
}
|
||||
Reference in New Issue
Block a user