158 lines
5.9 KiB
JavaScript
158 lines
5.9 KiB
JavaScript
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
|
||
|
|
// CSV解析函数
|
||
|
|
function parseCSV(content) {
|
||
|
|
const data = [];
|
||
|
|
const skipped = [];
|
||
|
|
let currentRow = [];
|
||
|
|
let currentField = '';
|
||
|
|
let inQuotes = false;
|
||
|
|
let headers = null;
|
||
|
|
let lineNum = 1;
|
||
|
|
|
||
|
|
for (let i = 0; i < content.length; i++) {
|
||
|
|
const char = content[i];
|
||
|
|
const nextChar = content[i + 1];
|
||
|
|
|
||
|
|
if (char === '"') {
|
||
|
|
if (inQuotes && nextChar === '"') {
|
||
|
|
currentField += '"';
|
||
|
|
i++;
|
||
|
|
} else {
|
||
|
|
inQuotes = !inQuotes;
|
||
|
|
}
|
||
|
|
} else if (char === ',' && !inQuotes) {
|
||
|
|
currentRow.push(currentField.trim());
|
||
|
|
currentField = '';
|
||
|
|
} else if ((char === '\n' || char === '\r') && !inQuotes) {
|
||
|
|
if (currentField || currentRow.length > 0) {
|
||
|
|
currentRow.push(currentField.trim());
|
||
|
|
|
||
|
|
if (!headers) {
|
||
|
|
headers = currentRow;
|
||
|
|
} else if (currentRow.length === headers.length) {
|
||
|
|
const row = {};
|
||
|
|
headers.forEach((header, index) => {
|
||
|
|
row[header] = currentRow[index];
|
||
|
|
});
|
||
|
|
data.push(row);
|
||
|
|
} else {
|
||
|
|
// 尝试提取企业名称
|
||
|
|
let companyName = '未知';
|
||
|
|
for (let field of currentRow) {
|
||
|
|
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
|
||
|
|
// 提取企业名称
|
||
|
|
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
|
||
|
|
if (match) {
|
||
|
|
companyName = match[1];
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
skipped.push({
|
||
|
|
lineNum,
|
||
|
|
expectedFields: headers.length,
|
||
|
|
actualFields: currentRow.length,
|
||
|
|
companyName,
|
||
|
|
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
currentRow = [];
|
||
|
|
currentField = '';
|
||
|
|
}
|
||
|
|
if (char === '\r' && nextChar === '\n') {
|
||
|
|
i++;
|
||
|
|
}
|
||
|
|
lineNum++;
|
||
|
|
} else {
|
||
|
|
currentField += char;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 处理最后一行
|
||
|
|
if (currentField || currentRow.length > 0) {
|
||
|
|
currentRow.push(currentField.trim());
|
||
|
|
if (headers && currentRow.length === headers.length) {
|
||
|
|
const row = {};
|
||
|
|
headers.forEach((header, index) => {
|
||
|
|
row[header] = currentRow[index];
|
||
|
|
});
|
||
|
|
data.push(row);
|
||
|
|
} else {
|
||
|
|
let companyName = '未知';
|
||
|
|
for (let field of currentRow) {
|
||
|
|
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
|
||
|
|
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
|
||
|
|
if (match) {
|
||
|
|
companyName = match[1];
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
skipped.push({
|
||
|
|
lineNum,
|
||
|
|
expectedFields: headers ? headers.length : 0,
|
||
|
|
actualFields: currentRow.length,
|
||
|
|
companyName,
|
||
|
|
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return { data, skipped };
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log('正在分析公司介绍.csv文件...\n');
|
||
|
|
|
||
|
|
const content = fs.readFileSync(path.join(__dirname, '..', '公司介绍.csv'), 'utf-8');
|
||
|
|
const result = parseCSV(content);
|
||
|
|
|
||
|
|
console.log('========== 解析结果统计 ==========');
|
||
|
|
console.log(`✅ 成功解析: ${result.data.length} 条记录`);
|
||
|
|
console.log(`❌ 格式错误被跳过: ${result.skipped.length} 条记录`);
|
||
|
|
console.log(`📊 总计: ${result.data.length + result.skipped.length} 条\n`);
|
||
|
|
|
||
|
|
if (result.skipped.length > 0) {
|
||
|
|
console.log('========== 格式错误的记录列表 ==========\n');
|
||
|
|
|
||
|
|
// 按企业名称分组
|
||
|
|
const grouped = {};
|
||
|
|
result.skipped.forEach(item => {
|
||
|
|
if (!grouped[item.companyName]) {
|
||
|
|
grouped[item.companyName] = [];
|
||
|
|
}
|
||
|
|
grouped[item.companyName].push(item);
|
||
|
|
});
|
||
|
|
|
||
|
|
Object.keys(grouped).forEach((companyName, index) => {
|
||
|
|
const items = grouped[companyName];
|
||
|
|
console.log(`${index + 1}. 企业名称: ${companyName}`);
|
||
|
|
console.log(` 错误记录数: ${items.length}`);
|
||
|
|
console.log(` 期望字段数: 5`);
|
||
|
|
console.log(` 实际字段数: ${items[0].actualFields}`);
|
||
|
|
console.log(` 首个字段内容: ${items[0].firstField}...`);
|
||
|
|
console.log('');
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log('\n========== 修复建议 ==========');
|
||
|
|
console.log('1. 检查CSV文件中每条企业记录是否有完整的5个字段:');
|
||
|
|
console.log(' - ✅企业名称');
|
||
|
|
console.log(' - ✅企业类型');
|
||
|
|
console.log(' - ✅地区');
|
||
|
|
console.log(' - ✅企业简介');
|
||
|
|
console.log(' - ✅推荐理由');
|
||
|
|
console.log('\n2. 确保每条记录的推荐理由字段末尾有换行符');
|
||
|
|
console.log('\n3. 确保所有字段内容如果包含逗号、引号或换行符,必须用双引号包裹');
|
||
|
|
console.log('\n4. 特别检查上述列表中的企业记录');
|
||
|
|
}
|
||
|
|
|
||
|
|
// 输出缺少的企业名单
|
||
|
|
console.log('\n========== 特别关注 ==========');
|
||
|
|
console.log('以下企业应该在CSV中但未被成功解析:');
|
||
|
|
console.log('- 江苏恒瑞医药股份有限公司');
|
||
|
|
console.log('- 宿迁阿特斯阳光能源科技有限公司');
|
||
|
|
console.log('\n建议: 检查这些企业所在行的前一条记录是否缺少换行符');
|