Files
DDCZ/scripts/csvErrorReport.js

158 lines
5.9 KiB
JavaScript
Raw Normal View History

const fs = require('fs');
const path = require('path');
// CSV解析函数
function parseCSV(content) {
const data = [];
const skipped = [];
let currentRow = [];
let currentField = '';
let inQuotes = false;
let headers = null;
let lineNum = 1;
for (let i = 0; i < content.length; i++) {
const char = content[i];
const nextChar = content[i + 1];
if (char === '"') {
if (inQuotes && nextChar === '"') {
currentField += '"';
i++;
} else {
inQuotes = !inQuotes;
}
} else if (char === ',' && !inQuotes) {
currentRow.push(currentField.trim());
currentField = '';
} else if ((char === '\n' || char === '\r') && !inQuotes) {
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (!headers) {
headers = currentRow;
} else if (currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
} else {
// 尝试提取企业名称
let companyName = '未知';
for (let field of currentRow) {
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
// 提取企业名称
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
if (match) {
companyName = match[1];
break;
}
}
}
skipped.push({
lineNum,
expectedFields: headers.length,
actualFields: currentRow.length,
companyName,
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
});
}
currentRow = [];
currentField = '';
}
if (char === '\r' && nextChar === '\n') {
i++;
}
lineNum++;
} else {
currentField += char;
}
}
// 处理最后一行
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (headers && currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
} else {
let companyName = '未知';
for (let field of currentRow) {
if (field.includes('有限公司') || field.includes('股份') || field.includes('集团')) {
const match = field.match(/([^,,。;]+?(有限公司|股份有限公司|集团有限公司|科技有限公司))/);
if (match) {
companyName = match[1];
break;
}
}
}
skipped.push({
lineNum,
expectedFields: headers ? headers.length : 0,
actualFields: currentRow.length,
companyName,
firstField: currentRow[0] ? currentRow[0].substring(0, 80) : ''
});
}
}
return { data, skipped };
}
console.log('正在分析公司介绍.csv文件...\n');
const content = fs.readFileSync(path.join(__dirname, '..', '公司介绍.csv'), 'utf-8');
const result = parseCSV(content);
console.log('========== 解析结果统计 ==========');
console.log(`✅ 成功解析: ${result.data.length} 条记录`);
console.log(`❌ 格式错误被跳过: ${result.skipped.length} 条记录`);
console.log(`📊 总计: ${result.data.length + result.skipped.length}\n`);
if (result.skipped.length > 0) {
console.log('========== 格式错误的记录列表 ==========\n');
// 按企业名称分组
const grouped = {};
result.skipped.forEach(item => {
if (!grouped[item.companyName]) {
grouped[item.companyName] = [];
}
grouped[item.companyName].push(item);
});
Object.keys(grouped).forEach((companyName, index) => {
const items = grouped[companyName];
console.log(`${index + 1}. 企业名称: ${companyName}`);
console.log(` 错误记录数: ${items.length}`);
console.log(` 期望字段数: 5`);
console.log(` 实际字段数: ${items[0].actualFields}`);
console.log(` 首个字段内容: ${items[0].firstField}...`);
console.log('');
});
console.log('\n========== 修复建议 ==========');
console.log('1. 检查CSV文件中每条企业记录是否有完整的5个字段:');
console.log(' - ✅企业名称');
console.log(' - ✅企业类型');
console.log(' - ✅地区');
console.log(' - ✅企业简介');
console.log(' - ✅推荐理由');
console.log('\n2. 确保每条记录的推荐理由字段末尾有换行符');
console.log('\n3. 确保所有字段内容如果包含逗号、引号或换行符,必须用双引号包裹');
console.log('\n4. 特别检查上述列表中的企业记录');
}
// 输出缺少的企业名单
console.log('\n========== 特别关注 ==========');
console.log('以下企业应该在CSV中但未被成功解析:');
console.log('- 江苏恒瑞医药股份有限公司');
console.log('- 宿迁阿特斯阳光能源科技有限公司');
console.log('\n建议: 检查这些企业所在行的前一条记录是否缺少换行符');