Files
DDCZ/scripts/checkCSV.js

118 lines
3.4 KiB
JavaScript
Raw Permalink Normal View History

const fs = require('fs');
const path = require('path');
// CSV文件路径
const COMPANY_CSV = path.join(__dirname, '..', '公司介绍.csv');
// CSV解析函数与convertCSV.js相同的逻辑
function parseCSV(content) {
const data = [];
let currentRow = [];
let currentField = '';
let inQuotes = false;
let headers = null;
for (let i = 0; i < content.length; i++) {
const char = content[i];
const nextChar = content[i + 1];
if (char === '"') {
if (inQuotes && nextChar === '"') {
currentField += '"';
i++;
} else {
inQuotes = !inQuotes;
}
} else if (char === ',' && !inQuotes) {
currentRow.push(currentField.trim());
currentField = '';
} else if ((char === '\n' || char === '\r') && !inQuotes) {
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (!headers) {
headers = currentRow;
} else if (currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
}
currentRow = [];
currentField = '';
}
if (char === '\r' && nextChar === '\n') {
i++;
}
} else {
currentField += char;
}
}
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (headers && currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
}
}
return data;
}
console.log('读取CSV文件...\n');
const content = fs.readFileSync(COMPANY_CSV, 'utf-8');
const data = parseCSV(content);
console.log(`✅ 解析到 ${data.length} 条数据记录\n`);
// 统计企业名称
const companyNames = new Map();
data.forEach((row, index) => {
const name = row['企业名称'] || row['✅企业名称'];
if (name) {
if (companyNames.has(name)) {
companyNames.get(name).push(index + 2); // +2 因为第1行是表头索引从0开始
} else {
companyNames.set(name, [index + 2]);
}
}
});
console.log(`📊 唯一企业数量: ${companyNames.size}\n`);
// 检查重复
const duplicates = [];
companyNames.forEach((indices, name) => {
if (indices.length > 1) {
duplicates.push({ name, indices });
}
});
if (duplicates.length > 0) {
console.log(`⚠️ 发现 ${duplicates.length} 个重复的企业名称:\n`);
duplicates.forEach(dup => {
console.log(` "${dup.name}" 出现 ${dup.indices.length} 次,在数据行: ${dup.indices.join(', ')}`);
});
} else {
console.log('✅ 没有发现重复的企业名称\n');
}
// 列出所有企业名称前50个
console.log('\n📝 前50个企业名称:');
const names = Array.from(companyNames.keys());
names.slice(0, 50).forEach((name, index) => {
console.log(` ${index + 1}. ${name}`);
});
if (names.length > 50) {
console.log(` ... 还有 ${names.length - 50} 个企业`);
}
console.log(`\n总计: ${names.length} 家企业`);