Files
DDCZ/scripts/checkMissingData.js

171 lines
4.9 KiB
JavaScript
Raw Permalink Normal View History

const fs = require('fs');
const path = require('path');
// CSV文件路径
const BASE_DIR = path.join(__dirname, '..');
const COMPANY_CSV = path.join(BASE_DIR, '公司介绍.csv');
const IMAGE_CSV = path.join(BASE_DIR, '企业图片.csv');
const BUSINESS_CSV = path.join(BASE_DIR, '企业业务板块和内推岗位.csv');
// CSV解析函数
function parseCSV(content) {
const data = [];
let currentRow = [];
let currentField = '';
let inQuotes = false;
let headers = null;
for (let i = 0; i < content.length; i++) {
const char = content[i];
const nextChar = content[i + 1];
if (char === '"') {
if (inQuotes && nextChar === '"') {
currentField += '"';
i++;
} else {
inQuotes = !inQuotes;
}
} else if (char === ',' && !inQuotes) {
currentRow.push(currentField.trim());
currentField = '';
} else if ((char === '\n' || char === '\r') && !inQuotes) {
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (!headers) {
headers = currentRow;
} else if (currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
}
currentRow = [];
currentField = '';
}
if (char === '\r' && nextChar === '\n') {
i++;
}
} else {
currentField += char;
}
}
if (currentField || currentRow.length > 0) {
currentRow.push(currentField.trim());
if (headers && currentRow.length === headers.length) {
const row = {};
headers.forEach((header, index) => {
row[header] = currentRow[index];
});
data.push(row);
}
}
return data;
}
function extractCity(region) {
if (!region) return '';
const match = region.match(/总部[:]\s*([^ \n]+)/);
if (match) {
let city = match[1].trim();
city = city.replace(/^(江苏|浙江|广东|山东|河北|河南|四川|湖北|湖南|安徽|福建|陕西|辽宁)\s*/, '');
if (!city.endsWith('市') && !city.includes('自治') && !city.includes('特别行政区')) {
city += '市';
}
return city;
}
return '';
}
console.log('读取CSV文件...\n');
const companyData = parseCSV(fs.readFileSync(COMPANY_CSV, 'utf-8'));
const imageData = parseCSV(fs.readFileSync(IMAGE_CSV, 'utf-8'));
const businessData = parseCSV(fs.readFileSync(BUSINESS_CSV, 'utf-8'));
console.log(`✅ 读取 ${companyData.length} 条公司介绍数据`);
console.log(`✅ 读取 ${imageData.length} 条企业图片数据`);
console.log(`✅ 读取 ${businessData.length} 条业务板块数据\n`);
// 建立企业Map
const companiesMap = new Map();
companyData.forEach((row) => {
const name = row['企业名称'] || row['✅企业名称'];
if (!name) return;
if (companiesMap.has(name)) return;
const region = row['地区'] || row['✅地区'] || '';
const city = extractCity(region);
companiesMap.set(name, {
name: name,
city: city,
hasImage: false,
hasSegment: false
});
});
// 标记有图片的企业
imageData.forEach(row => {
const name = row['企业名称'];
if (name && companiesMap.has(name)) {
companiesMap.get(name).hasImage = true;
}
});
// 标记有业务板块的企业
businessData.forEach(row => {
const name = row['企业名称'] || row['✅企业名称'];
if (name && companiesMap.has(name)) {
companiesMap.get(name).hasSegment = true;
}
});
// 检查缺失数据
const noImage = [];
const noSegment = [];
const noCity = [];
companiesMap.forEach((company, name) => {
if (!company.hasImage) noImage.push(name);
if (!company.hasSegment) noSegment.push(name);
if (!company.city) noCity.push(name);
});
console.log('========== 数据质量检查结果 ==========\n');
if (noImage.length > 0) {
console.log(`❌ 缺少图片的企业 (${noImage.length}家):`);
noImage.forEach((name, index) => {
console.log(` ${index + 1}. ${name}`);
});
console.log('');
}
if (noSegment.length > 0) {
console.log(`❌ 缺少业务板块的企业 (${noSegment.length}家):`);
noSegment.forEach((name, index) => {
console.log(` ${index + 1}. ${name}`);
});
console.log('');
}
if (noCity.length > 0) {
console.log(`❌ 缺少城市信息的企业 (${noCity.length}家):`);
noCity.forEach((name, index) => {
console.log(` ${index + 1}. ${name}`);
});
console.log('');
}
if (noImage.length === 0 && noSegment.length === 0 && noCity.length === 0) {
console.log('✅ 所有企业数据完整!');
}