// server/utils/textRecognizer.js import { Tensor } from 'onnxruntime-node'; import sharp from 'sharp'; import fse from 'fs-extra'; import * as path from 'path'; class TextRecognizer { constructor() { this.recSession = null; this.config = null; this.characterSet = []; this.debugDir = path.join(process.cwd(), 'temp', 'debug'); this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed'); this.logger = { info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args), error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args), debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args), warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args) }; // 确保目录存在 fse.ensureDirSync(this.debugDir); fse.ensureDirSync(this.preprocessedDir); } initialize(recSession, config) { this.recSession = recSession; this.config = config; this.logger.info('文本识别器初始化完成'); } async loadCharacterSet(keysPath) { try { const keysContent = await fse.readFile(keysPath, 'utf8'); this.characterSet = []; const lines = keysContent.split('\n'); // 使用提供的字符集文件 const uniqueChars = new Set(); for (const line of lines) { const trimmed = line.trim(); // 跳过空行和注释行 if (trimmed && !trimmed.startsWith('#')) { // 将每行作为一个完整的字符处理 uniqueChars.add(trimmed); } } this.characterSet = Array.from(uniqueChars); if (this.characterSet.length === 0) { throw new Error('字符集文件为空或格式不正确'); } this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`); // 记录字符集统计信息 const charTypes = { chinese: 0, english: 0, digit: 0, punctuation: 0, other: 0 }; this.characterSet.forEach(char => { if (/[\u4e00-\u9fff]/.test(char)) { charTypes.chinese++; } else if (/[a-zA-Z]/.test(char)) { charTypes.english++; } else if (/[0-9]/.test(char)) { charTypes.digit++; } else if (/[,。!?;:""()【】《》…—·]/.test(char)) { charTypes.punctuation++; } else { charTypes.other++; } }); this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`); this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`); } catch (error) { this.logger.error('加载字符集失败', error.message); // 完全使用提供的字符集,失败时抛出错误 throw new Error(`字符集加载失败: ${error.message}`); } } getCharacterSetSize() { return this.characterSet.length; } async recognizeText(textRegionBuffer, regionIndex = 0) { const startTime = Date.now(); this.logger.info(`开始文本识别 - 区域 ${regionIndex}`); try { const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex); const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor }); const result = this.postprocessRecognition(outputs); const processingTime = Date.now() - startTime; this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`); return result; } catch (error) { this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error); return { text: '', confidence: 0 }; } } async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) { this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`); const targetHeight = 48; const targetWidth = 320; // 原始目标宽度 const finalWidth = targetWidth + 20; // 最终宽度(左右各加10像素) const timestamp = Date.now(); try { const metadata = await sharp(textRegionBuffer).metadata(); this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`); // 保存原始裁剪区域图像 const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`); await fse.writeFile(originalPath, textRegionBuffer); this.logger.debug(`保存原始区域图像: ${originalPath}`); // 图像分析 const stats = await sharp(textRegionBuffer).grayscale().stats(); const meanBrightness = stats.channels[0].mean; const stdDev = stats.channels[0].stdev; this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`); // 智能预处理 let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex); // 保存预处理后的图像(灰度+对比度调整后) const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`); await fse.writeFile(processedPath, processedBuffer); this.logger.debug(`保存预处理图像: ${processedPath}`); // 保持宽高比的resize,并在左右添加10像素空白 const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex); // 保存调整大小后的图像 const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`); await fse.writeFile(resizedPath, resizedBuffer); this.logger.debug(`保存调整大小图像: ${resizedPath}`); // 使用最终尺寸创建张量 const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight); this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`); // 创建张量时使用最终尺寸 return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]); } catch (error) { this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error); return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]); } } // server/utils/textRecognizer.js // 增强的图像预处理 async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) { let processedBuffer = buffer; try { // 更精细的图像分析 const stats = await sharp(buffer) .grayscale() .stats(); const median = stats.channels[0].median; const max = stats.channels[0].max; const min = stats.channels[0].min; this.logger.debug(`区域 ${regionIndex}: 详细统计 - 中值=${median}, 范围=${min}-${max}, 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`); // 更智能的预处理策略 if (meanBrightness > 220 && stdDev < 25) { // 高亮度低对比度图像 this.logger.debug(`区域 ${regionIndex}: 应用高亮度低对比度增强`); processedBuffer = await sharp(buffer) .linear(1.8, -80) // 更强的对比度增强 .normalize({ lower: 5, upper: 95 }) // 更激进的归一化 .grayscale() .toBuffer(); } else if (meanBrightness < 70) { // 低亮度图像 this.logger.debug(`区域 ${regionIndex}: 应用低亮度增强`); processedBuffer = await sharp(buffer) .linear(1.5, 50) // 更强的亮度提升 .normalize() .grayscale() .toBuffer(); } else if (stdDev < 15) { // 极低对比度 this.logger.debug(`区域 ${regionIndex}: 应用极低对比度增强`); processedBuffer = await sharp(buffer) .linear(2.0, -30) // 非常强的对比度增强 .normalize({ lower: 1, upper: 99 }) .grayscale() .toBuffer(); } else if (stdDev > 80) { // 高对比度图像,可能过度增强 this.logger.debug(`区域 ${regionIndex}: 应用高对比度抑制`); processedBuffer = await sharp(buffer) .linear(0.8, 20) // 降低对比度 .normalize() .grayscale() .toBuffer(); } else { // 标准处理 this.logger.debug(`区域 ${regionIndex}: 应用标准增强`); processedBuffer = await sharp(buffer) .linear(1.3, -15) // 适度的对比度增强 .normalize({ lower: 10, upper: 90 }) .grayscale() .toBuffer(); } // 应用锐化滤波增强文字边缘 processedBuffer = await sharp(processedBuffer) .sharpen({ sigma: 1.2, m1: 1.5, m2: 0.7 }) .toBuffer(); } catch (error) { this.logger.error(`区域 ${regionIndex}: 预处理失败`, error); // 回退到基本处理 processedBuffer = await sharp(buffer) .normalize() .grayscale() .toBuffer(); } return processedBuffer; } async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) { const metadata = await sharp(buffer).metadata(); const originalAspectRatio = metadata.width / metadata.height; const targetAspectRatio = targetWidth / targetHeight; let resizeWidth, resizeHeight; if (originalAspectRatio > targetAspectRatio) { // 宽度限制,按宽度缩放 resizeWidth = targetWidth; resizeHeight = Math.round(targetWidth / originalAspectRatio); } else { // 高度限制,按高度缩放 resizeHeight = targetHeight; resizeWidth = Math.round(targetHeight * originalAspectRatio); } resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth)); resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight)); this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`); // 计算居中的偏移量 const offsetX = Math.floor((targetWidth - resizeWidth) / 2); const offsetY = Math.floor((targetHeight - resizeHeight) / 2); this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`); // 先调整大小并居中 let resizedBuffer = await sharp(buffer) .resize(resizeWidth, resizeHeight, { fit: 'contain', background: { r: 255, g: 255, b: 255 } }) .extend({ top: offsetY, bottom: targetHeight - resizeHeight - offsetY, left: offsetX, right: targetWidth - resizeWidth - offsetX, background: { r: 255, g: 255, b: 255 } }) .png() .toBuffer(); // 在左右各添加10像素空白 const finalWidth = targetWidth + 20; // 左右各加10像素 const finalHeight = targetHeight; resizedBuffer = await sharp(resizedBuffer) .extend({ top: 0, bottom: 0, left: 10, right: 10, background: { r: 255, g: 255, b: 255 } }) .png() .toBuffer(); this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`); return resizedBuffer; } async bufferToTensor(buffer, width, height) { // 获取实际图像尺寸(因为现在宽度增加了20像素) const metadata = await sharp(buffer).metadata(); const actualWidth = metadata.width; const actualHeight = metadata.height; const imageData = await sharp(buffer) .ensureAlpha() .raw() .toBuffer({ resolveWithObject: true }); // 使用实际尺寸创建张量 const inputData = new Float32Array(3 * actualHeight * actualWidth); const data = imageData.data; for (let i = 0; i < data.length; i += 4) { const pixelIndex = Math.floor(i / 4); const y = Math.floor(pixelIndex / actualWidth); const x = pixelIndex % actualWidth; // 使用灰度值填充三个通道 const grayValue = data[i] / 255.0; for (let c = 0; c < 3; c++) { const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x; if (inputIndex < inputData.length) { inputData[inputIndex] = grayValue; } } } return inputData; } postprocessRecognition(outputs) { this.logger.debug('开始识别后处理'); try { const outputNames = this.recSession.outputNames; const recognitionOutput = outputs[outputNames[0]]; if (!recognitionOutput) { this.logger.debug('识别输出为空'); return { text: '', confidence: 0 }; } const data = recognitionOutput.data; const [batch, seqLen, vocabSize] = recognitionOutput.dims; this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`); if (this.characterSet.length === 0) { this.logger.error('字符集为空'); return { text: '', confidence: 0 }; } // 验证词汇表大小与字符集大小的匹配 if (vocabSize !== this.characterSet.length + 1) { this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配,可能影响识别效果`); } const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize); this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`); return { text, confidence }; } catch (error) { this.logger.error('识别后处理失败', error); return { text: '', confidence: 0 }; } } ctcDecode(data, seqLen, vocabSize) { let text = ''; let lastCharIndex = -1; let confidenceSum = 0; let charCount = 0; // 动态阈值调整 const baseThreshold = 0.03; let confidenceThreshold = baseThreshold; // 分析序列置信度分布 let maxSequenceProb = 0; let minSequenceProb = 1; let sumProb = 0; let probCount = 0; for (let t = 0; t < seqLen; t++) { for (let i = 0; i < vocabSize; i++) { const prob = data[t * vocabSize + i]; if (prob > 0.01) { // 只统计有意义的概率 maxSequenceProb = Math.max(maxSequenceProb, prob); minSequenceProb = Math.min(minSequenceProb, prob); sumProb += prob; probCount++; } } } const avgProb = probCount > 0 ? sumProb / probCount : 0; // 根据序列特性动态调整阈值 if (avgProb < 0.3) { confidenceThreshold = baseThreshold * 0.5; } else if (avgProb > 0.7) { confidenceThreshold = baseThreshold * 1.5; } this.logger.debug(`序列统计: 平均概率=${avgProb.toFixed(4)}, 使用解码阈值: ${confidenceThreshold.toFixed(4)}`); // 改进的beam search算法 const beamWidth = 5; let beams = [{ text: '', confidence: 1.0, lastChar: -1 }]; for (let t = 0; t < seqLen; t++) { const newBeams = []; // 获取当前时间步的top-k字符 const topK = []; for (let i = 0; i < vocabSize; i++) { const prob = data[t * vocabSize + i]; if (prob > confidenceThreshold) { topK.push({ index: i, prob }); } } // 按概率排序 topK.sort((a, b) => b.prob - a.prob); const candidates = topK.slice(0, beamWidth); // 为每个beam扩展候选字符 for (const beam of beams) { for (const candidate of candidates) { const charIndex = candidate.index; if (charIndex === 0) { // 空白字符 newBeams.push({ text: beam.text, confidence: beam.confidence, lastChar: -1 }); } else { const actualCharIndex = charIndex - 1; if (actualCharIndex < this.characterSet.length) { const char = this.characterSet[actualCharIndex]; let newText = beam.text; // 处理重复字符 if (charIndex !== beam.lastChar) { newText += char; } newBeams.push({ text: newText, confidence: beam.confidence * candidate.prob, lastChar: charIndex }); } } } } // 选择top beamWidth个beam newBeams.sort((a, b) => b.confidence - a.confidence); beams = newBeams.slice(0, beamWidth); } // 选择最佳beam if (beams.length > 0) { const bestBeam = beams[0]; text = bestBeam.text; // 计算平均置信度(几何平均) const textLength = text.length; if (textLength > 0) { confidenceSum = Math.pow(bestBeam.confidence, 1 / textLength); charCount = textLength; } } const avgConfidence = charCount > 0 ? confidenceSum : 0; return { text: text, confidence: avgConfidence }; } } export default TextRecognizer;