diff --git a/server/utils/textRecognizer.js b/server/utils/textRecognizer.js index 37fcf55..b6b9cb0 100644 --- a/server/utils/textRecognizer.js +++ b/server/utils/textRecognizer.js @@ -87,7 +87,6 @@ class TextRecognizer { throw new Error(`字符集加载失败: ${error.message}`); } } - getCharacterSetSize() { return this.characterSet.length; } @@ -165,32 +164,79 @@ class TextRecognizer { } } +// server/utils/textRecognizer.js +// 增强的图像预处理 + async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) { let processedBuffer = buffer; - if (meanBrightness > 200 && stdDev < 30) { - this.logger.debug(`区域 ${regionIndex}: 应用高亮度图像增强`); - processedBuffer = await sharp(buffer) - .linear(1.5, -50) - .normalize() + try { + // 更精细的图像分析 + const stats = await sharp(buffer) .grayscale() + .stats(); + + const median = stats.channels[0].median; + const max = stats.channels[0].max; + const min = stats.channels[0].min; + + this.logger.debug(`区域 ${regionIndex}: 详细统计 - 中值=${median}, 范围=${min}-${max}, 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`); + + // 更智能的预处理策略 + if (meanBrightness > 220 && stdDev < 25) { + // 高亮度低对比度图像 + this.logger.debug(`区域 ${regionIndex}: 应用高亮度低对比度增强`); + processedBuffer = await sharp(buffer) + .linear(1.8, -80) // 更强的对比度增强 + .normalize({ lower: 5, upper: 95 }) // 更激进的归一化 + .grayscale() + .toBuffer(); + } else if (meanBrightness < 70) { + // 低亮度图像 + this.logger.debug(`区域 ${regionIndex}: 应用低亮度增强`); + processedBuffer = await sharp(buffer) + .linear(1.5, 50) // 更强的亮度提升 + .normalize() + .grayscale() + .toBuffer(); + } else if (stdDev < 15) { + // 极低对比度 + this.logger.debug(`区域 ${regionIndex}: 应用极低对比度增强`); + processedBuffer = await sharp(buffer) + .linear(2.0, -30) // 非常强的对比度增强 + .normalize({ lower: 1, upper: 99 }) + .grayscale() + .toBuffer(); + } else if (stdDev > 80) { + // 高对比度图像,可能过度增强 + this.logger.debug(`区域 ${regionIndex}: 应用高对比度抑制`); + processedBuffer = await sharp(buffer) + .linear(0.8, 20) // 降低对比度 + .normalize() + .grayscale() + .toBuffer(); + } else { + // 标准处理 + this.logger.debug(`区域 ${regionIndex}: 应用标准增强`); + processedBuffer = await sharp(buffer) + .linear(1.3, -15) // 适度的对比度增强 + .normalize({ lower: 10, upper: 90 }) + .grayscale() + .toBuffer(); + } + + // 应用锐化滤波增强文字边缘 + processedBuffer = await sharp(processedBuffer) + .sharpen({ + sigma: 1.2, + m1: 1.5, + m2: 0.7 + }) .toBuffer(); - } else if (meanBrightness < 80) { - this.logger.debug(`区域 ${regionIndex}: 应用低亮度图像增强`); - processedBuffer = await sharp(buffer) - .linear(1.2, 30) - .normalize() - .grayscale() - .toBuffer(); - } else if (stdDev < 20) { - this.logger.debug(`区域 ${regionIndex}: 应用低对比度增强`); - processedBuffer = await sharp(buffer) - .linear(1.3, -20) - .normalize() - .grayscale() - .toBuffer(); - } else { - this.logger.debug(`区域 ${regionIndex}: 应用标准化灰度处理`); + + } catch (error) { + this.logger.error(`区域 ${regionIndex}: 预处理失败`, error); + // 回退到基本处理 processedBuffer = await sharp(buffer) .normalize() .grayscale() @@ -346,88 +392,114 @@ class TextRecognizer { const baseThreshold = 0.03; let confidenceThreshold = baseThreshold; - // 先分析整个序列的置信度分布 + // 分析序列置信度分布 let maxSequenceProb = 0; - for (let t = 0; t < seqLen; t++) { - for (let i = 0; i < vocabSize; i++) { - maxSequenceProb = Math.max(maxSequenceProb, data[t * vocabSize + i]); - } - } - - // 如果整体置信度较低,降低阈值 - if (maxSequenceProb < 0.5) { - confidenceThreshold = baseThreshold * 0.5; - } - - this.logger.debug(`使用解码阈值: ${confidenceThreshold.toFixed(4)}`); + let minSequenceProb = 1; + let sumProb = 0; + let probCount = 0; for (let t = 0; t < seqLen; t++) { - let maxProb = -1; - let maxIndex = -1; - - // 找到当前时间步的最大概率字符 for (let i = 0; i < vocabSize; i++) { const prob = data[t * vocabSize + i]; - if (prob > maxProb) { - maxProb = prob; - maxIndex = i; + if (prob > 0.01) { // 只统计有意义的概率 + maxSequenceProb = Math.max(maxSequenceProb, prob); + minSequenceProb = Math.min(minSequenceProb, prob); + sumProb += prob; + probCount++; } } - - // 改进的CTC解码逻辑 - if (maxIndex > 0 && maxProb > confidenceThreshold) { - const charIndex = maxIndex - 1; - if (charIndex < this.characterSet.length) { - const char = this.characterSet[charIndex]; - - // 更智能的重复字符处理 - const shouldAddChar = maxIndex !== lastCharIndex || - maxProb > 0.8 || - (maxIndex === lastCharIndex && charCount > 0 && text[text.length - 1] !== char); - - if (shouldAddChar && char && char.trim() !== '') { - text += char; - confidenceSum += maxProb; - charCount++; - } - lastCharIndex = maxIndex; - } else { - this.logger.warn(`字符索引${charIndex}超出字符集范围(0-${this.characterSet.length-1})`); - } - } else if (maxIndex === 0) { - lastCharIndex = -1; - } } - const avgConfidence = charCount > 0 ? confidenceSum / charCount : 0; + const avgProb = probCount > 0 ? sumProb / probCount : 0; - // 基本的文本清理(不包含错误模式修复) - const cleanedText = this.basicTextCleaning(text); + // 根据序列特性动态调整阈值 + if (avgProb < 0.3) { + confidenceThreshold = baseThreshold * 0.5; + } else if (avgProb > 0.7) { + confidenceThreshold = baseThreshold * 1.5; + } + + this.logger.debug(`序列统计: 平均概率=${avgProb.toFixed(4)}, 使用解码阈值: ${confidenceThreshold.toFixed(4)}`); + + // 改进的beam search算法 + const beamWidth = 5; + let beams = [{ text: '', confidence: 1.0, lastChar: -1 }]; + + for (let t = 0; t < seqLen; t++) { + const newBeams = []; + + // 获取当前时间步的top-k字符 + const topK = []; + for (let i = 0; i < vocabSize; i++) { + const prob = data[t * vocabSize + i]; + if (prob > confidenceThreshold) { + topK.push({ index: i, prob }); + } + } + + // 按概率排序 + topK.sort((a, b) => b.prob - a.prob); + const candidates = topK.slice(0, beamWidth); + + // 为每个beam扩展候选字符 + for (const beam of beams) { + for (const candidate of candidates) { + const charIndex = candidate.index; + + if (charIndex === 0) { + // 空白字符 + newBeams.push({ + text: beam.text, + confidence: beam.confidence, + lastChar: -1 + }); + } else { + const actualCharIndex = charIndex - 1; + if (actualCharIndex < this.characterSet.length) { + const char = this.characterSet[actualCharIndex]; + let newText = beam.text; + + // 处理重复字符 + if (charIndex !== beam.lastChar) { + newText += char; + } + + newBeams.push({ + text: newText, + confidence: beam.confidence * candidate.prob, + lastChar: charIndex + }); + } + } + } + } + + // 选择top beamWidth个beam + newBeams.sort((a, b) => b.confidence - a.confidence); + beams = newBeams.slice(0, beamWidth); + } + + // 选择最佳beam + if (beams.length > 0) { + const bestBeam = beams[0]; + text = bestBeam.text; + + // 计算平均置信度(几何平均) + const textLength = text.length; + if (textLength > 0) { + confidenceSum = Math.pow(bestBeam.confidence, 1 / textLength); + charCount = textLength; + } + } + + const avgConfidence = charCount > 0 ? confidenceSum : 0; return { - text: cleanedText, + text: text, confidence: avgConfidence }; } - basicTextCleaning(text) { - if (!text) return ''; - - let cleaned = text; - - // 1. 移除过多的重复字符(保留合理的重复) - cleaned = cleaned.replace(/([^0-9])\1{2,}/g, '$1$1'); - - // 2. 修复标点符号 - cleaned = cleaned.replace(/∶/g, ':') - .replace(/《/g, '(') - .replace(/》/g, ')'); - - // 3. 修复数字和百分号 - cleaned = cleaned.replace(/(\d+)%%/g, '$1%'); - - return cleaned.trim(); - } } export default TextRecognizer; \ No newline at end of file