505 行
19 KiB
JavaScript
505 行
19 KiB
JavaScript
// server/utils/textRecognizer.js
|
||
import { Tensor } from 'onnxruntime-node';
|
||
import sharp from 'sharp';
|
||
import fse from 'fs-extra';
|
||
import * as path from 'path';
|
||
|
||
class TextRecognizer {
|
||
constructor() {
|
||
this.recSession = null;
|
||
this.config = null;
|
||
this.characterSet = [];
|
||
this.debugDir = path.join(process.cwd(), 'temp', 'debug');
|
||
this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed');
|
||
this.logger = {
|
||
info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args),
|
||
error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args),
|
||
debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args),
|
||
warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args)
|
||
};
|
||
|
||
// 确保目录存在
|
||
fse.ensureDirSync(this.debugDir);
|
||
fse.ensureDirSync(this.preprocessedDir);
|
||
}
|
||
|
||
initialize(recSession, config) {
|
||
this.recSession = recSession;
|
||
this.config = config;
|
||
this.logger.info('文本识别器初始化完成');
|
||
}
|
||
|
||
async loadCharacterSet(keysPath) {
|
||
try {
|
||
const keysContent = await fse.readFile(keysPath, 'utf8');
|
||
this.characterSet = [];
|
||
const lines = keysContent.split('\n');
|
||
|
||
// 使用提供的字符集文件
|
||
const uniqueChars = new Set();
|
||
|
||
for (const line of lines) {
|
||
const trimmed = line.trim();
|
||
// 跳过空行和注释行
|
||
if (trimmed && !trimmed.startsWith('#')) {
|
||
// 将每行作为一个完整的字符处理
|
||
uniqueChars.add(trimmed);
|
||
}
|
||
}
|
||
|
||
this.characterSet = Array.from(uniqueChars);
|
||
|
||
if (this.characterSet.length === 0) {
|
||
throw new Error('字符集文件为空或格式不正确');
|
||
}
|
||
|
||
this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`);
|
||
|
||
// 记录字符集统计信息
|
||
const charTypes = {
|
||
chinese: 0,
|
||
english: 0,
|
||
digit: 0,
|
||
punctuation: 0,
|
||
other: 0
|
||
};
|
||
|
||
this.characterSet.forEach(char => {
|
||
if (/[\u4e00-\u9fff]/.test(char)) {
|
||
charTypes.chinese++;
|
||
} else if (/[a-zA-Z]/.test(char)) {
|
||
charTypes.english++;
|
||
} else if (/[0-9]/.test(char)) {
|
||
charTypes.digit++;
|
||
} else if (/[,。!?;:""()【】《》…—·]/.test(char)) {
|
||
charTypes.punctuation++;
|
||
} else {
|
||
charTypes.other++;
|
||
}
|
||
});
|
||
|
||
this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`);
|
||
this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`);
|
||
|
||
} catch (error) {
|
||
this.logger.error('加载字符集失败', error.message);
|
||
// 完全使用提供的字符集,失败时抛出错误
|
||
throw new Error(`字符集加载失败: ${error.message}`);
|
||
}
|
||
}
|
||
getCharacterSetSize() {
|
||
return this.characterSet.length;
|
||
}
|
||
|
||
async recognizeText(textRegionBuffer, regionIndex = 0) {
|
||
const startTime = Date.now();
|
||
this.logger.info(`开始文本识别 - 区域 ${regionIndex}`);
|
||
|
||
try {
|
||
const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex);
|
||
const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor });
|
||
const result = this.postprocessRecognition(outputs);
|
||
|
||
const processingTime = Date.now() - startTime;
|
||
this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`);
|
||
|
||
return result;
|
||
|
||
} catch (error) {
|
||
this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error);
|
||
return { text: '', confidence: 0 };
|
||
}
|
||
}
|
||
|
||
async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) {
|
||
this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`);
|
||
|
||
const targetHeight = 48;
|
||
const targetWidth = 320; // 原始目标宽度
|
||
const finalWidth = targetWidth + 20; // 最终宽度(左右各加10像素)
|
||
const timestamp = Date.now();
|
||
|
||
try {
|
||
const metadata = await sharp(textRegionBuffer).metadata();
|
||
this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`);
|
||
|
||
// 保存原始裁剪区域图像
|
||
const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`);
|
||
await fse.writeFile(originalPath, textRegionBuffer);
|
||
this.logger.debug(`保存原始区域图像: ${originalPath}`);
|
||
|
||
// 图像分析
|
||
const stats = await sharp(textRegionBuffer).grayscale().stats();
|
||
const meanBrightness = stats.channels[0].mean;
|
||
const stdDev = stats.channels[0].stdev;
|
||
|
||
this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`);
|
||
|
||
// 智能预处理
|
||
let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex);
|
||
|
||
// 保存预处理后的图像(灰度+对比度调整后)
|
||
const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`);
|
||
await fse.writeFile(processedPath, processedBuffer);
|
||
this.logger.debug(`保存预处理图像: ${processedPath}`);
|
||
|
||
// 保持宽高比的resize,并在左右添加10像素空白
|
||
const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex);
|
||
|
||
// 保存调整大小后的图像
|
||
const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`);
|
||
await fse.writeFile(resizedPath, resizedBuffer);
|
||
this.logger.debug(`保存调整大小图像: ${resizedPath}`);
|
||
|
||
// 使用最终尺寸创建张量
|
||
const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight);
|
||
this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`);
|
||
|
||
// 创建张量时使用最终尺寸
|
||
return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]);
|
||
|
||
} catch (error) {
|
||
this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error);
|
||
return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]);
|
||
}
|
||
}
|
||
|
||
// server/utils/textRecognizer.js
|
||
// 增强的图像预处理
|
||
|
||
async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) {
|
||
let processedBuffer = buffer;
|
||
|
||
try {
|
||
// 更精细的图像分析
|
||
const stats = await sharp(buffer)
|
||
.grayscale()
|
||
.stats();
|
||
|
||
const median = stats.channels[0].median;
|
||
const max = stats.channels[0].max;
|
||
const min = stats.channels[0].min;
|
||
|
||
this.logger.debug(`区域 ${regionIndex}: 详细统计 - 中值=${median}, 范围=${min}-${max}, 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`);
|
||
|
||
// 更智能的预处理策略
|
||
if (meanBrightness > 220 && stdDev < 25) {
|
||
// 高亮度低对比度图像
|
||
this.logger.debug(`区域 ${regionIndex}: 应用高亮度低对比度增强`);
|
||
processedBuffer = await sharp(buffer)
|
||
.linear(1.8, -80) // 更强的对比度增强
|
||
.normalize({ lower: 5, upper: 95 }) // 更激进的归一化
|
||
.grayscale()
|
||
.toBuffer();
|
||
} else if (meanBrightness < 70) {
|
||
// 低亮度图像
|
||
this.logger.debug(`区域 ${regionIndex}: 应用低亮度增强`);
|
||
processedBuffer = await sharp(buffer)
|
||
.linear(1.5, 50) // 更强的亮度提升
|
||
.normalize()
|
||
.grayscale()
|
||
.toBuffer();
|
||
} else if (stdDev < 15) {
|
||
// 极低对比度
|
||
this.logger.debug(`区域 ${regionIndex}: 应用极低对比度增强`);
|
||
processedBuffer = await sharp(buffer)
|
||
.linear(2.0, -30) // 非常强的对比度增强
|
||
.normalize({ lower: 1, upper: 99 })
|
||
.grayscale()
|
||
.toBuffer();
|
||
} else if (stdDev > 80) {
|
||
// 高对比度图像,可能过度增强
|
||
this.logger.debug(`区域 ${regionIndex}: 应用高对比度抑制`);
|
||
processedBuffer = await sharp(buffer)
|
||
.linear(0.8, 20) // 降低对比度
|
||
.normalize()
|
||
.grayscale()
|
||
.toBuffer();
|
||
} else {
|
||
// 标准处理
|
||
this.logger.debug(`区域 ${regionIndex}: 应用标准增强`);
|
||
processedBuffer = await sharp(buffer)
|
||
.linear(1.3, -15) // 适度的对比度增强
|
||
.normalize({ lower: 10, upper: 90 })
|
||
.grayscale()
|
||
.toBuffer();
|
||
}
|
||
|
||
// 应用锐化滤波增强文字边缘
|
||
processedBuffer = await sharp(processedBuffer)
|
||
.sharpen({
|
||
sigma: 1.2,
|
||
m1: 1.5,
|
||
m2: 0.7
|
||
})
|
||
.toBuffer();
|
||
|
||
} catch (error) {
|
||
this.logger.error(`区域 ${regionIndex}: 预处理失败`, error);
|
||
// 回退到基本处理
|
||
processedBuffer = await sharp(buffer)
|
||
.normalize()
|
||
.grayscale()
|
||
.toBuffer();
|
||
}
|
||
|
||
return processedBuffer;
|
||
}
|
||
|
||
async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) {
|
||
const metadata = await sharp(buffer).metadata();
|
||
const originalAspectRatio = metadata.width / metadata.height;
|
||
const targetAspectRatio = targetWidth / targetHeight;
|
||
|
||
let resizeWidth, resizeHeight;
|
||
|
||
if (originalAspectRatio > targetAspectRatio) {
|
||
// 宽度限制,按宽度缩放
|
||
resizeWidth = targetWidth;
|
||
resizeHeight = Math.round(targetWidth / originalAspectRatio);
|
||
} else {
|
||
// 高度限制,按高度缩放
|
||
resizeHeight = targetHeight;
|
||
resizeWidth = Math.round(targetHeight * originalAspectRatio);
|
||
}
|
||
|
||
resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth));
|
||
resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight));
|
||
|
||
this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`);
|
||
|
||
// 计算居中的偏移量
|
||
const offsetX = Math.floor((targetWidth - resizeWidth) / 2);
|
||
const offsetY = Math.floor((targetHeight - resizeHeight) / 2);
|
||
|
||
this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`);
|
||
|
||
// 先调整大小并居中
|
||
let resizedBuffer = await sharp(buffer)
|
||
.resize(resizeWidth, resizeHeight, {
|
||
fit: 'contain',
|
||
background: { r: 255, g: 255, b: 255 }
|
||
})
|
||
.extend({
|
||
top: offsetY,
|
||
bottom: targetHeight - resizeHeight - offsetY,
|
||
left: offsetX,
|
||
right: targetWidth - resizeWidth - offsetX,
|
||
background: { r: 255, g: 255, b: 255 }
|
||
})
|
||
.png()
|
||
.toBuffer();
|
||
|
||
// 在左右各添加10像素空白
|
||
const finalWidth = targetWidth + 20; // 左右各加10像素
|
||
const finalHeight = targetHeight;
|
||
|
||
resizedBuffer = await sharp(resizedBuffer)
|
||
.extend({
|
||
top: 0,
|
||
bottom: 0,
|
||
left: 10,
|
||
right: 10,
|
||
background: { r: 255, g: 255, b: 255 }
|
||
})
|
||
.png()
|
||
.toBuffer();
|
||
|
||
this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`);
|
||
|
||
return resizedBuffer;
|
||
}
|
||
|
||
async bufferToTensor(buffer, width, height) {
|
||
// 获取实际图像尺寸(因为现在宽度增加了20像素)
|
||
const metadata = await sharp(buffer).metadata();
|
||
const actualWidth = metadata.width;
|
||
const actualHeight = metadata.height;
|
||
|
||
const imageData = await sharp(buffer)
|
||
.ensureAlpha()
|
||
.raw()
|
||
.toBuffer({ resolveWithObject: true });
|
||
|
||
// 使用实际尺寸创建张量
|
||
const inputData = new Float32Array(3 * actualHeight * actualWidth);
|
||
const data = imageData.data;
|
||
|
||
for (let i = 0; i < data.length; i += 4) {
|
||
const pixelIndex = Math.floor(i / 4);
|
||
const y = Math.floor(pixelIndex / actualWidth);
|
||
const x = pixelIndex % actualWidth;
|
||
|
||
// 使用灰度值填充三个通道
|
||
const grayValue = data[i] / 255.0;
|
||
|
||
for (let c = 0; c < 3; c++) {
|
||
const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x;
|
||
if (inputIndex < inputData.length) {
|
||
inputData[inputIndex] = grayValue;
|
||
}
|
||
}
|
||
}
|
||
|
||
return inputData;
|
||
}
|
||
|
||
postprocessRecognition(outputs) {
|
||
this.logger.debug('开始识别后处理');
|
||
|
||
try {
|
||
const outputNames = this.recSession.outputNames;
|
||
const recognitionOutput = outputs[outputNames[0]];
|
||
|
||
if (!recognitionOutput) {
|
||
this.logger.debug('识别输出为空');
|
||
return { text: '', confidence: 0 };
|
||
}
|
||
|
||
const data = recognitionOutput.data;
|
||
const [batch, seqLen, vocabSize] = recognitionOutput.dims;
|
||
|
||
this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`);
|
||
|
||
if (this.characterSet.length === 0) {
|
||
this.logger.error('字符集为空');
|
||
return { text: '', confidence: 0 };
|
||
}
|
||
|
||
// 验证词汇表大小与字符集大小的匹配
|
||
if (vocabSize !== this.characterSet.length + 1) {
|
||
this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配,可能影响识别效果`);
|
||
}
|
||
|
||
const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize);
|
||
this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`);
|
||
|
||
return { text, confidence };
|
||
|
||
} catch (error) {
|
||
this.logger.error('识别后处理失败', error);
|
||
return { text: '', confidence: 0 };
|
||
}
|
||
}
|
||
|
||
ctcDecode(data, seqLen, vocabSize) {
|
||
let text = '';
|
||
let lastCharIndex = -1;
|
||
let confidenceSum = 0;
|
||
let charCount = 0;
|
||
|
||
// 动态阈值调整
|
||
const baseThreshold = 0.03;
|
||
let confidenceThreshold = baseThreshold;
|
||
|
||
// 分析序列置信度分布
|
||
let maxSequenceProb = 0;
|
||
let minSequenceProb = 1;
|
||
let sumProb = 0;
|
||
let probCount = 0;
|
||
|
||
for (let t = 0; t < seqLen; t++) {
|
||
for (let i = 0; i < vocabSize; i++) {
|
||
const prob = data[t * vocabSize + i];
|
||
if (prob > 0.01) { // 只统计有意义的概率
|
||
maxSequenceProb = Math.max(maxSequenceProb, prob);
|
||
minSequenceProb = Math.min(minSequenceProb, prob);
|
||
sumProb += prob;
|
||
probCount++;
|
||
}
|
||
}
|
||
}
|
||
|
||
const avgProb = probCount > 0 ? sumProb / probCount : 0;
|
||
|
||
// 根据序列特性动态调整阈值
|
||
if (avgProb < 0.3) {
|
||
confidenceThreshold = baseThreshold * 0.5;
|
||
} else if (avgProb > 0.7) {
|
||
confidenceThreshold = baseThreshold * 1.5;
|
||
}
|
||
|
||
this.logger.debug(`序列统计: 平均概率=${avgProb.toFixed(4)}, 使用解码阈值: ${confidenceThreshold.toFixed(4)}`);
|
||
|
||
// 改进的beam search算法
|
||
const beamWidth = 5;
|
||
let beams = [{ text: '', confidence: 1.0, lastChar: -1 }];
|
||
|
||
for (let t = 0; t < seqLen; t++) {
|
||
const newBeams = [];
|
||
|
||
// 获取当前时间步的top-k字符
|
||
const topK = [];
|
||
for (let i = 0; i < vocabSize; i++) {
|
||
const prob = data[t * vocabSize + i];
|
||
if (prob > confidenceThreshold) {
|
||
topK.push({ index: i, prob });
|
||
}
|
||
}
|
||
|
||
// 按概率排序
|
||
topK.sort((a, b) => b.prob - a.prob);
|
||
const candidates = topK.slice(0, beamWidth);
|
||
|
||
// 为每个beam扩展候选字符
|
||
for (const beam of beams) {
|
||
for (const candidate of candidates) {
|
||
const charIndex = candidate.index;
|
||
|
||
if (charIndex === 0) {
|
||
// 空白字符
|
||
newBeams.push({
|
||
text: beam.text,
|
||
confidence: beam.confidence,
|
||
lastChar: -1
|
||
});
|
||
} else {
|
||
const actualCharIndex = charIndex - 1;
|
||
if (actualCharIndex < this.characterSet.length) {
|
||
const char = this.characterSet[actualCharIndex];
|
||
let newText = beam.text;
|
||
|
||
// 处理重复字符
|
||
if (charIndex !== beam.lastChar) {
|
||
newText += char;
|
||
}
|
||
|
||
newBeams.push({
|
||
text: newText,
|
||
confidence: beam.confidence * candidate.prob,
|
||
lastChar: charIndex
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 选择top beamWidth个beam
|
||
newBeams.sort((a, b) => b.confidence - a.confidence);
|
||
beams = newBeams.slice(0, beamWidth);
|
||
}
|
||
|
||
// 选择最佳beam
|
||
if (beams.length > 0) {
|
||
const bestBeam = beams[0];
|
||
text = bestBeam.text;
|
||
|
||
// 计算平均置信度(几何平均)
|
||
const textLength = text.length;
|
||
if (textLength > 0) {
|
||
confidenceSum = Math.pow(bestBeam.confidence, 1 / textLength);
|
||
charCount = textLength;
|
||
}
|
||
}
|
||
|
||
const avgConfidence = charCount > 0 ? confidenceSum : 0;
|
||
|
||
return {
|
||
text: text,
|
||
confidence: avgConfidence
|
||
};
|
||
}
|
||
|
||
}
|
||
|
||
export default TextRecognizer; |