Electron-vue3-ts-offline/server/utils/textRecognizer.js

433 行
16 KiB
JavaScript

2025-11-13 16:34:41 +08:00
// server/utils/textRecognizer.js
import { Tensor } from 'onnxruntime-node';
import sharp from 'sharp';
import fse from 'fs-extra';
import * as path from 'path';
class TextRecognizer {
constructor() {
this.recSession = null;
this.config = null;
this.characterSet = [];
this.debugDir = path.join(process.cwd(), 'temp', 'debug');
2025-11-13 18:09:31 +08:00
this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed');
this.logger = {
info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args),
error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args),
debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args),
warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args)
};
// 确保目录存在
2025-11-13 16:34:41 +08:00
fse.ensureDirSync(this.debugDir);
2025-11-13 18:09:31 +08:00
fse.ensureDirSync(this.preprocessedDir);
2025-11-13 16:34:41 +08:00
}
initialize(recSession, config) {
this.recSession = recSession;
this.config = config;
2025-11-13 18:09:31 +08:00
this.logger.info('文本识别器初始化完成');
2025-11-13 16:34:41 +08:00
}
async loadCharacterSet(keysPath) {
try {
const keysContent = await fse.readFile(keysPath, 'utf8');
this.characterSet = [];
const lines = keysContent.split('\n');
2025-11-13 18:09:31 +08:00
// 使用提供的字符集文件
const uniqueChars = new Set();
2025-11-13 16:34:41 +08:00
for (const line of lines) {
const trimmed = line.trim();
2025-11-13 18:09:31 +08:00
// 跳过空行和注释行
2025-11-13 16:34:41 +08:00
if (trimmed && !trimmed.startsWith('#')) {
2025-11-13 18:09:31 +08:00
// 将每行作为一个完整的字符处理
uniqueChars.add(trimmed);
2025-11-13 16:34:41 +08:00
}
}
2025-11-13 18:09:31 +08:00
this.characterSet = Array.from(uniqueChars);
2025-11-13 16:34:41 +08:00
if (this.characterSet.length === 0) {
throw new Error('字符集文件为空或格式不正确');
}
2025-11-13 18:09:31 +08:00
this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 记录字符集统计信息
const charTypes = {
chinese: 0,
english: 0,
digit: 0,
punctuation: 0,
other: 0
};
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
this.characterSet.forEach(char => {
if (/[\u4e00-\u9fff]/.test(char)) {
charTypes.chinese++;
} else if (/[a-zA-Z]/.test(char)) {
charTypes.english++;
} else if (/[0-9]/.test(char)) {
charTypes.digit++;
} else if (/[,。!?;:""()【】《》…—·]/.test(char)) {
charTypes.punctuation++;
} else {
charTypes.other++;
}
});
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`);
this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
} catch (error) {
this.logger.error('加载字符集失败', error.message);
// 完全使用提供的字符集,失败时抛出错误
throw new Error(`字符集加载失败: ${error.message}`);
}
2025-11-13 16:34:41 +08:00
}
getCharacterSetSize() {
return this.characterSet.length;
}
2025-11-13 18:09:31 +08:00
async recognizeText(textRegionBuffer, regionIndex = 0) {
const startTime = Date.now();
this.logger.info(`开始文本识别 - 区域 ${regionIndex}`);
2025-11-13 16:34:41 +08:00
try {
2025-11-13 18:09:31 +08:00
const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex);
2025-11-13 16:34:41 +08:00
const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor });
const result = this.postprocessRecognition(outputs);
2025-11-13 18:09:31 +08:00
const processingTime = Date.now() - startTime;
this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`);
2025-11-13 16:34:41 +08:00
return result;
} catch (error) {
2025-11-13 18:09:31 +08:00
this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error);
2025-11-13 16:34:41 +08:00
return { text: '', confidence: 0 };
}
}
2025-11-13 18:09:31 +08:00
async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) {
this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
const targetHeight = 48;
const targetWidth = 320; // 原始目标宽度
const finalWidth = targetWidth + 20; // 最终宽度左右各加10像素
const timestamp = Date.now();
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
try {
2025-11-13 16:34:41 +08:00
const metadata = await sharp(textRegionBuffer).metadata();
2025-11-13 18:09:31 +08:00
this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 保存原始裁剪区域图像
const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`);
2025-11-13 16:34:41 +08:00
await fse.writeFile(originalPath, textRegionBuffer);
2025-11-13 18:09:31 +08:00
this.logger.debug(`保存原始区域图像: ${originalPath}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 图像分析
const stats = await sharp(textRegionBuffer).grayscale().stats();
2025-11-13 16:34:41 +08:00
const meanBrightness = stats.channels[0].mean;
const stdDev = stats.channels[0].stdev;
2025-11-13 18:09:31 +08:00
this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 智能预处理
let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 保存预处理后的图像(灰度+对比度调整后)
const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`);
await fse.writeFile(processedPath, processedBuffer);
this.logger.debug(`保存预处理图像: ${processedPath}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 保持宽高比的resize,并在左右添加10像素空白
const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 保存调整大小后的图像
const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`);
await fse.writeFile(resizedPath, resizedBuffer);
this.logger.debug(`保存调整大小图像: ${resizedPath}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 使用最终尺寸创建张量
const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight);
this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 创建张量时使用最终尺寸
return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
} catch (error) {
this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error);
return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]);
}
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) {
let processedBuffer = buffer;
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
if (meanBrightness > 200 && stdDev < 30) {
this.logger.debug(`区域 ${regionIndex}: 应用高亮度图像增强`);
processedBuffer = await sharp(buffer)
.linear(1.5, -50)
.normalize()
.grayscale()
.toBuffer();
} else if (meanBrightness < 80) {
this.logger.debug(`区域 ${regionIndex}: 应用低亮度图像增强`);
processedBuffer = await sharp(buffer)
.linear(1.2, 30)
.normalize()
.grayscale()
.toBuffer();
} else if (stdDev < 20) {
this.logger.debug(`区域 ${regionIndex}: 应用低对比度增强`);
processedBuffer = await sharp(buffer)
.linear(1.3, -20)
.normalize()
.grayscale()
.toBuffer();
} else {
this.logger.debug(`区域 ${regionIndex}: 应用标准化灰度处理`);
processedBuffer = await sharp(buffer)
.normalize()
.grayscale()
.toBuffer();
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
return processedBuffer;
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) {
const metadata = await sharp(buffer).metadata();
const originalAspectRatio = metadata.width / metadata.height;
const targetAspectRatio = targetWidth / targetHeight;
let resizeWidth, resizeHeight;
if (originalAspectRatio > targetAspectRatio) {
// 宽度限制,按宽度缩放
resizeWidth = targetWidth;
resizeHeight = Math.round(targetWidth / originalAspectRatio);
} else {
// 高度限制,按高度缩放
resizeHeight = targetHeight;
resizeWidth = Math.round(targetHeight * originalAspectRatio);
}
resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth));
resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight));
this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`);
// 计算居中的偏移量
const offsetX = Math.floor((targetWidth - resizeWidth) / 2);
const offsetY = Math.floor((targetHeight - resizeHeight) / 2);
this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`);
// 先调整大小并居中
let resizedBuffer = await sharp(buffer)
.resize(resizeWidth, resizeHeight, {
fit: 'contain',
background: { r: 255, g: 255, b: 255 }
})
.extend({
top: offsetY,
bottom: targetHeight - resizeHeight - offsetY,
left: offsetX,
right: targetWidth - resizeWidth - offsetX,
background: { r: 255, g: 255, b: 255 }
})
.png()
.toBuffer();
// 在左右各添加10像素空白
const finalWidth = targetWidth + 20; // 左右各加10像素
const finalHeight = targetHeight;
resizedBuffer = await sharp(resizedBuffer)
.extend({
top: 0,
bottom: 0,
left: 10,
right: 10,
background: { r: 255, g: 255, b: 255 }
})
.png()
.toBuffer();
this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`);
return resizedBuffer;
}
async bufferToTensor(buffer, width, height) {
// 获取实际图像尺寸因为现在宽度增加了20像素
const metadata = await sharp(buffer).metadata();
const actualWidth = metadata.width;
const actualHeight = metadata.height;
const imageData = await sharp(buffer)
.ensureAlpha()
.raw()
.toBuffer({ resolveWithObject: true });
// 使用实际尺寸创建张量
const inputData = new Float32Array(3 * actualHeight * actualWidth);
const data = imageData.data;
for (let i = 0; i < data.length; i += 4) {
const pixelIndex = Math.floor(i / 4);
const y = Math.floor(pixelIndex / actualWidth);
const x = pixelIndex % actualWidth;
// 使用灰度值填充三个通道
const grayValue = data[i] / 255.0;
for (let c = 0; c < 3; c++) {
const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x;
if (inputIndex < inputData.length) {
inputData[inputIndex] = grayValue;
}
}
2025-11-13 16:34:41 +08:00
}
2025-11-13 18:09:31 +08:00
return inputData;
2025-11-13 16:34:41 +08:00
}
postprocessRecognition(outputs) {
2025-11-13 18:09:31 +08:00
this.logger.debug('开始识别后处理');
2025-11-13 16:34:41 +08:00
try {
const outputNames = this.recSession.outputNames;
const recognitionOutput = outputs[outputNames[0]];
if (!recognitionOutput) {
2025-11-13 18:09:31 +08:00
this.logger.debug('识别输出为空');
2025-11-13 16:34:41 +08:00
return { text: '', confidence: 0 };
}
const data = recognitionOutput.data;
const [batch, seqLen, vocabSize] = recognitionOutput.dims;
2025-11-13 18:09:31 +08:00
this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`);
2025-11-13 16:34:41 +08:00
if (this.characterSet.length === 0) {
2025-11-13 18:09:31 +08:00
this.logger.error('字符集为空');
2025-11-13 16:34:41 +08:00
return { text: '', confidence: 0 };
}
2025-11-13 18:09:31 +08:00
// 验证词汇表大小与字符集大小的匹配
if (vocabSize !== this.characterSet.length + 1) {
this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配,可能影响识别效果`);
}
const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize);
this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`);
return { text, confidence };
} catch (error) {
this.logger.error('识别后处理失败', error);
return { text: '', confidence: 0 };
}
}
ctcDecode(data, seqLen, vocabSize) {
let text = '';
let lastCharIndex = -1;
let confidenceSum = 0;
let charCount = 0;
// 动态阈值调整
const baseThreshold = 0.03;
let confidenceThreshold = baseThreshold;
// 先分析整个序列的置信度分布
let maxSequenceProb = 0;
for (let t = 0; t < seqLen; t++) {
for (let i = 0; i < vocabSize; i++) {
maxSequenceProb = Math.max(maxSequenceProb, data[t * vocabSize + i]);
}
}
// 如果整体置信度较低,降低阈值
if (maxSequenceProb < 0.5) {
confidenceThreshold = baseThreshold * 0.5;
}
this.logger.debug(`使用解码阈值: ${confidenceThreshold.toFixed(4)}`);
for (let t = 0; t < seqLen; t++) {
let maxProb = -1;
let maxIndex = -1;
// 找到当前时间步的最大概率字符
for (let i = 0; i < vocabSize; i++) {
const prob = data[t * vocabSize + i];
if (prob > maxProb) {
maxProb = prob;
maxIndex = i;
2025-11-13 16:34:41 +08:00
}
2025-11-13 18:09:31 +08:00
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 改进的CTC解码逻辑
if (maxIndex > 0 && maxProb > confidenceThreshold) {
const charIndex = maxIndex - 1;
if (charIndex < this.characterSet.length) {
const char = this.characterSet[charIndex];
// 更智能的重复字符处理
const shouldAddChar = maxIndex !== lastCharIndex ||
maxProb > 0.8 ||
(maxIndex === lastCharIndex && charCount > 0 && text[text.length - 1] !== char);
if (shouldAddChar && char && char.trim() !== '') {
text += char;
confidenceSum += maxProb;
charCount++;
2025-11-13 16:34:41 +08:00
}
2025-11-13 18:09:31 +08:00
lastCharIndex = maxIndex;
} else {
this.logger.warn(`字符索引${charIndex}超出字符集范围(0-${this.characterSet.length-1})`);
2025-11-13 16:34:41 +08:00
}
2025-11-13 18:09:31 +08:00
} else if (maxIndex === 0) {
lastCharIndex = -1;
2025-11-13 16:34:41 +08:00
}
2025-11-13 18:09:31 +08:00
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
const avgConfidence = charCount > 0 ? confidenceSum / charCount : 0;
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
// 基本的文本清理(不包含错误模式修复)
const cleanedText = this.basicTextCleaning(text);
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
return {
text: cleanedText,
confidence: avgConfidence
};
}
2025-11-13 16:34:41 +08:00
2025-11-13 18:09:31 +08:00
basicTextCleaning(text) {
if (!text) return '';
let cleaned = text;
// 1. 移除过多的重复字符(保留合理的重复)
cleaned = cleaned.replace(/([^0-9])\1{2,}/g, '$1$1');
// 2. 修复标点符号
cleaned = cleaned.replace(//g, '')
.replace(/《/g, '')
.replace(/》/g, '');
// 3. 修复数字和百分号
cleaned = cleaned.replace(/(\d+)%%/g, '$1%');
return cleaned.trim();
2025-11-13 16:34:41 +08:00
}
}
export default TextRecognizer;