| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505 |
- // server/utils/textRecognizer.js
- import { Tensor } from 'onnxruntime-node';
- import sharp from 'sharp';
- import fse from 'fs-extra';
- import * as path from 'path';
- class TextRecognizer {
- constructor() {
- this.recSession = null;
- this.config = null;
- this.characterSet = [];
- this.debugDir = path.join(process.cwd(), 'temp', 'debug');
- this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed');
- this.logger = {
- info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args),
- error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args),
- debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args),
- warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args)
- };
- // 确保目录存在
- fse.ensureDirSync(this.debugDir);
- fse.ensureDirSync(this.preprocessedDir);
- }
- initialize(recSession, config) {
- this.recSession = recSession;
- this.config = config;
- this.logger.info('文本识别器初始化完成');
- }
- async loadCharacterSet(keysPath) {
- try {
- const keysContent = await fse.readFile(keysPath, 'utf8');
- this.characterSet = [];
- const lines = keysContent.split('\n');
- // 使用提供的字符集文件
- const uniqueChars = new Set();
- for (const line of lines) {
- const trimmed = line.trim();
- // 跳过空行和注释行
- if (trimmed && !trimmed.startsWith('#')) {
- // 将每行作为一个完整的字符处理
- uniqueChars.add(trimmed);
- }
- }
- this.characterSet = Array.from(uniqueChars);
- if (this.characterSet.length === 0) {
- throw new Error('字符集文件为空或格式不正确');
- }
- this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`);
- // 记录字符集统计信息
- const charTypes = {
- chinese: 0,
- english: 0,
- digit: 0,
- punctuation: 0,
- other: 0
- };
- this.characterSet.forEach(char => {
- if (/[\u4e00-\u9fff]/.test(char)) {
- charTypes.chinese++;
- } else if (/[a-zA-Z]/.test(char)) {
- charTypes.english++;
- } else if (/[0-9]/.test(char)) {
- charTypes.digit++;
- } else if (/[,。!?;:""()【】《》…—·]/.test(char)) {
- charTypes.punctuation++;
- } else {
- charTypes.other++;
- }
- });
- this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`);
- this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`);
- } catch (error) {
- this.logger.error('加载字符集失败', error.message);
- // 完全使用提供的字符集,失败时抛出错误
- throw new Error(`字符集加载失败: ${error.message}`);
- }
- }
- getCharacterSetSize() {
- return this.characterSet.length;
- }
- async recognizeText(textRegionBuffer, regionIndex = 0) {
- const startTime = Date.now();
- this.logger.info(`开始文本识别 - 区域 ${regionIndex}`);
- try {
- const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex);
- const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor });
- const result = this.postprocessRecognition(outputs);
- const processingTime = Date.now() - startTime;
- this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`);
- return result;
- } catch (error) {
- this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error);
- return { text: '', confidence: 0 };
- }
- }
- async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) {
- this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`);
- const targetHeight = 48;
- const targetWidth = 320; // 原始目标宽度
- const finalWidth = targetWidth + 20; // 最终宽度(左右各加10像素)
- const timestamp = Date.now();
- try {
- const metadata = await sharp(textRegionBuffer).metadata();
- this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`);
- // 保存原始裁剪区域图像
- const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`);
- await fse.writeFile(originalPath, textRegionBuffer);
- this.logger.debug(`保存原始区域图像: ${originalPath}`);
- // 图像分析
- const stats = await sharp(textRegionBuffer).grayscale().stats();
- const meanBrightness = stats.channels[0].mean;
- const stdDev = stats.channels[0].stdev;
- this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`);
- // 智能预处理
- let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex);
- // 保存预处理后的图像(灰度+对比度调整后)
- const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`);
- await fse.writeFile(processedPath, processedBuffer);
- this.logger.debug(`保存预处理图像: ${processedPath}`);
- // 保持宽高比的resize,并在左右添加10像素空白
- const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex);
- // 保存调整大小后的图像
- const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`);
- await fse.writeFile(resizedPath, resizedBuffer);
- this.logger.debug(`保存调整大小图像: ${resizedPath}`);
- // 使用最终尺寸创建张量
- const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight);
- this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`);
- // 创建张量时使用最终尺寸
- return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]);
- } catch (error) {
- this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error);
- return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]);
- }
- }
- // server/utils/textRecognizer.js
- // 增强的图像预处理
- async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) {
- let processedBuffer = buffer;
- try {
- // 更精细的图像分析
- const stats = await sharp(buffer)
- .grayscale()
- .stats();
- const median = stats.channels[0].median;
- const max = stats.channels[0].max;
- const min = stats.channels[0].min;
- this.logger.debug(`区域 ${regionIndex}: 详细统计 - 中值=${median}, 范围=${min}-${max}, 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`);
- // 更智能的预处理策略
- if (meanBrightness > 220 && stdDev < 25) {
- // 高亮度低对比度图像
- this.logger.debug(`区域 ${regionIndex}: 应用高亮度低对比度增强`);
- processedBuffer = await sharp(buffer)
- .linear(1.8, -80) // 更强的对比度增强
- .normalize({ lower: 5, upper: 95 }) // 更激进的归一化
- .grayscale()
- .toBuffer();
- } else if (meanBrightness < 70) {
- // 低亮度图像
- this.logger.debug(`区域 ${regionIndex}: 应用低亮度增强`);
- processedBuffer = await sharp(buffer)
- .linear(1.5, 50) // 更强的亮度提升
- .normalize()
- .grayscale()
- .toBuffer();
- } else if (stdDev < 15) {
- // 极低对比度
- this.logger.debug(`区域 ${regionIndex}: 应用极低对比度增强`);
- processedBuffer = await sharp(buffer)
- .linear(2.0, -30) // 非常强的对比度增强
- .normalize({ lower: 1, upper: 99 })
- .grayscale()
- .toBuffer();
- } else if (stdDev > 80) {
- // 高对比度图像,可能过度增强
- this.logger.debug(`区域 ${regionIndex}: 应用高对比度抑制`);
- processedBuffer = await sharp(buffer)
- .linear(0.8, 20) // 降低对比度
- .normalize()
- .grayscale()
- .toBuffer();
- } else {
- // 标准处理
- this.logger.debug(`区域 ${regionIndex}: 应用标准增强`);
- processedBuffer = await sharp(buffer)
- .linear(1.3, -15) // 适度的对比度增强
- .normalize({ lower: 10, upper: 90 })
- .grayscale()
- .toBuffer();
- }
- // 应用锐化滤波增强文字边缘
- processedBuffer = await sharp(processedBuffer)
- .sharpen({
- sigma: 1.2,
- m1: 1.5,
- m2: 0.7
- })
- .toBuffer();
- } catch (error) {
- this.logger.error(`区域 ${regionIndex}: 预处理失败`, error);
- // 回退到基本处理
- processedBuffer = await sharp(buffer)
- .normalize()
- .grayscale()
- .toBuffer();
- }
- return processedBuffer;
- }
- async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) {
- const metadata = await sharp(buffer).metadata();
- const originalAspectRatio = metadata.width / metadata.height;
- const targetAspectRatio = targetWidth / targetHeight;
- let resizeWidth, resizeHeight;
- if (originalAspectRatio > targetAspectRatio) {
- // 宽度限制,按宽度缩放
- resizeWidth = targetWidth;
- resizeHeight = Math.round(targetWidth / originalAspectRatio);
- } else {
- // 高度限制,按高度缩放
- resizeHeight = targetHeight;
- resizeWidth = Math.round(targetHeight * originalAspectRatio);
- }
- resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth));
- resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight));
- this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`);
- // 计算居中的偏移量
- const offsetX = Math.floor((targetWidth - resizeWidth) / 2);
- const offsetY = Math.floor((targetHeight - resizeHeight) / 2);
- this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`);
- // 先调整大小并居中
- let resizedBuffer = await sharp(buffer)
- .resize(resizeWidth, resizeHeight, {
- fit: 'contain',
- background: { r: 255, g: 255, b: 255 }
- })
- .extend({
- top: offsetY,
- bottom: targetHeight - resizeHeight - offsetY,
- left: offsetX,
- right: targetWidth - resizeWidth - offsetX,
- background: { r: 255, g: 255, b: 255 }
- })
- .png()
- .toBuffer();
- // 在左右各添加10像素空白
- const finalWidth = targetWidth + 20; // 左右各加10像素
- const finalHeight = targetHeight;
- resizedBuffer = await sharp(resizedBuffer)
- .extend({
- top: 0,
- bottom: 0,
- left: 10,
- right: 10,
- background: { r: 255, g: 255, b: 255 }
- })
- .png()
- .toBuffer();
- this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`);
- return resizedBuffer;
- }
- async bufferToTensor(buffer, width, height) {
- // 获取实际图像尺寸(因为现在宽度增加了20像素)
- const metadata = await sharp(buffer).metadata();
- const actualWidth = metadata.width;
- const actualHeight = metadata.height;
- const imageData = await sharp(buffer)
- .ensureAlpha()
- .raw()
- .toBuffer({ resolveWithObject: true });
- // 使用实际尺寸创建张量
- const inputData = new Float32Array(3 * actualHeight * actualWidth);
- const data = imageData.data;
- for (let i = 0; i < data.length; i += 4) {
- const pixelIndex = Math.floor(i / 4);
- const y = Math.floor(pixelIndex / actualWidth);
- const x = pixelIndex % actualWidth;
- // 使用灰度值填充三个通道
- const grayValue = data[i] / 255.0;
- for (let c = 0; c < 3; c++) {
- const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x;
- if (inputIndex < inputData.length) {
- inputData[inputIndex] = grayValue;
- }
- }
- }
- return inputData;
- }
- postprocessRecognition(outputs) {
- this.logger.debug('开始识别后处理');
- try {
- const outputNames = this.recSession.outputNames;
- const recognitionOutput = outputs[outputNames[0]];
- if (!recognitionOutput) {
- this.logger.debug('识别输出为空');
- return { text: '', confidence: 0 };
- }
- const data = recognitionOutput.data;
- const [batch, seqLen, vocabSize] = recognitionOutput.dims;
- this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`);
- if (this.characterSet.length === 0) {
- this.logger.error('字符集为空');
- return { text: '', confidence: 0 };
- }
- // 验证词汇表大小与字符集大小的匹配
- if (vocabSize !== this.characterSet.length + 1) {
- this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配,可能影响识别效果`);
- }
- const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize);
- this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`);
- return { text, confidence };
- } catch (error) {
- this.logger.error('识别后处理失败', error);
- return { text: '', confidence: 0 };
- }
- }
- ctcDecode(data, seqLen, vocabSize) {
- let text = '';
- let lastCharIndex = -1;
- let confidenceSum = 0;
- let charCount = 0;
- // 动态阈值调整
- const baseThreshold = 0.03;
- let confidenceThreshold = baseThreshold;
- // 分析序列置信度分布
- let maxSequenceProb = 0;
- let minSequenceProb = 1;
- let sumProb = 0;
- let probCount = 0;
- for (let t = 0; t < seqLen; t++) {
- for (let i = 0; i < vocabSize; i++) {
- const prob = data[t * vocabSize + i];
- if (prob > 0.01) { // 只统计有意义的概率
- maxSequenceProb = Math.max(maxSequenceProb, prob);
- minSequenceProb = Math.min(minSequenceProb, prob);
- sumProb += prob;
- probCount++;
- }
- }
- }
- const avgProb = probCount > 0 ? sumProb / probCount : 0;
- // 根据序列特性动态调整阈值
- if (avgProb < 0.3) {
- confidenceThreshold = baseThreshold * 0.5;
- } else if (avgProb > 0.7) {
- confidenceThreshold = baseThreshold * 1.5;
- }
- this.logger.debug(`序列统计: 平均概率=${avgProb.toFixed(4)}, 使用解码阈值: ${confidenceThreshold.toFixed(4)}`);
- // 改进的beam search算法
- const beamWidth = 5;
- let beams = [{ text: '', confidence: 1.0, lastChar: -1 }];
- for (let t = 0; t < seqLen; t++) {
- const newBeams = [];
- // 获取当前时间步的top-k字符
- const topK = [];
- for (let i = 0; i < vocabSize; i++) {
- const prob = data[t * vocabSize + i];
- if (prob > confidenceThreshold) {
- topK.push({ index: i, prob });
- }
- }
- // 按概率排序
- topK.sort((a, b) => b.prob - a.prob);
- const candidates = topK.slice(0, beamWidth);
- // 为每个beam扩展候选字符
- for (const beam of beams) {
- for (const candidate of candidates) {
- const charIndex = candidate.index;
- if (charIndex === 0) {
- // 空白字符
- newBeams.push({
- text: beam.text,
- confidence: beam.confidence,
- lastChar: -1
- });
- } else {
- const actualCharIndex = charIndex - 1;
- if (actualCharIndex < this.characterSet.length) {
- const char = this.characterSet[actualCharIndex];
- let newText = beam.text;
- // 处理重复字符
- if (charIndex !== beam.lastChar) {
- newText += char;
- }
- newBeams.push({
- text: newText,
- confidence: beam.confidence * candidate.prob,
- lastChar: charIndex
- });
- }
- }
- }
- }
- // 选择top beamWidth个beam
- newBeams.sort((a, b) => b.confidence - a.confidence);
- beams = newBeams.slice(0, beamWidth);
- }
- // 选择最佳beam
- if (beams.length > 0) {
- const bestBeam = beams[0];
- text = bestBeam.text;
- // 计算平均置信度(几何平均)
- const textLength = text.length;
- if (textLength > 0) {
- confidenceSum = Math.pow(bestBeam.confidence, 1 / textLength);
- charCount = textLength;
- }
- }
- const avgConfidence = charCount > 0 ? confidenceSum : 0;
- return {
- text: text,
- confidence: avgConfidence
- };
- }
- }
- export default TextRecognizer;
|