Electron-vue3-ts-offline/server/utils/textRecognizer.js

// server/utils/textRecognizer.js
import { Tensor } from 'onnxruntime-node';
import sharp from 'sharp';
import fse from 'fs-extra';
import * as path from 'path';

class TextRecognizer {
    constructor() {
        this.recSession = null;
        this.config = null;
        this.characterSet = [];
        this.debugDir = path.join(process.cwd(), 'temp', 'debug');
        this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed');
        this.logger = {
            info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args),
            error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args),
            debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args),
            warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args)
        };

        // 确保目录存在
        fse.ensureDirSync(this.debugDir);
        fse.ensureDirSync(this.preprocessedDir);
    }

    initialize(recSession, config) {
        this.recSession = recSession;
        this.config = config;
        this.logger.info('文本识别器初始化完成');
    }

    async loadCharacterSet(keysPath) {
        try {
            const keysContent = await fse.readFile(keysPath, 'utf8');
            this.characterSet = [];
            const lines = keysContent.split('\n');

            // 使用提供的字符集文件
            const uniqueChars = new Set();

            for (const line of lines) {
                const trimmed = line.trim();
                // 跳过空行和注释行
                if (trimmed && !trimmed.startsWith('#')) {
                    // 将每行作为一个完整的字符处理
                    uniqueChars.add(trimmed);
                }
            }

            this.characterSet = Array.from(uniqueChars);

            if (this.characterSet.length === 0) {
                throw new Error('字符集文件为空或格式不正确');
            }

            this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`);

            // 记录字符集统计信息
            const charTypes = {
                chinese: 0,
                english: 0,
                digit: 0,
                punctuation: 0,
                other: 0
            };

            this.characterSet.forEach(char => {
                if (/[\u4e00-\u9fff]/.test(char)) {
                    charTypes.chinese++;
                } else if (/[a-zA-Z]/.test(char)) {
                    charTypes.english++;
                } else if (/[0-9]/.test(char)) {
                    charTypes.digit++;
                } else if (/[，。！？；：""（）【】《》…—·]/.test(char)) {
                    charTypes.punctuation++;
                } else {
                    charTypes.other++;
                }
            });

            this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`);
            this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`);

        } catch (error) {
            this.logger.error('加载字符集失败', error.message);
            // 完全使用提供的字符集，失败时抛出错误
            throw new Error(`字符集加载失败: ${error.message}`);
        }
    }
    getCharacterSetSize() {
        return this.characterSet.length;
    }

    async recognizeText(textRegionBuffer, regionIndex = 0) {
        const startTime = Date.now();
        this.logger.info(`开始文本识别 - 区域 ${regionIndex}`);

        try {
            const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex);
            const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor });
            const result = this.postprocessRecognition(outputs);

            const processingTime = Date.now() - startTime;
            this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`);

            return result;

        } catch (error) {
            this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error);
            return { text: '', confidence: 0 };
        }
    }

    async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) {
        this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`);

        const targetHeight = 48;
        const targetWidth = 320; // 原始目标宽度
        const finalWidth = targetWidth + 20; // 最终宽度（左右各加10像素）
        const timestamp = Date.now();

        try {
            const metadata = await sharp(textRegionBuffer).metadata();
            this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`);

            // 保存原始裁剪区域图像
            const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`);
            await fse.writeFile(originalPath, textRegionBuffer);
            this.logger.debug(`保存原始区域图像: ${originalPath}`);

            // 图像分析
            const stats = await sharp(textRegionBuffer).grayscale().stats();
            const meanBrightness = stats.channels[0].mean;
            const stdDev = stats.channels[0].stdev;

            this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`);

            // 智能预处理
            let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex);

            // 保存预处理后的图像（灰度+对比度调整后）
            const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`);
            await fse.writeFile(processedPath, processedBuffer);
            this.logger.debug(`保存预处理图像: ${processedPath}`);

            // 保持宽高比的resize，并在左右添加10像素空白
            const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex);

            // 保存调整大小后的图像
            const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`);
            await fse.writeFile(resizedPath, resizedBuffer);
            this.logger.debug(`保存调整大小图像: ${resizedPath}`);

            // 使用最终尺寸创建张量
            const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight);
            this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`);

            // 创建张量时使用最终尺寸
            return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]);

        } catch (error) {
            this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error);
            return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]);
        }
    }

// server/utils/textRecognizer.js
// 增强的图像预处理

    async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) {
        let processedBuffer = buffer;

        try {
            // 更精细的图像分析
            const stats = await sharp(buffer)
                .grayscale()
                .stats();

            const median = stats.channels[0].median;
            const max = stats.channels[0].max;
            const min = stats.channels[0].min;

            this.logger.debug(`区域 ${regionIndex}: 详细统计 - 中值=${median}, 范围=${min}-${max}, 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`);

            // 更智能的预处理策略
            if (meanBrightness > 220 && stdDev < 25) {
                // 高亮度低对比度图像
                this.logger.debug(`区域 ${regionIndex}: 应用高亮度低对比度增强`);
                processedBuffer = await sharp(buffer)
                    .linear(1.8, -80)  // 更强的对比度增强
                    .normalize({ lower: 5, upper: 95 }) // 更激进的归一化
                    .grayscale()
                    .toBuffer();
            } else if (meanBrightness < 70) {
                // 低亮度图像
                this.logger.debug(`区域 ${regionIndex}: 应用低亮度增强`);
                processedBuffer = await sharp(buffer)
                    .linear(1.5, 50)   // 更强的亮度提升
                    .normalize()
                    .grayscale()
                    .toBuffer();
            } else if (stdDev < 15) {
                // 极低对比度
                this.logger.debug(`区域 ${regionIndex}: 应用极低对比度增强`);
                processedBuffer = await sharp(buffer)
                    .linear(2.0, -30)  // 非常强的对比度增强
                    .normalize({ lower: 1, upper: 99 })
                    .grayscale()
                    .toBuffer();
            } else if (stdDev > 80) {
                // 高对比度图像，可能过度增强
                this.logger.debug(`区域 ${regionIndex}: 应用高对比度抑制`);
                processedBuffer = await sharp(buffer)
                    .linear(0.8, 20)   // 降低对比度
                    .normalize()
                    .grayscale()
                    .toBuffer();
            } else {
                // 标准处理
                this.logger.debug(`区域 ${regionIndex}: 应用标准增强`);
                processedBuffer = await sharp(buffer)
                    .linear(1.3, -15)  // 适度的对比度增强
                    .normalize({ lower: 10, upper: 90 })
                    .grayscale()
                    .toBuffer();
            }

            // 应用锐化滤波增强文字边缘
            processedBuffer = await sharp(processedBuffer)
                .sharpen({
                    sigma: 1.2,
                    m1: 1.5,
                    m2: 0.7
                })
                .toBuffer();

        } catch (error) {
            this.logger.error(`区域 ${regionIndex}: 预处理失败`, error);
            // 回退到基本处理
            processedBuffer = await sharp(buffer)
                .normalize()
                .grayscale()
                .toBuffer();
        }

        return processedBuffer;
    }

    async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) {
        const metadata = await sharp(buffer).metadata();
        const originalAspectRatio = metadata.width / metadata.height;
        const targetAspectRatio = targetWidth / targetHeight;

        let resizeWidth, resizeHeight;

        if (originalAspectRatio > targetAspectRatio) {
            // 宽度限制，按宽度缩放
            resizeWidth = targetWidth;
            resizeHeight = Math.round(targetWidth / originalAspectRatio);
        } else {
            // 高度限制，按高度缩放
            resizeHeight = targetHeight;
            resizeWidth = Math.round(targetHeight * originalAspectRatio);
        }

        resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth));
        resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight));

        this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`);

        // 计算居中的偏移量
        const offsetX = Math.floor((targetWidth - resizeWidth) / 2);
        const offsetY = Math.floor((targetHeight - resizeHeight) / 2);

        this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`);

        // 先调整大小并居中
        let resizedBuffer = await sharp(buffer)
            .resize(resizeWidth, resizeHeight, {
                fit: 'contain',
                background: { r: 255, g: 255, b: 255 }
            })
            .extend({
                top: offsetY,
                bottom: targetHeight - resizeHeight - offsetY,
                left: offsetX,
                right: targetWidth - resizeWidth - offsetX,
                background: { r: 255, g: 255, b: 255 }
            })
            .png()
            .toBuffer();

        // 在左右各添加10像素空白
        const finalWidth = targetWidth + 20; // 左右各加10像素
        const finalHeight = targetHeight;

        resizedBuffer = await sharp(resizedBuffer)
            .extend({
                top: 0,
                bottom: 0,
                left: 10,
                right: 10,
                background: { r: 255, g: 255, b: 255 }
            })
            .png()
            .toBuffer();

        this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`);

        return resizedBuffer;
    }

    async bufferToTensor(buffer, width, height) {
        // 获取实际图像尺寸（因为现在宽度增加了20像素）
        const metadata = await sharp(buffer).metadata();
        const actualWidth = metadata.width;
        const actualHeight = metadata.height;

        const imageData = await sharp(buffer)
            .ensureAlpha()
            .raw()
            .toBuffer({ resolveWithObject: true });

        // 使用实际尺寸创建张量
        const inputData = new Float32Array(3 * actualHeight * actualWidth);
        const data = imageData.data;

        for (let i = 0; i < data.length; i += 4) {
            const pixelIndex = Math.floor(i / 4);
            const y = Math.floor(pixelIndex / actualWidth);
            const x = pixelIndex % actualWidth;

            // 使用灰度值填充三个通道
            const grayValue = data[i] / 255.0;

            for (let c = 0; c < 3; c++) {
                const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x;
                if (inputIndex < inputData.length) {
                    inputData[inputIndex] = grayValue;
                }
            }
        }

        return inputData;
    }

    postprocessRecognition(outputs) {
        this.logger.debug('开始识别后处理');

        try {
            const outputNames = this.recSession.outputNames;
            const recognitionOutput = outputs[outputNames[0]];

            if (!recognitionOutput) {
                this.logger.debug('识别输出为空');
                return { text: '', confidence: 0 };
            }

            const data = recognitionOutput.data;
            const [batch, seqLen, vocabSize] = recognitionOutput.dims;

            this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`);

            if (this.characterSet.length === 0) {
                this.logger.error('字符集为空');
                return { text: '', confidence: 0 };
            }

            // 验证词汇表大小与字符集大小的匹配
            if (vocabSize !== this.characterSet.length + 1) {
                this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配，可能影响识别效果`);
            }

            const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize);
            this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`);

            return { text, confidence };

        } catch (error) {
            this.logger.error('识别后处理失败', error);
            return { text: '', confidence: 0 };
        }
    }

    ctcDecode(data, seqLen, vocabSize) {
        let text = '';
        let lastCharIndex = -1;
        let confidenceSum = 0;
        let charCount = 0;

        // 动态阈值调整
        const baseThreshold = 0.03;
        let confidenceThreshold = baseThreshold;

        // 分析序列置信度分布
        let maxSequenceProb = 0;
        let minSequenceProb = 1;
        let sumProb = 0;
        let probCount = 0;

        for (let t = 0; t < seqLen; t++) {
            for (let i = 0; i < vocabSize; i++) {
                const prob = data[t * vocabSize + i];
                if (prob > 0.01) { // 只统计有意义的概率
                    maxSequenceProb = Math.max(maxSequenceProb, prob);
                    minSequenceProb = Math.min(minSequenceProb, prob);
                    sumProb += prob;
                    probCount++;
                }
            }
        }

        const avgProb = probCount > 0 ? sumProb / probCount : 0;

        // 根据序列特性动态调整阈值
        if (avgProb < 0.3) {
            confidenceThreshold = baseThreshold * 0.5;
        } else if (avgProb > 0.7) {
            confidenceThreshold = baseThreshold * 1.5;
        }

        this.logger.debug(`序列统计: 平均概率=${avgProb.toFixed(4)}, 使用解码阈值: ${confidenceThreshold.toFixed(4)}`);

        // 改进的beam search算法
        const beamWidth = 5;
        let beams = [{ text: '', confidence: 1.0, lastChar: -1 }];

        for (let t = 0; t < seqLen; t++) {
            const newBeams = [];

            // 获取当前时间步的top-k字符
            const topK = [];
            for (let i = 0; i < vocabSize; i++) {
                const prob = data[t * vocabSize + i];
                if (prob > confidenceThreshold) {
                    topK.push({ index: i, prob });
                }
            }

            // 按概率排序
            topK.sort((a, b) => b.prob - a.prob);
            const candidates = topK.slice(0, beamWidth);

            // 为每个beam扩展候选字符
            for (const beam of beams) {
                for (const candidate of candidates) {
                    const charIndex = candidate.index;

                    if (charIndex === 0) {
                        // 空白字符
                        newBeams.push({
                            text: beam.text,
                            confidence: beam.confidence,
                            lastChar: -1
                        });
                    } else {
                        const actualCharIndex = charIndex - 1;
                        if (actualCharIndex < this.characterSet.length) {
                            const char = this.characterSet[actualCharIndex];
                            let newText = beam.text;

                            // 处理重复字符
                            if (charIndex !== beam.lastChar) {
                                newText += char;
                            }

                            newBeams.push({
                                text: newText,
                                confidence: beam.confidence * candidate.prob,
                                lastChar: charIndex
                            });
                        }
                    }
                }
            }

            // 选择top beamWidth个beam
            newBeams.sort((a, b) => b.confidence - a.confidence);
            beams = newBeams.slice(0, beamWidth);
        }

        // 选择最佳beam
        if (beams.length > 0) {
            const bestBeam = beams[0];
            text = bestBeam.text;

            // 计算平均置信度（几何平均）
            const textLength = text.length;
            if (textLength > 0) {
                confidenceSum = Math.pow(bestBeam.confidence, 1 / textLength);
                charCount = textLength;
            }
        }

        const avgConfidence = charCount > 0 ? confidenceSum : 0;

        return {
            text: text,
            confidence: avgConfidence
        };
    }

}

export default TextRecognizer;