diff --git a/server/server.ts b/server/server.ts index 257bf37..0c68c24 100644 --- a/server/server.ts +++ b/server/server.ts @@ -325,35 +325,6 @@ app.get('/api/files/:id/preview', async (req, res) => { res.status(500).json({ error: 'Failed to get file preview' }); } }); -// 更新批量OCR接口 -app.post('/api/ocr/batch-recognize', async (req, res) => { - try { - const { fileIds, config } = req.body; - - if (!fileIds || !Array.isArray(fileIds)) { - return res.status(400).json({ error: 'File IDs array is required' }); - } - - const filePaths = []; - for (const fileId of fileIds) { - const file = await fileService.getFileById(parseInt(fileId)); - if (file) { - filePaths.push(file.filePath); - } - } - - const results = await onnxOcrManager.batchRecognize(filePaths, config); - - res.json({ - success: true, - data: results - }); - - } catch (error) { - console.error('批量ONNX OCR识别失败:', error); - res.status(500).json({ error: '批量识别失败: ' + error.message }); - } -}); // 获取预处理后的图片 app.get('/api/ocr/processed-image', async (req, res) => { try { diff --git a/server/utils/detectionProcessor.js b/server/utils/detectionProcessor.js index bfe2c28..bfe4dcb 100644 --- a/server/utils/detectionProcessor.js +++ b/server/utils/detectionProcessor.js @@ -6,27 +6,41 @@ class DetectionProcessor { constructor() { this.session = null; this.config = null; + this.logger = { + info: (msg, ...args) => console.log(`🔍 [检测] ${msg}`, ...args), + error: (msg, ...args) => console.error(`❌ [检测] ${msg}`, ...args), + debug: (msg, ...args) => console.log(`🐛 [检测] ${msg}`, ...args) + }; } initialize(session, config) { this.session = session; this.config = config; + this.logger.info('检测处理器初始化完成'); } async detectText(processedImage) { + const startTime = Date.now(); + this.logger.info('开始文本检测'); + try { const inputTensor = await this.prepareDetectionInput(processedImage); const outputs = await this.session.run({ [this.session.inputNames[0]]: inputTensor }); const textBoxes = this.postprocessDetection(outputs, processedImage); + + const processingTime = Date.now() - startTime; + this.logger.info(`检测完成: ${textBoxes.length}个区域, 耗时${processingTime}ms`); + return textBoxes; } catch (error) { - console.error('文本检测失败:', error); + this.logger.error('检测失败', error); return []; } } async prepareDetectionInput(processedImage) { const { buffer, width, height } = processedImage; + this.logger.debug(`准备检测输入: ${width}x${height}`); const imageData = await sharp(buffer) .ensureAlpha() @@ -37,111 +51,155 @@ class DetectionProcessor { const data = imageData.data; const channels = imageData.info.channels; + // 优化数据填充逻辑 for (let i = 0; i < data.length; i += channels) { const pixelIndex = Math.floor(i / channels); - const channel = Math.floor(pixelIndex / (height * width)); - const posInChannel = pixelIndex % (height * width); - - if (channel < 3) { - const y = Math.floor(posInChannel / width); - const x = posInChannel % width; - const inputIndex = channel * height * width + y * width + x; + const y = Math.floor(pixelIndex / width); + const x = pixelIndex % width; + for (let c = 0; c < 3; c++) { + const inputIndex = c * height * width + y * width + x; if (inputIndex < inputData.length) { inputData[inputIndex] = data[i] / 255.0; } } } + this.logger.debug('检测输入张量准备完成'); return new Tensor('float32', inputData, [1, 3, height, width]); } postprocessDetection(outputs, processedImage) { + this.logger.debug('开始检测后处理'); + try { const boxes = []; const outputNames = this.session.outputNames; const detectionOutput = outputs[outputNames[0]]; if (!detectionOutput) { + this.logger.debug('检测输出为空'); return boxes; } const [batch, channels, height, width] = detectionOutput.dims; const data = detectionOutput.data; - // 降低检测阈值,提高召回率 - const threshold = this.config.detThresh || 0.05; - const points = []; + // 动态阈值调整 + const baseThreshold = this.config.detThresh || 0.05; + const adaptiveThreshold = this.calculateAdaptiveThreshold(data, baseThreshold); - // 改进的点收集逻辑 - for (let y = 0; y < height; y++) { - for (let x = 0; x < width; x++) { - const idx = y * width + x; - const prob = data[idx]; - if (prob > threshold) { - points.push({ - x, - y, - prob, - localMax: this.isLocalMaximum(data, x, y, width, height, 2) - }); - } - } - } + this.logger.debug(`使用检测阈值: ${adaptiveThreshold.toFixed(4)}`); + + const points = this.collectDetectionPoints(data, width, height, adaptiveThreshold); if (points.length === 0) { + this.logger.debug('未检测到有效文本点'); return boxes; } - // 改进的聚类算法 - const clusters = this.enhancedCluster(points, 8); + this.logger.debug(`收集到 ${points.length} 个检测点`); + const clusters = this.enhancedCluster(points, this.config.clusterDistance || 8); + this.logger.debug(`聚类得到 ${clusters.length} 个区域`); - for (const cluster of clusters) { - // 降低最小点数要求 - if (cluster.length < 2) continue; + const validBoxes = this.filterAndScaleBoxes(clusters, processedImage); + this.logger.info(`生成 ${validBoxes.length} 个有效文本框`); - const minX = Math.min(...cluster.map(p => p.x)); - const maxX = Math.max(...cluster.map(p => p.x)); - const minY = Math.min(...cluster.map(p => p.y)); - const maxY = Math.max(...cluster.map(p => p.y)); - - const boxWidth = maxX - minX; - const boxHeight = maxY - minY; - - // 放宽尺寸限制 - if (boxWidth < 2 || boxHeight < 2) continue; - - const aspectRatio = boxWidth / boxHeight; - // 放宽宽高比限制 - if (aspectRatio > 100 || aspectRatio < 0.01) continue; - - const avgConfidence = cluster.reduce((sum, p) => sum + p.prob, 0) / cluster.length; - - // 降低框置信度阈值 - const boxThreshold = this.config.detBoxThresh || 0.1; - if (avgConfidence > boxThreshold) { - const box = this.scaleBoxToProcessedImage({ - x1: minX, y1: minY, - x2: maxX, y2: minY, - x3: maxX, y3: maxY, - x4: minX, y4: maxY - }, processedImage); - box.confidence = avgConfidence; - boxes.push(box); - } - } - - boxes.sort((a, b) => b.confidence - a.confidence); - console.log(`✅ 检测到 ${boxes.length} 个文本区域`); - return boxes; + return validBoxes.sort((a, b) => b.confidence - a.confidence); } catch (error) { - console.error('检测后处理错误:', error); + this.logger.error('检测后处理错误', error); return []; } } - // 添加局部最大值检测 + collectDetectionPoints(data, width, height, threshold) { + const points = []; + let totalProb = 0; + let maxProb = 0; + + for (let y = 0; y < height; y++) { + for (let x = 0; x < width; x++) { + const idx = y * width + x; + const prob = data[idx]; + + if (prob > threshold) { + totalProb += prob; + maxProb = Math.max(maxProb, prob); + points.push({ + x, y, prob, + localMax: this.isLocalMaximum(data, x, y, width, height, 2) + }); + } + } + } + + if (points.length > 0) { + this.logger.debug(`检测点统计: 平均置信度 ${(totalProb/points.length).toFixed(4)}, 最大置信度 ${maxProb.toFixed(4)}`); + } + + return points; + } + + calculateAdaptiveThreshold(data, baseThreshold) { + // 基于图像特性动态调整阈值 + let sum = 0; + let count = 0; + const sampleSize = Math.min(1000, data.length); + + for (let i = 0; i < sampleSize; i++) { + const idx = Math.floor(Math.random() * data.length); + if (data[idx] > baseThreshold) { + sum += data[idx]; + count++; + } + } + + if (count === 0) return baseThreshold; + + const mean = sum / count; + return Math.min(baseThreshold * 1.5, mean * 0.8); + } + + filterAndScaleBoxes(clusters, processedImage) { + const boxes = []; + const minPoints = this.config.minClusterPoints || 2; + const boxThreshold = this.config.detBoxThresh || 0.1; + + for (const cluster of clusters) { + if (cluster.length < minPoints) continue; + + const minX = Math.min(...cluster.map(p => p.x)); + const maxX = Math.max(...cluster.map(p => p.x)); + const minY = Math.min(...cluster.map(p => p.y)); + const maxY = Math.max(...cluster.map(p => p.y)); + + const boxWidth = maxX - minX; + const boxHeight = maxY - minY; + + // 放宽尺寸限制,提高小文本检测 + if (boxWidth < 1 || boxHeight < 1) continue; + + const aspectRatio = boxWidth / boxHeight; + if (aspectRatio > 150 || aspectRatio < 0.005) continue; + + const avgConfidence = cluster.reduce((sum, p) => sum + p.prob, 0) / cluster.length; + + if (avgConfidence > boxThreshold) { + const box = this.scaleBoxToProcessedImage({ + x1: minX, y1: minY, + x2: maxX, y2: minY, + x3: maxX, y3: maxY, + x4: minX, y4: maxY + }, processedImage); + box.confidence = avgConfidence; + boxes.push(box); + } + } + + return boxes; + } + isLocalMaximum(data, x, y, width, height, radius) { const centerProb = data[y * width + x]; for (let dy = -radius; dy <= radius; dy++) { @@ -159,12 +217,9 @@ class DetectionProcessor { return true; } - // 改进的聚类算法 enhancedCluster(points, distanceThreshold) { const clusters = []; const visited = new Set(); - - // 按概率降序排序,优先处理高置信度点 const sortedPoints = [...points].sort((a, b) => b.prob - a.prob); for (let i = 0; i < sortedPoints.length; i++) { @@ -180,8 +235,7 @@ class DetectionProcessor { cluster.push(currentPoint); // 动态调整搜索半径 - const adaptiveThreshold = distanceThreshold * - (1 + (1 - currentPoint.prob) * 0.5); + const adaptiveThreshold = distanceThreshold * (1 + (1 - currentPoint.prob) * 0.3); for (let j = 0; j < sortedPoints.length; j++) { if (visited.has(j)) continue; @@ -209,61 +263,17 @@ class DetectionProcessor { scaleBoxToProcessedImage(box, processedImage) { const { width: processedWidth, height: processedHeight } = processedImage; - - const scaledBox = { - x1: box.x1, - y1: box.y1, - x2: box.x2, - y2: box.y2, - x3: box.x3, - y3: box.y3, - x4: box.x4, - y4: box.y4 - }; - const clamp = (value, max) => Math.max(0, Math.min(max, value)); return { - x1: clamp(scaledBox.x1, processedWidth - 1), - y1: clamp(scaledBox.y1, processedHeight - 1), - x2: clamp(scaledBox.x2, processedWidth - 1), - y2: clamp(scaledBox.y2, processedHeight - 1), - x3: clamp(scaledBox.x3, processedWidth - 1), - y3: clamp(scaledBox.y3, processedHeight - 1), - x4: clamp(scaledBox.x4, processedWidth - 1), - y4: clamp(scaledBox.y4, processedHeight - 1) - }; - } - - scaleBoxToOriginalImage(box, processedImage) { - const { - scaleX, scaleY, - paddingX, paddingY, - originalWidth, originalHeight - } = processedImage; - - const paddedX1 = box.x1 * scaleX; - const paddedY1 = box.y1 * scaleY; - const paddedX3 = box.x3 * scaleX; - const paddedY3 = box.y3 * scaleY; - - const originalX1 = paddedX1 - paddingX; - const originalY1 = paddedY1 - paddingY; - const originalX3 = paddedX3 - paddingX; - const originalY3 = paddedY3 - paddingY; - - const clamp = (value, max) => Math.max(0, Math.min(max, value)); - - return { - x1: clamp(originalX1, originalWidth - 1), - y1: clamp(originalY1, originalHeight - 1), - x2: clamp(originalX3, originalWidth - 1), - y2: clamp(originalY1, originalHeight - 1), - x3: clamp(originalX3, originalWidth - 1), - y3: clamp(originalY3, originalHeight - 1), - x4: clamp(originalX1, originalWidth - 1), - y4: clamp(originalY3, originalHeight - 1), - confidence: box.confidence + x1: clamp(box.x1, processedWidth - 1), + y1: clamp(box.y1, processedHeight - 1), + x2: clamp(box.x2, processedWidth - 1), + y2: clamp(box.y2, processedHeight - 1), + x3: clamp(box.x3, processedWidth - 1), + y3: clamp(box.y3, processedHeight - 1), + x4: clamp(box.x4, processedWidth - 1), + y4: clamp(box.y4, processedHeight - 1) }; } } diff --git a/server/utils/imagePreprocessor.js b/server/utils/imagePreprocessor.js index b30e3ac..1d2921a 100644 --- a/server/utils/imagePreprocessor.js +++ b/server/utils/imagePreprocessor.js @@ -4,20 +4,28 @@ import sharp from 'sharp'; class ImagePreprocessor { constructor() { this.tempDir = './temp/processed'; + this.logger = { + info: (msg, ...args) => console.log(`🖼️ [预处理] ${msg}`, ...args), + error: (msg, ...args) => console.error(`❌ [预处理] ${msg}`, ...args), + debug: (msg, ...args) => console.debug(`❌ [预处理] ${msg}`, ...args) + }; } async preprocessWithPadding(imagePath, config) { + const startTime = Date.now(); + this.logger.info(`开始预处理: ${imagePath}`); + try { const metadata = await sharp(imagePath).metadata(); + this.logger.info(`原始尺寸: ${metadata.width}x${metadata.height}`); - // 减少填充,避免过度改变图像 - const minPadding = 30; - const paddingX = Math.max(minPadding, Math.floor(metadata.width * 0.05)); - const paddingY = Math.max(minPadding, Math.floor(metadata.height * 0.05)); - + // 智能填充策略 + const { paddingX, paddingY } = this.calculateSmartPadding(metadata); const paddedWidth = metadata.width + paddingX * 2; const paddedHeight = metadata.height + paddingY * 2; + this.logger.debug(`添加填充: ${paddingX}x${paddingY}, 新尺寸: ${paddedWidth}x${paddedHeight}`); + const paddedBuffer = await sharp(imagePath) .extend({ top: paddingY, @@ -39,30 +47,42 @@ class ImagePreprocessor { .png() .toBuffer(); - console.log(`🖼️ 图像预处理完成: ${metadata.width}x${metadata.height} -> ${width}x${height}`); + const processingTime = Date.now() - startTime; + this.logger.info(`预处理完成: ${width}x${height}, 耗时${processingTime}ms`); return { processedImage: { buffer: resizedBuffer, - width, - height, + width, height, originalWidth: metadata.width, originalHeight: metadata.height, - paddedWidth: paddedWidth, - paddedHeight: paddedHeight, - paddingX, - paddingY, + paddedWidth, paddedHeight, + paddingX, paddingY, scaleX: paddedWidth / width, scaleY: paddedHeight / height } }; } catch (error) { - console.error('预处理错误:', error); + this.logger.error('预处理错误', error); throw error; } } + calculateSmartPadding(metadata) { + const basePadding = 20; + const minPadding = 15; + + // 根据图像尺寸动态调整填充 + const widthRatio = Math.max(0.02, Math.min(0.08, 100 / metadata.width)); + const heightRatio = Math.max(0.02, Math.min(0.08, 100 / metadata.height)); + + return { + paddingX: Math.max(minPadding, Math.floor(metadata.width * widthRatio)), + paddingY: Math.max(minPadding, Math.floor(metadata.height * heightRatio)) + }; + } + resizeForDetection(metadata, config) { const { width, height } = metadata; const limitSideLen = config.detLimitSideLen || 960; @@ -70,15 +90,18 @@ class ImagePreprocessor { let ratio = 1; if (Math.max(width, height) > limitSideLen) { ratio = limitSideLen / Math.max(width, height); + this.logger.debug(`缩放比例: ${ratio.toFixed(4)}`); } const newWidth = Math.floor(width * ratio); const newHeight = Math.floor(height * ratio); - return { - width: Math.max(32, Math.floor(newWidth / 32) * 32), - height: Math.max(32, Math.floor(newHeight / 32) * 32) - }; + // 确保尺寸是32的倍数 + const finalWidth = Math.max(32, Math.floor(newWidth / 32) * 32); + const finalHeight = Math.max(32, Math.floor(newHeight / 32) * 32); + + this.logger.debug(`调整后尺寸: ${finalWidth}x${finalHeight}`); + return { width: finalWidth, height: finalHeight }; } async getImageInfo(imagePath) { @@ -91,11 +114,9 @@ class ImagePreprocessor { processed: false }; } catch (error) { + this.logger.error('获取图像信息失败', error); return { - width: 0, - height: 0, - format: 'unknown', - processed: false + width: 0, height: 0, format: 'unknown', processed: false }; } } diff --git a/server/utils/onnxOcrManager.js b/server/utils/onnxOcrManager.js index a7572fd..68e940e 100644 --- a/server/utils/onnxOcrManager.js +++ b/server/utils/onnxOcrManager.js @@ -1,5 +1,6 @@ // server/utils/onnxOcrManager.js import { InferenceSession } from 'onnxruntime-node'; +import sharp from 'sharp'; import fse from 'fs-extra'; import * as path from 'path'; import { fileURLToPath } from 'url'; @@ -29,31 +30,41 @@ class OnnxOcrManager { this.imagePreprocessor = new ImagePreprocessor(); this.textPostProcessor = new TextPostProcessor(); - // 更新默认配置,优化识别效果 + this.logger = { + info: (msg, ...args) => console.log(`🚀 [OCR管理器] ${msg}`, ...args), + error: (msg, ...args) => console.error(`❌ [OCR管理器] ${msg}`, ...args), + debug: (msg, ...args) => console.log(`🐛 [OCR管理器] ${msg}`, ...args) + }; + + // 确保可视化目录存在 + this.visualizationDir = path.join(process.cwd(), 'temp', 'visualization'); + fse.ensureDirSync(this.visualizationDir); + + // 优化配置参数 this.defaultConfig = { language: 'ch', detLimitSideLen: 960, - detThresh: 0.05, // 降低检测阈值 - detBoxThresh: 0.1, // 降低框阈值 - detUnclipRatio: 1.8, // 调整解压缩比例 - maxTextLength: 50, // 增加最大文本长度 + detThresh: 0.05, + detBoxThresh: 0.08, + detUnclipRatio: 1.8, + maxTextLength: 100, recImageHeight: 48, - clsThresh: 0.8, // 降低分类阈值 - minTextHeight: 2, // 降低最小文本高度 - minTextWidth: 2, // 降低最小文本宽度 - clusterDistance: 8, // 调整聚类距离 - minClusterPoints: 2 // 降低最小聚类点数 + clsThresh: 0.7, + minTextHeight: 1, + minTextWidth: 1, + clusterDistance: 8, + minClusterPoints: 1 }; } async initialize(config = {}) { if (this.isInitialized) { - console.log('🔁 OCR管理器已初始化'); + this.logger.info('OCR管理器已初始化'); return; } try { - console.log('🚀 开始初始化OCR管理器...'); + this.logger.info('开始初始化OCR管理器...'); await this.validateModelFiles(); await this.recognitionProcessor.loadCharacterSet(this.keysPath); @@ -73,10 +84,10 @@ class OnnxOcrManager { this.recognitionProcessor.initialize(this.recSession, this.clsSession, mergedConfig); this.isInitialized = true; - console.log('✅ OCR管理器初始化完成'); + this.logger.info('OCR管理器初始化完成'); } catch (error) { - console.error('❌ OCR管理器初始化失败:', error); + this.logger.error('初始化失败', error); throw error; } } @@ -94,8 +105,9 @@ class OnnxOcrManager { if (!exists) { throw new Error(`模型文件不存在: ${filePath}`); } + this.logger.debug(`验证通过: ${name}`); } - console.log('✅ 所有模型文件验证通过'); + this.logger.info('所有模型文件验证通过'); } async recognizeImage(imagePath, config = {}) { @@ -112,13 +124,17 @@ class OnnxOcrManager { } try { - console.log(`\n🎯 开始OCR识别: ${path.basename(imagePath)}`); + this.logger.info(`开始OCR识别: ${path.basename(imagePath)}`); const startTime = Date.now(); const preprocessResult = await this.imagePreprocessor.preprocessWithPadding(imagePath, config); const { processedImage } = preprocessResult; const textBoxes = await this.detectionProcessor.detectText(processedImage); + + // 在原始图像上绘制文本框 + await this.drawTextBoxesOnOriginalImage(imagePath, textBoxes, processedImage); + const recognitionResults = await this.recognitionProcessor.recognizeTextWithCls(processedImage, textBoxes); const processingTime = Date.now() - startTime; @@ -138,30 +154,166 @@ class OnnxOcrManager { totalPages: 1, rawText, imageInfo, - recognitionCount: recognitionResults.length + recognitionCount: recognitionResults.length, + detectionCount: textBoxes.length, + visualizationPath: this.getVisualizationPath(imagePath) }; - console.log(`\n📊 OCR识别统计:`); - console.log(` - 处理时间: ${processingTime}ms`); - console.log(` - 检测区域: ${textBoxes.length} 个`); - console.log(` - 成功识别: ${recognitionResults.length} 个`); - console.log(` - 总体置信度: ${overallConfidence.toFixed(4)}`); - console.log(` - 最终文本长度: ${rawText.length} 字符`); + this.logger.info(`OCR识别完成: + - 处理时间: ${processingTime}ms + - 检测区域: ${textBoxes.length}个 + - 成功识别: ${recognitionResults.length}个 + - 总体置信度: ${overallConfidence.toFixed(4)} + - 最终文本: ${rawText.length}字符 + - 可视化图像: ${result.visualizationPath}`); return result; } catch (error) { - console.error(`❌ OCR识别失败: ${error.message}`); + this.logger.error(`OCR识别失败: ${error.message}`); throw new Error(`OCR识别失败: ${error.message}`); } } + async drawTextBoxesOnOriginalImage(originalImagePath, textBoxes, processedImage) { + try { + this.logger.info('开始在原始图像上绘制文本框'); + + // 读取原始图像 + const originalImage = sharp(originalImagePath); + const metadata = await originalImage.metadata(); + + // 创建SVG绘制指令 + const svgOverlay = this.createTextBoxesSVG(textBoxes, processedImage, metadata); + + // 将SVG叠加到原始图像上 + const visualizationPath = this.getVisualizationPath(originalImagePath); + await originalImage + .composite([{ + input: Buffer.from(svgOverlay), + top: 0, + left: 0 + }]) + .png() + .toFile(visualizationPath); + + this.logger.info(`文本框可视化图像已保存: ${visualizationPath}`); + + } catch (error) { + this.logger.error('绘制文本框失败', error); + } + } + + createTextBoxesSVG(textBoxes, processedImage, originalMetadata) { + const { width, height } = originalMetadata; + + let svg = ``; + + // 定义样式 + svg += ` + + `; + + textBoxes.forEach((box, index) => { + // 将处理后的图像坐标转换回原始图像坐标 + const originalBox = this.scaleBoxToOriginalImage(box, processedImage); + + // 根据置信度选择颜色 + const boxClass = box.confidence > 0.8 ? 'text-box-high-conf' : 'text-box'; + + // 绘制文本框(多边形) + const points = [ + `${originalBox.x1},${originalBox.y1}`, + `${originalBox.x2},${originalBox.y2}`, + `${originalBox.x3},${originalBox.y3}`, + `${originalBox.x4},${originalBox.y4}` + ].join(' '); + + svg += ``; + + // 在框上方添加索引和置信度标签 + const labelX = Math.min(originalBox.x1, originalBox.x2, originalBox.x3, originalBox.x4); + const labelY = Math.min(originalBox.y1, originalBox.y2, originalBox.y3, originalBox.y4) - 5; + + if (labelY > 15) { // 确保标签在图像范围内 + svg += `${index + 1} (${box.confidence.toFixed(2)})`; + } + }); + + svg += ''; + return svg; + } + + scaleBoxToOriginalImage(box, processedImage) { + const { + scaleX, scaleY, + paddingX, paddingY, + originalWidth, originalHeight + } = processedImage; + + // 将处理后的图像坐标转换回填充后的图像坐标 + const paddedX1 = box.x1 * scaleX; + const paddedY1 = box.y1 * scaleY; + const paddedX2 = box.x2 * scaleX; + const paddedY2 = box.y2 * scaleY; + const paddedX3 = box.x3 * scaleX; + const paddedY3 = box.y3 * scaleY; + const paddedX4 = box.x4 * scaleX; + const paddedY4 = box.y4 * scaleY; + + // 去除填充,得到原始图像坐标 + const originalX1 = paddedX1 - paddingX; + const originalY1 = paddedY1 - paddingY; + const originalX2 = paddedX2 - paddingX; + const originalY2 = paddedY2 - paddingY; + const originalX3 = paddedX3 - paddingX; + const originalY3 = paddedY3 - paddingY; + const originalX4 = paddedX4 - paddingX; + const originalY4 = paddedY4 - paddingY; + + const clamp = (value, max) => Math.max(0, Math.min(max, value)); + + return { + x1: clamp(originalX1, originalWidth - 1), + y1: clamp(originalY1, originalHeight - 1), + x2: clamp(originalX2, originalWidth - 1), + y2: clamp(originalY2, originalHeight - 1), + x3: clamp(originalX3, originalWidth - 1), + y3: clamp(originalY3, originalHeight - 1), + x4: clamp(originalX4, originalWidth - 1), + y4: clamp(originalY4, originalHeight - 1), + confidence: box.confidence + }; + } + + getVisualizationPath(originalImagePath) { + const originalName = path.basename(originalImagePath, path.extname(originalImagePath)); + const timestamp = Date.now(); + return path.join(this.visualizationDir, `${originalName}-detection-${timestamp}.png`); + } + getStatus() { return { isInitialized: this.isInitialized, isOffline: true, engine: 'PP-OCRv3 (ONNX Runtime)', - version: '1.0.0', + version: '2.0.0', models: { detection: path.relative(process.cwd(), this.detModelPath), recognition: path.relative(process.cwd(), this.recModelPath), @@ -172,28 +324,11 @@ class OnnxOcrManager { detThresh: this.defaultConfig.detThresh, detBoxThresh: this.defaultConfig.detBoxThresh, clsThresh: this.defaultConfig.clsThresh, - preprocessing: 'enabled with padding' + preprocessing: 'enhanced with smart padding' }, backend: 'CPU' }; } - - async terminate() { - if (this.detSession) { - this.detSession.release(); - this.detSession = null; - } - if (this.recSession) { - this.recSession.release(); - this.recSession = null; - } - if (this.clsSession) { - this.clsSession.release(); - this.clsSession = null; - } - this.isInitialized = false; - console.log('🛑 OCR管理器已终止'); - } } const onnxOcrManager = new OnnxOcrManager(); diff --git a/server/utils/recognitionProcessor.js b/server/utils/recognitionProcessor.js index 22e642c..df7c27f 100644 --- a/server/utils/recognitionProcessor.js +++ b/server/utils/recognitionProcessor.js @@ -61,7 +61,7 @@ class RecognitionProcessor { recognitionImage = await this.textRegionCropper.rotateImage(textRegion.buffer, 180); } - const textResult = await this.textRecognizer.recognizeText(recognitionImage); + const textResult = await this.textRecognizer.recognizeText(recognitionImage, i + 1); if (textResult.text && textResult.text.trim().length > 0 && textResult.confidence > 0.05) { const originalBox = this.scaleBoxToOriginalImage(box, processedImage); diff --git a/server/utils/textRecognizer.js b/server/utils/textRecognizer.js index 7e82eca..37fcf55 100644 --- a/server/utils/textRecognizer.js +++ b/server/utils/textRecognizer.js @@ -10,12 +10,23 @@ class TextRecognizer { this.config = null; this.characterSet = []; this.debugDir = path.join(process.cwd(), 'temp', 'debug'); + this.preprocessedDir = path.join(process.cwd(), 'temp', 'preprocessed'); + this.logger = { + info: (msg, ...args) => console.log(`🔤 [识别] ${msg}`, ...args), + error: (msg, ...args) => console.error(`❌ [识别] ${msg}`, ...args), + debug: (msg, ...args) => console.log(`🐛 [识别] ${msg}`, ...args), + warn: (msg, ...args) => console.warn(`🐛 [识别] ${msg}`, ...args) + }; + + // 确保目录存在 fse.ensureDirSync(this.debugDir); + fse.ensureDirSync(this.preprocessedDir); } initialize(recSession, config) { this.recSession = recSession; this.config = config; + this.logger.info('文本识别器初始化完成'); } async loadCharacterSet(keysPath) { @@ -24,346 +35,398 @@ class TextRecognizer { this.characterSet = []; const lines = keysContent.split('\n'); + // 使用提供的字符集文件 + const uniqueChars = new Set(); + for (const line of lines) { const trimmed = line.trim(); + // 跳过空行和注释行 if (trimmed && !trimmed.startsWith('#')) { - for (const char of trimmed) { - if (char.trim() && !this.characterSet.includes(char)) { - this.characterSet.push(char); - } - } + // 将每行作为一个完整的字符处理 + uniqueChars.add(trimmed); } } + this.characterSet = Array.from(uniqueChars); + if (this.characterSet.length === 0) { throw new Error('字符集文件为空或格式不正确'); } - console.log(`✅ 字符集加载完成,共 ${this.characterSet.length} 个字符`); + this.logger.info(`字符集加载完成: ${this.characterSet.length}个字符`); + + // 记录字符集统计信息 + const charTypes = { + chinese: 0, + english: 0, + digit: 0, + punctuation: 0, + other: 0 + }; + + this.characterSet.forEach(char => { + if (/[\u4e00-\u9fff]/.test(char)) { + charTypes.chinese++; + } else if (/[a-zA-Z]/.test(char)) { + charTypes.english++; + } else if (/[0-9]/.test(char)) { + charTypes.digit++; + } else if (/[,。!?;:""()【】《》…—·]/.test(char)) { + charTypes.punctuation++; + } else { + charTypes.other++; + } + }); + + this.logger.debug(`字符集统计: 中文${charTypes.chinese}, 英文${charTypes.english}, 数字${charTypes.digit}, 标点${charTypes.punctuation}, 其他${charTypes.other}`); + this.logger.debug(`前20个字符: ${this.characterSet.slice(0, 20).join('')}`); } catch (error) { - console.error('❌ 加载字符集失败,使用默认字符集:', error.message); - this.characterSet = this.getDefaultCharacterSet(); + this.logger.error('加载字符集失败', error.message); + // 完全使用提供的字符集,失败时抛出错误 + throw new Error(`字符集加载失败: ${error.message}`); } } - getDefaultCharacterSet() { - const defaultSet = []; - for (let i = 0; i <= 9; i++) defaultSet.push(i.toString()); - for (let i = 97; i <= 122; i++) defaultSet.push(String.fromCharCode(i)); - for (let i = 65; i <= 90; i++) defaultSet.push(String.fromCharCode(i)); - defaultSet.push(...' ,。!?;:""()【】《》…—·'.split('')); - - const commonChinese = '的一是不了在人有的我他这个们中来就时大地为子中你说道生国年着就那和要她出也得里后自以会家可下而过天去能对小多然于心学么之都好看起发当没成只如事把还用第样道想作种开美总从无情已面最女但现前些所同日手又行意动方期它头经长儿回位分爱老因很给名法间斯知世什两次使身者被高已亲其进此话常与活正感'; - for (const char of commonChinese) { - defaultSet.push(char); - } - - console.log(`📝 使用默认字符集,共 ${defaultSet.length} 个字符`); - return defaultSet; - } - getCharacterSetSize() { return this.characterSet.length; } - async recognizeText(textRegionBuffer) { - console.log('🔠 === 开始文本识别流程 ==='); + async recognizeText(textRegionBuffer, regionIndex = 0) { + const startTime = Date.now(); + this.logger.info(`开始文本识别 - 区域 ${regionIndex}`); try { - console.log('📥 1. 准备识别输入...'); - console.log(` - 输入图像大小: ${textRegionBuffer.length} 字节`); - - const inputTensor = await this.prepareRecognitionInput(textRegionBuffer); - console.log('✅ 输入张量准备完成'); - console.log(` - 张量形状: [${inputTensor.dims.join(', ')}]`); - console.log(` - 张量类型: ${inputTensor.type}`); - console.log(` - 数据长度: ${inputTensor.data.length}`); - - // 数据验证 - const tensorData = inputTensor.data; - let minVal = Infinity; - let maxVal = -Infinity; - let sumVal = 0; - let validCount = 0; - - for (let i = 0; i < Math.min(100, tensorData.length); i++) { - const val = tensorData[i]; - if (!isNaN(val) && isFinite(val)) { - minVal = Math.min(minVal, val); - maxVal = Math.max(maxVal, val); - sumVal += val; - validCount++; - } - } - - console.log(` - 数据范围: ${minVal.toFixed(4)} ~ ${maxVal.toFixed(4)}`); - console.log(` - 数据均值: ${(sumVal / validCount).toFixed(4)}`); - - console.log('🧠 2. 执行模型推理...'); - const startInference = Date.now(); + const inputTensor = await this.prepareRecognitionInput(textRegionBuffer, regionIndex); const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor }); - const inferenceTime = Date.now() - startInference; - console.log(`✅ 模型推理完成 (${inferenceTime}ms)`); - - const outputNames = this.recSession.outputNames; - console.log(` - 输出数量: ${outputNames.length}`); - - outputNames.forEach((name, index) => { - const output = outputs[name]; - if (output) { - console.log(` - 输出 ${index + 1} (${name}): 形状 [${output.dims.join(', ')}]`); - } - }); - - console.log('🔍 3. 后处理识别结果...'); const result = this.postprocessRecognition(outputs); - console.log('✅ 后处理完成'); - console.log(` - 识别文本: "${result.text}"`); - console.log(` - 置信度: ${result.confidence.toFixed(4)}`); - console.log(` - 文本长度: ${result.text.length} 字符`); - console.log('🎉 === 文本识别流程完成 ==='); + const processingTime = Date.now() - startTime; + this.logger.info(`识别完成 - 区域 ${regionIndex}: "${result.text}", 置信度: ${result.confidence.toFixed(4)}, 耗时: ${processingTime}ms`); + return result; } catch (error) { - console.error('❌ 文本识别失败:'); - console.error(` - 错误信息: ${error.message}`); + this.logger.error(`文本识别失败 - 区域 ${regionIndex}`, error); return { text: '', confidence: 0 }; } } - async prepareRecognitionInput(textRegionBuffer) { - console.log(' 📝 准备识别输入详情:'); + async prepareRecognitionInput(textRegionBuffer, regionIndex = 0) { + this.logger.debug(`准备识别输入 - 区域 ${regionIndex}`); + + const targetHeight = 48; + const targetWidth = 320; // 原始目标宽度 + const finalWidth = targetWidth + 20; // 最终宽度(左右各加10像素) + const timestamp = Date.now(); try { - const targetHeight = 48; - const targetWidth = 320; - const metadata = await sharp(textRegionBuffer).metadata(); - console.log(` - 原始图像尺寸: ${metadata.width}x${metadata.height}`); + this.logger.debug(`原始区域 ${regionIndex}: ${metadata.width}x${metadata.height}`); - // 保存原始图像用于调试 - const originalPath = path.join(this.debugDir, `original-${Date.now()}.png`); + // 保存原始裁剪区域图像 + const originalPath = path.join(this.preprocessedDir, `region-${regionIndex}-original-${timestamp}.png`); await fse.writeFile(originalPath, textRegionBuffer); + this.logger.debug(`保存原始区域图像: ${originalPath}`); - // 关键修复:正确的预处理流程 - let processedBuffer = textRegionBuffer; - - // 1. 分析图像特性 - const stats = await sharp(processedBuffer) - .grayscale() - .stats(); + // 图像分析 + const stats = await sharp(textRegionBuffer).grayscale().stats(); const meanBrightness = stats.channels[0].mean; const stdDev = stats.channels[0].stdev; - console.log(` - 图像统计: 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`); + this.logger.debug(`图像统计 - 区域 ${regionIndex}: 亮度=${meanBrightness.toFixed(1)}, 对比度=${stdDev.toFixed(1)}`); - // 2. 改进的预处理策略 - if (meanBrightness > 200 && stdDev < 30) { - console.log(' - 检测到高亮度图像,进行对比度增强'); - processedBuffer = await sharp(processedBuffer) - .linear(1.5, -50) - .normalize() - .grayscale() - .toBuffer(); - } else if (meanBrightness < 80) { - console.log(' - 检测到低亮度图像,进行亮度调整'); - processedBuffer = await sharp(processedBuffer) - .linear(1.2, 30) - .normalize() - .grayscale() - .toBuffer(); - } else { - console.log(' - 使用标准化灰度处理'); - processedBuffer = await sharp(processedBuffer) - .normalize() - .grayscale() - .toBuffer(); - } + // 智能预处理 + let processedBuffer = await this.applySmartPreprocessing(textRegionBuffer, meanBrightness, stdDev, regionIndex); - // 3. 保持宽高比的resize - const originalAspectRatio = metadata.width / metadata.height; - const targetAspectRatio = targetWidth / targetHeight; - - let resizeWidth, resizeHeight; - - if (originalAspectRatio > targetAspectRatio) { - // 宽度限制 - resizeWidth = targetWidth; - resizeHeight = Math.round(targetWidth / originalAspectRatio); - } else { - // 高度限制 - resizeHeight = targetHeight; - resizeWidth = Math.round(targetHeight * originalAspectRatio); - } - - // 确保尺寸有效 - resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth)); - resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight)); - - processedBuffer = await sharp(processedBuffer) - .resize(resizeWidth, resizeHeight, { - fit: 'contain', - background: { r: 255, g: 255, b: 255 } - }) - .extend({ - top: 0, - bottom: targetHeight - resizeHeight, - left: 0, - right: targetWidth - resizeWidth, - background: { r: 255, g: 255, b: 255 } - }) - .png() - .toBuffer(); - - const processedMetadata = await sharp(processedBuffer).metadata(); - console.log(` - 处理后尺寸: ${processedMetadata.width}x${processedMetadata.height}`); - - // 保存预处理后的图像用于调试 - const processedPath = path.join(this.debugDir, `processed-${Date.now()}.png`); + // 保存预处理后的图像(灰度+对比度调整后) + const processedPath = path.join(this.preprocessedDir, `region-${regionIndex}-processed-${timestamp}.png`); await fse.writeFile(processedPath, processedBuffer); + this.logger.debug(`保存预处理图像: ${processedPath}`); - // 4. 转换为张量 - 关键修复:正确的归一化 - console.log(' - 转换为张量数据...'); - const imageData = await sharp(processedBuffer) - .ensureAlpha() - .raw() - .toBuffer({ resolveWithObject: true }); + // 保持宽高比的resize,并在左右添加10像素空白 + const resizedBuffer = await this.resizeWithAspectRatio(processedBuffer, targetWidth, targetHeight, regionIndex); - const inputData = new Float32Array(3 * targetHeight * targetWidth); - const data = imageData.data; - const channels = imageData.info.channels; + // 保存调整大小后的图像 + const resizedPath = path.join(this.preprocessedDir, `region-${regionIndex}-resized-${timestamp}.png`); + await fse.writeFile(resizedPath, resizedBuffer); + this.logger.debug(`保存调整大小图像: ${resizedPath}`); - // 使用正确的归一化方法 - for (let i = 0; i < data.length; i += channels) { - const pixelIndex = Math.floor(i / channels); - const y = Math.floor(pixelIndex / targetWidth); - const x = pixelIndex % targetWidth; + // 使用最终尺寸创建张量 + const inputData = await this.bufferToTensor(resizedBuffer, finalWidth, targetHeight); + this.logger.debug(`识别输入张量准备完成 - 区域 ${regionIndex}`); - // 对每个位置,三个通道使用相同的灰度值 - const grayValue = data[i] / 255.0; - - for (let c = 0; c < 3; c++) { - const inputIndex = c * targetHeight * targetWidth + y * targetWidth + x; - if (inputIndex < inputData.length) { - inputData[inputIndex] = grayValue; - } - } - } - - console.log(` - 输入数据长度: ${inputData.length}`); - - // 数据验证 - let validCount = 0; - let sumValue = 0; - let minValue = Infinity; - let maxValue = -Infinity; - - for (let i = 0; i < Math.min(100, inputData.length); i++) { - const val = inputData[i]; - if (!isNaN(val) && isFinite(val)) { - validCount++; - sumValue += val; - minValue = Math.min(minValue, val); - maxValue = Math.max(maxValue, val); - } - } - - console.log(` - 数据验证: 有效=${validCount}`); - console.log(` - 数据范围: ${minValue.toFixed(4)} ~ ${maxValue.toFixed(4)}`); - console.log(` - 数据均值: ${(sumValue / validCount).toFixed(4)}`); - - return new Tensor('float32', inputData, [1, 3, targetHeight, targetWidth]); + // 创建张量时使用最终尺寸 + return new Tensor('float32', inputData, [1, 3, targetHeight, finalWidth]); } catch (error) { - console.error(` ❌ 准备输入失败: ${error.message}`); - // 返回有效的默认张量 - return new Tensor('float32', new Float32Array(3 * 48 * 320).fill(0.5), [1, 3, 48, 320]); + this.logger.error(`准备识别输入失败 - 区域 ${regionIndex}`, error); + return new Tensor('float32', new Float32Array(3 * targetHeight * finalWidth).fill(0.5), [1, 3, targetHeight, finalWidth]); } } + async applySmartPreprocessing(buffer, meanBrightness, stdDev, regionIndex = 0) { + let processedBuffer = buffer; + + if (meanBrightness > 200 && stdDev < 30) { + this.logger.debug(`区域 ${regionIndex}: 应用高亮度图像增强`); + processedBuffer = await sharp(buffer) + .linear(1.5, -50) + .normalize() + .grayscale() + .toBuffer(); + } else if (meanBrightness < 80) { + this.logger.debug(`区域 ${regionIndex}: 应用低亮度图像增强`); + processedBuffer = await sharp(buffer) + .linear(1.2, 30) + .normalize() + .grayscale() + .toBuffer(); + } else if (stdDev < 20) { + this.logger.debug(`区域 ${regionIndex}: 应用低对比度增强`); + processedBuffer = await sharp(buffer) + .linear(1.3, -20) + .normalize() + .grayscale() + .toBuffer(); + } else { + this.logger.debug(`区域 ${regionIndex}: 应用标准化灰度处理`); + processedBuffer = await sharp(buffer) + .normalize() + .grayscale() + .toBuffer(); + } + + return processedBuffer; + } + + async resizeWithAspectRatio(buffer, targetWidth, targetHeight, regionIndex = 0) { + const metadata = await sharp(buffer).metadata(); + const originalAspectRatio = metadata.width / metadata.height; + const targetAspectRatio = targetWidth / targetHeight; + + let resizeWidth, resizeHeight; + + if (originalAspectRatio > targetAspectRatio) { + // 宽度限制,按宽度缩放 + resizeWidth = targetWidth; + resizeHeight = Math.round(targetWidth / originalAspectRatio); + } else { + // 高度限制,按高度缩放 + resizeHeight = targetHeight; + resizeWidth = Math.round(targetHeight * originalAspectRatio); + } + + resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth)); + resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight)); + + this.logger.debug(`区域 ${regionIndex}: 调整尺寸 ${metadata.width}x${metadata.height} -> ${resizeWidth}x${resizeHeight}`); + + // 计算居中的偏移量 + const offsetX = Math.floor((targetWidth - resizeWidth) / 2); + const offsetY = Math.floor((targetHeight - resizeHeight) / 2); + + this.logger.debug(`区域 ${regionIndex}: 居中偏移 X=${offsetX}, Y=${offsetY}`); + + // 先调整大小并居中 + let resizedBuffer = await sharp(buffer) + .resize(resizeWidth, resizeHeight, { + fit: 'contain', + background: { r: 255, g: 255, b: 255 } + }) + .extend({ + top: offsetY, + bottom: targetHeight - resizeHeight - offsetY, + left: offsetX, + right: targetWidth - resizeWidth - offsetX, + background: { r: 255, g: 255, b: 255 } + }) + .png() + .toBuffer(); + + // 在左右各添加10像素空白 + const finalWidth = targetWidth + 20; // 左右各加10像素 + const finalHeight = targetHeight; + + resizedBuffer = await sharp(resizedBuffer) + .extend({ + top: 0, + bottom: 0, + left: 10, + right: 10, + background: { r: 255, g: 255, b: 255 } + }) + .png() + .toBuffer(); + + this.logger.debug(`区域 ${regionIndex}: 最终尺寸 ${finalWidth}x${finalHeight} (左右各加10像素空白)`); + + return resizedBuffer; + } + + async bufferToTensor(buffer, width, height) { + // 获取实际图像尺寸(因为现在宽度增加了20像素) + const metadata = await sharp(buffer).metadata(); + const actualWidth = metadata.width; + const actualHeight = metadata.height; + + const imageData = await sharp(buffer) + .ensureAlpha() + .raw() + .toBuffer({ resolveWithObject: true }); + + // 使用实际尺寸创建张量 + const inputData = new Float32Array(3 * actualHeight * actualWidth); + const data = imageData.data; + + for (let i = 0; i < data.length; i += 4) { + const pixelIndex = Math.floor(i / 4); + const y = Math.floor(pixelIndex / actualWidth); + const x = pixelIndex % actualWidth; + + // 使用灰度值填充三个通道 + const grayValue = data[i] / 255.0; + + for (let c = 0; c < 3; c++) { + const inputIndex = c * actualHeight * actualWidth + y * actualWidth + x; + if (inputIndex < inputData.length) { + inputData[inputIndex] = grayValue; + } + } + } + + return inputData; + } + postprocessRecognition(outputs) { - console.log(' 📝 后处理识别结果详情:'); + this.logger.debug('开始识别后处理'); try { const outputNames = this.recSession.outputNames; const recognitionOutput = outputs[outputNames[0]]; if (!recognitionOutput) { - console.log(' ❌ 识别输出为空'); + this.logger.debug('识别输出为空'); return { text: '', confidence: 0 }; } const data = recognitionOutput.data; const [batch, seqLen, vocabSize] = recognitionOutput.dims; - console.log(` - 序列长度: ${seqLen}, 词汇表大小: ${vocabSize}`); - console.log(` - 输出数据总数: ${data.length}`); - console.log(` - 字符集大小: ${this.characterSet.length}`); + this.logger.debug(`序列长度: ${seqLen}, 词汇表大小: ${vocabSize}, 字符集大小: ${this.characterSet.length}`); if (this.characterSet.length === 0) { - console.log(' ❌ 字符集为空'); + this.logger.error('字符集为空'); return { text: '', confidence: 0 }; } - // 改进的CTC解码算法 - let text = ''; - let lastCharIndex = -1; - let confidenceSum = 0; - let charCount = 0; + // 验证词汇表大小与字符集大小的匹配 + if (vocabSize !== this.characterSet.length + 1) { + this.logger.warn(`词汇表大小(${vocabSize})与字符集大小(${this.characterSet.length})不匹配,可能影响识别效果`); + } - // 降低置信度阈值,提高召回率 - const confidenceThreshold = 0.05; + const { text, confidence } = this.ctcDecode(data, seqLen, vocabSize); + this.logger.debug(`解码结果: "${text}", 置信度: ${confidence.toFixed(4)}`); - console.log(' - 处理每个时间步:'); - for (let t = 0; t < seqLen; t++) { - let maxProb = -1; - let maxIndex = -1; + return { text, confidence }; - // 找到当前时间步的最大概率字符 - for (let i = 0; i < vocabSize; i++) { - const prob = data[t * vocabSize + i]; - if (prob > maxProb) { - maxProb = prob; - maxIndex = i; - } - } + } catch (error) { + this.logger.error('识别后处理失败', error); + return { text: '', confidence: 0 }; + } + } - // 改进的解码逻辑 - if (maxIndex > 0 && maxProb > confidenceThreshold) { - const char = this.characterSet[maxIndex - 1] || ''; + ctcDecode(data, seqLen, vocabSize) { + let text = ''; + let lastCharIndex = -1; + let confidenceSum = 0; + let charCount = 0; - // 放宽重复字符限制 - if (maxIndex !== lastCharIndex || maxProb > 0.8) { - if (char && char.trim() !== '') { - text += char; - confidenceSum += maxProb; - charCount++; - console.log(` [位置 ${t}] 字符: "${char}", 置信度: ${maxProb.toFixed(4)}`); - } - lastCharIndex = maxIndex; - } - } else if (maxIndex === 0) { - // 空白符,重置lastCharIndex - lastCharIndex = -1; + // 动态阈值调整 + const baseThreshold = 0.03; + let confidenceThreshold = baseThreshold; + + // 先分析整个序列的置信度分布 + let maxSequenceProb = 0; + for (let t = 0; t < seqLen; t++) { + for (let i = 0; i < vocabSize; i++) { + maxSequenceProb = Math.max(maxSequenceProb, data[t * vocabSize + i]); + } + } + + // 如果整体置信度较低,降低阈值 + if (maxSequenceProb < 0.5) { + confidenceThreshold = baseThreshold * 0.5; + } + + this.logger.debug(`使用解码阈值: ${confidenceThreshold.toFixed(4)}`); + + for (let t = 0; t < seqLen; t++) { + let maxProb = -1; + let maxIndex = -1; + + // 找到当前时间步的最大概率字符 + for (let i = 0; i < vocabSize; i++) { + const prob = data[t * vocabSize + i]; + if (prob > maxProb) { + maxProb = prob; + maxIndex = i; } } - const avgConfidence = charCount > 0 ? confidenceSum / charCount : 0; + // 改进的CTC解码逻辑 + if (maxIndex > 0 && maxProb > confidenceThreshold) { + const charIndex = maxIndex - 1; + if (charIndex < this.characterSet.length) { + const char = this.characterSet[charIndex]; - console.log(` - 识别结果: "${text}"`); - console.log(` - 字符数: ${charCount}, 平均置信度: ${avgConfidence.toFixed(4)}`); + // 更智能的重复字符处理 + const shouldAddChar = maxIndex !== lastCharIndex || + maxProb > 0.8 || + (maxIndex === lastCharIndex && charCount > 0 && text[text.length - 1] !== char); - return { - text: text, - confidence: avgConfidence - }; - - } catch (error) { - console.error(` ❌ 后处理失败: ${error.message}`); - return { text: '', confidence: 0 }; + if (shouldAddChar && char && char.trim() !== '') { + text += char; + confidenceSum += maxProb; + charCount++; + } + lastCharIndex = maxIndex; + } else { + this.logger.warn(`字符索引${charIndex}超出字符集范围(0-${this.characterSet.length-1})`); + } + } else if (maxIndex === 0) { + lastCharIndex = -1; + } } + + const avgConfidence = charCount > 0 ? confidenceSum / charCount : 0; + + // 基本的文本清理(不包含错误模式修复) + const cleanedText = this.basicTextCleaning(text); + + return { + text: cleanedText, + confidence: avgConfidence + }; + } + + basicTextCleaning(text) { + if (!text) return ''; + + let cleaned = text; + + // 1. 移除过多的重复字符(保留合理的重复) + cleaned = cleaned.replace(/([^0-9])\1{2,}/g, '$1$1'); + + // 2. 修复标点符号 + cleaned = cleaned.replace(/∶/g, ':') + .replace(/《/g, '(') + .replace(/》/g, ')'); + + // 3. 修复数字和百分号 + cleaned = cleaned.replace(/(\d+)%%/g, '$1%'); + + return cleaned.trim(); } } diff --git a/server/utils/textRegionCropper.js b/server/utils/textRegionCropper.js index d4ffa72..55d3895 100644 --- a/server/utils/textRegionCropper.js +++ b/server/utils/textRegionCropper.js @@ -1,17 +1,28 @@ // server/utils/textRegionCropper.js import sharp from 'sharp'; +import fse from 'fs-extra'; +import * as path from 'path'; class TextRegionCropper { constructor() { - // 可以在这里添加配置参数 + this.logger = { + info: (msg, ...args) => console.log(`✂️ [裁剪] ${msg}`, ...args), + debug: (msg, ...args) => console.log(`🐛 [裁剪] ${msg}`, ...args), + error: (msg, ...args) => console.error(`❌ [裁剪] ${msg}`, ...args) + }; + // 确保裁剪调试目录存在 + this.cropDebugDir = path.join(process.cwd(), 'temp', 'crop_debug'); + fse.ensureDirSync(this.cropDebugDir); } async cropTextRegion(imageBuffer, box, regionIndex) { + const timestamp = Date.now(); try { const metadata = await sharp(imageBuffer).metadata(); const imgWidth = metadata.width; const imgHeight = metadata.height; + // 计算文本框的边界 const left = Math.min(box.x1, box.x2, box.x3, box.x4); const top = Math.min(box.y1, box.y2, box.y3, box.y4); const right = Math.max(box.x1, box.x2, box.x3, box.x4); @@ -20,78 +31,54 @@ class TextRegionCropper { const originalWidth = right - left; const originalHeight = bottom - top; - // 减少扩展,避免引入过多背景 - const widthExpand = 10; - const heightExpand = 10; + // 四边各扩大5像素 + const expandPixels = 5; - const newWidth = originalWidth + widthExpand; - const newHeight = originalHeight + heightExpand; + const expandedLeft = Math.max(0, left - expandPixels); + const expandedTop = Math.max(0, top - expandPixels); + const expandedRight = Math.min(imgWidth - 1, right + expandPixels); + const expandedBottom = Math.min(imgHeight - 1, bottom + expandPixels); - const centerX = (left + right) / 2; - const centerY = (top + bottom) / 2; + const expandedWidth = expandedRight - expandedLeft; + const expandedHeight = expandedBottom - expandedTop; - const expandedLeft = Math.max(0, centerX - newWidth / 2); - const expandedTop = Math.max(0, centerY - newHeight / 2); - const expandedRight = Math.min(imgWidth - 1, centerX + newWidth / 2); - const expandedBottom = Math.min(imgHeight - 1, centerY + newHeight / 2); - - const finalWidth = expandedRight - expandedLeft; - const finalHeight = expandedBottom - expandedTop; - - if (finalWidth <= 0 || finalHeight <= 0) { - console.log(`❌ 区域 ${regionIndex}: 无效的裁剪区域`); + if (expandedWidth <= 0 || expandedHeight <= 0) { + this.logger.debug(`区域 ${regionIndex}: 无效的裁剪区域`); return null; } - let adjustedLeft = expandedLeft; - let adjustedTop = expandedTop; - let adjustedWidth = finalWidth; - let adjustedHeight = finalHeight; - - if (expandedLeft < 0) { - adjustedLeft = 0; - adjustedWidth = expandedRight; - } - if (expandedTop < 0) { - adjustedTop = 0; - adjustedHeight = expandedBottom; - } - if (expandedRight > imgWidth) { - adjustedWidth = imgWidth - adjustedLeft; - } - if (expandedBottom > imgHeight) { - adjustedHeight = imgHeight - adjustedTop; - } - const croppedBuffer = await sharp(imageBuffer) .extract({ - left: Math.floor(adjustedLeft), - top: Math.floor(adjustedTop), - width: Math.floor(adjustedWidth), - height: Math.floor(adjustedHeight) + left: Math.floor(expandedLeft), + top: Math.floor(expandedTop), + width: Math.floor(expandedWidth), + height: Math.floor(expandedHeight) }) .png() .toBuffer(); - console.log(`✂️ 区域 ${regionIndex}: 裁剪 ${Math.floor(adjustedWidth)}x${Math.floor(adjustedHeight)}`); + // 保存裁剪后的图像用于调试 + const cropPath = path.join(this.cropDebugDir, `crop-${regionIndex}-${timestamp}.png`); + await fse.writeFile(cropPath, croppedBuffer); + this.logger.debug(`区域 ${regionIndex}: 裁剪 ${Math.floor(expandedWidth)}x${Math.floor(expandedHeight)} -> ${cropPath}`); return { buffer: croppedBuffer, boxInfo: { original: { left, top, right, bottom, width: originalWidth, height: originalHeight }, expanded: { - left: adjustedLeft, - top: adjustedTop, - right: adjustedLeft + adjustedWidth, - bottom: adjustedTop + adjustedHeight, - width: adjustedWidth, - height: adjustedHeight + left: expandedLeft, + top: expandedTop, + right: expandedRight, + bottom: expandedBottom, + width: expandedWidth, + height: expandedHeight } } }; } catch (error) { - console.error(`❌ 区域 ${regionIndex}: 裁剪失败`, error); + this.logger.error(`区域 ${regionIndex}: 裁剪失败`, error); return null; } }