// server/utils/detectionProcessor.js import { Tensor } from 'onnxruntime-node'; import sharp from 'sharp'; class DetectionProcessor { constructor() { this.session = null; this.config = null; this.logger = { info: (msg, ...args) => console.log(`🔍 [检测] ${msg}`, ...args), error: (msg, ...args) => console.error(`❌ [检测] ${msg}`, ...args), debug: (msg, ...args) => console.log(`🐛 [检测] ${msg}`, ...args) }; } initialize(session, config) { this.session = session; this.config = config; this.logger.info('检测处理器初始化完成'); } async detectText(processedImage) { const startTime = Date.now(); this.logger.info('开始文本检测'); try { const inputTensor = await this.prepareDetectionInput(processedImage); const outputs = await this.session.run({ [this.session.inputNames[0]]: inputTensor }); const textBoxes = this.postprocessDetection(outputs, processedImage); const processingTime = Date.now() - startTime; this.logger.info(`检测完成: ${textBoxes.length}个区域, 耗时${processingTime}ms`); return textBoxes; } catch (error) { this.logger.error('检测失败', error); return []; } } async prepareDetectionInput(processedImage) { const { buffer, width, height } = processedImage; this.logger.debug(`准备检测输入: ${width}x${height}`); const imageData = await sharp(buffer) .ensureAlpha() .raw() .toBuffer({ resolveWithObject: true }); const inputData = new Float32Array(3 * height * width); const data = imageData.data; const channels = imageData.info.channels; // 优化数据填充逻辑 for (let i = 0; i < data.length; i += channels) { const pixelIndex = Math.floor(i / channels); const y = Math.floor(pixelIndex / width); const x = pixelIndex % width; for (let c = 0; c < 3; c++) { const inputIndex = c * height * width + y * width + x; if (inputIndex < inputData.length) { inputData[inputIndex] = data[i] / 255.0; } } } this.logger.debug('检测输入张量准备完成'); return new Tensor('float32', inputData, [1, 3, height, width]); } postprocessDetection(outputs, processedImage) { this.logger.debug('开始检测后处理'); try { const boxes = []; const outputNames = this.session.outputNames; const detectionOutput = outputs[outputNames[0]]; if (!detectionOutput) { this.logger.debug('检测输出为空'); return boxes; } const [batch, channels, height, width] = detectionOutput.dims; const data = detectionOutput.data; // 动态阈值调整 const baseThreshold = this.config.detThresh || 0.05; const adaptiveThreshold = this.calculateAdaptiveThreshold(data, baseThreshold); this.logger.debug(`使用检测阈值: ${adaptiveThreshold.toFixed(4)}`); const points = this.collectDetectionPoints(data, width, height, adaptiveThreshold); if (points.length === 0) { this.logger.debug('未检测到有效文本点'); return boxes; } this.logger.debug(`收集到 ${points.length} 个检测点`); const clusters = this.enhancedCluster(points, this.config.clusterDistance || 8); this.logger.debug(`聚类得到 ${clusters.length} 个区域`); const validBoxes = this.filterAndScaleBoxes(clusters, processedImage); this.logger.info(`生成 ${validBoxes.length} 个有效文本框`); return validBoxes.sort((a, b) => b.confidence - a.confidence); } catch (error) { this.logger.error('检测后处理错误', error); return []; } } collectDetectionPoints(data, width, height, threshold) { const points = []; let totalProb = 0; let maxProb = 0; for (let y = 0; y < height; y++) { for (let x = 0; x < width; x++) { const idx = y * width + x; const prob = data[idx]; if (prob > threshold) { totalProb += prob; maxProb = Math.max(maxProb, prob); points.push({ x, y, prob, localMax: this.isLocalMaximum(data, x, y, width, height, 2) }); } } } if (points.length > 0) { this.logger.debug(`检测点统计: 平均置信度 ${(totalProb/points.length).toFixed(4)}, 最大置信度 ${maxProb.toFixed(4)}`); } return points; } calculateAdaptiveThreshold(data, baseThreshold) { // 基于图像特性动态调整阈值 let sum = 0; let count = 0; const sampleSize = Math.min(1000, data.length); for (let i = 0; i < sampleSize; i++) { const idx = Math.floor(Math.random() * data.length); if (data[idx] > baseThreshold) { sum += data[idx]; count++; } } if (count === 0) return baseThreshold; const mean = sum / count; return Math.min(baseThreshold * 1.5, mean * 0.8); } filterAndScaleBoxes(clusters, processedImage) { const boxes = []; const minPoints = this.config.minClusterPoints || 2; const boxThreshold = this.config.detBoxThresh || 0.1; for (const cluster of clusters) { if (cluster.length < minPoints) continue; const minX = Math.min(...cluster.map(p => p.x)); const maxX = Math.max(...cluster.map(p => p.x)); const minY = Math.min(...cluster.map(p => p.y)); const maxY = Math.max(...cluster.map(p => p.y)); const boxWidth = maxX - minX; const boxHeight = maxY - minY; // 放宽尺寸限制,提高小文本检测 if (boxWidth < 1 || boxHeight < 1) continue; const aspectRatio = boxWidth / boxHeight; if (aspectRatio > 150 || aspectRatio < 0.005) continue; const avgConfidence = cluster.reduce((sum, p) => sum + p.prob, 0) / cluster.length; if (avgConfidence > boxThreshold) { const box = this.scaleBoxToProcessedImage({ x1: minX, y1: minY, x2: maxX, y2: minY, x3: maxX, y3: maxY, x4: minX, y4: maxY }, processedImage); box.confidence = avgConfidence; boxes.push(box); } } return boxes; } isLocalMaximum(data, x, y, width, height, radius) { const centerProb = data[y * width + x]; for (let dy = -radius; dy <= radius; dy++) { for (let dx = -radius; dx <= radius; dx++) { if (dx === 0 && dy === 0) continue; const nx = x + dx; const ny = y + dy; if (nx >= 0 && nx < width && ny >= 0 && ny < height) { if (data[ny * width + nx] > centerProb) { return false; } } } } return true; } enhancedCluster(points, distanceThreshold) { const clusters = []; const visited = new Set(); const sortedPoints = [...points].sort((a, b) => b.prob - a.prob); for (let i = 0; i < sortedPoints.length; i++) { if (visited.has(i)) continue; const cluster = []; const queue = [i]; visited.add(i); while (queue.length > 0) { const currentIndex = queue.shift(); const currentPoint = sortedPoints[currentIndex]; cluster.push(currentPoint); // 动态调整搜索半径 const adaptiveThreshold = distanceThreshold * (1 + (1 - currentPoint.prob) * 0.3); for (let j = 0; j < sortedPoints.length; j++) { if (visited.has(j)) continue; const targetPoint = sortedPoints[j]; const dist = Math.sqrt( Math.pow(targetPoint.x - currentPoint.x, 2) + Math.pow(targetPoint.y - currentPoint.y, 2) ); if (dist < adaptiveThreshold) { queue.push(j); visited.add(j); } } } if (cluster.length > 0) { clusters.push(cluster); } } return clusters; } scaleBoxToProcessedImage(box, processedImage) { const { width: processedWidth, height: processedHeight } = processedImage; const clamp = (value, max) => Math.max(0, Math.min(max, value)); return { x1: clamp(box.x1, processedWidth - 1), y1: clamp(box.y1, processedHeight - 1), x2: clamp(box.x2, processedWidth - 1), y2: clamp(box.y2, processedHeight - 1), x3: clamp(box.x3, processedWidth - 1), y3: clamp(box.y3, processedHeight - 1), x4: clamp(box.x4, processedWidth - 1), y4: clamp(box.y4, processedHeight - 1) }; } } export default DetectionProcessor;