| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- // server/utils/detectionProcessor.js
- import { Tensor } from 'onnxruntime-node';
- import sharp from 'sharp';
- class DetectionProcessor {
- constructor() {
- this.session = null;
- this.config = null;
- }
- initialize(session, config) {
- this.session = session;
- this.config = config;
- }
- async detectText(processedImage) {
- try {
- const inputTensor = await this.prepareDetectionInput(processedImage);
- const outputs = await this.session.run({ [this.session.inputNames[0]]: inputTensor });
- const textBoxes = this.postprocessDetection(outputs, processedImage);
- return textBoxes;
- } catch (error) {
- console.error('文本检测失败:', error);
- return [];
- }
- }
- async prepareDetectionInput(processedImage) {
- const { buffer, width, height } = processedImage;
- const imageData = await sharp(buffer)
- .ensureAlpha()
- .raw()
- .toBuffer({ resolveWithObject: true });
- const inputData = new Float32Array(3 * height * width);
- const data = imageData.data;
- const channels = imageData.info.channels;
- for (let i = 0; i < data.length; i += channels) {
- const pixelIndex = Math.floor(i / channels);
- const channel = Math.floor(pixelIndex / (height * width));
- const posInChannel = pixelIndex % (height * width);
- if (channel < 3) {
- const y = Math.floor(posInChannel / width);
- const x = posInChannel % width;
- const inputIndex = channel * height * width + y * width + x;
- if (inputIndex < inputData.length) {
- inputData[inputIndex] = data[i] / 255.0;
- }
- }
- }
- return new Tensor('float32', inputData, [1, 3, height, width]);
- }
- postprocessDetection(outputs, processedImage) {
- try {
- const boxes = [];
- const outputNames = this.session.outputNames;
- const detectionOutput = outputs[outputNames[0]];
- if (!detectionOutput) {
- return boxes;
- }
- const [batch, channels, height, width] = detectionOutput.dims;
- const data = detectionOutput.data;
- // 降低检测阈值,提高召回率
- const threshold = this.config.detThresh || 0.05;
- const points = [];
- // 改进的点收集逻辑
- for (let y = 0; y < height; y++) {
- for (let x = 0; x < width; x++) {
- const idx = y * width + x;
- const prob = data[idx];
- if (prob > threshold) {
- points.push({
- x,
- y,
- prob,
- localMax: this.isLocalMaximum(data, x, y, width, height, 2)
- });
- }
- }
- }
- if (points.length === 0) {
- return boxes;
- }
- // 改进的聚类算法
- const clusters = this.enhancedCluster(points, 8);
- for (const cluster of clusters) {
- // 降低最小点数要求
- if (cluster.length < 2) continue;
- const minX = Math.min(...cluster.map(p => p.x));
- const maxX = Math.max(...cluster.map(p => p.x));
- const minY = Math.min(...cluster.map(p => p.y));
- const maxY = Math.max(...cluster.map(p => p.y));
- const boxWidth = maxX - minX;
- const boxHeight = maxY - minY;
- // 放宽尺寸限制
- if (boxWidth < 2 || boxHeight < 2) continue;
- const aspectRatio = boxWidth / boxHeight;
- // 放宽宽高比限制
- if (aspectRatio > 100 || aspectRatio < 0.01) continue;
- const avgConfidence = cluster.reduce((sum, p) => sum + p.prob, 0) / cluster.length;
- // 降低框置信度阈值
- const boxThreshold = this.config.detBoxThresh || 0.1;
- if (avgConfidence > boxThreshold) {
- const box = this.scaleBoxToProcessedImage({
- x1: minX, y1: minY,
- x2: maxX, y2: minY,
- x3: maxX, y3: maxY,
- x4: minX, y4: maxY
- }, processedImage);
- box.confidence = avgConfidence;
- boxes.push(box);
- }
- }
- boxes.sort((a, b) => b.confidence - a.confidence);
- console.log(`✅ 检测到 ${boxes.length} 个文本区域`);
- return boxes;
- } catch (error) {
- console.error('检测后处理错误:', error);
- return [];
- }
- }
- // 添加局部最大值检测
- isLocalMaximum(data, x, y, width, height, radius) {
- const centerProb = data[y * width + x];
- for (let dy = -radius; dy <= radius; dy++) {
- for (let dx = -radius; dx <= radius; dx++) {
- if (dx === 0 && dy === 0) continue;
- const nx = x + dx;
- const ny = y + dy;
- if (nx >= 0 && nx < width && ny >= 0 && ny < height) {
- if (data[ny * width + nx] > centerProb) {
- return false;
- }
- }
- }
- }
- return true;
- }
- // 改进的聚类算法
- enhancedCluster(points, distanceThreshold) {
- const clusters = [];
- const visited = new Set();
- // 按概率降序排序,优先处理高置信度点
- const sortedPoints = [...points].sort((a, b) => b.prob - a.prob);
- for (let i = 0; i < sortedPoints.length; i++) {
- if (visited.has(i)) continue;
- const cluster = [];
- const queue = [i];
- visited.add(i);
- while (queue.length > 0) {
- const currentIndex = queue.shift();
- const currentPoint = sortedPoints[currentIndex];
- cluster.push(currentPoint);
- // 动态调整搜索半径
- const adaptiveThreshold = distanceThreshold *
- (1 + (1 - currentPoint.prob) * 0.5);
- for (let j = 0; j < sortedPoints.length; j++) {
- if (visited.has(j)) continue;
- const targetPoint = sortedPoints[j];
- const dist = Math.sqrt(
- Math.pow(targetPoint.x - currentPoint.x, 2) +
- Math.pow(targetPoint.y - currentPoint.y, 2)
- );
- if (dist < adaptiveThreshold) {
- queue.push(j);
- visited.add(j);
- }
- }
- }
- if (cluster.length > 0) {
- clusters.push(cluster);
- }
- }
- return clusters;
- }
- scaleBoxToProcessedImage(box, processedImage) {
- const { width: processedWidth, height: processedHeight } = processedImage;
- const scaledBox = {
- x1: box.x1,
- y1: box.y1,
- x2: box.x2,
- y2: box.y2,
- x3: box.x3,
- y3: box.y3,
- x4: box.x4,
- y4: box.y4
- };
- const clamp = (value, max) => Math.max(0, Math.min(max, value));
- return {
- x1: clamp(scaledBox.x1, processedWidth - 1),
- y1: clamp(scaledBox.y1, processedHeight - 1),
- x2: clamp(scaledBox.x2, processedWidth - 1),
- y2: clamp(scaledBox.y2, processedHeight - 1),
- x3: clamp(scaledBox.x3, processedWidth - 1),
- y3: clamp(scaledBox.y3, processedHeight - 1),
- x4: clamp(scaledBox.x4, processedWidth - 1),
- y4: clamp(scaledBox.y4, processedHeight - 1)
- };
- }
- scaleBoxToOriginalImage(box, processedImage) {
- const {
- scaleX, scaleY,
- paddingX, paddingY,
- originalWidth, originalHeight
- } = processedImage;
- const paddedX1 = box.x1 * scaleX;
- const paddedY1 = box.y1 * scaleY;
- const paddedX3 = box.x3 * scaleX;
- const paddedY3 = box.y3 * scaleY;
- const originalX1 = paddedX1 - paddingX;
- const originalY1 = paddedY1 - paddingY;
- const originalX3 = paddedX3 - paddingX;
- const originalY3 = paddedY3 - paddingY;
- const clamp = (value, max) => Math.max(0, Math.min(max, value));
- return {
- x1: clamp(originalX1, originalWidth - 1),
- y1: clamp(originalY1, originalHeight - 1),
- x2: clamp(originalX3, originalWidth - 1),
- y2: clamp(originalY1, originalHeight - 1),
- x3: clamp(originalX3, originalWidth - 1),
- y3: clamp(originalY3, originalHeight - 1),
- x4: clamp(originalX1, originalWidth - 1),
- y4: clamp(originalY3, originalHeight - 1),
- confidence: box.confidence
- };
- }
- }
- export default DetectionProcessor;
|