| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- // server/utils/textPostProcessor.js
- class TextPostProcessor {
- buildTextBlocks(recognitionResults) {
- if (!recognitionResults || recognitionResults.length === 0) {
- return [{
- type: 'text',
- content: '未识别到文本',
- confidence: 0
- }];
- }
- console.log(`📊 开始构建文本块,共 ${recognitionResults.length} 个识别结果`);
- const lines = this.groupTextIntoLines(recognitionResults);
- const blocks = [];
- for (const line of lines) {
- const content = line.map(item => item.text).join('');
- const avgConfidence = line.reduce((sum, item) => sum + item.confidence, 0) / line.length;
- const type = this.classifyTextType(content);
- blocks.push({
- type,
- content,
- confidence: avgConfidence,
- ...(type === 'citation' && { number: this.extractCitationNumber(content) })
- });
- console.log(`📝 文本行: "${content}" (${type}, 置信度: ${avgConfidence.toFixed(4)})`);
- }
- const mergedBlocks = this.mergeShortTextBlocks(blocks);
- console.log(`✅ 文本块构建完成: ${mergedBlocks.length} 个块`);
- return mergedBlocks;
- }
- groupTextIntoLines(results) {
- if (results.length === 0) return [];
- const lines = [];
- const sortedResults = [...results].sort((a, b) => a.box.y1 - b.box.y1);
- let currentLine = [];
- let currentY = -1;
- const lineThreshold = 0.8 * this.calculateAverageHeight(results);
- for (const result of sortedResults) {
- if (currentY === -1 || Math.abs(result.box.y1 - currentY) < lineThreshold) {
- currentLine.push(result);
- if (currentY === -1) currentY = result.box.y1;
- else currentY = (currentY + result.box.y1) / 2;
- } else {
- if (currentLine.length > 0) {
- currentLine.sort((a, b) => a.box.x1 - b.box.x1);
- lines.push(currentLine);
- }
- currentLine = [result];
- currentY = result.box.y1;
- }
- }
- if (currentLine.length > 0) {
- currentLine.sort((a, b) => a.box.x1 - b.box.x1);
- lines.push(currentLine);
- }
- return lines;
- }
- calculateAverageHeight(results) {
- if (results.length === 0) return 0;
- const totalHeight = results.reduce((sum, result) => {
- const height = Math.max(result.box.y1, result.box.y2, result.box.y3, result.box.y4) -
- Math.min(result.box.y1, result.box.y2, result.box.y3, result.box.y4);
- return sum + height;
- }, 0);
- return totalHeight / results.length;
- }
- classifyTextType(text) {
- if (this.isReference(text)) return 'reference';
- if (this.isCitation(text)) return 'citation';
- if (this.isImageMarker(text)) return 'image';
- if (this.isTableMarker(text)) return 'table';
- return 'text';
- }
- isReference(text) {
- const refPatterns = [
- /^参考文献/i,
- /^references/i,
- /^bibliography/i,
- /^引用文献/i,
- /^参考书目/i
- ];
- return refPatterns.some(pattern => pattern.test(text));
- }
- isCitation(text) {
- return /^\[\d+\]/.test(text);
- }
- extractCitationNumber(text) {
- const match = text.match(/^\[(\d+)\]/);
- return match ? parseInt(match[1]) : null;
- }
- isImageMarker(text) {
- const imagePatterns = [
- /^图\s*\d+/i,
- /^figure\s*\d+/i,
- /^图片\s*\d+/i,
- /^图表\s*\d+/i,
- /^fig\.?\s*\d+/i
- ];
- return imagePatterns.some(pattern => pattern.test(text));
- }
- isTableMarker(text) {
- const tablePatterns = [
- /^表\s*\d+/i,
- /^table\s*\d+/i,
- /^表格\s*\d+/i
- ];
- return tablePatterns.some(pattern => pattern.test(text));
- }
- mergeShortTextBlocks(blocks) {
- if (blocks.length <= 1) return blocks;
- const mergedBlocks = [];
- let currentBlock = { ...blocks[0] };
- for (let i = 1; i < blocks.length; i++) {
- const block = blocks[i];
- // 放宽合并条件
- if (currentBlock.type === 'text' &&
- block.type === 'text' &&
- currentBlock.content.length < 100) { // 增加长度限制
- currentBlock.content += ' ' + block.content;
- currentBlock.confidence = (currentBlock.confidence + block.confidence) / 2;
- } else {
- mergedBlocks.push(currentBlock);
- currentBlock = { ...block };
- }
- }
- mergedBlocks.push(currentBlock);
- return mergedBlocks;
- }
- calculateOverallConfidence(results) {
- if (results.length === 0) return 0;
- const total = results.reduce((sum, result) => sum + result.confidence, 0);
- return total / results.length;
- }
- }
- export default TextPostProcessor;
|