Electron-vue3-ts-offline/server/utils/textPostProcessor.js
2025-11-13 16:34:41 +08:00

163 行
5.1 KiB
JavaScript

// server/utils/textPostProcessor.js
class TextPostProcessor {
buildTextBlocks(recognitionResults) {
if (!recognitionResults || recognitionResults.length === 0) {
return [{
type: 'text',
content: '未识别到文本',
confidence: 0
}];
}
console.log(`📊 开始构建文本块,共 ${recognitionResults.length} 个识别结果`);
const lines = this.groupTextIntoLines(recognitionResults);
const blocks = [];
for (const line of lines) {
const content = line.map(item => item.text).join('');
const avgConfidence = line.reduce((sum, item) => sum + item.confidence, 0) / line.length;
const type = this.classifyTextType(content);
blocks.push({
type,
content,
confidence: avgConfidence,
...(type === 'citation' && { number: this.extractCitationNumber(content) })
});
console.log(`📝 文本行: "${content}" (${type}, 置信度: ${avgConfidence.toFixed(4)})`);
}
const mergedBlocks = this.mergeShortTextBlocks(blocks);
console.log(`✅ 文本块构建完成: ${mergedBlocks.length} 个块`);
return mergedBlocks;
}
groupTextIntoLines(results) {
if (results.length === 0) return [];
const lines = [];
const sortedResults = [...results].sort((a, b) => a.box.y1 - b.box.y1);
let currentLine = [];
let currentY = -1;
const lineThreshold = 0.8 * this.calculateAverageHeight(results);
for (const result of sortedResults) {
if (currentY === -1 || Math.abs(result.box.y1 - currentY) < lineThreshold) {
currentLine.push(result);
if (currentY === -1) currentY = result.box.y1;
else currentY = (currentY + result.box.y1) / 2;
} else {
if (currentLine.length > 0) {
currentLine.sort((a, b) => a.box.x1 - b.box.x1);
lines.push(currentLine);
}
currentLine = [result];
currentY = result.box.y1;
}
}
if (currentLine.length > 0) {
currentLine.sort((a, b) => a.box.x1 - b.box.x1);
lines.push(currentLine);
}
return lines;
}
calculateAverageHeight(results) {
if (results.length === 0) return 0;
const totalHeight = results.reduce((sum, result) => {
const height = Math.max(result.box.y1, result.box.y2, result.box.y3, result.box.y4) -
Math.min(result.box.y1, result.box.y2, result.box.y3, result.box.y4);
return sum + height;
}, 0);
return totalHeight / results.length;
}
classifyTextType(text) {
if (this.isReference(text)) return 'reference';
if (this.isCitation(text)) return 'citation';
if (this.isImageMarker(text)) return 'image';
if (this.isTableMarker(text)) return 'table';
return 'text';
}
isReference(text) {
const refPatterns = [
/^参考文献/i,
/^references/i,
/^bibliography/i,
/^引用文献/i,
/^参考书目/i
];
return refPatterns.some(pattern => pattern.test(text));
}
isCitation(text) {
return /^\[\d+\]/.test(text);
}
extractCitationNumber(text) {
const match = text.match(/^\[(\d+)\]/);
return match ? parseInt(match[1]) : null;
}
isImageMarker(text) {
const imagePatterns = [
/^图\s*\d+/i,
/^figure\s*\d+/i,
/^图片\s*\d+/i,
/^图表\s*\d+/i,
/^fig\.?\s*\d+/i
];
return imagePatterns.some(pattern => pattern.test(text));
}
isTableMarker(text) {
const tablePatterns = [
/^表\s*\d+/i,
/^table\s*\d+/i,
/^表格\s*\d+/i
];
return tablePatterns.some(pattern => pattern.test(text));
}
mergeShortTextBlocks(blocks) {
if (blocks.length <= 1) return blocks;
const mergedBlocks = [];
let currentBlock = { ...blocks[0] };
for (let i = 1; i < blocks.length; i++) {
const block = blocks[i];
// 放宽合并条件
if (currentBlock.type === 'text' &&
block.type === 'text' &&
currentBlock.content.length < 100) { // 增加长度限制
currentBlock.content += ' ' + block.content;
currentBlock.confidence = (currentBlock.confidence + block.confidence) / 2;
} else {
mergedBlocks.push(currentBlock);
currentBlock = { ...block };
}
}
mergedBlocks.push(currentBlock);
return mergedBlocks;
}
calculateOverallConfidence(results) {
if (results.length === 0) return 0;
const total = results.reduce((sum, result) => sum + result.confidence, 0);
return total / results.length;
}
}
export default TextPostProcessor;