textRecognizer.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. // server/utils/textRecognizer.js
  2. import { Tensor } from 'onnxruntime-node';
  3. import sharp from 'sharp';
  4. import fse from 'fs-extra';
  5. import * as path from 'path';
  6. class TextRecognizer {
  7. constructor() {
  8. this.recSession = null;
  9. this.config = null;
  10. this.characterSet = [];
  11. this.debugDir = path.join(process.cwd(), 'temp', 'debug');
  12. fse.ensureDirSync(this.debugDir);
  13. }
  14. initialize(recSession, config) {
  15. this.recSession = recSession;
  16. this.config = config;
  17. }
  18. async loadCharacterSet(keysPath) {
  19. try {
  20. const keysContent = await fse.readFile(keysPath, 'utf8');
  21. this.characterSet = [];
  22. const lines = keysContent.split('\n');
  23. for (const line of lines) {
  24. const trimmed = line.trim();
  25. if (trimmed && !trimmed.startsWith('#')) {
  26. for (const char of trimmed) {
  27. if (char.trim() && !this.characterSet.includes(char)) {
  28. this.characterSet.push(char);
  29. }
  30. }
  31. }
  32. }
  33. if (this.characterSet.length === 0) {
  34. throw new Error('字符集文件为空或格式不正确');
  35. }
  36. console.log(`✅ 字符集加载完成,共 ${this.characterSet.length} 个字符`);
  37. } catch (error) {
  38. console.error('❌ 加载字符集失败,使用默认字符集:', error.message);
  39. this.characterSet = this.getDefaultCharacterSet();
  40. }
  41. }
  42. getDefaultCharacterSet() {
  43. const defaultSet = [];
  44. for (let i = 0; i <= 9; i++) defaultSet.push(i.toString());
  45. for (let i = 97; i <= 122; i++) defaultSet.push(String.fromCharCode(i));
  46. for (let i = 65; i <= 90; i++) defaultSet.push(String.fromCharCode(i));
  47. defaultSet.push(...' ,。!?;:""()【】《》…—·'.split(''));
  48. const commonChinese = '的一是不了在人有的我他这个们中来就时大地为子中你说道生国年着就那和要她出也得里后自以会家可下而过天去能对小多然于心学么之都好看起发当没成只如事把还用第样道想作种开美总从无情已面最女但现前些所同日手又行意动方期它头经长儿回位分爱老因很给名法间斯知世什两次使身者被高已亲其进此话常与活正感';
  49. for (const char of commonChinese) {
  50. defaultSet.push(char);
  51. }
  52. console.log(`📝 使用默认字符集,共 ${defaultSet.length} 个字符`);
  53. return defaultSet;
  54. }
  55. getCharacterSetSize() {
  56. return this.characterSet.length;
  57. }
  58. async recognizeText(textRegionBuffer) {
  59. console.log('🔠 === 开始文本识别流程 ===');
  60. try {
  61. console.log('📥 1. 准备识别输入...');
  62. console.log(` - 输入图像大小: ${textRegionBuffer.length} 字节`);
  63. const inputTensor = await this.prepareRecognitionInput(textRegionBuffer);
  64. console.log('✅ 输入张量准备完成');
  65. console.log(` - 张量形状: [${inputTensor.dims.join(', ')}]`);
  66. console.log(` - 张量类型: ${inputTensor.type}`);
  67. console.log(` - 数据长度: ${inputTensor.data.length}`);
  68. // 数据验证
  69. const tensorData = inputTensor.data;
  70. let minVal = Infinity;
  71. let maxVal = -Infinity;
  72. let sumVal = 0;
  73. let validCount = 0;
  74. for (let i = 0; i < Math.min(100, tensorData.length); i++) {
  75. const val = tensorData[i];
  76. if (!isNaN(val) && isFinite(val)) {
  77. minVal = Math.min(minVal, val);
  78. maxVal = Math.max(maxVal, val);
  79. sumVal += val;
  80. validCount++;
  81. }
  82. }
  83. console.log(` - 数据范围: ${minVal.toFixed(4)} ~ ${maxVal.toFixed(4)}`);
  84. console.log(` - 数据均值: ${(sumVal / validCount).toFixed(4)}`);
  85. console.log('🧠 2. 执行模型推理...');
  86. const startInference = Date.now();
  87. const outputs = await this.recSession.run({ [this.recSession.inputNames[0]]: inputTensor });
  88. const inferenceTime = Date.now() - startInference;
  89. console.log(`✅ 模型推理完成 (${inferenceTime}ms)`);
  90. const outputNames = this.recSession.outputNames;
  91. console.log(` - 输出数量: ${outputNames.length}`);
  92. outputNames.forEach((name, index) => {
  93. const output = outputs[name];
  94. if (output) {
  95. console.log(` - 输出 ${index + 1} (${name}): 形状 [${output.dims.join(', ')}]`);
  96. }
  97. });
  98. console.log('🔍 3. 后处理识别结果...');
  99. const result = this.postprocessRecognition(outputs);
  100. console.log('✅ 后处理完成');
  101. console.log(` - 识别文本: "${result.text}"`);
  102. console.log(` - 置信度: ${result.confidence.toFixed(4)}`);
  103. console.log(` - 文本长度: ${result.text.length} 字符`);
  104. console.log('🎉 === 文本识别流程完成 ===');
  105. return result;
  106. } catch (error) {
  107. console.error('❌ 文本识别失败:');
  108. console.error(` - 错误信息: ${error.message}`);
  109. return { text: '', confidence: 0 };
  110. }
  111. }
  112. async prepareRecognitionInput(textRegionBuffer) {
  113. console.log(' 📝 准备识别输入详情:');
  114. try {
  115. const targetHeight = 48;
  116. const targetWidth = 320;
  117. const metadata = await sharp(textRegionBuffer).metadata();
  118. console.log(` - 原始图像尺寸: ${metadata.width}x${metadata.height}`);
  119. // 保存原始图像用于调试
  120. const originalPath = path.join(this.debugDir, `original-${Date.now()}.png`);
  121. await fse.writeFile(originalPath, textRegionBuffer);
  122. // 关键修复:正确的预处理流程
  123. let processedBuffer = textRegionBuffer;
  124. // 1. 分析图像特性
  125. const stats = await sharp(processedBuffer)
  126. .grayscale()
  127. .stats();
  128. const meanBrightness = stats.channels[0].mean;
  129. const stdDev = stats.channels[0].stdev;
  130. console.log(` - 图像统计: 均值=${meanBrightness.toFixed(1)}, 标准差=${stdDev.toFixed(1)}`);
  131. // 2. 改进的预处理策略
  132. if (meanBrightness > 200 && stdDev < 30) {
  133. console.log(' - 检测到高亮度图像,进行对比度增强');
  134. processedBuffer = await sharp(processedBuffer)
  135. .linear(1.5, -50)
  136. .normalize()
  137. .grayscale()
  138. .toBuffer();
  139. } else if (meanBrightness < 80) {
  140. console.log(' - 检测到低亮度图像,进行亮度调整');
  141. processedBuffer = await sharp(processedBuffer)
  142. .linear(1.2, 30)
  143. .normalize()
  144. .grayscale()
  145. .toBuffer();
  146. } else {
  147. console.log(' - 使用标准化灰度处理');
  148. processedBuffer = await sharp(processedBuffer)
  149. .normalize()
  150. .grayscale()
  151. .toBuffer();
  152. }
  153. // 3. 保持宽高比的resize
  154. const originalAspectRatio = metadata.width / metadata.height;
  155. const targetAspectRatio = targetWidth / targetHeight;
  156. let resizeWidth, resizeHeight;
  157. if (originalAspectRatio > targetAspectRatio) {
  158. // 宽度限制
  159. resizeWidth = targetWidth;
  160. resizeHeight = Math.round(targetWidth / originalAspectRatio);
  161. } else {
  162. // 高度限制
  163. resizeHeight = targetHeight;
  164. resizeWidth = Math.round(targetHeight * originalAspectRatio);
  165. }
  166. // 确保尺寸有效
  167. resizeWidth = Math.max(1, Math.min(resizeWidth, targetWidth));
  168. resizeHeight = Math.max(1, Math.min(resizeHeight, targetHeight));
  169. processedBuffer = await sharp(processedBuffer)
  170. .resize(resizeWidth, resizeHeight, {
  171. fit: 'contain',
  172. background: { r: 255, g: 255, b: 255 }
  173. })
  174. .extend({
  175. top: 0,
  176. bottom: targetHeight - resizeHeight,
  177. left: 0,
  178. right: targetWidth - resizeWidth,
  179. background: { r: 255, g: 255, b: 255 }
  180. })
  181. .png()
  182. .toBuffer();
  183. const processedMetadata = await sharp(processedBuffer).metadata();
  184. console.log(` - 处理后尺寸: ${processedMetadata.width}x${processedMetadata.height}`);
  185. // 保存预处理后的图像用于调试
  186. const processedPath = path.join(this.debugDir, `processed-${Date.now()}.png`);
  187. await fse.writeFile(processedPath, processedBuffer);
  188. // 4. 转换为张量 - 关键修复:正确的归一化
  189. console.log(' - 转换为张量数据...');
  190. const imageData = await sharp(processedBuffer)
  191. .ensureAlpha()
  192. .raw()
  193. .toBuffer({ resolveWithObject: true });
  194. const inputData = new Float32Array(3 * targetHeight * targetWidth);
  195. const data = imageData.data;
  196. const channels = imageData.info.channels;
  197. // 使用正确的归一化方法
  198. for (let i = 0; i < data.length; i += channels) {
  199. const pixelIndex = Math.floor(i / channels);
  200. const y = Math.floor(pixelIndex / targetWidth);
  201. const x = pixelIndex % targetWidth;
  202. // 对每个位置,三个通道使用相同的灰度值
  203. const grayValue = data[i] / 255.0;
  204. for (let c = 0; c < 3; c++) {
  205. const inputIndex = c * targetHeight * targetWidth + y * targetWidth + x;
  206. if (inputIndex < inputData.length) {
  207. inputData[inputIndex] = grayValue;
  208. }
  209. }
  210. }
  211. console.log(` - 输入数据长度: ${inputData.length}`);
  212. // 数据验证
  213. let validCount = 0;
  214. let sumValue = 0;
  215. let minValue = Infinity;
  216. let maxValue = -Infinity;
  217. for (let i = 0; i < Math.min(100, inputData.length); i++) {
  218. const val = inputData[i];
  219. if (!isNaN(val) && isFinite(val)) {
  220. validCount++;
  221. sumValue += val;
  222. minValue = Math.min(minValue, val);
  223. maxValue = Math.max(maxValue, val);
  224. }
  225. }
  226. console.log(` - 数据验证: 有效=${validCount}`);
  227. console.log(` - 数据范围: ${minValue.toFixed(4)} ~ ${maxValue.toFixed(4)}`);
  228. console.log(` - 数据均值: ${(sumValue / validCount).toFixed(4)}`);
  229. return new Tensor('float32', inputData, [1, 3, targetHeight, targetWidth]);
  230. } catch (error) {
  231. console.error(` ❌ 准备输入失败: ${error.message}`);
  232. // 返回有效的默认张量
  233. return new Tensor('float32', new Float32Array(3 * 48 * 320).fill(0.5), [1, 3, 48, 320]);
  234. }
  235. }
  236. postprocessRecognition(outputs) {
  237. console.log(' 📝 后处理识别结果详情:');
  238. try {
  239. const outputNames = this.recSession.outputNames;
  240. const recognitionOutput = outputs[outputNames[0]];
  241. if (!recognitionOutput) {
  242. console.log(' ❌ 识别输出为空');
  243. return { text: '', confidence: 0 };
  244. }
  245. const data = recognitionOutput.data;
  246. const [batch, seqLen, vocabSize] = recognitionOutput.dims;
  247. console.log(` - 序列长度: ${seqLen}, 词汇表大小: ${vocabSize}`);
  248. console.log(` - 输出数据总数: ${data.length}`);
  249. console.log(` - 字符集大小: ${this.characterSet.length}`);
  250. if (this.characterSet.length === 0) {
  251. console.log(' ❌ 字符集为空');
  252. return { text: '', confidence: 0 };
  253. }
  254. // 改进的CTC解码算法
  255. let text = '';
  256. let lastCharIndex = -1;
  257. let confidenceSum = 0;
  258. let charCount = 0;
  259. // 降低置信度阈值,提高召回率
  260. const confidenceThreshold = 0.05;
  261. console.log(' - 处理每个时间步:');
  262. for (let t = 0; t < seqLen; t++) {
  263. let maxProb = -1;
  264. let maxIndex = -1;
  265. // 找到当前时间步的最大概率字符
  266. for (let i = 0; i < vocabSize; i++) {
  267. const prob = data[t * vocabSize + i];
  268. if (prob > maxProb) {
  269. maxProb = prob;
  270. maxIndex = i;
  271. }
  272. }
  273. // 改进的解码逻辑
  274. if (maxIndex > 0 && maxProb > confidenceThreshold) {
  275. const char = this.characterSet[maxIndex - 1] || '';
  276. // 放宽重复字符限制
  277. if (maxIndex !== lastCharIndex || maxProb > 0.8) {
  278. if (char && char.trim() !== '') {
  279. text += char;
  280. confidenceSum += maxProb;
  281. charCount++;
  282. console.log(` [位置 ${t}] 字符: "${char}", 置信度: ${maxProb.toFixed(4)}`);
  283. }
  284. lastCharIndex = maxIndex;
  285. }
  286. } else if (maxIndex === 0) {
  287. // 空白符,重置lastCharIndex
  288. lastCharIndex = -1;
  289. }
  290. }
  291. const avgConfidence = charCount > 0 ? confidenceSum / charCount : 0;
  292. console.log(` - 识别结果: "${text}"`);
  293. console.log(` - 字符数: ${charCount}, 平均置信度: ${avgConfidence.toFixed(4)}`);
  294. return {
  295. text: text,
  296. confidence: avgConfidence
  297. };
  298. } catch (error) {
  299. console.error(` ❌ 后处理失败: ${error.message}`);
  300. return { text: '', confidence: 0 };
  301. }
  302. }
  303. }
  304. export default TextRecognizer;