// scripts/download-ppocrv5.js import fs from 'fs-extra'; import path from 'path'; import { fileURLToPath } from 'url'; import { createRequire } from 'module'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const require = createRequire(import.meta.url); class PPOCRv5Downloader { constructor() { this.modelDir = path.join(process.cwd(), 'models', 'ppocrv5'); this.tempDir = path.join(process.cwd(), 'temp', 'downloads'); // PP-OCRv5 官方模型下载链接 this.modelUrls = { detection: { url: 'https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_det_infer.onnx', filename: 'ch_PP-OCRv5_det_infer.onnx' }, recognition: { url: 'https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_rec_infer.onnx', filename: 'ch_PP-OCRv5_rec_infer.onnx' }, classification: { url: 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.onnx', filename: 'ch_ppocr_mobile_v2.0_cls_infer.onnx' }, keys: { url: 'https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/ppocr/utils/ppocr_keys_v1.txt', filename: 'ppocr_keys_v1.txt' } }; } async downloadModels() { console.log('🚀 开始下载 PP-OCRv5 模型...'); console.log('📝 PP-OCRv5 特性:'); console.log(' - 更高的文本检测准确率'); console.log(' - 更好的小文本识别能力'); console.log(' - 优化的模型结构'); console.log(' - 完全离线运行\n'); try { // 创建目录结构 await this.createDirectories(); let successCount = 0; const totalCount = Object.keys(this.modelUrls).length; // 并行下载所有模型 const downloadPromises = Object.entries(this.modelUrls).map(async ([type, info]) => { try { await this.downloadFile(type, info); successCount++; console.log(` ✅ ${this.getTypeName(type)} 下载完成 (${successCount}/${totalCount})`); } catch (error) { console.log(` ❌ ${this.getTypeName(type)} 下载失败: ${error.message}`); throw error; } }); await Promise.all(downloadPromises); console.log('\n🎉 所有模型下载完成!'); this.displayModelInfo(); } catch (error) { console.error('\n❌ 下载过程中出现错误:', error.message); await this.provideAlternativeSources(); } } async createDirectories() { const dirs = [ this.modelDir, path.join(this.modelDir, 'det'), path.join(this.modelDir, 'rec'), path.join(this.modelDir, 'cls'), path.join(this.modelDir, 'keys'), this.tempDir ]; for (const dir of dirs) { await fs.ensureDir(dir); } console.log('📁 目录结构创建完成'); } async downloadFile(type, info) { const targetPath = this.getTargetPath(type, info.filename); // 检查文件是否已存在 if (await fs.pathExists(targetPath)) { const stats = await fs.stat(targetPath); if (stats.size > this.getMinFileSize(type)) { console.log(` ⏭️ ${this.getTypeName(type)} 已存在,跳过下载`); return; } } console.log(` 📥 下载 ${this.getTypeName(type)}...`); const fetch = await import('node-fetch'); const response = await fetch.default(info.url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const buffer = await response.buffer(); // 验证文件大小 if (buffer.length < this.getMinFileSize(type)) { throw new Error(`文件大小异常: ${(buffer.length / 1024 / 1024).toFixed(2)} MB`); } await fs.writeFile(targetPath, buffer); // 验证文件完整性 await this.validateFile(type, targetPath); } getTargetPath(type, filename) { const dirs = { detection: path.join(this.modelDir, 'det'), recognition: path.join(this.modelDir, 'rec'), classification: path.join(this.modelDir, 'cls'), keys: path.join(this.modelDir, 'keys') }; return path.join(dirs[type], filename); } getTypeName(type) { const names = { detection: '检测模型 (PP-OCRv5 Det)', recognition: '识别模型 (PP-OCRv5 Rec)', classification: '分类模型 (Cls)', keys: '字符集文件' }; return names[type]; } getMinFileSize(type) { const sizes = { detection: 2000000, // 2MB recognition: 8000000, // 8MB classification: 1000000, // 1MB keys: 50000 // 50KB }; return sizes[type]; } async validateFile(type, filePath) { const stats = await fs.stat(filePath); if (type === 'keys') { const content = await fs.readFile(filePath, 'utf8'); const lines = content.split('\n').filter(line => line.trim()); if (lines.length < 5000) { throw new Error('字符集文件不完整'); } } console.log(` 📊 文件大小: ${(stats.size / 1024 / 1024).toFixed(2)} MB`); } displayModelInfo() { console.log('\n📂 模型文件位置:'); console.log(` 🎯 检测模型: ${path.join(this.modelDir, 'det', 'ch_PP-OCRv5_det_infer.onnx')}`); console.log(` 🔤 识别模型: ${path.join(this.modelDir, 'rec', 'ch_PP-OCRv5_rec_infer.onnx')}`); console.log(` 🧭 分类模型: ${path.join(this.modelDir, 'cls', 'ch_ppocr_mobile_v2.0_cls_infer.onnx')}`); console.log(` 📝 字符集: ${path.join(this.modelDir, 'keys', 'ppocr_keys_v1.txt')}`); console.log('\n🚀 使用命令:'); console.log(' yarn dev # 启动应用'); } async provideAlternativeSources() { console.log('\n💡 备用下载方案:'); console.log(' 1. 手动下载 PP-OCRv5 模型:'); console.log(' - 检测模型: https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_det_infer.onnx'); console.log(' - 识别模型: https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_rec_infer.onnx'); console.log(' - 分类模型: https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.onnx'); console.log(' - 字符集: https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/ppocr/utils/ppocr_keys_v1.txt'); console.log('\n 2. 将文件放置到以下目录:'); console.log(` ${this.modelDir}/`); console.log(' ├── det/ch_PP-OCRv5_det_infer.onnx'); console.log(' ├── rec/ch_PP-OCRv5_rec_infer.onnx'); console.log(' ├── cls/ch_ppocr_mobile_v2.0_cls_infer.onnx'); console.log(' └── keys/ppocr_keys_v1.txt'); } } // 执行下载 const downloader = new PPOCRv5Downloader(); downloader.downloadModels().catch(console.error);