197 行
7.4 KiB
JavaScript
197 行
7.4 KiB
JavaScript
|
|
// scripts/download-ppocrv5.js
|
||
|
|
import fs from 'fs-extra';
|
||
|
|
import path from 'path';
|
||
|
|
import { fileURLToPath } from 'url';
|
||
|
|
import { createRequire } from 'module';
|
||
|
|
|
||
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||
|
|
const require = createRequire(import.meta.url);
|
||
|
|
|
||
|
|
class PPOCRv5Downloader {
|
||
|
|
constructor() {
|
||
|
|
this.modelDir = path.join(process.cwd(), 'models', 'ppocrv5');
|
||
|
|
this.tempDir = path.join(process.cwd(), 'temp', 'downloads');
|
||
|
|
|
||
|
|
// PP-OCRv5 官方模型下载链接
|
||
|
|
this.modelUrls = {
|
||
|
|
detection: {
|
||
|
|
url: 'https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_det_infer.onnx',
|
||
|
|
filename: 'ch_PP-OCRv5_det_infer.onnx'
|
||
|
|
},
|
||
|
|
recognition: {
|
||
|
|
url: 'https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_rec_infer.onnx',
|
||
|
|
filename: 'ch_PP-OCRv5_rec_infer.onnx'
|
||
|
|
},
|
||
|
|
classification: {
|
||
|
|
url: 'https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.onnx',
|
||
|
|
filename: 'ch_ppocr_mobile_v2.0_cls_infer.onnx'
|
||
|
|
},
|
||
|
|
keys: {
|
||
|
|
url: 'https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/ppocr/utils/ppocr_keys_v1.txt',
|
||
|
|
filename: 'ppocr_keys_v1.txt'
|
||
|
|
}
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
async downloadModels() {
|
||
|
|
console.log('🚀 开始下载 PP-OCRv5 模型...');
|
||
|
|
console.log('📝 PP-OCRv5 特性:');
|
||
|
|
console.log(' - 更高的文本检测准确率');
|
||
|
|
console.log(' - 更好的小文本识别能力');
|
||
|
|
console.log(' - 优化的模型结构');
|
||
|
|
console.log(' - 完全离线运行\n');
|
||
|
|
|
||
|
|
try {
|
||
|
|
// 创建目录结构
|
||
|
|
await this.createDirectories();
|
||
|
|
|
||
|
|
let successCount = 0;
|
||
|
|
const totalCount = Object.keys(this.modelUrls).length;
|
||
|
|
|
||
|
|
// 并行下载所有模型
|
||
|
|
const downloadPromises = Object.entries(this.modelUrls).map(async ([type, info]) => {
|
||
|
|
try {
|
||
|
|
await this.downloadFile(type, info);
|
||
|
|
successCount++;
|
||
|
|
console.log(` ✅ ${this.getTypeName(type)} 下载完成 (${successCount}/${totalCount})`);
|
||
|
|
} catch (error) {
|
||
|
|
console.log(` ❌ ${this.getTypeName(type)} 下载失败: ${error.message}`);
|
||
|
|
throw error;
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
await Promise.all(downloadPromises);
|
||
|
|
|
||
|
|
console.log('\n🎉 所有模型下载完成!');
|
||
|
|
this.displayModelInfo();
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.error('\n❌ 下载过程中出现错误:', error.message);
|
||
|
|
await this.provideAlternativeSources();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async createDirectories() {
|
||
|
|
const dirs = [
|
||
|
|
this.modelDir,
|
||
|
|
path.join(this.modelDir, 'det'),
|
||
|
|
path.join(this.modelDir, 'rec'),
|
||
|
|
path.join(this.modelDir, 'cls'),
|
||
|
|
path.join(this.modelDir, 'keys'),
|
||
|
|
this.tempDir
|
||
|
|
];
|
||
|
|
|
||
|
|
for (const dir of dirs) {
|
||
|
|
await fs.ensureDir(dir);
|
||
|
|
}
|
||
|
|
console.log('📁 目录结构创建完成');
|
||
|
|
}
|
||
|
|
|
||
|
|
async downloadFile(type, info) {
|
||
|
|
const targetPath = this.getTargetPath(type, info.filename);
|
||
|
|
|
||
|
|
// 检查文件是否已存在
|
||
|
|
if (await fs.pathExists(targetPath)) {
|
||
|
|
const stats = await fs.stat(targetPath);
|
||
|
|
if (stats.size > this.getMinFileSize(type)) {
|
||
|
|
console.log(` ⏭️ ${this.getTypeName(type)} 已存在,跳过下载`);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(` 📥 下载 ${this.getTypeName(type)}...`);
|
||
|
|
|
||
|
|
const fetch = await import('node-fetch');
|
||
|
|
const response = await fetch.default(info.url);
|
||
|
|
|
||
|
|
if (!response.ok) {
|
||
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const buffer = await response.buffer();
|
||
|
|
|
||
|
|
// 验证文件大小
|
||
|
|
if (buffer.length < this.getMinFileSize(type)) {
|
||
|
|
throw new Error(`文件大小异常: ${(buffer.length / 1024 / 1024).toFixed(2)} MB`);
|
||
|
|
}
|
||
|
|
|
||
|
|
await fs.writeFile(targetPath, buffer);
|
||
|
|
|
||
|
|
// 验证文件完整性
|
||
|
|
await this.validateFile(type, targetPath);
|
||
|
|
}
|
||
|
|
|
||
|
|
getTargetPath(type, filename) {
|
||
|
|
const dirs = {
|
||
|
|
detection: path.join(this.modelDir, 'det'),
|
||
|
|
recognition: path.join(this.modelDir, 'rec'),
|
||
|
|
classification: path.join(this.modelDir, 'cls'),
|
||
|
|
keys: path.join(this.modelDir, 'keys')
|
||
|
|
};
|
||
|
|
return path.join(dirs[type], filename);
|
||
|
|
}
|
||
|
|
|
||
|
|
getTypeName(type) {
|
||
|
|
const names = {
|
||
|
|
detection: '检测模型 (PP-OCRv5 Det)',
|
||
|
|
recognition: '识别模型 (PP-OCRv5 Rec)',
|
||
|
|
classification: '分类模型 (Cls)',
|
||
|
|
keys: '字符集文件'
|
||
|
|
};
|
||
|
|
return names[type];
|
||
|
|
}
|
||
|
|
|
||
|
|
getMinFileSize(type) {
|
||
|
|
const sizes = {
|
||
|
|
detection: 2000000, // 2MB
|
||
|
|
recognition: 8000000, // 8MB
|
||
|
|
classification: 1000000, // 1MB
|
||
|
|
keys: 50000 // 50KB
|
||
|
|
};
|
||
|
|
return sizes[type];
|
||
|
|
}
|
||
|
|
|
||
|
|
async validateFile(type, filePath) {
|
||
|
|
const stats = await fs.stat(filePath);
|
||
|
|
|
||
|
|
if (type === 'keys') {
|
||
|
|
const content = await fs.readFile(filePath, 'utf8');
|
||
|
|
const lines = content.split('\n').filter(line => line.trim());
|
||
|
|
if (lines.length < 5000) {
|
||
|
|
throw new Error('字符集文件不完整');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(` 📊 文件大小: ${(stats.size / 1024 / 1024).toFixed(2)} MB`);
|
||
|
|
}
|
||
|
|
|
||
|
|
displayModelInfo() {
|
||
|
|
console.log('\n📂 模型文件位置:');
|
||
|
|
console.log(` 🎯 检测模型: ${path.join(this.modelDir, 'det', 'ch_PP-OCRv5_det_infer.onnx')}`);
|
||
|
|
console.log(` 🔤 识别模型: ${path.join(this.modelDir, 'rec', 'ch_PP-OCRv5_rec_infer.onnx')}`);
|
||
|
|
console.log(` 🧭 分类模型: ${path.join(this.modelDir, 'cls', 'ch_ppocr_mobile_v2.0_cls_infer.onnx')}`);
|
||
|
|
console.log(` 📝 字符集: ${path.join(this.modelDir, 'keys', 'ppocr_keys_v1.txt')}`);
|
||
|
|
|
||
|
|
console.log('\n🚀 使用命令:');
|
||
|
|
console.log(' yarn dev # 启动应用');
|
||
|
|
}
|
||
|
|
|
||
|
|
async provideAlternativeSources() {
|
||
|
|
console.log('\n💡 备用下载方案:');
|
||
|
|
console.log(' 1. 手动下载 PP-OCRv5 模型:');
|
||
|
|
console.log(' - 检测模型: https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_det_infer.onnx');
|
||
|
|
console.log(' - 识别模型: https://paddleocr.bj.bcebos.com/PP-OCRv5/chinese/ch_PP-OCRv5_rec_infer.onnx');
|
||
|
|
console.log(' - 分类模型: https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.onnx');
|
||
|
|
console.log(' - 字符集: https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/ppocr/utils/ppocr_keys_v1.txt');
|
||
|
|
console.log('\n 2. 将文件放置到以下目录:');
|
||
|
|
console.log(` ${this.modelDir}/`);
|
||
|
|
console.log(' ├── det/ch_PP-OCRv5_det_infer.onnx');
|
||
|
|
console.log(' ├── rec/ch_PP-OCRv5_rec_infer.onnx');
|
||
|
|
console.log(' ├── cls/ch_ppocr_mobile_v2.0_cls_infer.onnx');
|
||
|
|
console.log(' └── keys/ppocr_keys_v1.txt');
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// 执行下载
|
||
|
|
const downloader = new PPOCRv5Downloader();
|
||
|
|
downloader.downloadModels().catch(console.error);
|