const express = require('express') const cors = require('cors') const multer = require('multer') const path = require('path') const fs = require('fs-extra') const { calculateFileMD5 } = require('./utils.js') const { initDatabase, FileService } = require('../database/database.js') // 新增 OCR 相关依赖 const Tesseract = require('tesseract.js') const sharp = require('sharp') const { createCanvas, loadImage } = require('canvas') const app = express() const PORT = 3000 // 初始化数据库 initDatabase() const fileService = new FileService() // 确保上传目录和临时目录存在 const uploadDir = path.join(process.cwd(), 'uploads') const tempDir = path.join(process.cwd(), 'temp') fs.ensureDirSync(uploadDir) fs.ensureDirSync(tempDir) // 配置 multer - 修复中文文件名问题 const storage = multer.diskStorage({ destination: (req, file, cb) => { cb(null, uploadDir) }, filename: (req, file, cb) => { // 处理中文文件名 - 使用原始文件名但确保安全 const originalName = Buffer.from(file.originalname, 'latin1').toString('utf8') const ext = path.extname(originalName) const name = path.basename(originalName, ext) // 清理文件名,移除特殊字符 const safeName = name.replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '_') const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9) const filename = safeName + '-' + uniqueSuffix + ext cb(null, filename) } }) const upload = multer({ storage, fileFilter: (req, file, cb) => { // 处理文件名编码 file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8') cb(null, true) } }) // 设置响应头,确保使用 UTF-8 编码 app.use((req, res, next) => { res.setHeader('Content-Type', 'application/json; charset=utf-8') next() }) app.use(cors()) app.use(express.json({ limit: '50mb' })) app.use(express.urlencoded({ extended: true, limit: '50mb' })) // 文件上传接口 app.post('/api/upload', upload.single('file'), async (req, res) => { try { if (!req.file) { return res.status(400).json({ error: 'No file uploaded' }) } // 确保文件名正确编码 const originalName = Buffer.from(req.file.originalname, 'latin1').toString('utf8') const fileInfo = { originalName: originalName, fileName: req.file.filename, filePath: req.file.path, fileSize: req.file.size, mimeType: req.file.mimetype } // 计算 MD5 const md5 = await calculateFileMD5(req.file.path) // 保存到数据库 const fileRecord = await fileService.createFile({ ...fileInfo, md5 }) res.json({ success: true, data: fileRecord }) } catch (error) { console.error('Upload error:', error) res.status(500).json({ error: 'Upload failed: ' + error.message }) } }) // 修复获取文件列表接口 - 确保返回正确的数据结构 app.get('/api/files', async (req, res) => { try { const page = parseInt(req.query.page) || 1 const pageSize = parseInt(req.query.pageSize) || 100 const result = await fileService.getFilesPaginated(page, pageSize) // 返回统一的数据结构 res.json({ success: true, data: result.files, // 直接返回文件数组 pagination: result.pagination }) } catch (error) { console.error('Get files error:', error) res.status(500).json({ success: false, error: 'Failed to get files: ' + error.message }) } }) // MD5 检查接口 app.post('/api/files/:id/check-md5', async (req, res) => { try { const fileId = parseInt(req.params.id) const file = await fileService.getFileById(fileId) if (!file) { return res.status(404).json({ error: 'File not found' }) } const currentMD5 = await calculateFileMD5(file.filePath) const isChanged = currentMD5 !== file.md5 res.json({ isChanged, currentMD5, originalMD5: file.md5, file }) } catch (error) { console.error('MD5 check error:', error) res.status(500).json({ error: 'MD5 check failed' }) } }) // 更新 MD5 接口 app.put('/api/files/:id/update-md5', async (req, res) => { try { const fileId = parseInt(req.params.id) const { md5 } = req.body await fileService.updateFileMD5(fileId, md5) res.json({ success: true }) } catch (error) { console.error('Update MD5 error:', error) res.status(500).json({ error: 'Update failed' }) } }) // 新增 OCR 识别接口 app.post('/api/ocr/recognize', async (req, res) => { try { const { fileId, page } = req.body if (!fileId) { return res.status(400).json({ error: 'File ID is required' }) } const file = await fileService.getFileById(parseInt(fileId)) if (!file) { return res.status(404).json({ error: 'File not found' }) } console.log(`开始OCR识别: ${file.originalName}`) // 预处理图像 const processedImagePath = await preprocessImage(file.filePath) // 使用 Tesseract 进行 OCR 识别 const result = await performOCR(processedImagePath) // 清理临时文件 await fs.remove(processedImagePath) res.json({ success: true, data: { textBlocks: result.textBlocks, totalPages: result.totalPages || 1, processingTime: result.processingTime, confidence: result.confidence } }) } catch (error) { console.error('OCR recognition error:', error) res.status(500).json({ error: 'OCR recognition failed: ' + error.message }) } }) // 添加 OCR 结果相关的 API 接口 // 保存 OCR 结果 app.post('/api/ocr/save-result', async (req, res) => { try { const { fileId, ocrData } = req.body if (!fileId || !ocrData) { return res.status(400).json({ error: '文件ID和OCR数据是必需的' }) } await fileService.saveOcrResult(parseInt(fileId), ocrData) res.json({ success: true }) } catch (error) { console.error('保存OCR结果失败:', error) res.status(500).json({ error: '保存OCR结果失败: ' + error.message }) } }) // 获取 OCR 结果 app.get('/api/ocr/result/:fileId', async (req, res) => { try { const fileId = parseInt(req.params.fileId) const result = await fileService.getOcrResult(fileId) if (result) { res.json({ success: true, data: result.ocr_data }) } else { res.json({ success: false, error: '未找到OCR结果' }) } } catch (error) { console.error('获取OCR结果失败:', error) res.status(500).json({ error: '获取OCR结果失败: ' + error.message }) } }) // 更新 OCR 文本(人工纠错) app.put('/api/ocr/update-text', async (req, res) => { try { const { fileId, textBlocks } = req.body if (!fileId || !textBlocks) { return res.status(400).json({ error: '文件ID和文本数据是必需的' }) } await fileService.updateOcrText(parseInt(fileId), textBlocks) res.json({ success: true }) } catch (error) { console.error('更新OCR文本失败:', error) res.status(500).json({ error: '更新OCR文本失败: ' + error.message }) } }) // 图像预处理函数 async function preprocessImage(imagePath) { const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`) try { // 使用 sharp 进行图像预处理 await sharp(imagePath) .grayscale() // 转为灰度图 .normalize() // 标准化图像 .linear(1.5, 0) // 增加对比度 .sharpen() // 锐化 .png() .toFile(tempOutputPath) return tempOutputPath } catch (error) { console.error('Image preprocessing failed:', error) // 如果预处理失败,返回原图 return imagePath } } // OCR 识别函数 async function performOCR(imagePath) { return new Promise((resolve, reject) => { const startTime = Date.now() Tesseract.recognize( imagePath, 'chi_sim+eng', // 中文简体 + 英文 { logger: m => console.log(m), tessedit_pageseg_mode: Tesseract.PSM.AUTO, tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5,。!?;:"'/'()【】《》…—·' } ).then(({ data: { text, confidence } }) => { const processingTime = Date.now() - startTime // 解析文本块 const textBlocks = parseOCRText(text) resolve({ textBlocks, confidence, processingTime }) }).catch(error => { reject(error) }) }) } // 解析 OCR 文本结果 function parseOCRText(text) { const blocks = [] const lines = text.split('\n').filter(line => line.trim()) for (const line of lines) { const trimmedLine = line.trim() if (!trimmedLine) continue // 检测参考文献 if (isReference(trimmedLine)) { blocks.push({ type: 'reference', content: trimmedLine }) } // 检测引用 else if (isCitation(trimmedLine)) { blocks.push({ type: 'citation', content: trimmedLine.replace(/^\[\d+\]\s*/, ''), number: extractCitationNumber(trimmedLine) }) } // 检测图片标记 else if (isImageMarker(trimmedLine)) { blocks.push({ type: 'image', content: trimmedLine }) } // 普通文本 else { blocks.push({ type: 'text', content: trimmedLine }) } } return blocks } // 辅助函数 function isReference(text) { const refPatterns = [ /^参考文献/i, /^references/i, /^bibliography/i, /^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/ ] return refPatterns.some(pattern => pattern.test(text)) } function isCitation(text) { return /^\[\d+\]/.test(text) } function extractCitationNumber(text) { const match = text.match(/^\[(\d+)\]/) return match ? parseInt(match[1]) : null } function isImageMarker(text) { const imagePatterns = [ /^图\s*\d+/i, /^figure\s*\d+/i, /^图片\d*/i ] return imagePatterns.some(pattern => pattern.test(text)) } // 获取文件预览接口 app.get('/api/files/:id/preview', async (req, res) => { try { const fileId = parseInt(req.params.id) const file = await fileService.getFileById(fileId) if (!file) { return res.status(404).json({ error: 'File not found' }) } // 检查文件是否存在 if (!fs.existsSync(file.filePath)) { return res.status(404).json({ error: 'File not found on disk' }) } // 设置正确的 Content-Type res.setHeader('Content-Type', file.mimeType) // 直接发送文件 res.sendFile(path.resolve(file.filePath)) } catch (error) { console.error('File preview error:', error) res.status(500).json({ error: 'Failed to get file preview' }) } }) // 获取文件缩略图接口 app.get('/api/files/:id/thumbnail', async (req, res) => { try { const fileId = parseInt(req.params.id) const file = await fileService.getFileById(fileId) if (!file) { return res.status(404).json({ error: 'File not found' }) } // 只对图片生成缩略图 if (!file.mimeType.startsWith('image/')) { return res.status(400).json({ error: 'Not an image file' }) } const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`) // 生成缩略图 await sharp(file.filePath) .resize(100, 100, { fit: 'inside', withoutEnlargement: true }) .jpeg({ quality: 80 }) .toFile(thumbnailPath) res.sendFile(path.resolve(thumbnailPath)) } catch (error) { console.error('Thumbnail generation error:', error) // 如果缩略图生成失败,返回原图 res.sendFile(path.resolve(file.filePath)) } }) // 健康检查接口 app.get('/api/health', (req, res) => { res.json({ status: 'OK', timestamp: new Date().toISOString(), service: 'file-management-api' }) }) function startServer() { app.listen(PORT, () => { console.log(`Server running on http://localhost:${PORT}`) }) } module.exports = { startServer }