| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468 |
- const express = require('express')
- const cors = require('cors')
- const multer = require('multer')
- const path = require('path')
- const fs = require('fs-extra')
- const { calculateFileMD5 } = require('./utils.js')
- const { initDatabase, FileService } = require('../database/database.js')
- // 新增 OCR 相关依赖
- const Tesseract = require('tesseract.js')
- const sharp = require('sharp')
- const { createCanvas, loadImage } = require('canvas')
- const app = express()
- const PORT = 3000
- // 初始化数据库
- initDatabase()
- const fileService = new FileService()
- // 确保上传目录和临时目录存在
- const uploadDir = path.join(process.cwd(), 'uploads')
- const tempDir = path.join(process.cwd(), 'temp')
- fs.ensureDirSync(uploadDir)
- fs.ensureDirSync(tempDir)
- // 配置 multer - 修复中文文件名问题
- const storage = multer.diskStorage({
- destination: (req, file, cb) => {
- cb(null, uploadDir)
- },
- filename: (req, file, cb) => {
- // 处理中文文件名 - 使用原始文件名但确保安全
- const originalName = Buffer.from(file.originalname, 'latin1').toString('utf8')
- const ext = path.extname(originalName)
- const name = path.basename(originalName, ext)
- // 清理文件名,移除特殊字符
- const safeName = name.replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '_')
- const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9)
- const filename = safeName + '-' + uniqueSuffix + ext
- cb(null, filename)
- }
- })
- const upload = multer({
- storage,
- fileFilter: (req, file, cb) => {
- // 处理文件名编码
- file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8')
- cb(null, true)
- }
- })
- // 设置响应头,确保使用 UTF-8 编码
- app.use((req, res, next) => {
- res.setHeader('Content-Type', 'application/json; charset=utf-8')
- next()
- })
- app.use(cors())
- app.use(express.json({ limit: '50mb' }))
- app.use(express.urlencoded({ extended: true, limit: '50mb' }))
- // 文件上传接口
- app.post('/api/upload', upload.single('file'), async (req, res) => {
- try {
- if (!req.file) {
- return res.status(400).json({ error: 'No file uploaded' })
- }
- // 确保文件名正确编码
- const originalName = Buffer.from(req.file.originalname, 'latin1').toString('utf8')
- const fileInfo = {
- originalName: originalName,
- fileName: req.file.filename,
- filePath: req.file.path,
- fileSize: req.file.size,
- mimeType: req.file.mimetype
- }
- // 计算 MD5
- const md5 = await calculateFileMD5(req.file.path)
- // 保存到数据库
- const fileRecord = await fileService.createFile({
- ...fileInfo,
- md5
- })
- res.json({
- success: true,
- data: fileRecord
- })
- } catch (error) {
- console.error('Upload error:', error)
- res.status(500).json({ error: 'Upload failed: ' + error.message })
- }
- })
- // 修复获取文件列表接口 - 确保返回正确的数据结构
- app.get('/api/files', async (req, res) => {
- try {
- const page = parseInt(req.query.page) || 1
- const pageSize = parseInt(req.query.pageSize) || 100
- const result = await fileService.getFilesPaginated(page, pageSize)
- // 返回统一的数据结构
- res.json({
- success: true,
- data: result.files, // 直接返回文件数组
- pagination: result.pagination
- })
- } catch (error) {
- console.error('Get files error:', error)
- res.status(500).json({
- success: false,
- error: 'Failed to get files: ' + error.message
- })
- }
- })
- // MD5 检查接口
- app.post('/api/files/:id/check-md5', async (req, res) => {
- try {
- const fileId = parseInt(req.params.id)
- const file = await fileService.getFileById(fileId)
- if (!file) {
- return res.status(404).json({ error: 'File not found' })
- }
- const currentMD5 = await calculateFileMD5(file.filePath)
- const isChanged = currentMD5 !== file.md5
- res.json({
- isChanged,
- currentMD5,
- originalMD5: file.md5,
- file
- })
- } catch (error) {
- console.error('MD5 check error:', error)
- res.status(500).json({ error: 'MD5 check failed' })
- }
- })
- // 更新 MD5 接口
- app.put('/api/files/:id/update-md5', async (req, res) => {
- try {
- const fileId = parseInt(req.params.id)
- const { md5 } = req.body
- await fileService.updateFileMD5(fileId, md5)
- res.json({ success: true })
- } catch (error) {
- console.error('Update MD5 error:', error)
- res.status(500).json({ error: 'Update failed' })
- }
- })
- // 新增 OCR 识别接口
- app.post('/api/ocr/recognize', async (req, res) => {
- try {
- const { fileId, page } = req.body
- if (!fileId) {
- return res.status(400).json({ error: 'File ID is required' })
- }
- const file = await fileService.getFileById(parseInt(fileId))
- if (!file) {
- return res.status(404).json({ error: 'File not found' })
- }
- console.log(`开始OCR识别: ${file.originalName}`)
- // 预处理图像
- const processedImagePath = await preprocessImage(file.filePath)
- // 使用 Tesseract 进行 OCR 识别
- const result = await performOCR(processedImagePath)
- // 清理临时文件
- await fs.remove(processedImagePath)
- res.json({
- success: true,
- data: {
- textBlocks: result.textBlocks,
- totalPages: result.totalPages || 1,
- processingTime: result.processingTime,
- confidence: result.confidence
- }
- })
- } catch (error) {
- console.error('OCR recognition error:', error)
- res.status(500).json({ error: 'OCR recognition failed: ' + error.message })
- }
- })
- // 添加 OCR 结果相关的 API 接口
- // 保存 OCR 结果
- app.post('/api/ocr/save-result', async (req, res) => {
- try {
- const { fileId, ocrData } = req.body
- if (!fileId || !ocrData) {
- return res.status(400).json({ error: '文件ID和OCR数据是必需的' })
- }
- await fileService.saveOcrResult(parseInt(fileId), ocrData)
- res.json({ success: true })
- } catch (error) {
- console.error('保存OCR结果失败:', error)
- res.status(500).json({ error: '保存OCR结果失败: ' + error.message })
- }
- })
- // 获取 OCR 结果
- app.get('/api/ocr/result/:fileId', async (req, res) => {
- try {
- const fileId = parseInt(req.params.fileId)
- const result = await fileService.getOcrResult(fileId)
- if (result) {
- res.json({
- success: true,
- data: result.ocr_data
- })
- } else {
- res.json({
- success: false,
- error: '未找到OCR结果'
- })
- }
- } catch (error) {
- console.error('获取OCR结果失败:', error)
- res.status(500).json({ error: '获取OCR结果失败: ' + error.message })
- }
- })
- // 更新 OCR 文本(人工纠错)
- app.put('/api/ocr/update-text', async (req, res) => {
- try {
- const { fileId, textBlocks } = req.body
- if (!fileId || !textBlocks) {
- return res.status(400).json({ error: '文件ID和文本数据是必需的' })
- }
- await fileService.updateOcrText(parseInt(fileId), textBlocks)
- res.json({ success: true })
- } catch (error) {
- console.error('更新OCR文本失败:', error)
- res.status(500).json({ error: '更新OCR文本失败: ' + error.message })
- }
- })
- // 图像预处理函数
- async function preprocessImage(imagePath) {
- const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`)
- try {
- // 使用 sharp 进行图像预处理
- await sharp(imagePath)
- .grayscale() // 转为灰度图
- .normalize() // 标准化图像
- .linear(1.5, 0) // 增加对比度
- .sharpen() // 锐化
- .png()
- .toFile(tempOutputPath)
- return tempOutputPath
- } catch (error) {
- console.error('Image preprocessing failed:', error)
- // 如果预处理失败,返回原图
- return imagePath
- }
- }
- // OCR 识别函数
- async function performOCR(imagePath) {
- return new Promise((resolve, reject) => {
- const startTime = Date.now()
- Tesseract.recognize(
- imagePath,
- 'chi_sim+eng', // 中文简体 + 英文
- {
- logger: m => console.log(m),
- tessedit_pageseg_mode: Tesseract.PSM.AUTO,
- tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5,。!?;:"'/'()【】《》…—·'
- }
- ).then(({ data: { text, confidence } }) => {
- const processingTime = Date.now() - startTime
- // 解析文本块
- const textBlocks = parseOCRText(text)
- resolve({
- textBlocks,
- confidence,
- processingTime
- })
- }).catch(error => {
- reject(error)
- })
- })
- }
- // 解析 OCR 文本结果
- function parseOCRText(text) {
- const blocks = []
- const lines = text.split('\n').filter(line => line.trim())
- for (const line of lines) {
- const trimmedLine = line.trim()
- if (!trimmedLine) continue
- // 检测参考文献
- if (isReference(trimmedLine)) {
- blocks.push({
- type: 'reference',
- content: trimmedLine
- })
- }
- // 检测引用
- else if (isCitation(trimmedLine)) {
- blocks.push({
- type: 'citation',
- content: trimmedLine.replace(/^\[\d+\]\s*/, ''),
- number: extractCitationNumber(trimmedLine)
- })
- }
- // 检测图片标记
- else if (isImageMarker(trimmedLine)) {
- blocks.push({
- type: 'image',
- content: trimmedLine
- })
- }
- // 普通文本
- else {
- blocks.push({
- type: 'text',
- content: trimmedLine
- })
- }
- }
- return blocks
- }
- // 辅助函数
- function isReference(text) {
- const refPatterns = [
- /^参考文献/i,
- /^references/i,
- /^bibliography/i,
- /^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/
- ]
- return refPatterns.some(pattern => pattern.test(text))
- }
- function isCitation(text) {
- return /^\[\d+\]/.test(text)
- }
- function extractCitationNumber(text) {
- const match = text.match(/^\[(\d+)\]/)
- return match ? parseInt(match[1]) : null
- }
- function isImageMarker(text) {
- const imagePatterns = [
- /^图\s*\d+/i,
- /^figure\s*\d+/i,
- /^图片\d*/i
- ]
- return imagePatterns.some(pattern => pattern.test(text))
- }
- // 获取文件预览接口
- app.get('/api/files/:id/preview', async (req, res) => {
- try {
- const fileId = parseInt(req.params.id)
- const file = await fileService.getFileById(fileId)
- if (!file) {
- return res.status(404).json({ error: 'File not found' })
- }
- // 检查文件是否存在
- if (!fs.existsSync(file.filePath)) {
- return res.status(404).json({ error: 'File not found on disk' })
- }
- // 设置正确的 Content-Type
- res.setHeader('Content-Type', file.mimeType)
- // 直接发送文件
- res.sendFile(path.resolve(file.filePath))
- } catch (error) {
- console.error('File preview error:', error)
- res.status(500).json({ error: 'Failed to get file preview' })
- }
- })
- // 获取文件缩略图接口
- app.get('/api/files/:id/thumbnail', async (req, res) => {
- try {
- const fileId = parseInt(req.params.id)
- const file = await fileService.getFileById(fileId)
- if (!file) {
- return res.status(404).json({ error: 'File not found' })
- }
- // 只对图片生成缩略图
- if (!file.mimeType.startsWith('image/')) {
- return res.status(400).json({ error: 'Not an image file' })
- }
- const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`)
- // 生成缩略图
- await sharp(file.filePath)
- .resize(100, 100, {
- fit: 'inside',
- withoutEnlargement: true
- })
- .jpeg({ quality: 80 })
- .toFile(thumbnailPath)
- res.sendFile(path.resolve(thumbnailPath))
- } catch (error) {
- console.error('Thumbnail generation error:', error)
- // 如果缩略图生成失败,返回原图
- res.sendFile(path.resolve(file.filePath))
- }
- })
- // 健康检查接口
- app.get('/api/health', (req, res) => {
- res.json({
- status: 'OK',
- timestamp: new Date().toISOString(),
- service: 'file-management-api'
- })
- })
- function startServer() {
- app.listen(PORT, () => {
- console.log(`Server running on http://localhost:${PORT}`)
- })
- }
- module.exports = { startServer }
|