468 行
13 KiB
JavaScript
468 行
13 KiB
JavaScript
const express = require('express')
|
||
const cors = require('cors')
|
||
const multer = require('multer')
|
||
const path = require('path')
|
||
const fs = require('fs-extra')
|
||
const { calculateFileMD5 } = require('./utils.js')
|
||
const { initDatabase, FileService } = require('../database/database.js')
|
||
|
||
// 新增 OCR 相关依赖
|
||
const Tesseract = require('tesseract.js')
|
||
const sharp = require('sharp')
|
||
const { createCanvas, loadImage } = require('canvas')
|
||
|
||
const app = express()
|
||
const PORT = 3000
|
||
|
||
// 初始化数据库
|
||
initDatabase()
|
||
const fileService = new FileService()
|
||
|
||
// 确保上传目录和临时目录存在
|
||
const uploadDir = path.join(process.cwd(), 'uploads')
|
||
const tempDir = path.join(process.cwd(), 'temp')
|
||
fs.ensureDirSync(uploadDir)
|
||
fs.ensureDirSync(tempDir)
|
||
|
||
// 配置 multer - 修复中文文件名问题
|
||
const storage = multer.diskStorage({
|
||
destination: (req, file, cb) => {
|
||
cb(null, uploadDir)
|
||
},
|
||
filename: (req, file, cb) => {
|
||
// 处理中文文件名 - 使用原始文件名但确保安全
|
||
const originalName = Buffer.from(file.originalname, 'latin1').toString('utf8')
|
||
const ext = path.extname(originalName)
|
||
const name = path.basename(originalName, ext)
|
||
|
||
// 清理文件名,移除特殊字符
|
||
const safeName = name.replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '_')
|
||
const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9)
|
||
const filename = safeName + '-' + uniqueSuffix + ext
|
||
|
||
cb(null, filename)
|
||
}
|
||
})
|
||
|
||
const upload = multer({
|
||
storage,
|
||
fileFilter: (req, file, cb) => {
|
||
// 处理文件名编码
|
||
file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8')
|
||
cb(null, true)
|
||
}
|
||
})
|
||
|
||
// 设置响应头,确保使用 UTF-8 编码
|
||
app.use((req, res, next) => {
|
||
res.setHeader('Content-Type', 'application/json; charset=utf-8')
|
||
next()
|
||
})
|
||
|
||
app.use(cors())
|
||
app.use(express.json({ limit: '50mb' }))
|
||
app.use(express.urlencoded({ extended: true, limit: '50mb' }))
|
||
|
||
// 文件上传接口
|
||
app.post('/api/upload', upload.single('file'), async (req, res) => {
|
||
try {
|
||
if (!req.file) {
|
||
return res.status(400).json({ error: 'No file uploaded' })
|
||
}
|
||
|
||
// 确保文件名正确编码
|
||
const originalName = Buffer.from(req.file.originalname, 'latin1').toString('utf8')
|
||
|
||
const fileInfo = {
|
||
originalName: originalName,
|
||
fileName: req.file.filename,
|
||
filePath: req.file.path,
|
||
fileSize: req.file.size,
|
||
mimeType: req.file.mimetype
|
||
}
|
||
|
||
// 计算 MD5
|
||
const md5 = await calculateFileMD5(req.file.path)
|
||
|
||
// 保存到数据库
|
||
const fileRecord = await fileService.createFile({
|
||
...fileInfo,
|
||
md5
|
||
})
|
||
|
||
res.json({
|
||
success: true,
|
||
data: fileRecord
|
||
})
|
||
} catch (error) {
|
||
console.error('Upload error:', error)
|
||
res.status(500).json({ error: 'Upload failed: ' + error.message })
|
||
}
|
||
})
|
||
|
||
// 修复获取文件列表接口 - 确保返回正确的数据结构
|
||
app.get('/api/files', async (req, res) => {
|
||
try {
|
||
const page = parseInt(req.query.page) || 1
|
||
const pageSize = parseInt(req.query.pageSize) || 100
|
||
|
||
const result = await fileService.getFilesPaginated(page, pageSize)
|
||
|
||
// 返回统一的数据结构
|
||
res.json({
|
||
success: true,
|
||
data: result.files, // 直接返回文件数组
|
||
pagination: result.pagination
|
||
})
|
||
} catch (error) {
|
||
console.error('Get files error:', error)
|
||
res.status(500).json({
|
||
success: false,
|
||
error: 'Failed to get files: ' + error.message
|
||
})
|
||
}
|
||
})
|
||
|
||
// MD5 检查接口
|
||
app.post('/api/files/:id/check-md5', async (req, res) => {
|
||
try {
|
||
const fileId = parseInt(req.params.id)
|
||
const file = await fileService.getFileById(fileId)
|
||
|
||
if (!file) {
|
||
return res.status(404).json({ error: 'File not found' })
|
||
}
|
||
|
||
const currentMD5 = await calculateFileMD5(file.filePath)
|
||
const isChanged = currentMD5 !== file.md5
|
||
|
||
res.json({
|
||
isChanged,
|
||
currentMD5,
|
||
originalMD5: file.md5,
|
||
file
|
||
})
|
||
} catch (error) {
|
||
console.error('MD5 check error:', error)
|
||
res.status(500).json({ error: 'MD5 check failed' })
|
||
}
|
||
})
|
||
|
||
// 更新 MD5 接口
|
||
app.put('/api/files/:id/update-md5', async (req, res) => {
|
||
try {
|
||
const fileId = parseInt(req.params.id)
|
||
const { md5 } = req.body
|
||
|
||
await fileService.updateFileMD5(fileId, md5)
|
||
res.json({ success: true })
|
||
} catch (error) {
|
||
console.error('Update MD5 error:', error)
|
||
res.status(500).json({ error: 'Update failed' })
|
||
}
|
||
})
|
||
|
||
// 新增 OCR 识别接口
|
||
app.post('/api/ocr/recognize', async (req, res) => {
|
||
try {
|
||
const { fileId, page } = req.body
|
||
|
||
if (!fileId) {
|
||
return res.status(400).json({ error: 'File ID is required' })
|
||
}
|
||
|
||
const file = await fileService.getFileById(parseInt(fileId))
|
||
if (!file) {
|
||
return res.status(404).json({ error: 'File not found' })
|
||
}
|
||
|
||
console.log(`开始OCR识别: ${file.originalName}`)
|
||
|
||
// 预处理图像
|
||
const processedImagePath = await preprocessImage(file.filePath)
|
||
|
||
// 使用 Tesseract 进行 OCR 识别
|
||
const result = await performOCR(processedImagePath)
|
||
|
||
// 清理临时文件
|
||
await fs.remove(processedImagePath)
|
||
|
||
res.json({
|
||
success: true,
|
||
data: {
|
||
textBlocks: result.textBlocks,
|
||
totalPages: result.totalPages || 1,
|
||
processingTime: result.processingTime,
|
||
confidence: result.confidence
|
||
}
|
||
})
|
||
|
||
} catch (error) {
|
||
console.error('OCR recognition error:', error)
|
||
res.status(500).json({ error: 'OCR recognition failed: ' + error.message })
|
||
}
|
||
})
|
||
|
||
// 添加 OCR 结果相关的 API 接口
|
||
|
||
// 保存 OCR 结果
|
||
app.post('/api/ocr/save-result', async (req, res) => {
|
||
try {
|
||
const { fileId, ocrData } = req.body
|
||
|
||
if (!fileId || !ocrData) {
|
||
return res.status(400).json({ error: '文件ID和OCR数据是必需的' })
|
||
}
|
||
|
||
await fileService.saveOcrResult(parseInt(fileId), ocrData)
|
||
|
||
res.json({ success: true })
|
||
} catch (error) {
|
||
console.error('保存OCR结果失败:', error)
|
||
res.status(500).json({ error: '保存OCR结果失败: ' + error.message })
|
||
}
|
||
})
|
||
|
||
// 获取 OCR 结果
|
||
app.get('/api/ocr/result/:fileId', async (req, res) => {
|
||
try {
|
||
const fileId = parseInt(req.params.fileId)
|
||
const result = await fileService.getOcrResult(fileId)
|
||
|
||
if (result) {
|
||
res.json({
|
||
success: true,
|
||
data: result.ocr_data
|
||
})
|
||
} else {
|
||
res.json({
|
||
success: false,
|
||
error: '未找到OCR结果'
|
||
})
|
||
}
|
||
} catch (error) {
|
||
console.error('获取OCR结果失败:', error)
|
||
res.status(500).json({ error: '获取OCR结果失败: ' + error.message })
|
||
}
|
||
})
|
||
|
||
// 更新 OCR 文本(人工纠错)
|
||
app.put('/api/ocr/update-text', async (req, res) => {
|
||
try {
|
||
const { fileId, textBlocks } = req.body
|
||
|
||
if (!fileId || !textBlocks) {
|
||
return res.status(400).json({ error: '文件ID和文本数据是必需的' })
|
||
}
|
||
|
||
await fileService.updateOcrText(parseInt(fileId), textBlocks)
|
||
|
||
res.json({ success: true })
|
||
} catch (error) {
|
||
console.error('更新OCR文本失败:', error)
|
||
res.status(500).json({ error: '更新OCR文本失败: ' + error.message })
|
||
}
|
||
})
|
||
|
||
// 图像预处理函数
|
||
async function preprocessImage(imagePath) {
|
||
const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`)
|
||
|
||
try {
|
||
// 使用 sharp 进行图像预处理
|
||
await sharp(imagePath)
|
||
.grayscale() // 转为灰度图
|
||
.normalize() // 标准化图像
|
||
.linear(1.5, 0) // 增加对比度
|
||
.sharpen() // 锐化
|
||
.png()
|
||
.toFile(tempOutputPath)
|
||
|
||
return tempOutputPath
|
||
} catch (error) {
|
||
console.error('Image preprocessing failed:', error)
|
||
// 如果预处理失败,返回原图
|
||
return imagePath
|
||
}
|
||
}
|
||
|
||
// OCR 识别函数
|
||
async function performOCR(imagePath) {
|
||
return new Promise((resolve, reject) => {
|
||
const startTime = Date.now()
|
||
|
||
Tesseract.recognize(
|
||
imagePath,
|
||
'chi_sim+eng', // 中文简体 + 英文
|
||
{
|
||
logger: m => console.log(m),
|
||
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
|
||
tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5,。!?;:"'/'()【】《》…—·'
|
||
}
|
||
).then(({ data: { text, confidence } }) => {
|
||
const processingTime = Date.now() - startTime
|
||
|
||
// 解析文本块
|
||
const textBlocks = parseOCRText(text)
|
||
|
||
resolve({
|
||
textBlocks,
|
||
confidence,
|
||
processingTime
|
||
})
|
||
}).catch(error => {
|
||
reject(error)
|
||
})
|
||
})
|
||
}
|
||
|
||
// 解析 OCR 文本结果
|
||
function parseOCRText(text) {
|
||
const blocks = []
|
||
const lines = text.split('\n').filter(line => line.trim())
|
||
|
||
for (const line of lines) {
|
||
const trimmedLine = line.trim()
|
||
if (!trimmedLine) continue
|
||
|
||
// 检测参考文献
|
||
if (isReference(trimmedLine)) {
|
||
blocks.push({
|
||
type: 'reference',
|
||
content: trimmedLine
|
||
})
|
||
}
|
||
// 检测引用
|
||
else if (isCitation(trimmedLine)) {
|
||
blocks.push({
|
||
type: 'citation',
|
||
content: trimmedLine.replace(/^\[\d+\]\s*/, ''),
|
||
number: extractCitationNumber(trimmedLine)
|
||
})
|
||
}
|
||
// 检测图片标记
|
||
else if (isImageMarker(trimmedLine)) {
|
||
blocks.push({
|
||
type: 'image',
|
||
content: trimmedLine
|
||
})
|
||
}
|
||
// 普通文本
|
||
else {
|
||
blocks.push({
|
||
type: 'text',
|
||
content: trimmedLine
|
||
})
|
||
}
|
||
}
|
||
|
||
return blocks
|
||
}
|
||
|
||
// 辅助函数
|
||
function isReference(text) {
|
||
const refPatterns = [
|
||
/^参考文献/i,
|
||
/^references/i,
|
||
/^bibliography/i,
|
||
/^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/
|
||
]
|
||
return refPatterns.some(pattern => pattern.test(text))
|
||
}
|
||
|
||
function isCitation(text) {
|
||
return /^\[\d+\]/.test(text)
|
||
}
|
||
|
||
function extractCitationNumber(text) {
|
||
const match = text.match(/^\[(\d+)\]/)
|
||
return match ? parseInt(match[1]) : null
|
||
}
|
||
|
||
function isImageMarker(text) {
|
||
const imagePatterns = [
|
||
/^图\s*\d+/i,
|
||
/^figure\s*\d+/i,
|
||
/^图片\d*/i
|
||
]
|
||
return imagePatterns.some(pattern => pattern.test(text))
|
||
}
|
||
|
||
// 获取文件预览接口
|
||
app.get('/api/files/:id/preview', async (req, res) => {
|
||
try {
|
||
const fileId = parseInt(req.params.id)
|
||
const file = await fileService.getFileById(fileId)
|
||
|
||
if (!file) {
|
||
return res.status(404).json({ error: 'File not found' })
|
||
}
|
||
|
||
// 检查文件是否存在
|
||
if (!fs.existsSync(file.filePath)) {
|
||
return res.status(404).json({ error: 'File not found on disk' })
|
||
}
|
||
|
||
// 设置正确的 Content-Type
|
||
res.setHeader('Content-Type', file.mimeType)
|
||
|
||
// 直接发送文件
|
||
res.sendFile(path.resolve(file.filePath))
|
||
|
||
} catch (error) {
|
||
console.error('File preview error:', error)
|
||
res.status(500).json({ error: 'Failed to get file preview' })
|
||
}
|
||
})
|
||
|
||
// 获取文件缩略图接口
|
||
app.get('/api/files/:id/thumbnail', async (req, res) => {
|
||
try {
|
||
const fileId = parseInt(req.params.id)
|
||
const file = await fileService.getFileById(fileId)
|
||
|
||
if (!file) {
|
||
return res.status(404).json({ error: 'File not found' })
|
||
}
|
||
|
||
// 只对图片生成缩略图
|
||
if (!file.mimeType.startsWith('image/')) {
|
||
return res.status(400).json({ error: 'Not an image file' })
|
||
}
|
||
|
||
const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`)
|
||
|
||
// 生成缩略图
|
||
await sharp(file.filePath)
|
||
.resize(100, 100, {
|
||
fit: 'inside',
|
||
withoutEnlargement: true
|
||
})
|
||
.jpeg({ quality: 80 })
|
||
.toFile(thumbnailPath)
|
||
|
||
res.sendFile(path.resolve(thumbnailPath))
|
||
|
||
} catch (error) {
|
||
console.error('Thumbnail generation error:', error)
|
||
// 如果缩略图生成失败,返回原图
|
||
res.sendFile(path.resolve(file.filePath))
|
||
}
|
||
})
|
||
|
||
// 健康检查接口
|
||
app.get('/api/health', (req, res) => {
|
||
res.json({
|
||
status: 'OK',
|
||
timestamp: new Date().toISOString(),
|
||
service: 'file-management-api'
|
||
})
|
||
})
|
||
|
||
function startServer() {
|
||
app.listen(PORT, () => {
|
||
console.log(`Server running on http://localhost:${PORT}`)
|
||
})
|
||
}
|
||
|
||
module.exports = { startServer } |