Electron-vue3-ts-offline/server/server.js

const express = require('express')
const cors = require('cors')
const multer = require('multer')
const path = require('path')
const fs = require('fs-extra')
const { calculateFileMD5 } = require('./utils.js')
const { initDatabase, FileService } = require('../database/database.js')

// 新增 OCR 相关依赖
const Tesseract = require('tesseract.js')
const sharp = require('sharp')
const { createCanvas, loadImage } = require('canvas')

const app = express()
const PORT = 3000

// 初始化数据库
initDatabase()
const fileService = new FileService()

// 确保上传目录和临时目录存在
const uploadDir = path.join(process.cwd(), 'uploads')
const tempDir = path.join(process.cwd(), 'temp')
fs.ensureDirSync(uploadDir)
fs.ensureDirSync(tempDir)

// 配置 multer - 修复中文文件名问题
const storage = multer.diskStorage({
    destination: (req, file, cb) => {
        cb(null, uploadDir)
    },
    filename: (req, file, cb) => {
        // 处理中文文件名 - 使用原始文件名但确保安全
        const originalName = Buffer.from(file.originalname, 'latin1').toString('utf8')
        const ext = path.extname(originalName)
        const name = path.basename(originalName, ext)

        // 清理文件名，移除特殊字符
        const safeName = name.replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '_')
        const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9)
        const filename = safeName + '-' + uniqueSuffix + ext

        cb(null, filename)
    }
})

const upload = multer({
    storage,
    fileFilter: (req, file, cb) => {
        // 处理文件名编码
        file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8')
        cb(null, true)
    }
})

// 设置响应头，确保使用 UTF-8 编码
app.use((req, res, next) => {
    res.setHeader('Content-Type', 'application/json; charset=utf-8')
    next()
})

app.use(cors())
app.use(express.json({ limit: '50mb' }))
app.use(express.urlencoded({ extended: true, limit: '50mb' }))

// 文件上传接口
app.post('/api/upload', upload.single('file'), async (req, res) => {
    try {
        if (!req.file) {
            return res.status(400).json({ error: 'No file uploaded' })
        }

        // 确保文件名正确编码
        const originalName = Buffer.from(req.file.originalname, 'latin1').toString('utf8')

        const fileInfo = {
            originalName: originalName,
            fileName: req.file.filename,
            filePath: req.file.path,
            fileSize: req.file.size,
            mimeType: req.file.mimetype
        }

        // 计算 MD5
        const md5 = await calculateFileMD5(req.file.path)

        // 保存到数据库
        const fileRecord = await fileService.createFile({
            ...fileInfo,
            md5
        })

        res.json({
            success: true,
            data: fileRecord
        })
    } catch (error) {
        console.error('Upload error:', error)
        res.status(500).json({ error: 'Upload failed: ' + error.message })
    }
})

// 修复获取文件列表接口 - 确保返回正确的数据结构
app.get('/api/files', async (req, res) => {
    try {
        const page = parseInt(req.query.page) || 1
        const pageSize = parseInt(req.query.pageSize) || 100

        const result = await fileService.getFilesPaginated(page, pageSize)

        // 返回统一的数据结构
        res.json({
            success: true,
            data: result.files, // 直接返回文件数组
            pagination: result.pagination
        })
    } catch (error) {
        console.error('Get files error:', error)
        res.status(500).json({
            success: false,
            error: 'Failed to get files: ' + error.message
        })
    }
})

// MD5 检查接口
app.post('/api/files/:id/check-md5', async (req, res) => {
    try {
        const fileId = parseInt(req.params.id)
        const file = await fileService.getFileById(fileId)

        if (!file) {
            return res.status(404).json({ error: 'File not found' })
        }

        const currentMD5 = await calculateFileMD5(file.filePath)
        const isChanged = currentMD5 !== file.md5

        res.json({
            isChanged,
            currentMD5,
            originalMD5: file.md5,
            file
        })
    } catch (error) {
        console.error('MD5 check error:', error)
        res.status(500).json({ error: 'MD5 check failed' })
    }
})

// 更新 MD5 接口
app.put('/api/files/:id/update-md5', async (req, res) => {
    try {
        const fileId = parseInt(req.params.id)
        const { md5 } = req.body

        await fileService.updateFileMD5(fileId, md5)
        res.json({ success: true })
    } catch (error) {
        console.error('Update MD5 error:', error)
        res.status(500).json({ error: 'Update failed' })
    }
})

// 新增 OCR 识别接口
app.post('/api/ocr/recognize', async (req, res) => {
    try {
        const { fileId, page } = req.body

        if (!fileId) {
            return res.status(400).json({ error: 'File ID is required' })
        }

        const file = await fileService.getFileById(parseInt(fileId))
        if (!file) {
            return res.status(404).json({ error: 'File not found' })
        }

        console.log(`开始OCR识别: ${file.originalName}`)

        // 预处理图像
        const processedImagePath = await preprocessImage(file.filePath)

        // 使用 Tesseract 进行 OCR 识别
        const result = await performOCR(processedImagePath)

        // 清理临时文件
        await fs.remove(processedImagePath)

        res.json({
            success: true,
            data: {
                textBlocks: result.textBlocks,
                totalPages: result.totalPages || 1,
                processingTime: result.processingTime,
                confidence: result.confidence
            }
        })

    } catch (error) {
        console.error('OCR recognition error:', error)
        res.status(500).json({ error: 'OCR recognition failed: ' + error.message })
    }
})

// 添加 OCR 结果相关的 API 接口

// 保存 OCR 结果
app.post('/api/ocr/save-result', async (req, res) => {
    try {
        const { fileId, ocrData } = req.body

        if (!fileId || !ocrData) {
            return res.status(400).json({ error: '文件ID和OCR数据是必需的' })
        }

        await fileService.saveOcrResult(parseInt(fileId), ocrData)

        res.json({ success: true })
    } catch (error) {
        console.error('保存OCR结果失败:', error)
        res.status(500).json({ error: '保存OCR结果失败: ' + error.message })
    }
})

// 获取 OCR 结果
app.get('/api/ocr/result/:fileId', async (req, res) => {
    try {
        const fileId = parseInt(req.params.fileId)
        const result = await fileService.getOcrResult(fileId)

        if (result) {
            res.json({
                success: true,
                data: result.ocr_data
            })
        } else {
            res.json({
                success: false,
                error: '未找到OCR结果'
            })
        }
    } catch (error) {
        console.error('获取OCR结果失败:', error)
        res.status(500).json({ error: '获取OCR结果失败: ' + error.message })
    }
})

// 更新 OCR 文本（人工纠错）
app.put('/api/ocr/update-text', async (req, res) => {
    try {
        const { fileId, textBlocks } = req.body

        if (!fileId || !textBlocks) {
            return res.status(400).json({ error: '文件ID和文本数据是必需的' })
        }

        await fileService.updateOcrText(parseInt(fileId), textBlocks)

        res.json({ success: true })
    } catch (error) {
        console.error('更新OCR文本失败:', error)
        res.status(500).json({ error: '更新OCR文本失败: ' + error.message })
    }
})

// 图像预处理函数
async function preprocessImage(imagePath) {
    const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`)

    try {
        // 使用 sharp 进行图像预处理
        await sharp(imagePath)
            .grayscale() // 转为灰度图
            .normalize() // 标准化图像
            .linear(1.5, 0) // 增加对比度
            .sharpen() // 锐化
            .png()
            .toFile(tempOutputPath)

        return tempOutputPath
    } catch (error) {
        console.error('Image preprocessing failed:', error)
        // 如果预处理失败，返回原图
        return imagePath
    }
}

// OCR 识别函数
async function performOCR(imagePath) {
    return new Promise((resolve, reject) => {
        const startTime = Date.now()

        Tesseract.recognize(
            imagePath,
            'chi_sim+eng', // 中文简体 + 英文
            {
                logger: m => console.log(m),
                tessedit_pageseg_mode: Tesseract.PSM.AUTO,
                tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5，。！？；："'/'（）【】《》…—·'
            }
        ).then(({ data: { text, confidence } }) => {
            const processingTime = Date.now() - startTime

            // 解析文本块
            const textBlocks = parseOCRText(text)

            resolve({
                textBlocks,
                confidence,
                processingTime
            })
        }).catch(error => {
            reject(error)
        })
    })
}

// 解析 OCR 文本结果
function parseOCRText(text) {
    const blocks = []
    const lines = text.split('\n').filter(line => line.trim())

    for (const line of lines) {
        const trimmedLine = line.trim()
        if (!trimmedLine) continue

        // 检测参考文献
        if (isReference(trimmedLine)) {
            blocks.push({
                type: 'reference',
                content: trimmedLine
            })
        }
        // 检测引用
        else if (isCitation(trimmedLine)) {
            blocks.push({
                type: 'citation',
                content: trimmedLine.replace(/^\[\d+\]\s*/, ''),
                number: extractCitationNumber(trimmedLine)
            })
        }
        // 检测图片标记
        else if (isImageMarker(trimmedLine)) {
            blocks.push({
                type: 'image',
                content: trimmedLine
            })
        }
        // 普通文本
        else {
            blocks.push({
                type: 'text',
                content: trimmedLine
            })
        }
    }

    return blocks
}

// 辅助函数
function isReference(text) {
    const refPatterns = [
        /^参考文献/i,
        /^references/i,
        /^bibliography/i,
        /^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/
    ]
    return refPatterns.some(pattern => pattern.test(text))
}

function isCitation(text) {
    return /^\[\d+\]/.test(text)
}

function extractCitationNumber(text) {
    const match = text.match(/^\[(\d+)\]/)
    return match ? parseInt(match[1]) : null
}

function isImageMarker(text) {
    const imagePatterns = [
        /^图\s*\d+/i,
        /^figure\s*\d+/i,
        /^图片\d*/i
    ]
    return imagePatterns.some(pattern => pattern.test(text))
}

// 获取文件预览接口
app.get('/api/files/:id/preview', async (req, res) => {
    try {
        const fileId = parseInt(req.params.id)
        const file = await fileService.getFileById(fileId)

        if (!file) {
            return res.status(404).json({ error: 'File not found' })
        }

        // 检查文件是否存在
        if (!fs.existsSync(file.filePath)) {
            return res.status(404).json({ error: 'File not found on disk' })
        }

        // 设置正确的 Content-Type
        res.setHeader('Content-Type', file.mimeType)

        // 直接发送文件
        res.sendFile(path.resolve(file.filePath))

    } catch (error) {
        console.error('File preview error:', error)
        res.status(500).json({ error: 'Failed to get file preview' })
    }
})

// 获取文件缩略图接口
app.get('/api/files/:id/thumbnail', async (req, res) => {
    try {
        const fileId = parseInt(req.params.id)
        const file = await fileService.getFileById(fileId)

        if (!file) {
            return res.status(404).json({ error: 'File not found' })
        }

        // 只对图片生成缩略图
        if (!file.mimeType.startsWith('image/')) {
            return res.status(400).json({ error: 'Not an image file' })
        }

        const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`)

        // 生成缩略图
        await sharp(file.filePath)
            .resize(100, 100, {
                fit: 'inside',
                withoutEnlargement: true
            })
            .jpeg({ quality: 80 })
            .toFile(thumbnailPath)

        res.sendFile(path.resolve(thumbnailPath))

    } catch (error) {
        console.error('Thumbnail generation error:', error)
        // 如果缩略图生成失败，返回原图
        res.sendFile(path.resolve(file.filePath))
    }
})

// 健康检查接口
app.get('/api/health', (req, res) => {
    res.json({
        status: 'OK',
        timestamp: new Date().toISOString(),
        service: 'file-management-api'
    })
})

function startServer() {
    app.listen(PORT, () => {
        console.log(`Server running on http://localhost:${PORT}`)
    })
}

module.exports = { startServer }