diff --git a/database/database.js b/database/database.js
new file mode 100644
index 0000000..fb8e326
--- /dev/null
+++ b/database/database.js
@@ -0,0 +1,235 @@
+const sqlite3 = require('sqlite3')
+const path = require('path')
+const { open } = require('sqlite')
+const fs = require('fs-extra')
+
+const dbPath = path.join(process.cwd(), 'database/files.db')
+
+// 确保数据库目录存在
+const dbDir = path.dirname(dbPath)
+fs.ensureDirSync(dbDir)
+
+async function initDatabase() {
+ const db = await open({
+ filename: dbPath,
+ driver: sqlite3.Database
+ })
+
+ // 设置数据库编码为 UTF-8
+ await db.exec('PRAGMA encoding = "UTF-8"')
+ await db.exec('PRAGMA foreign_keys = ON')
+
+ await db.exec(`
+ CREATE TABLE IF NOT EXISTS files (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ original_name TEXT NOT NULL,
+ file_name TEXT NOT NULL,
+ file_path TEXT NOT NULL,
+ file_size INTEGER NOT NULL,
+ mime_type TEXT NOT NULL,
+ md5 TEXT NOT NULL,
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+ )
+ `)
+
+ // 新增 OCR 结果表
+ await db.exec(`
+ CREATE TABLE IF NOT EXISTS ocr_results (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ file_id INTEGER NOT NULL,
+ ocr_data TEXT NOT NULL,
+ confidence REAL,
+ processing_time INTEGER,
+ recognized_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
+ UNIQUE(file_id)
+ )
+ `)
+
+ await db.close()
+}
+
+class FileService {
+ async getDb() {
+ const db = await open({
+ filename: dbPath,
+ driver: sqlite3.Database
+ })
+
+ // 确保每次连接都使用 UTF-8
+ await db.exec('PRAGMA encoding = "UTF-8"')
+ return db
+ }
+
+ async createFile(fileData) {
+ const db = await this.getDb()
+
+ // 确保文件名正确存储
+ const result = await db.run(
+ `INSERT INTO files (original_name, file_name, file_path, file_size, mime_type, md5)
+ VALUES (?, ?, ?, ?, ?, ?)`,
+ [
+ fileData.originalName,
+ fileData.fileName,
+ fileData.filePath,
+ fileData.fileSize,
+ fileData.mimeType,
+ fileData.md5
+ ]
+ )
+
+ const file = await db.get(
+ 'SELECT * FROM files WHERE id = ?',
+ result.lastID
+ )
+
+ await db.close()
+
+ return this.mapDatabaseToFileRecord(file)
+ }
+
+ async getFilesPaginated(page, pageSize) {
+ const db = await this.getDb()
+ const offset = (page - 1) * pageSize
+
+ const files = await db.all(
+ 'SELECT * FROM files ORDER BY created_at DESC LIMIT ? OFFSET ?',
+ [pageSize, offset]
+ )
+
+ const totalResult = await db.get('SELECT COUNT(*) as count FROM files')
+ const total = totalResult.count
+
+ await db.close()
+
+ return {
+ files: files.map(file => this.mapDatabaseToFileRecord(file)),
+ pagination: {
+ page,
+ pageSize,
+ total,
+ totalPages: Math.ceil(total / pageSize)
+ }
+ }
+ }
+
+ async getFileById(id) {
+ const db = await this.getDb()
+ const file = await db.get('SELECT * FROM files WHERE id = ?', [id])
+ await db.close()
+ return file ? this.mapDatabaseToFileRecord(file) : null
+ }
+
+ async updateFileMD5(id, md5) {
+ const db = await this.getDb()
+ await db.run(
+ 'UPDATE files SET md5 = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?',
+ [md5, id]
+ )
+ await db.close()
+ }
+
+ mapDatabaseToFileRecord(dbFile) {
+ // 确保从数据库读取时正确处理编码
+ let originalName = dbFile.original_name
+ try {
+ // 尝试解码,如果已经是正确编码则不会影响
+ originalName = decodeURIComponent(originalName)
+ } catch (error) {
+ console.warn('文件名解码失败,使用原值:', error)
+ }
+
+ return {
+ id: dbFile.id,
+ originalName: originalName,
+ fileName: dbFile.file_name,
+ filePath: dbFile.file_path,
+ fileSize: dbFile.file_size,
+ mimeType: dbFile.mime_type,
+ md5: dbFile.md5,
+ createdAt: dbFile.created_at,
+ updatedAt: dbFile.updated_at
+ }
+ }
+ async saveOcrResult(fileId, ocrData) {
+ const db = await this.getDb()
+
+ // 将 OCR 数据转为 JSON 字符串存储
+ const ocrDataJson = JSON.stringify(ocrData)
+
+ try {
+ // 尝试更新已存在的记录
+ const result = await db.run(
+ `UPDATE ocr_results SET ocr_data = ?, confidence = ?, processing_time = ?, updated_at = CURRENT_TIMESTAMP
+ WHERE file_id = ?`,
+ [ocrDataJson, ocrData.confidence, ocrData.processingTime, fileId]
+ )
+
+ // 如果没有更新任何行,则插入新记录
+ if (result.changes === 0) {
+ await db.run(
+ `INSERT INTO ocr_results (file_id, ocr_data, confidence, processing_time)
+ VALUES (?, ?, ?, ?)`,
+ [fileId, ocrDataJson, ocrData.confidence, ocrData.processingTime]
+ )
+ }
+
+ await db.close()
+ return { success: true }
+ } catch (error) {
+ await db.close()
+ throw error
+ }
+ }
+
+ async getOcrResult(fileId) {
+ const db = await this.getDb()
+
+ const result = await db.get(
+ 'SELECT * FROM ocr_results WHERE file_id = ?',
+ [fileId]
+ )
+
+ await db.close()
+
+ if (result) {
+ return {
+ ...result,
+ ocr_data: JSON.parse(result.ocr_data)
+ }
+ }
+
+ return null
+ }
+
+ async updateOcrText(fileId, newTextBlocks) {
+ const db = await this.getDb()
+
+ const existingResult = await this.getOcrResult(fileId)
+ if (!existingResult) {
+ throw new Error('没有找到OCR结果')
+ }
+
+ // 更新文本块
+ const updatedOcrData = {
+ ...existingResult.ocr_data,
+ textBlocks: newTextBlocks,
+ updatedAt: new Date().toISOString(),
+ manuallyCorrected: true
+ }
+
+ const ocrDataJson = JSON.stringify(updatedOcrData)
+
+ await db.run(
+ 'UPDATE ocr_results SET ocr_data = ?, updated_at = CURRENT_TIMESTAMP WHERE file_id = ?',
+ [ocrDataJson, fileId]
+ )
+
+ await db.close()
+ return { success: true }
+ }
+}
+
+module.exports = { initDatabase, FileService }
\ No newline at end of file
diff --git a/package.json b/package.json
index 850302d..716ef50 100644
--- a/package.json
+++ b/package.json
@@ -11,13 +11,17 @@
"preview": "vite preview"
},
"dependencies": {
+ "canvas": "^3.2.0",
"cors": "^2.8.5",
"crypto-ts": "^1.0.2",
"express": "^5.1.0",
"fs-extra": "^11.3.2",
"multer": "^2.0.2",
+ "node-tesseract-ocr": "^2.2.1",
+ "sharp": "^0.34.5",
"sqlite": "^5.1.1",
"sqlite3": "^5.1.7",
+ "tesseract.js": "^6.0.1",
"vue": "^3.5.22",
"vue-router": "^4.6.3"
},
diff --git a/server/server.js b/server/server.js
index b0d2a35..08b500c 100644
--- a/server/server.js
+++ b/server/server.js
@@ -6,6 +6,11 @@ const fs = require('fs-extra')
const { calculateFileMD5 } = require('./utils.js')
const { initDatabase, FileService } = require('../database/database.js')
+// 新增 OCR 相关依赖
+const Tesseract = require('tesseract.js')
+const sharp = require('sharp')
+const { createCanvas, loadImage } = require('canvas')
+
const app = express()
const PORT = 3000
@@ -13,9 +18,11 @@ const PORT = 3000
initDatabase()
const fileService = new FileService()
-// 确保上传目录存在
+// 确保上传目录和临时目录存在
const uploadDir = path.join(process.cwd(), 'uploads')
+const tempDir = path.join(process.cwd(), 'temp')
fs.ensureDirSync(uploadDir)
+fs.ensureDirSync(tempDir)
// 配置 multer - 修复中文文件名问题
const storage = multer.diskStorage({
@@ -93,23 +100,30 @@ app.post('/api/upload', upload.single('file'), async (req, res) => {
}
})
-// 获取文件列表(分页)
+// 修复获取文件列表接口 - 确保返回正确的数据结构
app.get('/api/files', async (req, res) => {
try {
const page = parseInt(req.query.page) || 1
- const pageSize = parseInt(req.query.pageSize) || 10
+ const pageSize = parseInt(req.query.pageSize) || 100
const result = await fileService.getFilesPaginated(page, pageSize)
- // 确保返回的数据使用 UTF-8 编码
- res.json(result)
+ // 返回统一的数据结构
+ res.json({
+ success: true,
+ data: result.files, // 直接返回文件数组
+ pagination: result.pagination
+ })
} catch (error) {
console.error('Get files error:', error)
- res.status(500).json({ error: 'Failed to get files' })
+ res.status(500).json({
+ success: false,
+ error: 'Failed to get files: ' + error.message
+ })
}
})
-// 其他接口保持不变...
+// MD5 检查接口
app.post('/api/files/:id/check-md5', async (req, res) => {
try {
const fileId = parseInt(req.params.id)
@@ -134,6 +148,7 @@ app.post('/api/files/:id/check-md5', async (req, res) => {
}
})
+// 更新 MD5 接口
app.put('/api/files/:id/update-md5', async (req, res) => {
try {
const fileId = parseInt(req.params.id)
@@ -147,6 +162,294 @@ app.put('/api/files/:id/update-md5', async (req, res) => {
}
})
+// 新增 OCR 识别接口
+app.post('/api/ocr/recognize', async (req, res) => {
+ try {
+ const { fileId, page } = req.body
+
+ if (!fileId) {
+ return res.status(400).json({ error: 'File ID is required' })
+ }
+
+ const file = await fileService.getFileById(parseInt(fileId))
+ if (!file) {
+ return res.status(404).json({ error: 'File not found' })
+ }
+
+ console.log(`开始OCR识别: ${file.originalName}`)
+
+ // 预处理图像
+ const processedImagePath = await preprocessImage(file.filePath)
+
+ // 使用 Tesseract 进行 OCR 识别
+ const result = await performOCR(processedImagePath)
+
+ // 清理临时文件
+ await fs.remove(processedImagePath)
+
+ res.json({
+ success: true,
+ data: {
+ textBlocks: result.textBlocks,
+ totalPages: result.totalPages || 1,
+ processingTime: result.processingTime,
+ confidence: result.confidence
+ }
+ })
+
+ } catch (error) {
+ console.error('OCR recognition error:', error)
+ res.status(500).json({ error: 'OCR recognition failed: ' + error.message })
+ }
+})
+
+// 添加 OCR 结果相关的 API 接口
+
+// 保存 OCR 结果
+app.post('/api/ocr/save-result', async (req, res) => {
+ try {
+ const { fileId, ocrData } = req.body
+
+ if (!fileId || !ocrData) {
+ return res.status(400).json({ error: '文件ID和OCR数据是必需的' })
+ }
+
+ await fileService.saveOcrResult(parseInt(fileId), ocrData)
+
+ res.json({ success: true })
+ } catch (error) {
+ console.error('保存OCR结果失败:', error)
+ res.status(500).json({ error: '保存OCR结果失败: ' + error.message })
+ }
+})
+
+// 获取 OCR 结果
+app.get('/api/ocr/result/:fileId', async (req, res) => {
+ try {
+ const fileId = parseInt(req.params.fileId)
+ const result = await fileService.getOcrResult(fileId)
+
+ if (result) {
+ res.json({
+ success: true,
+ data: result.ocr_data
+ })
+ } else {
+ res.json({
+ success: false,
+ error: '未找到OCR结果'
+ })
+ }
+ } catch (error) {
+ console.error('获取OCR结果失败:', error)
+ res.status(500).json({ error: '获取OCR结果失败: ' + error.message })
+ }
+})
+
+// 更新 OCR 文本(人工纠错)
+app.put('/api/ocr/update-text', async (req, res) => {
+ try {
+ const { fileId, textBlocks } = req.body
+
+ if (!fileId || !textBlocks) {
+ return res.status(400).json({ error: '文件ID和文本数据是必需的' })
+ }
+
+ await fileService.updateOcrText(parseInt(fileId), textBlocks)
+
+ res.json({ success: true })
+ } catch (error) {
+ console.error('更新OCR文本失败:', error)
+ res.status(500).json({ error: '更新OCR文本失败: ' + error.message })
+ }
+})
+
+// 图像预处理函数
+async function preprocessImage(imagePath) {
+ const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`)
+
+ try {
+ // 使用 sharp 进行图像预处理
+ await sharp(imagePath)
+ .grayscale() // 转为灰度图
+ .normalize() // 标准化图像
+ .linear(1.5, 0) // 增加对比度
+ .sharpen() // 锐化
+ .png()
+ .toFile(tempOutputPath)
+
+ return tempOutputPath
+ } catch (error) {
+ console.error('Image preprocessing failed:', error)
+ // 如果预处理失败,返回原图
+ return imagePath
+ }
+}
+
+// OCR 识别函数
+async function performOCR(imagePath) {
+ return new Promise((resolve, reject) => {
+ const startTime = Date.now()
+
+ Tesseract.recognize(
+ imagePath,
+ 'chi_sim+eng', // 中文简体 + 英文
+ {
+ logger: m => console.log(m),
+ tessedit_pageseg_mode: Tesseract.PSM.AUTO,
+ tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5,。!?;:"'/'()【】《》…—·'
+ }
+ ).then(({ data: { text, confidence } }) => {
+ const processingTime = Date.now() - startTime
+
+ // 解析文本块
+ const textBlocks = parseOCRText(text)
+
+ resolve({
+ textBlocks,
+ confidence,
+ processingTime
+ })
+ }).catch(error => {
+ reject(error)
+ })
+ })
+}
+
+// 解析 OCR 文本结果
+function parseOCRText(text) {
+ const blocks = []
+ const lines = text.split('\n').filter(line => line.trim())
+
+ for (const line of lines) {
+ const trimmedLine = line.trim()
+ if (!trimmedLine) continue
+
+ // 检测参考文献
+ if (isReference(trimmedLine)) {
+ blocks.push({
+ type: 'reference',
+ content: trimmedLine
+ })
+ }
+ // 检测引用
+ else if (isCitation(trimmedLine)) {
+ blocks.push({
+ type: 'citation',
+ content: trimmedLine.replace(/^\[\d+\]\s*/, ''),
+ number: extractCitationNumber(trimmedLine)
+ })
+ }
+ // 检测图片标记
+ else if (isImageMarker(trimmedLine)) {
+ blocks.push({
+ type: 'image',
+ content: trimmedLine
+ })
+ }
+ // 普通文本
+ else {
+ blocks.push({
+ type: 'text',
+ content: trimmedLine
+ })
+ }
+ }
+
+ return blocks
+}
+
+// 辅助函数
+function isReference(text) {
+ const refPatterns = [
+ /^参考文献/i,
+ /^references/i,
+ /^bibliography/i,
+ /^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/
+ ]
+ return refPatterns.some(pattern => pattern.test(text))
+}
+
+function isCitation(text) {
+ return /^\[\d+\]/.test(text)
+}
+
+function extractCitationNumber(text) {
+ const match = text.match(/^\[(\d+)\]/)
+ return match ? parseInt(match[1]) : null
+}
+
+function isImageMarker(text) {
+ const imagePatterns = [
+ /^图\s*\d+/i,
+ /^figure\s*\d+/i,
+ /^图片\d*/i
+ ]
+ return imagePatterns.some(pattern => pattern.test(text))
+}
+
+// 获取文件预览接口
+app.get('/api/files/:id/preview', async (req, res) => {
+ try {
+ const fileId = parseInt(req.params.id)
+ const file = await fileService.getFileById(fileId)
+
+ if (!file) {
+ return res.status(404).json({ error: 'File not found' })
+ }
+
+ // 检查文件是否存在
+ if (!fs.existsSync(file.filePath)) {
+ return res.status(404).json({ error: 'File not found on disk' })
+ }
+
+ // 设置正确的 Content-Type
+ res.setHeader('Content-Type', file.mimeType)
+
+ // 直接发送文件
+ res.sendFile(path.resolve(file.filePath))
+
+ } catch (error) {
+ console.error('File preview error:', error)
+ res.status(500).json({ error: 'Failed to get file preview' })
+ }
+})
+
+// 获取文件缩略图接口
+app.get('/api/files/:id/thumbnail', async (req, res) => {
+ try {
+ const fileId = parseInt(req.params.id)
+ const file = await fileService.getFileById(fileId)
+
+ if (!file) {
+ return res.status(404).json({ error: 'File not found' })
+ }
+
+ // 只对图片生成缩略图
+ if (!file.mimeType.startsWith('image/')) {
+ return res.status(400).json({ error: 'Not an image file' })
+ }
+
+ const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`)
+
+ // 生成缩略图
+ await sharp(file.filePath)
+ .resize(100, 100, {
+ fit: 'inside',
+ withoutEnlargement: true
+ })
+ .jpeg({ quality: 80 })
+ .toFile(thumbnailPath)
+
+ res.sendFile(path.resolve(thumbnailPath))
+
+ } catch (error) {
+ console.error('Thumbnail generation error:', error)
+ // 如果缩略图生成失败,返回原图
+ res.sendFile(path.resolve(file.filePath))
+ }
+})
+
// 健康检查接口
app.get('/api/health', (req, res) => {
res.json({
diff --git a/src/renderer/components/FileList.vue b/src/renderer/components/FileList.vue
index 6a944f3..70d0ea7 100644
--- a/src/renderer/components/FileList.vue
+++ b/src/renderer/components/FileList.vue
@@ -18,6 +18,8 @@
| 文件名 |
大小 |
+ 类型 |
+ OCR状态 |
MD5 |
上传时间 |
操作 |
@@ -25,11 +27,18 @@
- | {{ decodeFileName(file.originalName) }} |
+
+ {{ decodeFileName(file.originalName) }}
+ |
{{ formatFileSize(file.fileSize) }} |
+ {{ getFileType(file.mimeType) }} |
+
+ ✓ 已识别
+ 待识别
+ |
{{ file.md5 }} |
{{ formatDate(file.createdAt) }} |
-
+ |
|