server.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. const express = require('express')
  2. const cors = require('cors')
  3. const multer = require('multer')
  4. const path = require('path')
  5. const fs = require('fs-extra')
  6. const { calculateFileMD5 } = require('./utils.js')
  7. const { initDatabase, FileService } = require('../database/database.js')
  8. // 新增 OCR 相关依赖
  9. const Tesseract = require('tesseract.js')
  10. const sharp = require('sharp')
  11. const { createCanvas, loadImage } = require('canvas')
  12. const app = express()
  13. const PORT = 3000
  14. // 初始化数据库
  15. initDatabase()
  16. const fileService = new FileService()
  17. // 确保上传目录和临时目录存在
  18. const uploadDir = path.join(process.cwd(), 'uploads')
  19. const tempDir = path.join(process.cwd(), 'temp')
  20. fs.ensureDirSync(uploadDir)
  21. fs.ensureDirSync(tempDir)
  22. // 配置 multer - 修复中文文件名问题
  23. const storage = multer.diskStorage({
  24. destination: (req, file, cb) => {
  25. cb(null, uploadDir)
  26. },
  27. filename: (req, file, cb) => {
  28. // 处理中文文件名 - 使用原始文件名但确保安全
  29. const originalName = Buffer.from(file.originalname, 'latin1').toString('utf8')
  30. const ext = path.extname(originalName)
  31. const name = path.basename(originalName, ext)
  32. // 清理文件名,移除特殊字符
  33. const safeName = name.replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '_')
  34. const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9)
  35. const filename = safeName + '-' + uniqueSuffix + ext
  36. cb(null, filename)
  37. }
  38. })
  39. const upload = multer({
  40. storage,
  41. fileFilter: (req, file, cb) => {
  42. // 处理文件名编码
  43. file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8')
  44. cb(null, true)
  45. }
  46. })
  47. // 设置响应头,确保使用 UTF-8 编码
  48. app.use((req, res, next) => {
  49. res.setHeader('Content-Type', 'application/json; charset=utf-8')
  50. next()
  51. })
  52. app.use(cors())
  53. app.use(express.json({ limit: '50mb' }))
  54. app.use(express.urlencoded({ extended: true, limit: '50mb' }))
  55. // 文件上传接口
  56. app.post('/api/upload', upload.single('file'), async (req, res) => {
  57. try {
  58. if (!req.file) {
  59. return res.status(400).json({ error: 'No file uploaded' })
  60. }
  61. // 确保文件名正确编码
  62. const originalName = Buffer.from(req.file.originalname, 'latin1').toString('utf8')
  63. const fileInfo = {
  64. originalName: originalName,
  65. fileName: req.file.filename,
  66. filePath: req.file.path,
  67. fileSize: req.file.size,
  68. mimeType: req.file.mimetype
  69. }
  70. // 计算 MD5
  71. const md5 = await calculateFileMD5(req.file.path)
  72. // 保存到数据库
  73. const fileRecord = await fileService.createFile({
  74. ...fileInfo,
  75. md5
  76. })
  77. res.json({
  78. success: true,
  79. data: fileRecord
  80. })
  81. } catch (error) {
  82. console.error('Upload error:', error)
  83. res.status(500).json({ error: 'Upload failed: ' + error.message })
  84. }
  85. })
  86. // 修复获取文件列表接口 - 确保返回正确的数据结构
  87. app.get('/api/files', async (req, res) => {
  88. try {
  89. const page = parseInt(req.query.page) || 1
  90. const pageSize = parseInt(req.query.pageSize) || 100
  91. const result = await fileService.getFilesPaginated(page, pageSize)
  92. // 返回统一的数据结构
  93. res.json({
  94. success: true,
  95. data: result.files, // 直接返回文件数组
  96. pagination: result.pagination
  97. })
  98. } catch (error) {
  99. console.error('Get files error:', error)
  100. res.status(500).json({
  101. success: false,
  102. error: 'Failed to get files: ' + error.message
  103. })
  104. }
  105. })
  106. // MD5 检查接口
  107. app.post('/api/files/:id/check-md5', async (req, res) => {
  108. try {
  109. const fileId = parseInt(req.params.id)
  110. const file = await fileService.getFileById(fileId)
  111. if (!file) {
  112. return res.status(404).json({ error: 'File not found' })
  113. }
  114. const currentMD5 = await calculateFileMD5(file.filePath)
  115. const isChanged = currentMD5 !== file.md5
  116. res.json({
  117. isChanged,
  118. currentMD5,
  119. originalMD5: file.md5,
  120. file
  121. })
  122. } catch (error) {
  123. console.error('MD5 check error:', error)
  124. res.status(500).json({ error: 'MD5 check failed' })
  125. }
  126. })
  127. // 更新 MD5 接口
  128. app.put('/api/files/:id/update-md5', async (req, res) => {
  129. try {
  130. const fileId = parseInt(req.params.id)
  131. const { md5 } = req.body
  132. await fileService.updateFileMD5(fileId, md5)
  133. res.json({ success: true })
  134. } catch (error) {
  135. console.error('Update MD5 error:', error)
  136. res.status(500).json({ error: 'Update failed' })
  137. }
  138. })
  139. // 新增 OCR 识别接口
  140. app.post('/api/ocr/recognize', async (req, res) => {
  141. try {
  142. const { fileId, page } = req.body
  143. if (!fileId) {
  144. return res.status(400).json({ error: 'File ID is required' })
  145. }
  146. const file = await fileService.getFileById(parseInt(fileId))
  147. if (!file) {
  148. return res.status(404).json({ error: 'File not found' })
  149. }
  150. console.log(`开始OCR识别: ${file.originalName}`)
  151. // 预处理图像
  152. const processedImagePath = await preprocessImage(file.filePath)
  153. // 使用 Tesseract 进行 OCR 识别
  154. const result = await performOCR(processedImagePath)
  155. // 清理临时文件
  156. await fs.remove(processedImagePath)
  157. res.json({
  158. success: true,
  159. data: {
  160. textBlocks: result.textBlocks,
  161. totalPages: result.totalPages || 1,
  162. processingTime: result.processingTime,
  163. confidence: result.confidence
  164. }
  165. })
  166. } catch (error) {
  167. console.error('OCR recognition error:', error)
  168. res.status(500).json({ error: 'OCR recognition failed: ' + error.message })
  169. }
  170. })
  171. // 添加 OCR 结果相关的 API 接口
  172. // 保存 OCR 结果
  173. app.post('/api/ocr/save-result', async (req, res) => {
  174. try {
  175. const { fileId, ocrData } = req.body
  176. if (!fileId || !ocrData) {
  177. return res.status(400).json({ error: '文件ID和OCR数据是必需的' })
  178. }
  179. await fileService.saveOcrResult(parseInt(fileId), ocrData)
  180. res.json({ success: true })
  181. } catch (error) {
  182. console.error('保存OCR结果失败:', error)
  183. res.status(500).json({ error: '保存OCR结果失败: ' + error.message })
  184. }
  185. })
  186. // 获取 OCR 结果
  187. app.get('/api/ocr/result/:fileId', async (req, res) => {
  188. try {
  189. const fileId = parseInt(req.params.fileId)
  190. const result = await fileService.getOcrResult(fileId)
  191. if (result) {
  192. res.json({
  193. success: true,
  194. data: result.ocr_data
  195. })
  196. } else {
  197. res.json({
  198. success: false,
  199. error: '未找到OCR结果'
  200. })
  201. }
  202. } catch (error) {
  203. console.error('获取OCR结果失败:', error)
  204. res.status(500).json({ error: '获取OCR结果失败: ' + error.message })
  205. }
  206. })
  207. // 更新 OCR 文本(人工纠错)
  208. app.put('/api/ocr/update-text', async (req, res) => {
  209. try {
  210. const { fileId, textBlocks } = req.body
  211. if (!fileId || !textBlocks) {
  212. return res.status(400).json({ error: '文件ID和文本数据是必需的' })
  213. }
  214. await fileService.updateOcrText(parseInt(fileId), textBlocks)
  215. res.json({ success: true })
  216. } catch (error) {
  217. console.error('更新OCR文本失败:', error)
  218. res.status(500).json({ error: '更新OCR文本失败: ' + error.message })
  219. }
  220. })
  221. // 图像预处理函数
  222. async function preprocessImage(imagePath) {
  223. const tempOutputPath = path.join(tempDir, `preprocessed-${Date.now()}.png`)
  224. try {
  225. // 使用 sharp 进行图像预处理
  226. await sharp(imagePath)
  227. .grayscale() // 转为灰度图
  228. .normalize() // 标准化图像
  229. .linear(1.5, 0) // 增加对比度
  230. .sharpen() // 锐化
  231. .png()
  232. .toFile(tempOutputPath)
  233. return tempOutputPath
  234. } catch (error) {
  235. console.error('Image preprocessing failed:', error)
  236. // 如果预处理失败,返回原图
  237. return imagePath
  238. }
  239. }
  240. // OCR 识别函数
  241. async function performOCR(imagePath) {
  242. return new Promise((resolve, reject) => {
  243. const startTime = Date.now()
  244. Tesseract.recognize(
  245. imagePath,
  246. 'chi_sim+eng', // 中文简体 + 英文
  247. {
  248. logger: m => console.log(m),
  249. tessedit_pageseg_mode: Tesseract.PSM.AUTO,
  250. tessedit_char_whitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u4e00-\u9fa5,。!?;:"'/'()【】《》…—·'
  251. }
  252. ).then(({ data: { text, confidence } }) => {
  253. const processingTime = Date.now() - startTime
  254. // 解析文本块
  255. const textBlocks = parseOCRText(text)
  256. resolve({
  257. textBlocks,
  258. confidence,
  259. processingTime
  260. })
  261. }).catch(error => {
  262. reject(error)
  263. })
  264. })
  265. }
  266. // 解析 OCR 文本结果
  267. function parseOCRText(text) {
  268. const blocks = []
  269. const lines = text.split('\n').filter(line => line.trim())
  270. for (const line of lines) {
  271. const trimmedLine = line.trim()
  272. if (!trimmedLine) continue
  273. // 检测参考文献
  274. if (isReference(trimmedLine)) {
  275. blocks.push({
  276. type: 'reference',
  277. content: trimmedLine
  278. })
  279. }
  280. // 检测引用
  281. else if (isCitation(trimmedLine)) {
  282. blocks.push({
  283. type: 'citation',
  284. content: trimmedLine.replace(/^\[\d+\]\s*/, ''),
  285. number: extractCitationNumber(trimmedLine)
  286. })
  287. }
  288. // 检测图片标记
  289. else if (isImageMarker(trimmedLine)) {
  290. blocks.push({
  291. type: 'image',
  292. content: trimmedLine
  293. })
  294. }
  295. // 普通文本
  296. else {
  297. blocks.push({
  298. type: 'text',
  299. content: trimmedLine
  300. })
  301. }
  302. }
  303. return blocks
  304. }
  305. // 辅助函数
  306. function isReference(text) {
  307. const refPatterns = [
  308. /^参考文献/i,
  309. /^references/i,
  310. /^bibliography/i,
  311. /^\[?\d+\]?\s*\.?\s*[A-Za-z].*\.\s*\d{4}/
  312. ]
  313. return refPatterns.some(pattern => pattern.test(text))
  314. }
  315. function isCitation(text) {
  316. return /^\[\d+\]/.test(text)
  317. }
  318. function extractCitationNumber(text) {
  319. const match = text.match(/^\[(\d+)\]/)
  320. return match ? parseInt(match[1]) : null
  321. }
  322. function isImageMarker(text) {
  323. const imagePatterns = [
  324. /^图\s*\d+/i,
  325. /^figure\s*\d+/i,
  326. /^图片\d*/i
  327. ]
  328. return imagePatterns.some(pattern => pattern.test(text))
  329. }
  330. // 获取文件预览接口
  331. app.get('/api/files/:id/preview', async (req, res) => {
  332. try {
  333. const fileId = parseInt(req.params.id)
  334. const file = await fileService.getFileById(fileId)
  335. if (!file) {
  336. return res.status(404).json({ error: 'File not found' })
  337. }
  338. // 检查文件是否存在
  339. if (!fs.existsSync(file.filePath)) {
  340. return res.status(404).json({ error: 'File not found on disk' })
  341. }
  342. // 设置正确的 Content-Type
  343. res.setHeader('Content-Type', file.mimeType)
  344. // 直接发送文件
  345. res.sendFile(path.resolve(file.filePath))
  346. } catch (error) {
  347. console.error('File preview error:', error)
  348. res.status(500).json({ error: 'Failed to get file preview' })
  349. }
  350. })
  351. // 获取文件缩略图接口
  352. app.get('/api/files/:id/thumbnail', async (req, res) => {
  353. try {
  354. const fileId = parseInt(req.params.id)
  355. const file = await fileService.getFileById(fileId)
  356. if (!file) {
  357. return res.status(404).json({ error: 'File not found' })
  358. }
  359. // 只对图片生成缩略图
  360. if (!file.mimeType.startsWith('image/')) {
  361. return res.status(400).json({ error: 'Not an image file' })
  362. }
  363. const thumbnailPath = path.join(tempDir, `thumbnail-${fileId}.jpg`)
  364. // 生成缩略图
  365. await sharp(file.filePath)
  366. .resize(100, 100, {
  367. fit: 'inside',
  368. withoutEnlargement: true
  369. })
  370. .jpeg({ quality: 80 })
  371. .toFile(thumbnailPath)
  372. res.sendFile(path.resolve(thumbnailPath))
  373. } catch (error) {
  374. console.error('Thumbnail generation error:', error)
  375. // 如果缩略图生成失败,返回原图
  376. res.sendFile(path.resolve(file.filePath))
  377. }
  378. })
  379. // 健康检查接口
  380. app.get('/api/health', (req, res) => {
  381. res.json({
  382. status: 'OK',
  383. timestamp: new Date().toISOString(),
  384. service: 'file-management-api'
  385. })
  386. })
  387. function startServer() {
  388. app.listen(PORT, () => {
  389. console.log(`Server running on http://localhost:${PORT}`)
  390. })
  391. }
  392. module.exports = { startServer }