实现pdf跳页解析功能

最新推荐文章于 2024-08-19 08:00:00 发布

峰啊疯了

最新推荐文章于 2024-08-19 08:00:00 发布

阅读量975

点赞数 27

文章标签： pdf

本文链接：https://blog.csdn.net/hdf12947991059/article/details/139968603

版权

1.设计图

实现pdf跳页解析功能_github

pdf解析可实现指定页码先进行解析，然后调换解析顺序

2.处理流程图

实现pdf跳页解析功能_github_02

实现pdf跳页解析功能_json_03

3.数据库、redis设计

实现pdf跳页解析功能_宽高_04

3.代码实现

pdf解析工具使用pdfium，仓库地址： https://github.com/klippa-app/pdfium-cli

实现pdf跳页解析功能_宽高_05

3.1.client_pdfium.go(底层代码)

package pdf_util

import (
	"fmt"
	"os/exec"
	"path"
	"strconv"
	"support/logger"
)

type pdfium struct {
	parseTool string
	dpi       string
	maxHeight int
	maxWidth  int
}

func newPdfium(dpi int) *pdfium {
	res := &pdfium{
		parseTool: getPdfiumTool(),
		dpi:       strconv.Itoa(dpi),
	}
	logger.Debug("new pdfium, dpi: %d, programName: %s", dpi, res.parseTool)
	return res
}

func (p *pdfium) parse(log logger.ILog, filePath string, imgDir string, firstPage, lastPage int, dpi int) ([]byte, error) {

	cmd := exec.Command(getBashTool(), "-c")
	arg := fmt.Sprintf("%s render %s --dpi %d --pages %d-%d %s", getPdfiumTool(),
		filePath, dpi, firstPage, lastPage, path.Join(imgDir, "%d.jpg"))
	cmd.Args = append(cmd.Args, arg)
	log.Debug("pdfium cmd is: %s", cmd)
	res, err := cmd.CombinedOutput()
	log.Debug("pdfium exec result:%s", res)

	return res, err
}

实现pdf跳页解析功能_github_06

3.2 业务代码

实现pdf跳页解析功能_宽高_07

dto.go

package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"path"
	"pps/config"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"strconv"
	"support/collection/_set"
	"support/util"
	"support/web/mw"
	"time"
)

const (
	metaFileName = "meta.json"
)

const maxTaskExpireParam = 5

type requestParam struct {
	FileId             string `json:"fileId" form:"fileId" binding:"required"`
	FileOssPath        string `json:"fileOssPath" form:"fileOssPath" binding:"required"`
	PrePage            int    `json:"prePage" form:"prePage"`
	imgOssPath         string
	metaFileKey        string
	DisablePicCompress bool `json:"disablePicCompress" form:"disablePicCompress"`
}

type requestJumpParam struct {
	FileId string `json:"fileId" form:"fileId" binding:"required"`
	PageNo int    `json:"pageNo" form:"pageNo" binding:"required"`
}

func (p *requestParam) init() {
	p.imgOssPath = getImageOssPath(p.FileOssPath)
	p.metaFileKey = path.Join(p.imgOssPath, metaFileName)
}

func getImageOssPath(fileOssPath string) string {
	return fileOssPath + "_i"
}
func getMetaFileKey(fileOssPath string) string {
	return path.Join(getImageOssPath(fileOssPath), metaFileName)
}

type ParseResult struct {
	TotalPage   int     `json:"totalPage"`
	CoverWidth  float64 `json:"coverWidth"`
	CoverHeight float64 `json:"coverHeight"`
	ParseStatus int     `json:"parseStatus"`
}

func (r *ParseResult) String() string {
	return util.ConvertToJsonStr(r)
}

type parseMsg struct {
	localPdfPath       string
	remotePdfPath      string
	localImgDirPrefix  string
	remoteImgDir       string
	totalPage          int
	width              float64 // pdf文档宽高
	height             float64
	metaFileKey        string
	reqTime            time.Time
	ctx                *gin.Context
	prePage            int
	imageDpi           int
	imageWidth         float64 // pdf封面图片宽高
	imageHeight        float64
	disablePicCompress bool // 是否禁用压缩图片
	fileId             string
	fileOssPath        string
}

func toParseMsg(pdfInfo *pfc.PdfInfo, param *requestParam, ctx *gin.Context) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(param.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       param.imgOssPath,
		totalPage:          pdfInfo.TotalPage,
		width:              pdfInfo.Width,
		height:             pdfInfo.Height,
		metaFileKey:        param.metaFileKey,
		ctx:                ctx,
		prePage:            param.PrePage,
		imageDpi:           config.Config.ImageDpi,
		disablePicCompress: param.DisablePicCompress,
		fileId:             param.FileId,
		fileOssPath:        param.FileOssPath,
	}
}

func (m *parseMsg) getLocalImgDir(firstPage int) string {
	return getLocalImageDir(m.localImgDirPrefix, firstPage)
}

func (m *parseMsg) getCost() time.Duration {
	return mw.GetCost(m.ctx)
}

func buildPdfParseRecord(msg *parseMsg, result *ParseResult,
	taskPage int, taskEstimateMs int64) *dao.TblPdfParseRecord {
	c := config.Config
	pending := calPendingList(msg.prePage, taskPage, result.TotalPage)
	base := dao.BasePdfParseRecord{
		Status:             dao.StParsing,
		FileId:             msg.fileId,
		FileOssPath:        msg.fileOssPath,
		Dpi:                msg.imageDpi,
		Tool:               c.ParseTool,
		Width:              result.CoverWidth,
		Height:             result.CoverHeight,
		TaskPage:           taskPage,
		TaskEstimateMs:     taskEstimateMs,
		ExpireTime:         getExpireTime(taskEstimateMs),
		RetryCount:         0,
		DisablePicCompress: msg.disablePicCompress,
	}
	return &dao.TblPdfParseRecord{
		PrePage:            msg.prePage,
		TotalPage:          result.TotalPage,
		Pending:            util.ConvertToJsonStr(pending),
		Parsing:            firstNum,
		BasePdfParseRecord: base,
	}
}

func calPendingList(prePage, taskPage, totalPage int) []int {
	pending := make([]int, 0)
	for start := prePage + 1; start <= totalPage; start = start + taskPage {
		pending = append(pending, start)
	}
	return pending
}

func getExpireTime(taskEstimateMs int64) int64 {
	return util.NowMs() + maxTaskExpireParam*taskEstimateMs
}

func getLocalImgDirPrefix(fileId string) string {
	return path.Join(config.Config.BaseDir, "img", fileId)
}

func getLocalImageDir(localImgDirPrefix string, firstPage int) string {
	return path.Join(localImgDirPrefix, strconv.Itoa(firstPage))
}

func getImageOssCompressDir(imageOssPath string) string {
	return imageOssPath + "_z"
}

type JumpHis struct {
	PrePage   int   `json:"prePage"`
	TotalPage int   `json:"totalPage"`
	TaskPage  int   `json:"taskPage"`
	History   []int `json:"history"`
}

func (j *JumpHis) String() string {
	return util.ConvertToJsonStr(j)
}

func buildJumpHis(record *dao.TblPdfParseRecord, his []int) *JumpHis {
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	totalPendingSet := _set.NewBySlice(calPendingList(prePage, taskPage, totalPage))
	var pending []int
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	pendingSet := _set.NewBySlice(pending)
	// 已经解析过的
	history := _set.Difference(totalPendingSet, pendingSet).Slice()
	hisSet := _set.NewBySlice(his)
	history = _set.Union(hisSet, _set.NewBySlice(history)).Slice()
	return &JumpHis{
		PrePage:   prePage,
		TaskPage:  taskPage,
		TotalPage: totalPage,
		History:   history,
	}
}

func baseRecord2ParseMsg(pdfInfo *pfc.PdfInfo, record dao.BasePdfParseRecord) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       getImageOssPath(record.FileOssPath),
		width:              record.Width,
		height:             record.Height,
		metaFileKey:        getMetaFileKey(record.FileOssPath),
		imageDpi:           record.Dpi,
		disablePicCompress: record.DisablePicCompress,
		fileId:             record.FileId,
		fileOssPath:        record.FileOssPath,
	}
}

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.

deal_parse.go

package pdf_parse_v2

import (
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	math2 "math"
	"os"
	"path"
	"pps/application/common"
	"pps/application/compress"
	util2 "pps/application/util"
	"pps/config"
	"pps/dao"
	"pps/helper/img_util"
	pfc "pps/helper/pdf_cache"
	"pps/helper/pdf_util"
	"strings"
	"support/concurrent"
	"support/http_util"
	"support/logger"
	"support/math"
	"support/oss"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

const firstNum = 1

const BigImageDpi = 96

// BestParseTime 最佳解析时间15s
const BestParseTime = 15000

func DealParse(c *gin.Context) {
	common.CallBegin()
	defer common.CallEnd()

	log := mw.GetLogger(c)
	// 参数校验
	var param requestParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealParse request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	param.init()

	// 检查是否解析过
	if pr := getParseResult(log, ¶m); pr != nil {
		log.Warn("skip parse as already parsed, meta: %s", pr)
		mw.RetJSON(c, pr)
		return
	}

	// 获取pdf信息(文件本地路径，页数，宽高)
	pdfInfo, err := pfc.GetPdf(log, param.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		mw.RetFail(c, mw.ErrInner)
		return
	}
	log.Info("get pdf info finished, cost: %s, pdfInfo: %s", mw.GetCost(c), pdfInfo)
	// 最多解析1000页
	maxParseCount := config.Config.TotalParseCount
	if pdfInfo.TotalPage > maxParseCount {
		log.Error("pdf totalPage more than maxParseCount, totalPage %d, maxParseCount: %d",
			pdfInfo.TotalPage, maxParseCount)
		pdfInfo.TotalPage = maxParseCount
	}
	msg := toParseMsg(pdfInfo, ¶m, c)
	// 预解析
	pr, httpErr := preParse(c, msg)
	if httpErr != nil {
		mw.RetFail(c, httpErr)
		return
	}
	// 启动协程，继续解析剩下的
	go safe.Safego(func() {
		leftParse(c.Copy(), msg)
	}, "leftParse")
	// 返回
	mw.RetJSON(c, pr)
	return

}

// getParseResult 如果解析过，图片目录中会有meta文件
func getParseResult(log logger.ILog, param *requestParam) *ParseResult {
	if r, _ := oss.Helper.IsObjectExist(param.metaFileKey); !r {
		return nil
	}
	res := ParseResult{}
	err := util2.DownloadData(param.metaFileKey, &res)
	if err != nil {
		log.Error("error while download meta file: %s", err)
		return nil
	}
	return &res
}

func preParse(c *gin.Context, msg *parseMsg) (r *ParseResult, he *http_util.HttpError) {

	log := mw.GetLogger(c)
	db := db2.Db(c)
	msg.prePage = util.If(msg.prePage == 0, config.Config.PreParseCount, msg.prePage)

	lastPage := math.Min(msg.prePage, msg.totalPage)
	if err := parseAndUpload(c, msg, firstNum, lastPage); err != nil {
		log.Error("pre parse failed while parse as: %s", err)
		return nil, mw.ErrInner
	}
	log.Info("pre parse upload image finished")
	parseStatus := util.If(msg.totalPage <= msg.prePage, dao.StParseSuccess, dao.StParsing)
	pr := &ParseResult{
		TotalPage:   msg.totalPage,
		CoverWidth:  msg.imageWidth,
		CoverHeight: msg.imageHeight,
		ParseStatus: parseStatus,
	}
	log.Info("pre parse set size finished, result: %s", pr)
	// 上传meta文件
	if err := util2.UploadData(msg.metaFileKey, pr); err != nil {
		log.Error("pre parse failed while upload meta file as: %s", err)
		return nil, mw.ErrInner
	}
	// 计算平均每页耗时
	preParseCost := msg.getCost()
	log.Info("pre parse upload meta file finished cost: %s", preParseCost)
	avgPageCost := preParseCost.Milliseconds() / int64(msg.prePage)
	taskPage := math.Max(config.Config.PreParseCount, int(BestParseTime/avgPageCost))
	taskEstimateMs := int64(taskPage) * avgPageCost
	e := buildPdfParseRecord(msg, pr, taskPage, taskEstimateMs)
	if err := dao.SavePdfParseRecord(db, e); err != nil {
		log.Error("SavePdfParseRecord failed as: %s", err)
		return pr, mw.ErrDb
	}
	return pr, nil
}

func parseAndUpload(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {

	log := mw.GetLogger(c)
	localImgDir := msg.getLocalImgDir(firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	defer func() {
		if err := os.RemoveAll(localImgDir); err != nil {
			log.Error("remove dir failed as: %s", err)
		}
	}()

	// 解析成图片
	res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
		firstPage, lastPage, msg.imageDpi)
	// 检查是否是因为图片太大,再执行一次
	res, err = checkImageParseErr(log, res, err, msg, firstPage, lastPage, localImgDir)
	if err != nil {
		return errors.Wrap(err, "parse pdf")
	}
	wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
	log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
		firstPage, lastPage, wrappedRes)

	// dpi检查,封面图片过大则调整dpi
	if firstPage == firstNum {
		if err := checkDpi(c, msg, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "check dpi")
		}
	}

	// 上传图片
	if err := uploadImage(localImgDir, msg.remoteImgDir, firstPage, lastPage); err != nil {
		return errors.Wrap(err, "upload image")
	}

	// 压缩图片 根据参数开启
	if !msg.disablePicCompress {
		if err := compressImage(c, msg.remoteImgDir, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}

	return nil
}

func uploadImage(localImgDir, remoteImgDir string, firstPage, lastPage int) error {
	// 并发上传图片
	limit := concurrent.NewGoLimit(5)
	var err error
	for i := firstPage; i <= lastPage; i++ {
		pageNo := i // 防止闭包i变化
		imgName := pdf_util.GetImgName(pageNo)
		localImgPath := path.Join(localImgDir, imgName)
		remoteImgPath := path.Join(remoteImgDir, imgName)
		limit.Run(func() {
			if e := oss.Helper.PutFile(remoteImgPath, localImgPath, oss.AclPublicRead); e != nil {
				err = e
			}
		})
	}
	limit.Wait()

	return err
}

func compressImage(c *gin.Context, remoteImgDir string, firstPage, lastPage int) error {

	// 压缩图片
	limit := concurrent.NewGoLimit(10)
	for i := firstPage; i <= lastPage; i++ {
		imgName := pdf_util.GetImgName(i)
		imgOssPath := path.Join(remoteImgDir, imgName)
		limit.RunError(func() error {
			return compress.CompressImage(c, imgOssPath)
		})
	}
	limit.Wait()

	return limit.FirstError()
}

func checkDpi(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {
	localImgDir := msg.getLocalImgDir(firstPage)
	log := mw.GetLogger(c)
	localImgPath := path.Join(localImgDir, pdf_util.GetImgName(firstNum))
	imgDim, err := img_util.GetLocalImageDim(localImgPath)
	if err != nil {
		return errors.Wrap(err, "get local image")
	}
	// 修改封面宽高
	msg.imageWidth = float64(imgDim.Width)
	msg.imageHeight = float64(imgDim.Height)

	maxWidth, maxHeight, maxSize := config.Config.ImgAttrMaxValue[0], config.Config.ImgAttrMaxValue[1], config.Config.ImgAttrMaxValue[2]
	if imgDim.Width == 0 || imgDim.Height == 0 {
		return errors.New("get imageDim failed Width or Height is 0")
	}
	rw, rh, rs := float64(imgDim.Width)/maxWidth, float64(imgDim.Height)/maxHeight,
		math2.Sqrt(float64(imgDim.Width*imgDim.Height)/maxSize)
	rr := math.Max(rw, rh, rs)
	// 需要调整dpi
	if rr > 1.0 {
		dpi := float64(msg.imageDpi) / rr
		msg.imageDpi = int(dpi)
		log.Error("checkDpi image is too large,dpi change to %d", msg.imageDpi)
		// 解析成图片
		res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		if err != nil {
			return errors.Wrap(err, "parse pdf")
		}

		// 获取调整dpi后的封面宽高
		wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
		log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
			firstPage, lastPage, wrappedRes)
		imgDim, err := img_util.GetLocalImageDim(localImgPath)
		if err != nil {
			return errors.Wrap(err, "get local image")
		}
		// 修改封面宽高
		msg.imageWidth = float64(imgDim.Width)
		msg.imageHeight = float64(imgDim.Height)
	}
	return nil
}
func wrapOssPath(res string, ossPath string) string {
	res = strings.ReplaceAll(res, "\r\n", "\n")
	res = strings.Trim(res, " \n")
	lines := strings.Split(res, "\n")
	for i, line := range lines {
		lines[i] = line + " (" + ossPath + ")"
	}
	return strings.Join(lines, "\n")
}

func checkImageParseErr(log logger.ILog, res []byte, err error, msg *parseMsg, firstPage, lastPage int, localImgDir string) ([]byte, error) {
	// 降dpi再执行一次
	if strings.Contains(string(res), pdf_util.PdfiumBigImageError.Error()) {
		msg.imageDpi = BigImageDpi
		log.Error("image is too large,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	if err != nil && strings.Contains(err.Error(), "killed") {
		msg.imageDpi = BigImageDpi
		log.Error("pdf parse is failed as killed by signal,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	return res, err
}

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.
204.
205.
206.
207.
208.
209.
210.
211.
212.
213.
214.
215.
216.
217.
218.
219.
220.
221.
222.
223.
224.
225.
226.
227.
228.
229.
230.
231.
232.
233.
234.
235.
236.
237.
238.
239.
240.
241.
242.
243.
244.
245.
246.
247.
248.
249.
250.
251.
252.
253.
254.
255.
256.
257.
258.
259.
260.
261.
262.
263.
264.
265.
266.
267.
268.
269.
270.
271.
272.
273.
274.
275.
276.
277.
278.
279.
280.
281.
282.
283.
284.
285.
286.
287.
288.
289.
290.
291.
292.
293.
294.
295.
296.
297.

deal_parse_left.go

package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	"os"
	"path"
	"pps/dao"
	"pps/helper/img_util"
	"pps/helper/pdf_util"
	"support/math"
	"support/oss"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

func leftParse(c *gin.Context, msg *parseMsg) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	for {
		record, err := dao.GetPdfParseRecordByFileId(db, msg.fileId)
		if err != nil {
			log.Error("GetPdfParseRecordByFileId failed as err:%v", err)
			return
		}
		var pending []int
		if err = json.Unmarshal([]byte(record.Pending), &pending); err != nil {
			log.Error("Unmarshal pending failed as err:%v", err)
			return
		}
		if len(pending) == 0 {
			if err := dao.UpdatePdfParseRecordStatus(db, msg.fileId, dao.StParseSuccess); err != nil {
				log.Error("UpdatePdfParseRecordStatus failed as err:%v", err)
			}
			return
		}
		// 取第一条进行解析任务
		firstPage := pending[0]
		lastPage := math.Min(record.TotalPage, firstPage+record.TaskPage-1)

		newPending := util.ConvertToJsonStr(pending[1:])
		expireTime := getExpireTime(record.TaskEstimateMs)
		if err := dao.UpdatePdfParseRecord(db, msg.fileId, newPending, firstPage, msg.imageDpi, expireTime); err != nil {
			log.Error("UpdatePdfParseRecordPending failed as err:%v", err)
			return
		}

		// 调用解析
		if err := parseAndUpload(c, msg, firstPage, lastPage); err != nil {
			// 切换破图上传oss
			if err := GenFailedImageUpload(c, record.BasePdfParseRecord, firstPage, lastPage); err != nil {
				log.Error("GenFailedImageUpload failed as err:%v", err)
			}
		}
	}
}

func GenFailedImageUpload(c *gin.Context, record dao.BasePdfParseRecord, firstPage, lastPage int) error {
	log := mw.GetLogger(c)

	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	localImgDir := getLocalImageDir(localImgDirPrefix, firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	imgName := pdf_util.GetImgName(firstPage)
	localImgPath := path.Join(localImgDir, imgName)
	err := GenFailedImage(localImgPath, record.Width, record.Height)
	if err != nil {
		return err
	}
	remoteImgOssPath := getImageOssPath(record.FileOssPath)
	// 上传图片
	if err := uploadImage(localImgDir, remoteImgOssPath, firstPage, firstPage); err != nil {
		return errors.Wrap(err, "upload image")
	}
	// 压缩图片 根据参数开启
	if !record.DisablePicCompress {
		if err := compressImage(c, remoteImgOssPath, firstPage, firstPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}
	// 剩下的不压缩，直接拷贝
	for start := firstPage + 1; start <= lastPage; start++ {
		// oss拷贝
		srcImg := path.Join(remoteImgOssPath, imgName)
		dstImg := path.Join(remoteImgOssPath, pdf_util.GetImgName(start))
		log.Debug("oss copy src:%s, dst:%s", srcImg, dstImg)
		err := oss.Helper.CopyObject(srcImg, dstImg)
		if err != nil {
			log.Error(" srcImg:%s, dstImg:%s oss copy failed as err:%v", srcImg, dstImg, err)
		}
		if !record.DisablePicCompress {
			srcImgCompress := getImageOssCompressDir(srcImg)
			dstImgCompress := getImageOssCompressDir(dstImg)
			err := oss.Helper.CopyFolder(srcImgCompress, dstImgCompress)
			if err != nil {
				log.Error(" srcImgCompress:%s, dstImgCompress:%s oss copy failed as err:%v",
					srcImgCompress, dstImgCompress, err)
			}
		}
	}
	return nil
}

func GenFailedImage(filePath string, width, height float64) error {
	data, err := img_util.GenFailImage(width, height)
	if err != nil {
		return err
	}
	outFile, err := os.Create(filePath)
	if err != nil {
		return err
	}
	_, _ = outFile.Write(data)
	_ = outFile.Close()
	return nil
}

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.

deal_parse_jump.go

package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"support/collection/_set"
	"support/database/redis"
	"support/logger"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
	"time"
)

const maxJumpCount = 5

const PpsJumpHisCachePrefix = "PPS:PDF:JUMP_FILEID_"

func DealJump(c *gin.Context) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	// 参数校验
	var param requestJumpParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealJump request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	fileId := param.FileId
	jumpHis, _ := CacheGetJumpHis(log, fileId)
	// redis有缓存
	if len(jumpHis.History) > 0 {
		if hasParse(jumpHis, param.PageNo, false) {
			log.Debug("pageNo:%d hasParse task", param.PageNo)
			mw.RetJSON(c, "")
			return
		}
	}
	record, err := dao.GetPdfParseRecordByFileId(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseRecordByFileId failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	jumpHis = buildJumpHis(record, jumpHis.History)
	log.Debug("jumpHis:%s", jumpHis)

	if hasParse(jumpHis, param.PageNo, true) {
		log.Debug("pageNo:%d hasParse task", param.PageNo)
		mw.RetJSON(c, "")
		return
	}
	// 更新his
	_ = CacheSetJumpHis(log, fileId, jumpHis)

	// 调整任务顺序
	pending := adjustPendingList(record, param.PageNo)
	defer func() {
		// 更新pending
		if err = dao.UpdatePdfParseRecordPending(db, fileId, util.ConvertToJsonStr(pending)); err != nil {
			log.Error("DealJump UpdatePdfParseRecordPending failed as: %s", err)
		}
	}()

	// 更新解析记录
	jumpCount, err := dao.GetPdfParseJumpCount(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseJumpCount failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	if jumpCount >= maxJumpCount {
		log.Warn("currJumpCount:%d gt maxJumCount:%d", jumpCount, maxJumpCount)
		mw.RetJSON(c, "")
		return
	}
	fromPage := pending[0]
	expireTime := getExpireTime(record.TaskEstimateMs)
	jumpRecord := dao.BuildPdfParseJumpFromParseRecord(record, fromPage, expireTime)
	// 创建跳页记录
	if err = dao.SavePdfParseJump(db, jumpRecord); err != nil {
		log.Error("DealJump SavePdfParseJump failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}

	pending = pending[1:]
	mw.RetJSON(c, "")
	// 处理跳页
	go safe.Safego(func() {
		dealJump(c.Copy(), jumpRecord)
	}, "dealJump")
}

func CacheGetJumpHis(log logger.ILog, fileId string) (*JumpHis, error) {
	var his JumpHis
	key := PpsJumpHisCachePrefix + fileId
	value, err := redis.Get(key)
	if err != nil {
		if err != redis.ErrGetNil {
			log.Error("CacheGetJumpHis, fileId: %s, err: %s", fileId, err)
		}
		return &his, err
	}
	err = json.Unmarshal([]byte(value), &his)
	return &his, err
}

func CacheSetJumpHis(log logger.ILog, fileId string, his *JumpHis) error {
	key := PpsJumpHisCachePrefix + fileId
	value, err := json.Marshal(his)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
		return err
	}
	err = redis.Set(key, string(value), time.Hour)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
	}
	return err
}

func hasParse(his *JumpHis, pageNo int, insert bool) bool {
	prePage, taskPage, totalPage := his.PrePage, his.TaskPage, his.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	// 开头补上0 表示预解析
	plist = append([]int{0}, plist...)
	start := calPageStart(plist, pageNo)
	hisSet := _set.NewBySlice(his.History)
	hisSet.Add(0)
	if hisSet.Has(start) {
		return true
	}
	if insert {
		// 本次操作加进his
		hisSet.Add(start)
	}

	his.History = hisSet.Slice()
	return false
}

// 二分法查找仅次于target的数字
func calPageStart(arr []int, target int) int {
	// 边界情况处理
	if len(arr) == 0 {
		return 0 // 如果数组空，返回0(一般不会运行到这里)
	}
	if target < arr[0] {
		return arr[0]
	}
	if target > arr[len(arr)-1] {
		return arr[len(arr)-1]
	}

	left := 0
	right := len(arr) - 1

	for left <= right {
		mid := left + (right-left)/2

		// 如果找到精确匹配
		if arr[mid] == target {
			return arr[mid]
		} else if arr[mid] < target {
			// 检查是否是最后一个小于n的元素
			if mid == len(arr)-1 || arr[mid+1] > target {
				return arr[mid]
			}
			left = mid + 1
		} else {
			right = mid - 1
		}
	}

	return 0 // 默认返回0，一般不会运行到这里
}

func adjustPendingList(record *dao.TblPdfParseRecord, pageNo int) (pending []int) {
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	if len(pending) == 0 {
		return
	}
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	start := calPageStart(plist, pageNo)
	// 已经排在第一个了
	if start == pending[0] {
		return
	}
	pending = rotateSlice(pending, start)
	return
}

func rotateSlice(arr []int, value int) []int {
	// 查找元素的位置
	index := -1
	for i, v := range arr {
		if v == value {
			index = i
			break
		}
	}

	// 如果没有找到指定的元素，返回原始切片
	if index == -1 {
		return arr
	}

	// 使用拼接方式旋转切片
	result := append(arr[index:], arr[:index]...)
	return result
}

func dealJump(c *gin.Context, record *dao.TblParseJump) {
	log := mw.GetLogger(c)
	db := db2.Db(c)

	// 获取pdf信息(文件本地路径，页数，宽高)
	pdfInfo, err := pfc.GetPdf(log, record.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		return
	}
	msg := baseRecord2ParseMsg(pdfInfo, record.BasePdfParseRecord)
	// 调用解析
	if err := parseAndUpload(c, msg, record.FromPage, record.ToPage); err != nil {
		// 切换破图上传oss
		if err := GenFailedImageUpload(c, record.BasePdfParseRecord, record.FromPage, record.ToPage); err != nil {
			log.Error("GenFailedImageUpload failed as err:%v", err)
		}
	}
	if err := dao.UpdateJumpSuccess(db, record.FileId); err != nil {
		log.Error("UpdateJumpSuccess failed as err:%v", err)
	}
}

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.
204.
205.
206.
207.
208.
209.
210.
211.
212.
213.
214.
215.
216.
217.
218.
219.
220.
221.
222.
223.
224.
225.
226.
227.
228.
229.
230.
231.
232.
233.
234.
235.
236.
237.
238.
239.

4.代码解读

实现pdf跳页解析功能_github_08

跳页主要是调整解析队列pending的顺序，如果有可用协程就直接启动协程执行跳页解析

实现pdf跳页解析功能_宽高_09

left解析主要就是从数据库的pending字段取出一个start 然后开始解析

5.番外篇pdfium介绍

PDFium 是一个开源的 PDF 渲染引擎，用于解析和呈现 PDF 文档。它最初由 Foxit Software 开发，随后由 Google 作为 Chromium 项目的一部分维护和发布。PDFium 被广泛应用于浏览器(如 Google Chrome)的内置 PDF 查看器，以及其他需要处理 PDF 文档的应用程序中。

PDFium 的主要功能

PDF 渲染：PDFium 可以将 PDF 页面渲染为不同格式的图像(如 Bitmaps)，并支持高效的缩放和旋转操作。
文本提取：可以提取 PDF 文档中的文本内容，便于文本检索和搜索引擎的索引。
表单处理：支持 PDF 表单的填充、提交和提取操作。
注释处理：支持读取和管理 PDF 注释，如高亮、注释和签名。
图像和图形处理：能够解析和渲染 PDF 文件中的图像和矢量图形。
安全性：支持解析和处理加密的 PDF 文档。

主要特性

跨平台支持：PDFium 可以在 Windows、macOS、Linux 等多个平台上编译和运行。
高性能：针对性能进行了优化，可以高效处理大型和复杂的 PDF 文档。
模块化：具有模块化架构，可以选择性编译和使用需要的功能模块。
丰富的 API：提供了丰富的 API，可以进行复杂的 PDF 操作和自定义扩展。

如何使用 PDFium

1. 下载和构建

PDFium 是一个开源项目，可以从其 GitHub 仓库下载源码：

你可以按照文档进行配置和编译，以适应不同平台和需求。常用的构建工具包括 CMake 和 Ninja。

2. 基本用例

构建完成后，你可以在你的 C/C++ 项目中使用 PDFium。以下是一个简单的示例，展示如何创建一个 PDF 文档并渲染第一页：

#include "public/fpdfview.h"

int main(int argc, char** argv) {
    // 初始化 PDFium
    FPDF_InitLibrary();

    // 加载 PDF 文档
    FPDF_DOCUMENT doc = FPDF_LoadDocument("example.pdf", nullptr);
    if (!doc) {
        FPDF_DestroyLibrary();
        return -1;
    }

    // 加载第一页
    FPDF_PAGE page = FPDF_LoadPage(doc, 0);
    if (!page) {
        FPDF_CloseDocument(doc);
        FPDF_DestroyLibrary();
        return -1;
    }

    // 渲染第一页到 Bitmap
    int width = FPDF_GetPageWidth(page);
    int height = FPDF_GetPageHeight(page);
    FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, 0);
    FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF); // 白色背景
    FPDF_RenderPageBitmap(bitmap, page, 0, 0, width, height, 0, 0);

    // 保存 Bitmap 到文件或进行其他处理
    // ...

    // 清理资源
    FPDFBitmap_Destroy(bitmap);
    FPDF_ClosePage(page);
    FPDF_CloseDocument(doc);
    FPDF_DestroyLibrary();

    return 0;
}