1.设计图

实现pdf跳页解析功能_github

pdf解析可实现指定页码先进行解析,然后调换解析顺序

2.处理流程图

实现pdf跳页解析功能_github_02

实现pdf跳页解析功能_json_03

3.数据库、redis设计

实现pdf跳页解析功能_宽高_04

3.代码实现

pdf解析工具使用pdfium,仓库地址: https://github.com/klippa-app/pdfium-cli

实现pdf跳页解析功能_宽高_05

3.1.client_pdfium.go(底层代码)
package pdf_util

import (
	"fmt"
	"os/exec"
	"path"
	"strconv"
	"support/logger"
)

type pdfium struct {
	parseTool string
	dpi       string
	maxHeight int
	maxWidth  int
}

func newPdfium(dpi int) *pdfium {
	res := &pdfium{
		parseTool: getPdfiumTool(),
		dpi:       strconv.Itoa(dpi),
	}
	logger.Debug("new pdfium, dpi: %d, programName: %s", dpi, res.parseTool)
	return res
}

func (p *pdfium) parse(log logger.ILog, filePath string, imgDir string, firstPage, lastPage int, dpi int) ([]byte, error) {

	cmd := exec.Command(getBashTool(), "-c")
	arg := fmt.Sprintf("%s render %s --dpi %d --pages %d-%d %s", getPdfiumTool(),
		filePath, dpi, firstPage, lastPage, path.Join(imgDir, "%d.jpg"))
	cmd.Args = append(cmd.Args, arg)
	log.Debug("pdfium cmd is: %s", cmd)
	res, err := cmd.CombinedOutput()
	log.Debug("pdfium exec result:%s", res)

	return res, err
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.

实现pdf跳页解析功能_github_06

3.2 业务代码

实现pdf跳页解析功能_宽高_07

dto.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"path"
	"pps/config"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"strconv"
	"support/collection/_set"
	"support/util"
	"support/web/mw"
	"time"
)

const (
	metaFileName = "meta.json"
)

const maxTaskExpireParam = 5

type requestParam struct {
	FileId             string `json:"fileId" form:"fileId" binding:"required"`
	FileOssPath        string `json:"fileOssPath" form:"fileOssPath" binding:"required"`
	PrePage            int    `json:"prePage" form:"prePage"`
	imgOssPath         string
	metaFileKey        string
	DisablePicCompress bool `json:"disablePicCompress" form:"disablePicCompress"`
}

type requestJumpParam struct {
	FileId string `json:"fileId" form:"fileId" binding:"required"`
	PageNo int    `json:"pageNo" form:"pageNo" binding:"required"`
}

func (p *requestParam) init() {
	p.imgOssPath = getImageOssPath(p.FileOssPath)
	p.metaFileKey = path.Join(p.imgOssPath, metaFileName)
}

func getImageOssPath(fileOssPath string) string {
	return fileOssPath + "_i"
}
func getMetaFileKey(fileOssPath string) string {
	return path.Join(getImageOssPath(fileOssPath), metaFileName)
}

type ParseResult struct {
	TotalPage   int     `json:"totalPage"`
	CoverWidth  float64 `json:"coverWidth"`
	CoverHeight float64 `json:"coverHeight"`
	ParseStatus int     `json:"parseStatus"`
}

func (r *ParseResult) String() string {
	return util.ConvertToJsonStr(r)
}

type parseMsg struct {
	localPdfPath       string
	remotePdfPath      string
	localImgDirPrefix  string
	remoteImgDir       string
	totalPage          int
	width              float64 // pdf文档宽高
	height             float64
	metaFileKey        string
	reqTime            time.Time
	ctx                *gin.Context
	prePage            int
	imageDpi           int
	imageWidth         float64 // pdf封面图片宽高
	imageHeight        float64
	disablePicCompress bool // 是否禁用压缩图片
	fileId             string
	fileOssPath        string
}

func toParseMsg(pdfInfo *pfc.PdfInfo, param *requestParam, ctx *gin.Context) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(param.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       param.imgOssPath,
		totalPage:          pdfInfo.TotalPage,
		width:              pdfInfo.Width,
		height:             pdfInfo.Height,
		metaFileKey:        param.metaFileKey,
		ctx:                ctx,
		prePage:            param.PrePage,
		imageDpi:           config.Config.ImageDpi,
		disablePicCompress: param.DisablePicCompress,
		fileId:             param.FileId,
		fileOssPath:        param.FileOssPath,
	}
}

func (m *parseMsg) getLocalImgDir(firstPage int) string {
	return getLocalImageDir(m.localImgDirPrefix, firstPage)
}

func (m *parseMsg) getCost() time.Duration {
	return mw.GetCost(m.ctx)
}

func buildPdfParseRecord(msg *parseMsg, result *ParseResult,
	taskPage int, taskEstimateMs int64) *dao.TblPdfParseRecord {
	c := config.Config
	pending := calPendingList(msg.prePage, taskPage, result.TotalPage)
	base := dao.BasePdfParseRecord{
		Status:             dao.StParsing,
		FileId:             msg.fileId,
		FileOssPath:        msg.fileOssPath,
		Dpi:                msg.imageDpi,
		Tool:               c.ParseTool,
		Width:              result.CoverWidth,
		Height:             result.CoverHeight,
		TaskPage:           taskPage,
		TaskEstimateMs:     taskEstimateMs,
		ExpireTime:         getExpireTime(taskEstimateMs),
		RetryCount:         0,
		DisablePicCompress: msg.disablePicCompress,
	}
	return &dao.TblPdfParseRecord{
		PrePage:            msg.prePage,
		TotalPage:          result.TotalPage,
		Pending:            util.ConvertToJsonStr(pending),
		Parsing:            firstNum,
		BasePdfParseRecord: base,
	}
}

func calPendingList(prePage, taskPage, totalPage int) []int {
	pending := make([]int, 0)
	for start := prePage + 1; start <= totalPage; start = start + taskPage {
		pending = append(pending, start)
	}
	return pending
}

func getExpireTime(taskEstimateMs int64) int64 {
	return util.NowMs() + maxTaskExpireParam*taskEstimateMs
}

func getLocalImgDirPrefix(fileId string) string {
	return path.Join(config.Config.BaseDir, "img", fileId)
}

func getLocalImageDir(localImgDirPrefix string, firstPage int) string {
	return path.Join(localImgDirPrefix, strconv.Itoa(firstPage))
}

func getImageOssCompressDir(imageOssPath string) string {
	return imageOssPath + "_z"
}

type JumpHis struct {
	PrePage   int   `json:"prePage"`
	TotalPage int   `json:"totalPage"`
	TaskPage  int   `json:"taskPage"`
	History   []int `json:"history"`
}

func (j *JumpHis) String() string {
	return util.ConvertToJsonStr(j)
}

func buildJumpHis(record *dao.TblPdfParseRecord, his []int) *JumpHis {
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	totalPendingSet := _set.NewBySlice(calPendingList(prePage, taskPage, totalPage))
	var pending []int
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	pendingSet := _set.NewBySlice(pending)
	// 已经解析过的
	history := _set.Difference(totalPendingSet, pendingSet).Slice()
	hisSet := _set.NewBySlice(his)
	history = _set.Union(hisSet, _set.NewBySlice(history)).Slice()
	return &JumpHis{
		PrePage:   prePage,
		TaskPage:  taskPage,
		TotalPage: totalPage,
		History:   history,
	}
}

func baseRecord2ParseMsg(pdfInfo *pfc.PdfInfo, record dao.BasePdfParseRecord) *parseMsg {
	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	return &parseMsg{
		localPdfPath:       pdfInfo.LocalPath,
		remotePdfPath:      pdfInfo.RemotePath,
		localImgDirPrefix:  localImgDirPrefix,
		remoteImgDir:       getImageOssPath(record.FileOssPath),
		width:              record.Width,
		height:             record.Height,
		metaFileKey:        getMetaFileKey(record.FileOssPath),
		imageDpi:           record.Dpi,
		disablePicCompress: record.DisablePicCompress,
		fileId:             record.FileId,
		fileOssPath:        record.FileOssPath,
	}
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.
  • 136.
  • 137.
  • 138.
  • 139.
  • 140.
  • 141.
  • 142.
  • 143.
  • 144.
  • 145.
  • 146.
  • 147.
  • 148.
  • 149.
  • 150.
  • 151.
  • 152.
  • 153.
  • 154.
  • 155.
  • 156.
  • 157.
  • 158.
  • 159.
  • 160.
  • 161.
  • 162.
  • 163.
  • 164.
  • 165.
  • 166.
  • 167.
  • 168.
  • 169.
  • 170.
  • 171.
  • 172.
  • 173.
  • 174.
  • 175.
  • 176.
  • 177.
  • 178.
  • 179.
  • 180.
  • 181.
  • 182.
  • 183.
  • 184.
  • 185.
  • 186.
  • 187.
  • 188.
  • 189.
  • 190.
  • 191.
  • 192.
  • 193.
  • 194.
  • 195.
  • 196.
  • 197.
  • 198.
  • 199.
  • 200.
  • 201.
  • 202.
  • 203.
deal_parse.go
package pdf_parse_v2

import (
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	math2 "math"
	"os"
	"path"
	"pps/application/common"
	"pps/application/compress"
	util2 "pps/application/util"
	"pps/config"
	"pps/dao"
	"pps/helper/img_util"
	pfc "pps/helper/pdf_cache"
	"pps/helper/pdf_util"
	"strings"
	"support/concurrent"
	"support/http_util"
	"support/logger"
	"support/math"
	"support/oss"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

const firstNum = 1

const BigImageDpi = 96

// BestParseTime 最佳解析时间15s
const BestParseTime = 15000

func DealParse(c *gin.Context) {
	common.CallBegin()
	defer common.CallEnd()

	log := mw.GetLogger(c)
	// 参数校验
	var param requestParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealParse request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	param.init()

	// 检查是否解析过
	if pr := getParseResult(log, ¶m); pr != nil {
		log.Warn("skip parse as already parsed, meta: %s", pr)
		mw.RetJSON(c, pr)
		return
	}

	// 获取pdf信息(文件本地路径,页数,宽高)
	pdfInfo, err := pfc.GetPdf(log, param.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		mw.RetFail(c, mw.ErrInner)
		return
	}
	log.Info("get pdf info finished, cost: %s, pdfInfo: %s", mw.GetCost(c), pdfInfo)
	// 最多解析1000页
	maxParseCount := config.Config.TotalParseCount
	if pdfInfo.TotalPage > maxParseCount {
		log.Error("pdf totalPage more than maxParseCount, totalPage %d, maxParseCount: %d",
			pdfInfo.TotalPage, maxParseCount)
		pdfInfo.TotalPage = maxParseCount
	}
	msg := toParseMsg(pdfInfo, ¶m, c)
	// 预解析
	pr, httpErr := preParse(c, msg)
	if httpErr != nil {
		mw.RetFail(c, httpErr)
		return
	}
	// 启动协程,继续解析剩下的
	go safe.Safego(func() {
		leftParse(c.Copy(), msg)
	}, "leftParse")
	// 返回
	mw.RetJSON(c, pr)
	return

}

// getParseResult 如果解析过,图片目录中会有meta文件
func getParseResult(log logger.ILog, param *requestParam) *ParseResult {
	if r, _ := oss.Helper.IsObjectExist(param.metaFileKey); !r {
		return nil
	}
	res := ParseResult{}
	err := util2.DownloadData(param.metaFileKey, &res)
	if err != nil {
		log.Error("error while download meta file: %s", err)
		return nil
	}
	return &res
}

func preParse(c *gin.Context, msg *parseMsg) (r *ParseResult, he *http_util.HttpError) {

	log := mw.GetLogger(c)
	db := db2.Db(c)
	msg.prePage = util.If(msg.prePage == 0, config.Config.PreParseCount, msg.prePage)

	lastPage := math.Min(msg.prePage, msg.totalPage)
	if err := parseAndUpload(c, msg, firstNum, lastPage); err != nil {
		log.Error("pre parse failed while parse as: %s", err)
		return nil, mw.ErrInner
	}
	log.Info("pre parse upload image finished")
	parseStatus := util.If(msg.totalPage <= msg.prePage, dao.StParseSuccess, dao.StParsing)
	pr := &ParseResult{
		TotalPage:   msg.totalPage,
		CoverWidth:  msg.imageWidth,
		CoverHeight: msg.imageHeight,
		ParseStatus: parseStatus,
	}
	log.Info("pre parse set size finished, result: %s", pr)
	// 上传meta文件
	if err := util2.UploadData(msg.metaFileKey, pr); err != nil {
		log.Error("pre parse failed while upload meta file as: %s", err)
		return nil, mw.ErrInner
	}
	// 计算平均每页耗时
	preParseCost := msg.getCost()
	log.Info("pre parse upload meta file finished cost: %s", preParseCost)
	avgPageCost := preParseCost.Milliseconds() / int64(msg.prePage)
	taskPage := math.Max(config.Config.PreParseCount, int(BestParseTime/avgPageCost))
	taskEstimateMs := int64(taskPage) * avgPageCost
	e := buildPdfParseRecord(msg, pr, taskPage, taskEstimateMs)
	if err := dao.SavePdfParseRecord(db, e); err != nil {
		log.Error("SavePdfParseRecord failed as: %s", err)
		return pr, mw.ErrDb
	}
	return pr, nil
}

func parseAndUpload(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {

	log := mw.GetLogger(c)
	localImgDir := msg.getLocalImgDir(firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	defer func() {
		if err := os.RemoveAll(localImgDir); err != nil {
			log.Error("remove dir failed as: %s", err)
		}
	}()

	// 解析成图片
	res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
		firstPage, lastPage, msg.imageDpi)
	// 检查是否是因为图片太大,再执行一次
	res, err = checkImageParseErr(log, res, err, msg, firstPage, lastPage, localImgDir)
	if err != nil {
		return errors.Wrap(err, "parse pdf")
	}
	wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
	log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
		firstPage, lastPage, wrappedRes)

	// dpi检查,封面图片过大则调整dpi
	if firstPage == firstNum {
		if err := checkDpi(c, msg, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "check dpi")
		}
	}

	// 上传图片
	if err := uploadImage(localImgDir, msg.remoteImgDir, firstPage, lastPage); err != nil {
		return errors.Wrap(err, "upload image")
	}

	// 压缩图片 根据参数开启
	if !msg.disablePicCompress {
		if err := compressImage(c, msg.remoteImgDir, firstPage, lastPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}

	return nil
}

func uploadImage(localImgDir, remoteImgDir string, firstPage, lastPage int) error {
	// 并发上传图片
	limit := concurrent.NewGoLimit(5)
	var err error
	for i := firstPage; i <= lastPage; i++ {
		pageNo := i // 防止闭包i变化
		imgName := pdf_util.GetImgName(pageNo)
		localImgPath := path.Join(localImgDir, imgName)
		remoteImgPath := path.Join(remoteImgDir, imgName)
		limit.Run(func() {
			if e := oss.Helper.PutFile(remoteImgPath, localImgPath, oss.AclPublicRead); e != nil {
				err = e
			}
		})
	}
	limit.Wait()

	return err
}

func compressImage(c *gin.Context, remoteImgDir string, firstPage, lastPage int) error {

	// 压缩图片
	limit := concurrent.NewGoLimit(10)
	for i := firstPage; i <= lastPage; i++ {
		imgName := pdf_util.GetImgName(i)
		imgOssPath := path.Join(remoteImgDir, imgName)
		limit.RunError(func() error {
			return compress.CompressImage(c, imgOssPath)
		})
	}
	limit.Wait()

	return limit.FirstError()
}

func checkDpi(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {
	localImgDir := msg.getLocalImgDir(firstPage)
	log := mw.GetLogger(c)
	localImgPath := path.Join(localImgDir, pdf_util.GetImgName(firstNum))
	imgDim, err := img_util.GetLocalImageDim(localImgPath)
	if err != nil {
		return errors.Wrap(err, "get local image")
	}
	// 修改封面宽高
	msg.imageWidth = float64(imgDim.Width)
	msg.imageHeight = float64(imgDim.Height)

	maxWidth, maxHeight, maxSize := config.Config.ImgAttrMaxValue[0], config.Config.ImgAttrMaxValue[1], config.Config.ImgAttrMaxValue[2]
	if imgDim.Width == 0 || imgDim.Height == 0 {
		return errors.New("get imageDim failed Width or Height is 0")
	}
	rw, rh, rs := float64(imgDim.Width)/maxWidth, float64(imgDim.Height)/maxHeight,
		math2.Sqrt(float64(imgDim.Width*imgDim.Height)/maxSize)
	rr := math.Max(rw, rh, rs)
	// 需要调整dpi
	if rr > 1.0 {
		dpi := float64(msg.imageDpi) / rr
		msg.imageDpi = int(dpi)
		log.Error("checkDpi image is too large,dpi change to %d", msg.imageDpi)
		// 解析成图片
		res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		if err != nil {
			return errors.Wrap(err, "parse pdf")
		}

		// 获取调整dpi后的封面宽高
		wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
		log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
			firstPage, lastPage, wrappedRes)
		imgDim, err := img_util.GetLocalImageDim(localImgPath)
		if err != nil {
			return errors.Wrap(err, "get local image")
		}
		// 修改封面宽高
		msg.imageWidth = float64(imgDim.Width)
		msg.imageHeight = float64(imgDim.Height)
	}
	return nil
}
func wrapOssPath(res string, ossPath string) string {
	res = strings.ReplaceAll(res, "\r\n", "\n")
	res = strings.Trim(res, " \n")
	lines := strings.Split(res, "\n")
	for i, line := range lines {
		lines[i] = line + " (" + ossPath + ")"
	}
	return strings.Join(lines, "\n")
}

func checkImageParseErr(log logger.ILog, res []byte, err error, msg *parseMsg, firstPage, lastPage int, localImgDir string) ([]byte, error) {
	// 降dpi再执行一次
	if strings.Contains(string(res), pdf_util.PdfiumBigImageError.Error()) {
		msg.imageDpi = BigImageDpi
		log.Error("image is too large,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	if err != nil && strings.Contains(err.Error(), "killed") {
		msg.imageDpi = BigImageDpi
		log.Error("pdf parse is failed as killed by signal,dpi change to %d", msg.imageDpi)
		res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
			firstPage, lastPage, msg.imageDpi)
		return res, err
	}
	return res, err
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.
  • 136.
  • 137.
  • 138.
  • 139.
  • 140.
  • 141.
  • 142.
  • 143.
  • 144.
  • 145.
  • 146.
  • 147.
  • 148.
  • 149.
  • 150.
  • 151.
  • 152.
  • 153.
  • 154.
  • 155.
  • 156.
  • 157.
  • 158.
  • 159.
  • 160.
  • 161.
  • 162.
  • 163.
  • 164.
  • 165.
  • 166.
  • 167.
  • 168.
  • 169.
  • 170.
  • 171.
  • 172.
  • 173.
  • 174.
  • 175.
  • 176.
  • 177.
  • 178.
  • 179.
  • 180.
  • 181.
  • 182.
  • 183.
  • 184.
  • 185.
  • 186.
  • 187.
  • 188.
  • 189.
  • 190.
  • 191.
  • 192.
  • 193.
  • 194.
  • 195.
  • 196.
  • 197.
  • 198.
  • 199.
  • 200.
  • 201.
  • 202.
  • 203.
  • 204.
  • 205.
  • 206.
  • 207.
  • 208.
  • 209.
  • 210.
  • 211.
  • 212.
  • 213.
  • 214.
  • 215.
  • 216.
  • 217.
  • 218.
  • 219.
  • 220.
  • 221.
  • 222.
  • 223.
  • 224.
  • 225.
  • 226.
  • 227.
  • 228.
  • 229.
  • 230.
  • 231.
  • 232.
  • 233.
  • 234.
  • 235.
  • 236.
  • 237.
  • 238.
  • 239.
  • 240.
  • 241.
  • 242.
  • 243.
  • 244.
  • 245.
  • 246.
  • 247.
  • 248.
  • 249.
  • 250.
  • 251.
  • 252.
  • 253.
  • 254.
  • 255.
  • 256.
  • 257.
  • 258.
  • 259.
  • 260.
  • 261.
  • 262.
  • 263.
  • 264.
  • 265.
  • 266.
  • 267.
  • 268.
  • 269.
  • 270.
  • 271.
  • 272.
  • 273.
  • 274.
  • 275.
  • 276.
  • 277.
  • 278.
  • 279.
  • 280.
  • 281.
  • 282.
  • 283.
  • 284.
  • 285.
  • 286.
  • 287.
  • 288.
  • 289.
  • 290.
  • 291.
  • 292.
  • 293.
  • 294.
  • 295.
  • 296.
  • 297.
deal_parse_left.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"github.com/pkg/errors"
	"os"
	"path"
	"pps/dao"
	"pps/helper/img_util"
	"pps/helper/pdf_util"
	"support/math"
	"support/oss"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
)

func leftParse(c *gin.Context, msg *parseMsg) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	for {
		record, err := dao.GetPdfParseRecordByFileId(db, msg.fileId)
		if err != nil {
			log.Error("GetPdfParseRecordByFileId failed as err:%v", err)
			return
		}
		var pending []int
		if err = json.Unmarshal([]byte(record.Pending), &pending); err != nil {
			log.Error("Unmarshal pending failed as err:%v", err)
			return
		}
		if len(pending) == 0 {
			if err := dao.UpdatePdfParseRecordStatus(db, msg.fileId, dao.StParseSuccess); err != nil {
				log.Error("UpdatePdfParseRecordStatus failed as err:%v", err)
			}
			return
		}
		// 取第一条进行解析任务
		firstPage := pending[0]
		lastPage := math.Min(record.TotalPage, firstPage+record.TaskPage-1)

		newPending := util.ConvertToJsonStr(pending[1:])
		expireTime := getExpireTime(record.TaskEstimateMs)
		if err := dao.UpdatePdfParseRecord(db, msg.fileId, newPending, firstPage, msg.imageDpi, expireTime); err != nil {
			log.Error("UpdatePdfParseRecordPending failed as err:%v", err)
			return
		}

		// 调用解析
		if err := parseAndUpload(c, msg, firstPage, lastPage); err != nil {
			// 切换破图上传oss
			if err := GenFailedImageUpload(c, record.BasePdfParseRecord, firstPage, lastPage); err != nil {
				log.Error("GenFailedImageUpload failed as err:%v", err)
			}
		}
	}
}

func GenFailedImageUpload(c *gin.Context, record dao.BasePdfParseRecord, firstPage, lastPage int) error {
	log := mw.GetLogger(c)

	localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
	localImgDir := getLocalImageDir(localImgDirPrefix, firstPage)
	if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
		return errors.Wrap(err, "mkdir")
	}
	imgName := pdf_util.GetImgName(firstPage)
	localImgPath := path.Join(localImgDir, imgName)
	err := GenFailedImage(localImgPath, record.Width, record.Height)
	if err != nil {
		return err
	}
	remoteImgOssPath := getImageOssPath(record.FileOssPath)
	// 上传图片
	if err := uploadImage(localImgDir, remoteImgOssPath, firstPage, firstPage); err != nil {
		return errors.Wrap(err, "upload image")
	}
	// 压缩图片 根据参数开启
	if !record.DisablePicCompress {
		if err := compressImage(c, remoteImgOssPath, firstPage, firstPage); err != nil {
			return errors.Wrap(err, "compressImage")
		}
	}
	// 剩下的不压缩,直接拷贝
	for start := firstPage + 1; start <= lastPage; start++ {
		// oss拷贝
		srcImg := path.Join(remoteImgOssPath, imgName)
		dstImg := path.Join(remoteImgOssPath, pdf_util.GetImgName(start))
		log.Debug("oss copy src:%s, dst:%s", srcImg, dstImg)
		err := oss.Helper.CopyObject(srcImg, dstImg)
		if err != nil {
			log.Error(" srcImg:%s, dstImg:%s oss copy failed as err:%v", srcImg, dstImg, err)
		}
		if !record.DisablePicCompress {
			srcImgCompress := getImageOssCompressDir(srcImg)
			dstImgCompress := getImageOssCompressDir(dstImg)
			err := oss.Helper.CopyFolder(srcImgCompress, dstImgCompress)
			if err != nil {
				log.Error(" srcImgCompress:%s, dstImgCompress:%s oss copy failed as err:%v",
					srcImgCompress, dstImgCompress, err)
			}
		}
	}
	return nil
}

func GenFailedImage(filePath string, width, height float64) error {
	data, err := img_util.GenFailImage(width, height)
	if err != nil {
		return err
	}
	outFile, err := os.Create(filePath)
	if err != nil {
		return err
	}
	_, _ = outFile.Write(data)
	_ = outFile.Close()
	return nil
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
deal_parse_jump.go
package pdf_parse_v2

import (
	"encoding/json"
	"github.com/gin-gonic/gin"
	"pps/dao"
	pfc "pps/helper/pdf_cache"
	"support/collection/_set"
	"support/database/redis"
	"support/logger"
	"support/safe"
	"support/util"
	db2 "support/web/db"
	"support/web/mw"
	"time"
)

const maxJumpCount = 5

const PpsJumpHisCachePrefix = "PPS:PDF:JUMP_FILEID_"

func DealJump(c *gin.Context) {
	log := mw.GetLogger(c)
	db := db2.Db(c)
	// 参数校验
	var param requestJumpParam
	if err := c.ShouldBindQuery(¶m); err != nil {
		log.Error("DealJump request param failed as: %s", err)
		mw.RetFail(c, mw.ErrBadParam)
		return
	}
	fileId := param.FileId
	jumpHis, _ := CacheGetJumpHis(log, fileId)
	// redis有缓存
	if len(jumpHis.History) > 0 {
		if hasParse(jumpHis, param.PageNo, false) {
			log.Debug("pageNo:%d hasParse task", param.PageNo)
			mw.RetJSON(c, "")
			return
		}
	}
	record, err := dao.GetPdfParseRecordByFileId(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseRecordByFileId failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	jumpHis = buildJumpHis(record, jumpHis.History)
	log.Debug("jumpHis:%s", jumpHis)

	if hasParse(jumpHis, param.PageNo, true) {
		log.Debug("pageNo:%d hasParse task", param.PageNo)
		mw.RetJSON(c, "")
		return
	}
	// 更新his
	_ = CacheSetJumpHis(log, fileId, jumpHis)

	// 调整任务顺序
	pending := adjustPendingList(record, param.PageNo)
	defer func() {
		// 更新pending
		if err = dao.UpdatePdfParseRecordPending(db, fileId, util.ConvertToJsonStr(pending)); err != nil {
			log.Error("DealJump UpdatePdfParseRecordPending failed as: %s", err)
		}
	}()

	// 更新解析记录
	jumpCount, err := dao.GetPdfParseJumpCount(db, fileId)
	if err != nil {
		log.Error("DealJump GetPdfParseJumpCount failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}
	if jumpCount >= maxJumpCount {
		log.Warn("currJumpCount:%d gt maxJumCount:%d", jumpCount, maxJumpCount)
		mw.RetJSON(c, "")
		return
	}
	fromPage := pending[0]
	expireTime := getExpireTime(record.TaskEstimateMs)
	jumpRecord := dao.BuildPdfParseJumpFromParseRecord(record, fromPage, expireTime)
	// 创建跳页记录
	if err = dao.SavePdfParseJump(db, jumpRecord); err != nil {
		log.Error("DealJump SavePdfParseJump failed as: %s", err)
		mw.RetFail(c, mw.ErrDb)
		return
	}

	pending = pending[1:]
	mw.RetJSON(c, "")
	// 处理跳页
	go safe.Safego(func() {
		dealJump(c.Copy(), jumpRecord)
	}, "dealJump")
}

func CacheGetJumpHis(log logger.ILog, fileId string) (*JumpHis, error) {
	var his JumpHis
	key := PpsJumpHisCachePrefix + fileId
	value, err := redis.Get(key)
	if err != nil {
		if err != redis.ErrGetNil {
			log.Error("CacheGetJumpHis, fileId: %s, err: %s", fileId, err)
		}
		return &his, err
	}
	err = json.Unmarshal([]byte(value), &his)
	return &his, err
}

func CacheSetJumpHis(log logger.ILog, fileId string, his *JumpHis) error {
	key := PpsJumpHisCachePrefix + fileId
	value, err := json.Marshal(his)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
		return err
	}
	err = redis.Set(key, string(value), time.Hour)
	if err != nil {
		log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
	}
	return err
}

func hasParse(his *JumpHis, pageNo int, insert bool) bool {
	prePage, taskPage, totalPage := his.PrePage, his.TaskPage, his.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	// 开头补上0 表示预解析
	plist = append([]int{0}, plist...)
	start := calPageStart(plist, pageNo)
	hisSet := _set.NewBySlice(his.History)
	hisSet.Add(0)
	if hisSet.Has(start) {
		return true
	}
	if insert {
		// 本次操作加进his
		hisSet.Add(start)
	}

	his.History = hisSet.Slice()
	return false
}

// 二分法查找仅次于target的数字
func calPageStart(arr []int, target int) int {
	// 边界情况处理
	if len(arr) == 0 {
		return 0 // 如果数组空,返回0(一般不会运行到这里)
	}
	if target < arr[0] {
		return arr[0]
	}
	if target > arr[len(arr)-1] {
		return arr[len(arr)-1]
	}

	left := 0
	right := len(arr) - 1

	for left <= right {
		mid := left + (right-left)/2

		// 如果找到精确匹配
		if arr[mid] == target {
			return arr[mid]
		} else if arr[mid] < target {
			// 检查是否是最后一个小于n的元素
			if mid == len(arr)-1 || arr[mid+1] > target {
				return arr[mid]
			}
			left = mid + 1
		} else {
			right = mid - 1
		}
	}

	return 0 // 默认返回0,一般不会运行到这里
}

func adjustPendingList(record *dao.TblPdfParseRecord, pageNo int) (pending []int) {
	_ = json.Unmarshal([]byte(record.Pending), &pending)
	if len(pending) == 0 {
		return
	}
	prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
	plist := calPendingList(prePage, taskPage, totalPage)
	start := calPageStart(plist, pageNo)
	// 已经排在第一个了
	if start == pending[0] {
		return
	}
	pending = rotateSlice(pending, start)
	return
}

func rotateSlice(arr []int, value int) []int {
	// 查找元素的位置
	index := -1
	for i, v := range arr {
		if v == value {
			index = i
			break
		}
	}

	// 如果没有找到指定的元素,返回原始切片
	if index == -1 {
		return arr
	}

	// 使用拼接方式旋转切片
	result := append(arr[index:], arr[:index]...)
	return result
}

func dealJump(c *gin.Context, record *dao.TblParseJump) {
	log := mw.GetLogger(c)
	db := db2.Db(c)

	// 获取pdf信息(文件本地路径,页数,宽高)
	pdfInfo, err := pfc.GetPdf(log, record.FileOssPath)
	if err != nil {
		log.Error("GetPdf failed as: %s", err)
		return
	}
	msg := baseRecord2ParseMsg(pdfInfo, record.BasePdfParseRecord)
	// 调用解析
	if err := parseAndUpload(c, msg, record.FromPage, record.ToPage); err != nil {
		// 切换破图上传oss
		if err := GenFailedImageUpload(c, record.BasePdfParseRecord, record.FromPage, record.ToPage); err != nil {
			log.Error("GenFailedImageUpload failed as err:%v", err)
		}
	}
	if err := dao.UpdateJumpSuccess(db, record.FileId); err != nil {
		log.Error("UpdateJumpSuccess failed as err:%v", err)
	}
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.
  • 136.
  • 137.
  • 138.
  • 139.
  • 140.
  • 141.
  • 142.
  • 143.
  • 144.
  • 145.
  • 146.
  • 147.
  • 148.
  • 149.
  • 150.
  • 151.
  • 152.
  • 153.
  • 154.
  • 155.
  • 156.
  • 157.
  • 158.
  • 159.
  • 160.
  • 161.
  • 162.
  • 163.
  • 164.
  • 165.
  • 166.
  • 167.
  • 168.
  • 169.
  • 170.
  • 171.
  • 172.
  • 173.
  • 174.
  • 175.
  • 176.
  • 177.
  • 178.
  • 179.
  • 180.
  • 181.
  • 182.
  • 183.
  • 184.
  • 185.
  • 186.
  • 187.
  • 188.
  • 189.
  • 190.
  • 191.
  • 192.
  • 193.
  • 194.
  • 195.
  • 196.
  • 197.
  • 198.
  • 199.
  • 200.
  • 201.
  • 202.
  • 203.
  • 204.
  • 205.
  • 206.
  • 207.
  • 208.
  • 209.
  • 210.
  • 211.
  • 212.
  • 213.
  • 214.
  • 215.
  • 216.
  • 217.
  • 218.
  • 219.
  • 220.
  • 221.
  • 222.
  • 223.
  • 224.
  • 225.
  • 226.
  • 227.
  • 228.
  • 229.
  • 230.
  • 231.
  • 232.
  • 233.
  • 234.
  • 235.
  • 236.
  • 237.
  • 238.
  • 239.

4.代码解读

实现pdf跳页解析功能_github_08

跳页主要是调整解析队列pending的顺序,如果有可用协程就直接启动协程执行跳页解析

实现pdf跳页解析功能_宽高_09

left解析主要就是从数据库的pending字段取出一个start 然后开始解析

5.番外篇pdfium介绍

PDFium 是一个开源的 PDF 渲染引擎,用于解析和呈现 PDF 文档。它最初由 Foxit Software 开发,随后由 Google 作为 Chromium 项目的一部分维护和发布。PDFium 被广泛应用于浏览器(如 Google Chrome)的内置 PDF 查看器,以及其他需要处理 PDF 文档的应用程序中。

PDFium 的主要功能
  1. PDF 渲染:PDFium 可以将 PDF 页面渲染为不同格式的图像(如 Bitmaps),并支持高效的缩放和旋转操作。
  2. 文本提取:可以提取 PDF 文档中的文本内容,便于文本检索和搜索引擎的索引。
  3. 表单处理:支持 PDF 表单的填充、提交和提取操作。
  4. 注释处理:支持读取和管理 PDF 注释,如高亮、注释和签名。
  5. 图像和图形处理:能够解析和渲染 PDF 文件中的图像和矢量图形。
  6. 安全性:支持解析和处理加密的 PDF 文档。
主要特性
  • 跨平台支持:PDFium 可以在 Windows、macOS、Linux 等多个平台上编译和运行。
  • 高性能:针对性能进行了优化,可以高效处理大型和复杂的 PDF 文档。
  • 模块化:具有模块化架构,可以选择性编译和使用需要的功能模块。
  • 丰富的 API:提供了丰富的 API,可以进行复杂的 PDF 操作和自定义扩展。
如何使用 PDFium
1. 下载和构建

PDFium 是一个开源项目,可以从其 GitHub 仓库下载源码:

git clone https://pdfium.googlesource.com/pdfium
  • 1.

你可以按照文档进行配置和编译,以适应不同平台和需求。常用的构建工具包括 CMake 和 Ninja。

2. 基本用例

构建完成后,你可以在你的 C/C++ 项目中使用 PDFium。以下是一个简单的示例,展示如何创建一个 PDF 文档并渲染第一页:

#include "public/fpdfview.h"

int main(int argc, char** argv) {
    // 初始化 PDFium
    FPDF_InitLibrary();

    // 加载 PDF 文档
    FPDF_DOCUMENT doc = FPDF_LoadDocument("example.pdf", nullptr);
    if (!doc) {
        FPDF_DestroyLibrary();
        return -1;
    }

    // 加载第一页
    FPDF_PAGE page = FPDF_LoadPage(doc, 0);
    if (!page) {
        FPDF_CloseDocument(doc);
        FPDF_DestroyLibrary();
        return -1;
    }

    // 渲染第一页到 Bitmap
    int width = FPDF_GetPageWidth(page);
    int height = FPDF_GetPageHeight(page);
    FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, 0);
    FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF); // 白色背景
    FPDF_RenderPageBitmap(bitmap, page, 0, 0, width, height, 0, 0);

    // 保存 Bitmap 到文件或进行其他处理
    // ...

    // 清理资源
    FPDFBitmap_Destroy(bitmap);
    FPDF_ClosePage(page);
    FPDF_CloseDocument(doc);
    FPDF_DestroyLibrary();

    return 0;
}
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
常见应用场景
  1. 嵌入式 PDF 查看器:你可以在桌面或移动应用中嵌入一个自定义的 PDF 查看组件,为用户提供查看和互动功能。
  2. 服务器端 PDF 处理:在服务器端应用中使用 PDFium 批量处理 PDF 文档,实现自动化的文档管理和处理流程。
  3. PDF 数据提取:通过 PDFium 提取 PDF 文档中的文本、图像和其他数据,用于数据分析、数据挖掘等。
资源和社区

PDFium 作为一个强大且灵活的 PDF 引擎,广泛应用于各种需处理 PDF 文档的场景。如果你有特定的需求,深入研究 PDFium 的文档和示例,结合实际情况进行开发,可以达到最佳效果。