当当电子书生成pdf示例

准备工作

安装 Python 3PyCharm 社区版

安装 pyautogui 命令如下:(参考:pyautogui安装教程

pip install pyautogui
或者
pip install pyautogui -i https://mirrors.aliyun.com/pypi/simple

pyautogui 使用参考:

https://blog.csdn.net/ibiao/article/details/77859997

https://jingyan.baidu.com/article/39810a23440b20b636fda621.html

技术实现

python + java项目

第一步:抓取所有书的名录,保存到本地数据库

python 脚本

#!/usr/bin/python3

import sys
import time
import pyautogui

from PIL import ImageGrab

# 标记图像
imgDir = "D:/dang/python/dang/img"
imgLoadingMark = "%s/%s.bmp" % (imgDir, "loadingMark") # “加载中”标记图像
imgLoadFailMark = "%s/%s.bmp" % (imgDir, "loadFailMark") # “加载失败”标记图像
imgLoginPassMark = "%s/%s.bmp" % (imgDir, "loginPassMark") # “登录失效”标记图像

# 参数配置
bookDir = "D:/book/xxxxxxxxx" # 替换书名
bookDir2 = "D:/tmpbmp" # BMP临时目录(用来抓取标记图像)
pageCount = 300 # 页数
pageArea = (354, 72, 1012, 950) # 页面区域(x1, y1, x2, y2)

# 查找标记图像
def marchMark(fileMark):
	return pyautogui.locateCenterOnScreen(fileMark, grayscale=True)

# 抓取页面图像
def catchImage(area, file):
	im = ImageGrab.grab(bbox=area)
	im.save(file)

# 检查标记图像
def checkMark():
	if markResult := marchMark(imgLoadingMark):
		print("imgLoadingMark", markResult)
		time.sleep(3)
		return checkMark()
	if markResult := marchMark(imgLoadFailMark):
		print("imgLoadFailMark", markResult)
		pyautogui.click(markResult)
		time.sleep(3)
		return checkMark()
	if markResult := marchMark(imgLoginPassMark):
		print("imgLoginPassMark", markResult)
		pyautogui.moveTo(markResult)
		return False
	return True

# 抓取一个页面
def catchPage(i):
	if checkMark():
		# print("catch", i)
		catchImage(pageArea, "%s/%d.jpg" % (bookDir, i))
		catchImage(pageArea, "%s/%d.bmp" % (bookDir2, i))
		return True
	else:
		return False

# 从第 i 页开始抓取
def startImpl(i):
	while i <= pageCount:
		if catchPage(i):
			pyautogui.press('right')
			# time.sleep(1)
			i += 1
		else:
			break
	else:
		return True
	return False

# 从第 i 页开始抓取
def start(i):
	if startImpl(i):
		print("catch success")
	else:
		print("catch fail")

# 抓取标记图像
def catchMark(area, fileMark):
	catchImage(area, fileMark)
	markResult = marchMark(fileMark)
	if markResult:
		print("markResult", markResult)
		pyautogui.moveTo(markResult)


print("ready...")
time.sleep(3)
print("go")

start(1)
# catchPage(1948)

# while True: pyautogui.hotkey('ctrl', 'end') # 下翻操作

# 抓取标记图像
# catchMark((430, 558, 523, 558 + 1), imgLoadingMark)
# catchMark((358, 448, 440, 448 + 1), imgLoadFailMark)
# catchMark((666, 510, 856, 510 + 1), imgLoginPassMark)

# im = pyautogui.screenshot(bookDir + "/1.jpg", region=(0, 0, 300, 400))

进入当当电子书的某个分类,例如“自然科学”,运行脚本中的“下翻操作”,翻开所有书目后,手动保存DOM内容为“自然科学.txt”

java项目

pom.xml

		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.47</version>
		</dependency>

		<dependency>
			<groupId>com.squareup.okhttp3</groupId>
			<artifactId>okhttp</artifactId>
			<version>4.7.2</version>
		</dependency>

		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>8.0.21</version>
		</dependency>

		<!--pdf相关-->
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.11</version>
		</dependency>
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>fontbox</artifactId>
			<version>2.0.11</version>
		</dependency>

		<!--用于解析html-->
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.13.1</version>
		</dependency>

CatchBookTable.java 构建书目记录,方便查找和管理

package com.xnktyu.dangdang;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.xnktyu.utils.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileFilter;
import java.util.HashMap;
import java.util.Map;

public class CatchBookTable
{
	private static final DBHelper local_db = new DBHelper("localhost", "root", "xxxxxxx", "dang");

	private static class tbase
	{
		protected String pack(String field)
		{
			return "f_" + field.toLowerCase();
		}

		@Override
		public String toString()
		{
			return getClass().getSimpleName().toLowerCase();
		}
	}

	private static final class t_book extends tbase
	{
		public final String bookId = pack("bookId");
		public final String note = pack("note");
		public final String title = pack("title");
		public final String author = pack("author");
		public final String time = pack("time");
		public final String vip_li
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值