准备工作
安装 pyautogui 命令如下:(参考:pyautogui安装教程)
pip install pyautogui
或者
pip install pyautogui -i https://mirrors.aliyun.com/pypi/simple
pyautogui 使用参考:
https://blog.csdn.net/ibiao/article/details/77859997
https://jingyan.baidu.com/article/39810a23440b20b636fda621.html
技术实现
python + java项目
第一步:抓取所有书的名录,保存到本地数据库
python 脚本
#!/usr/bin/python3
import sys
import time
import pyautogui
from PIL import ImageGrab
# 标记图像
imgDir = "D:/dang/python/dang/img"
imgLoadingMark = "%s/%s.bmp" % (imgDir, "loadingMark") # “加载中”标记图像
imgLoadFailMark = "%s/%s.bmp" % (imgDir, "loadFailMark") # “加载失败”标记图像
imgLoginPassMark = "%s/%s.bmp" % (imgDir, "loginPassMark") # “登录失效”标记图像
# 参数配置
bookDir = "D:/book/xxxxxxxxx" # 替换书名
bookDir2 = "D:/tmpbmp" # BMP临时目录(用来抓取标记图像)
pageCount = 300 # 页数
pageArea = (354, 72, 1012, 950) # 页面区域(x1, y1, x2, y2)
# 查找标记图像
def marchMark(fileMark):
return pyautogui.locateCenterOnScreen(fileMark, grayscale=True)
# 抓取页面图像
def catchImage(area, file):
im = ImageGrab.grab(bbox=area)
im.save(file)
# 检查标记图像
def checkMark():
if markResult := marchMark(imgLoadingMark):
print("imgLoadingMark", markResult)
time.sleep(3)
return checkMark()
if markResult := marchMark(imgLoadFailMark):
print("imgLoadFailMark", markResult)
pyautogui.click(markResult)
time.sleep(3)
return checkMark()
if markResult := marchMark(imgLoginPassMark):
print("imgLoginPassMark", markResult)
pyautogui.moveTo(markResult)
return False
return True
# 抓取一个页面
def catchPage(i):
if checkMark():
# print("catch", i)
catchImage(pageArea, "%s/%d.jpg" % (bookDir, i))
catchImage(pageArea, "%s/%d.bmp" % (bookDir2, i))
return True
else:
return False
# 从第 i 页开始抓取
def startImpl(i):
while i <= pageCount:
if catchPage(i):
pyautogui.press('right')
# time.sleep(1)
i += 1
else:
break
else:
return True
return False
# 从第 i 页开始抓取
def start(i):
if startImpl(i):
print("catch success")
else:
print("catch fail")
# 抓取标记图像
def catchMark(area, fileMark):
catchImage(area, fileMark)
markResult = marchMark(fileMark)
if markResult:
print("markResult", markResult)
pyautogui.moveTo(markResult)
print("ready...")
time.sleep(3)
print("go")
start(1)
# catchPage(1948)
# while True: pyautogui.hotkey('ctrl', 'end') # 下翻操作
# 抓取标记图像
# catchMark((430, 558, 523, 558 + 1), imgLoadingMark)
# catchMark((358, 448, 440, 448 + 1), imgLoadFailMark)
# catchMark((666, 510, 856, 510 + 1), imgLoginPassMark)
# im = pyautogui.screenshot(bookDir + "/1.jpg", region=(0, 0, 300, 400))
进入当当电子书的某个分类,例如“自然科学”,运行脚本中的“下翻操作”,翻开所有书目后,手动保存DOM内容为“自然科学.txt”
java项目
pom.xml
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.7.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.21</version>
</dependency>
<!--pdf相关-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.11</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.11</version>
</dependency>
<!--用于解析html-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
CatchBookTable.java 构建书目记录,方便查找和管理
package com.xnktyu.dangdang;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.xnktyu.utils.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileFilter;
import java.util.HashMap;
import java.util.Map;
public class CatchBookTable
{
private static final DBHelper local_db = new DBHelper("localhost", "root", "xxxxxxx", "dang");
private static class tbase
{
protected String pack(String field)
{
return "f_" + field.toLowerCase();
}
@Override
public String toString()
{
return getClass().getSimpleName().toLowerCase();
}
}
private static final class t_book extends tbase
{
public final String bookId = pack("bookId");
public final String note = pack("note");
public final String title = pack("title");
public final String author = pack("author");
public final String time = pack("time");
public final String vip_li