爬虫全流程
所需要的依赖
import pytesseract
from PIL import Image
import requests
from lxml import etree
import urllib.request
from io import StringIO, BytesIO
import re
import datetime
import json
import mysql.connector
import time
import base64
import os
pytesseract重点依赖引入方法
废话不多说,直接开干!
首先安装库
然后按照tesseract程序下载安装
tessercat下载地址:https://digi.bib.uni-mannheim.de/tesseract/ //请依据自己的操作系统下载exe文件安装
用户变量,系统变量都添加:PATH C:\Program Files (x86)\Tesseract-OCR; //这是tesseract的安装目录
系统变量添加:TESSDATA_PREFIX C:\Program Files (x86)\Tesseract-OCR
pip install pytesseract pytesseract依赖安装命令
再找到pytesseract.py文件
修改添加tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# requests.session,创建一个保持登录凭据的session实例
login_session = requests.session()
mydb = mysql.connector.connect(
host="######",
user="######",
passwd="######",
database="######"
)
mycursor = mydb.cursor()
# 首页请求头
header = {
"Host": "#############",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
}
# iost = io.StringIO()
# 首页登录
response = login_session.get(url="首页的url", headers=header)
response.encoding = 'utf-8'
# 获取一级分类的数据
html = etree.HTML(response.text)
# 首页返回头
print(response.headers)
# print(response.text)
# 首页返回头Cookie
token_value =response.headers.get("Set-Cookie")
print(token_value[0:43])
验证码的请求头
headers = {
"Host": "##########",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Referer": "##############",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": token_value[0:43] //首页url返回的cookie
}
imgbase64str=login_session.get(url="验证码url", headers=headers)
获取bs64流并保存到本地
request = imgbase64str.content
with open(r"D:\\文件\\7.png",'wb') as f:
f.write(request)
# login_session.get(url="http://222.134.6.165:8090/GenerateImage.jsp", headers=headers)
# items = html.xpath("//div[@class='ui-form-explain']//a//img//@src" )
# print(items[0])
# r = urllib.request.urlopen(r"D:\\文件\\7.png")
# f = open('VCode.jpg', 'wb') #这里是将验证码图片写入到本地文件
# f.write(r.read())
# f.close()
# imgBuf = BytesIO(r.read()) # 采用StringIO直接将验证码文件写到内存,省去写入硬盘
验证码图片的背景处理
img = Image.open(r"D:\\文件\\7.png") # PIL库加载图片
img = img.convert('RGBA') # 转换为RGBA
pix = img.load() # 读取为像素
for x in range(img.size[0]): # 处理上下黑边框
pix[x, 0] = pix[x, img.size[1] - 1] = (255, 255, 255, 255)
for y in range(img.size[1]): # 处理左右黑边框
pix[0, y] = pix[img.size[0] - 1, y] = (255, 255, 255, 255)
for y in range(img.size[1]): # 二值化处理,这个阈值为R=95,G=95,B=95
for x in range(img.size[0]):
if pix[x, y][0] < 95 or pix[x, y][1] < 95 or pix[x, y][2] < 95:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
img.save(r"D:\\文件\\5.png") # 由于tesseract限制,这里必须存到本地文件
使用tesseract去识别处理后的图片
text=pytesseract.image_to_string(r"D:\\文件\\5.png")
print(text[:-1])
验证码保存到本地处理之前
处理之后
获取验证码文字
headerse = {
"Host": "#############",
"Connection": "keep-alive",
"Content-Length": "79",
"Accept": "*/*",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "###################",
"Referer": "###################",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": token_value[0:43] /// 首页url返回的cookie
}
# # 1、get请求,获取token信息
token_url = "登录url"
data = {
"j_username": "账号",
"j_password": "密码",
"t_checkcode": text[:-1] //验证码
}
data["t_checkcode"]=int(data["t_checkcode"])
print(data)
print(headers)
print(headerse)
login_response = login_session.post(url=token_url,data=data, headers=headerse)
print(login_response.text)
print(login_response.headers)
# 通过正则表达式获取token值
# token_search = re.compile(r"JSESSIONID=(.*?);")
token_value = login_response.headers.get("Set-Cookie")
获取登录url返回的Cookie
print(token_value[0:43])
数据url请求头
headerss= {
"Host": "################",
"Connection": "keep-alive",
"Content-Length": "14",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "############",
"Referer": "######################",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": token_value[0:43] //登录url返回的Cookie
}
def getYesterday():
today=datetime.date.today()
oneday=datetime.timedelta(days=1)
yesterday=today-oneday
return yesterday
def getyes():
now = datetime.datetime.now()
year = now.year
month = now.month
return datetime.date(year,month,1)
login_url = "数据url"
data = {
"startTime": "2022-02-10",
"endTime": "2022-02-10",
"monthTime": "2022-01",
"dateRange": "day",
"outfallId": "",
"rgnCode": "",
"yearTime": 2022,
"isShowAllData": 1,
"quarterTime": 1,
"orderType": "fromSmallToBig",
"outfallIds": "177511,21,176152,177508,177503,176151,177506,177510,177504,177505,177501,177509,177500,177512",
"halfYearTime": 1
}
data['startTime'] = getyes()
data['endTime'] = getYesterday()
print(data)
# 发送Post请求,提交用户名密码,注意不要忘记携带data
login_response = login_session.post(url=login_url, headers=headerss, data=data)
print(login_response.text)
data = json.loads(login_response.text)
# print(data[0])
val = []
for i in range(len(data)):
print("序号%s 值%s" % (i + 1, data[i]))
val.append(tuple(data[i].values()))
print(tuple(data[i].values()))
# #插入新数据
print(val)
sql = "INSERT INTO 表名称 (loadSo2,code,outfallId,dataCountO3,chromaSo2,chromaO3,dataCountPm25,invalidSo2,invalidNo2,apiO3,invalidCo,maxApi,loadNo2,dataCountCo,chromaPm25,invalidPm10,loadPm10,apiPm10,invalidO3,totalApi,dataCountNo2,outfallName,loadCo,apiCo,orderzh,apiPm25,dataCountPm10,chromaNo2,chromaCo,apiNo2,rgnName,chromaPm10,invalidPm25,loadPm25,mainPollution,apiSo2,dataCountSo2,loadO3) VALUES (%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s,%s)"
mycursor.executemany(sql, val)
mydb.commit() # 数据表内容有更新,必须使用到该语句
参考链接
https://www.cnblogs.com/chenlove/p/14038580.html
https://blog.csdn.net/qiushi_1990/article/details/78041375
https://www.jb51.net/article/187678.htm
https://blog.csdn.net/xiaxianba/article/details/89450855
https://blog.csdn.net/purvispanwu/article/details/107099452
https://www.polarxiong.com/archives/python-tesseract-verification-code.html
https://www.cnblogs.com/zhangb8042/articles/10410263.html