学python爬虫

爬虫全流程

  1. 前期准备工作

所需要的依赖
import pytesseract
from PIL import Image
import requests
from lxml import etree
import urllib.request
from io import StringIO, BytesIO
import re
import datetime
import json
import mysql.connector
import time
import base64
import os

pytesseract重点依赖引入方法
废话不多说,直接开干!
首先安装库

然后按照tesseract程序下载安装

tessercat下载地址:https://digi.bib.uni-mannheim.de/tesseract/ //请依据自己的操作系统下载exe文件安装
在这里插入图片描述

用户变量,系统变量都添加:PATH C:\Program Files (x86)\Tesseract-OCR; //这是tesseract的安装目录
系统变量添加:TESSDATA_PREFIX C:\Program Files (x86)\Tesseract-OCR

pip install pytesseract pytesseract依赖安装命令
再找到pytesseract.py文件

修改添加tesseract.exe

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
  1. 创建一个保持登录凭据和mysql初始化

# requests.session,创建一个保持登录凭据的session实例
login_session = requests.session()

mydb = mysql.connector.connect(
  host="######",
  user="######",
  passwd="######",
  database="######"
)
mycursor = mydb.cursor()
  1. 首页url并获取Set-Cookie

# 首页请求头
header = {
    "Host": "#############",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
}

# iost = io.StringIO()
# 首页登录
response = login_session.get(url="首页的url", headers=header)
response.encoding = 'utf-8'

# 获取一级分类的数据
html = etree.HTML(response.text)
# 首页返回头
print(response.headers)
# print(response.text)
# 首页返回头Cookie
token_value =response.headers.get("Set-Cookie")
print(token_value[0:43])
  1. 获取首页验证码并识别验证码内容

验证码的请求头
headers = {
    "Host": "##########",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    "Referer": "##############",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] //首页url返回的cookie
}


imgbase64str=login_session.get(url="验证码url", headers=headers)
获取bs64流并保存到本地
request = imgbase64str.content
with open(r"D:\\文件\\7.png",'wb') as f:
    f.write(request)



# login_session.get(url="http://222.134.6.165:8090/GenerateImage.jsp", headers=headers)
# items = html.xpath("//div[@class='ui-form-explain']//a//img//@src" )
# print(items[0])

# r = urllib.request.urlopen(r"D:\\文件\\7.png")
# f = open('VCode.jpg', 'wb')    #这里是将验证码图片写入到本地文件
# f.write(r.read())
# f.close()
# imgBuf = BytesIO(r.read())  # 采用StringIO直接将验证码文件写到内存,省去写入硬盘
验证码图片的背景处理
img = Image.open(r"D:\\文件\\7.png")  # PIL库加载图片

img = img.convert('RGBA')  # 转换为RGBA
pix = img.load()  # 读取为像素
for x in range(img.size[0]):  # 处理上下黑边框
    pix[x, 0] = pix[x, img.size[1] - 1] = (255, 255, 255, 255)
for y in range(img.size[1]):  # 处理左右黑边框
    pix[0, y] = pix[img.size[0] - 1, y] = (255, 255, 255, 255)
for y in range(img.size[1]):  # 二值化处理,这个阈值为R=95,G=95,B=95
    for x in range(img.size[0]):
        if pix[x, y][0] < 95 or pix[x, y][1] < 95 or pix[x, y][2] < 95:
            pix[x, y] = (0, 0, 0, 255)
        else:
            pix[x, y] = (255, 255, 255, 255)
img.save(r"D:\\文件\\5.png")  # 由于tesseract限制,这里必须存到本地文件
使用tesseract去识别处理后的图片
text=pytesseract.image_to_string(r"D:\\文件\\5.png")
print(text[:-1])

验证码保存到本地处理之前

在这里插入图片描述
处理之后
在这里插入图片描述
获取验证码文字
在这里插入图片描述

  1. 拿到验证码进登录url并获取Set-Cookie

headerse = {
    "Host": "#############",
    "Connection": "keep-alive",
    "Content-Length": "79",
    "Accept": "*/*",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded",
    "Origin": "###################",
    "Referer": "###################",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] /// 首页url返回的cookie
}





# # 1、get请求,获取token信息
token_url = "登录url"

data = {
    "j_username": "账号",
    "j_password": "密码",
    "t_checkcode": text[:-1]  //验证码
}

data["t_checkcode"]=int(data["t_checkcode"])
print(data)
print(headers)
print(headerse)
login_response = login_session.post(url=token_url,data=data, headers=headerse)
print(login_response.text)
print(login_response.headers)
# 通过正则表达式获取token值
# token_search = re.compile(r"JSESSIONID=(.*?);")
token_value = login_response.headers.get("Set-Cookie")
获取登录url返回的Cookie
print(token_value[0:43])
  1. 存入数据url的数据到mysql

数据url请求头
headerss= {
    "Host": "################",
    "Connection": "keep-alive",
    "Content-Length": "14",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "############",
    "Referer": "######################",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] //登录url返回的Cookie
}

def getYesterday(): 
    today=datetime.date.today() 
    oneday=datetime.timedelta(days=1) 
    yesterday=today-oneday  
    return yesterday
def getyes():
    now = datetime.datetime.now()
    year = now.year
    month = now.month
    return datetime.date(year,month,1)
login_url = "数据url"
data = {
    "startTime": "2022-02-10",
    "endTime": "2022-02-10",
    "monthTime": "2022-01",
    "dateRange": "day",
    "outfallId": "",
    "rgnCode": "",
    "yearTime": 2022,
    "isShowAllData": 1,
    "quarterTime": 1,
    "orderType": "fromSmallToBig",
    "outfallIds": "177511,21,176152,177508,177503,176151,177506,177510,177504,177505,177501,177509,177500,177512",
    "halfYearTime": 1
}
data['startTime'] = getyes()
data['endTime'] = getYesterday()
print(data)
# 发送Post请求,提交用户名密码,注意不要忘记携带data
login_response = login_session.post(url=login_url, headers=headerss, data=data)
print(login_response.text)

data = json.loads(login_response.text)
# print(data[0])
val = []
for i in range(len(data)):
    print("序号%s   值%s" % (i + 1, data[i]))
    val.append(tuple(data[i].values()))
    print(tuple(data[i].values()))
# #插入新数据
print(val)

sql = "INSERT INTO 表名称 (loadSo2,code,outfallId,dataCountO3,chromaSo2,chromaO3,dataCountPm25,invalidSo2,invalidNo2,apiO3,invalidCo,maxApi,loadNo2,dataCountCo,chromaPm25,invalidPm10,loadPm10,apiPm10,invalidO3,totalApi,dataCountNo2,outfallName,loadCo,apiCo,orderzh,apiPm25,dataCountPm10,chromaNo2,chromaCo,apiNo2,rgnName,chromaPm10,invalidPm25,loadPm25,mainPollution,apiSo2,dataCountSo2,loadO3) VALUES (%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s,%s)"
mycursor.executemany(sql, val)
mydb.commit()    # 数据表内容有更新,必须使用到该语句

在这里插入图片描述
在这里插入图片描述

参考链接
https://www.cnblogs.com/chenlove/p/14038580.html
https://blog.csdn.net/qiushi_1990/article/details/78041375
https://www.jb51.net/article/187678.htm
https://blog.csdn.net/xiaxianba/article/details/89450855
https://blog.csdn.net/purvispanwu/article/details/107099452
https://www.polarxiong.com/archives/python-tesseract-verification-code.html
https://www.cnblogs.com/zhangb8042/articles/10410263.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值