学python爬虫

最新推荐文章于 2024-07-12 16:42:46 发布

dsjsdhsjd

最新推荐文章于 2024-07-12 16:42:46 发布

阅读量1k

点赞数

分类专栏： python 文章标签：爬虫 python

本文链接：https://blog.csdn.net/dsjsdhsjd/article/details/123007690

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

爬虫全流程

前期准备工作

所需要的依赖
import pytesseract
from PIL import Image
import requests
from lxml import etree
import urllib.request
from io import StringIO, BytesIO
import re
import datetime
import json
import mysql.connector
import time
import base64
import os

pytesseract重点依赖引入方法
废话不多说，直接开干！
首先安装库

然后按照tesseract程序下载安装

tessercat下载地址：https://digi.bib.uni-mannheim.de/tesseract/ //请依据自己的操作系统下载exe文件安装
在这里插入图片描述

用户变量，系统变量都添加：PATH C:\Program Files (x86)\Tesseract-OCR; //这是tesseract的安装目录
系统变量添加：TESSDATA_PREFIX C:\Program Files (x86)\Tesseract-OCR

pip install pytesseract pytesseract依赖安装命令
再找到pytesseract.py文件

修改添加tesseract.exe

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

创建一个保持登录凭据和mysql初始化

# requests.session,创建一个保持登录凭据的session实例
login_session = requests.session()

mydb = mysql.connector.connect(
  host="######",
  user="######",
  passwd="######",
  database="######"
)
mycursor = mydb.cursor()

首页url并获取Set-Cookie

# 首页请求头
header = {
    "Host": "#############",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
}

# iost = io.StringIO()
# 首页登录
response = login_session.get(url="首页的url", headers=header)
response.encoding = 'utf-8'

# 获取一级分类的数据
html = etree.HTML(response.text)
# 首页返回头
print(response.headers)
# print(response.text)
# 首页返回头Cookie
token_value =response.headers.get("Set-Cookie")
print(token_value[0:43])

获取首页验证码并识别验证码内容

验证码的请求头
headers = {
    "Host": "##########",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    "Referer": "##############",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] //首页url返回的cookie
}


imgbase64str=login_session.get(url="验证码url", headers=headers)
获取bs64流并保存到本地
request = imgbase64str.content
with open(r"D:\\文件\\7.png",'wb') as f:
    f.write(request)



# login_session.get(url="http://222.134.6.165:8090/GenerateImage.jsp", headers=headers)
# items = html.xpath("//div[@class='ui-form-explain']//a//img//@src" )
# print(items[0])

# r = urllib.request.urlopen(r"D:\\文件\\7.png")
# f = open('VCode.jpg', 'wb')    #这里是将验证码图片写入到本地文件
# f.write(r.read())
# f.close()
# imgBuf = BytesIO(r.read())  # 采用StringIO直接将验证码文件写到内存，省去写入硬盘
验证码图片的背景处理
img = Image.open(r"D:\\文件\\7.png")  # PIL库加载图片

img = img.convert('RGBA')  # 转换为RGBA
pix = img.load()  # 读取为像素
for x in range(img.size[0]):  # 处理上下黑边框
    pix[x, 0] = pix[x, img.size[1] - 1] = (255, 255, 255, 255)
for y in range(img.size[1]):  # 处理左右黑边框
    pix[0, y] = pix[img.size[0] - 1, y] = (255, 255, 255, 255)
for y in range(img.size[1]):  # 二值化处理，这个阈值为R=95，G=95，B=95
    for x in range(img.size[0]):
        if pix[x, y][0] < 95 or pix[x, y][1] < 95 or pix[x, y][2] < 95:
            pix[x, y] = (0, 0, 0, 255)
        else:
            pix[x, y] = (255, 255, 255, 255)
img.save(r"D:\\文件\\5.png")  # 由于tesseract限制，这里必须存到本地文件
使用tesseract去识别处理后的图片
text=pytesseract.image_to_string(r"D:\\文件\\5.png")
print(text[:-1])

验证码保存到本地处理之前

在这里插入图片描述
处理之后

获取验证码文字

拿到验证码进登录url并获取Set-Cookie

headerse = {
    "Host": "#############",
    "Connection": "keep-alive",
    "Content-Length": "79",
    "Accept": "*/*",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded",
    "Origin": "###################",
    "Referer": "###################",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] /// 首页url返回的cookie
}





# # 1、get请求，获取token信息
token_url = "登录url"

data = {
    "j_username": "账号",
    "j_password": "密码",
    "t_checkcode": text[:-1]  //验证码
}

data["t_checkcode"]=int(data["t_checkcode"])
print(data)
print(headers)
print(headerse)
login_response = login_session.post(url=token_url,data=data, headers=headerse)
print(login_response.text)
print(login_response.headers)
# 通过正则表达式获取token值
# token_search = re.compile(r"JSESSIONID=(.*?);")
token_value = login_response.headers.get("Set-Cookie")
获取登录url返回的Cookie
print(token_value[0:43])

存入数据url的数据到mysql

数据url请求头
headerss= {
    "Host": "################",
    "Connection": "keep-alive",
    "Content-Length": "14",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "############",
    "Referer": "######################",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": token_value[0:43] //登录url返回的Cookie
}

def getYesterday(): 
    today=datetime.date.today() 
    oneday=datetime.timedelta(days=1) 
    yesterday=today-oneday  
    return yesterday
def getyes():
    now = datetime.datetime.now()
    year = now.year
    month = now.month
    return datetime.date(year,month,1)
login_url = "数据url"
data = {
    "startTime": "2022-02-10",
    "endTime": "2022-02-10",
    "monthTime": "2022-01",
    "dateRange": "day",
    "outfallId": "",
    "rgnCode": "",
    "yearTime": 2022,
    "isShowAllData": 1,
    "quarterTime": 1,
    "orderType": "fromSmallToBig",
    "outfallIds": "177511,21,176152,177508,177503,176151,177506,177510,177504,177505,177501,177509,177500,177512",
    "halfYearTime": 1
}
data['startTime'] = getyes()
data['endTime'] = getYesterday()
print(data)
# 发送Post请求，提交用户名密码,注意不要忘记携带data
login_response = login_session.post(url=login_url, headers=headerss, data=data)
print(login_response.text)

data = json.loads(login_response.text)
# print(data[0])
val = []
for i in range(len(data)):
    print("序号%s   值%s" % (i + 1, data[i]))
    val.append(tuple(data[i].values()))
    print(tuple(data[i].values()))
# #插入新数据
print(val)

sql = "INSERT INTO 表名称 (loadSo2,code,outfallId,dataCountO3,chromaSo2,chromaO3,dataCountPm25,invalidSo2,invalidNo2,apiO3,invalidCo,maxApi,loadNo2,dataCountCo,chromaPm25,invalidPm10,loadPm10,apiPm10,invalidO3,totalApi,dataCountNo2,outfallName,loadCo,apiCo,orderzh,apiPm25,dataCountPm10,chromaNo2,chromaCo,apiNo2,rgnName,chromaPm10,invalidPm25,loadPm25,mainPollution,apiSo2,dataCountSo2,loadO3) VALUES (%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s,%s)"
mycursor.executemany(sql, val)
mydb.commit()    # 数据表内容有更新，必须使用到该语句

在这里插入图片描述

参考链接
https://www.cnblogs.com/chenlove/p/14038580.html
https://blog.csdn.net/qiushi_1990/article/details/78041375
https://www.jb51.net/article/187678.htm
https://blog.csdn.net/xiaxianba/article/details/89450855
https://blog.csdn.net/purvispanwu/article/details/107099452
https://www.polarxiong.com/archives/python-tesseract-verification-code.html
https://www.cnblogs.com/zhangb8042/articles/10410263.html