一、项目描述
获取用户权限,爬取网页信息
二、操作步骤
1、模拟登录
使用笨办法,人为登录成功一次,查看抓包数据,复制登录提交表单数据与请求标头中cookie值,伪装用户登录,post请求页面,请求之后解析页面至text文档
import http.cookiejar as cookielib
import requests
def askURL(postUrl):
# session代表某一次连接
huihuSession = requests.session()
# 因为原始的session.cookies 没有save()方法,所以需要用到cookielib中的方法LWPCookieJar,这个类实例化的cookie对象,就可以直接调用save方法。
huihuSession.cookies = cookielib.LWPCookieJar(filename = "huihuCookies.txt")
userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Cookie': 'PHPSESSID=121fce7emebi8tkr33r014b6p5; __guid=148770295.1606112073382355700.1623761038150.45; hd_sid=raJE0r; hd_auth=8442jh37mG5b8qtOCCpJnny8USliN5Rx%2F4%2BrZAnJ5mu1XFpIKPXgLj%2BJY6oKQ9R398fhsrx4c8YAgPPg%2FESvKEU; monitor_count=18',
}
account="输入用户名"
password="输入密码"
code="输入你的验证码"
postData = {
"username": account,
"password": password,
"code": code,
"_token":"QtL6IdSG8rlrSLk2iQTlIXA713c7Shy4",
"submit":"登录",
"indexlogin":"1",
}
# 使用session直接post请求
responseRes = huihuSession.post(postUrl, data = postData,headers=header)
huihuSession.cookies.save()
return responseRes.text
2、爬取页面信息
获取页面之后,使用正则表达式获取对应信息,如果页面不存在就print("error!")
from bs4 import BeautifulSoup
import re
for i in range(0, 1500): # 调用获取页面信息的函数
try:
url = postUrl + str(i + 120)
html = askURL(url)
bs = BeautifulSoup(html, "html.parser")
data = [] # 保存一个页面的所有信息
# 分类
for item in bs.find_all('span', id="catenavi"):
item = str(item)
catenavi = re.compile(r'<a .*?>(.*?)</a>')
cate = re.findall(catenavi, item)[0]
data.append(cate)
# 名称
for item in bs.find_all('div', class_="title_thema"):
item = str(item)
doctitle = re.compile(r'<h1 id="doctitle">(.*)</h1>')
title = re.findall(doctitle, item)[0]
data.append(title)
# 编辑者,浏览次数,编辑次数,更新时间
for item in bs.find_all('div', class_="columns ctxx"):
item = str(item)
# 编辑者
name = re.compile(r'<dt><a .*? target="_blank">(.*)</a> <em class="f12"><img .*? .*?/> .*?</em></dt>')
n = re.findall(name, item)
data.append(n)
# 浏览次数
view = re.compile((r'<li><span class="gray">浏览次数:</span>(\d*) 次</li>'))
v = re.findall(view, item)
data.append(v)
# 编辑次数
editor = re.compile((r'<li><span class="gray">编辑次数:</span>(\d*)次 <a class="clink" .*? target="_blank">历史版本</a></li>'))
e = re.findall(editor, item)
if len(e) != 0:
data.append(e)
else:
data.append(" ")
# 更新时间
time = re.compile((r'<li><span class="gray">更新时间:</span>(.*)</li>'))
t = re.findall(time, item)
data.append(t)
except :
print("error!")
3、存储数据创建工资表,存储列名,获取网页数据存储至exec表格中
import xlwt
savepath = "爬虫text.xls"
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('爬虫', cell_overwrite_ok=True) # 创建工作表
col = ("分类", "名称", "编辑者", "浏览次数", "编辑次数", "更新日期")
for i in range(6):
sheet.write(0, i, col[i]) # 列名
k = 0
for i in range(0, 1500):
k=k+1
l = len(data[2])
if len(data[2]) != 1:
data[2][l - 2] = data[2][l - 2] + ','
for j in range(6):
sheet.write(k, j, data[j]) # 数据
print(i)
book.save(savepath) # 保存
4、项目连接:https://download.csdn.net/download/m0_48673670/19696042?spm=1001.2014.3001.5503