Python 学习笔记
案例练习
爬虫精选案例
网站信息爬取
- 遇到的问题:
- 登录成功后获取到的token只缓存到内存中,并没有序列化到本地;这种情况下内存回收token丢失后,访问接口就没有传递token才导致的422 Unprocessable Entity
解决方法
: 我是通过 cookies 来实现 登录的
# 从浏览器 拿到request head 中的 cookies
# session 使用 session 会话存储状态,存储cookies
session = requests.session()
session.headers = headers
# 添加cookies
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
url = "http://www.olympedia.org/athletes/quick_search"
cookies_dict = requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
代码展示
"""
爬虫 程序
search: http://www.olympedia.org/athletes/quick_search
"""
import re # 正则表达式,进行文字匹配
from bs4 import BeautifulSoup # 网页解析,获取数据
import urllib.request, urllib.error # 制定URL,获取网页数据
import requests
import xlwt # 进行excel操作
def main():
baseurl = "http://www.olympedia.org" #要爬取的网页链接
# 1.爬取网页
datalist = getData(baseurl)
savepath = "运动员信息表.xls" # 当前目录新建XLS,存储进去
# dbpath = "athletes.db" # 当前目录新建数据库,存储进去
# 3.保存数据
saveData(datalist, savepath) # 2种存储方式可以只选择一种
# saveData2DB(datalist,dbpath)
# 爬取网页
def getData(baseurl):
datalist = [] #用来存储爬取的网页信息
for i in range(0, 1): # 调用获取页面信息的函数,1次
url = baseurl
html = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('form', class_="navbar-form navbar-left"): # 查找符合要求的字符串
params = {}
item = str(item)
# 需要传入一个data参数,需要的是bytes类型,这里用了urlencode方法传入一个字典,并指定编码
base_url = 'http://www.olympedia.org'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}
agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"
headers = {
"HOST": "www.olympedia.org",
"Referer": "http://www.olympedia.org/athletes/quick_search",
"User-Agent": agent
}
# cookies 内存、本地持久化 request head中的cookies
cookies = {
'_olympedia_session': 'Nzh0VE5CNEM4WkhBanI4NkZKUWc4NFB2NGc5TDVGVHJZK3BZN3lIbXBhT3BZVDJZUjVRN0F2V0xCUEtRcjVXNTJpSGl4R0NUTHpWdzBHMVRBZityWHNjV2QrVU9OQndVTnBVK3JtWlh4elFqZ1J0ZXVQRWgwbWtVN1liaXNoWVU5aldUY3RmWmNScXBtMDNFNzBCeXl3PT0tLS9ic1dRSEhxeTJlQW1aVGtmUWFFdmc9PQ%3D%3D--f4cbbf0f48cfb56940330c1a11552d8e226685d0'
}
session = requests.session()
session.headers = headers
# 添加cookies
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
url = "http://www.olympedia.org/athletes/quick_search"
cookies_dict = requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
fileContent = readLocalFile()
utf8 = re.findall(re.compile(r'<input name="utf8" type="hidden" value="(.*?)"/>'), item)[0] # 通过正则表达式查找
params["utf8"] = utf8
authenticity_token = \
re.findall(re.compile(r'<input name="authenticity_token" type="hidden" value="(.*?)"/>'), item)[
0] # 通过正则表达式查找
params["authenticity_token"] = authenticity_token
commit = re.findall(re.compile(
r'<input class="btn btn-default" data-disable-with="Go" name="commit" type="submit" value="(.*?)"/>'),
item)[0] # 通过正则表达式查找
params["commit"] = commit
for itemOne in fileContent:
try:
currentResult = [] # 保存运动员所有信息
query = itemOne
params["query"] = query
currentResult.append(query)
data = bytes(urllib.parse.urlencode(params), encoding='utf8')
respost = requests.post(url, data=data, headers=headers, cookies=cookies_dict, ) # POST
soupOne = BeautifulSoup(respost.text, "html.parser")
link = re.findall(re.compile(r'<a href="(.*?)">'), str(soupOne.select("tbody>tr a")[0]))[0]
resget = requests.get(baseurl + link)
dateOfBirth = re.findall(re.compile(r'<th>Born</th><td>(.*?) in'), resget.text)[0]
measurements = re.findall(re.compile(r'<th>Measurements</th><td>(.*?)</td>'), resget.text)[0]
currentResult.append(dateOfBirth)
currentResult.append(measurements)
print(currentResult)
datalist.append(currentResult)
except:
continue
return datalist
# 读取本地文件 test.txt
def readLocalFile():
data = []
print("正在读取内容中...")
with open("test.txt", "r") as f:
length = 0
for item in f.readlines():
length = length + 1
flag = True
for item_data in data:
if item_data == item.strip('\n'):
flag = False
else:
continue
if flag:
data.append(item.strip('\n'))
print(len(data), length)
return data
# 得到指定一个URL的网页内容
def askURL(url):
head = { # 模拟浏览器头部信息,向服务器发送消息
'Cookie': '_olympedia_session=WTQwKzluZTJiVlplRElBTzIvTkdld2lzZVJtZDY2ajlwbGFTNm1BZkdQZm5rd0ZpbEdpZ2dnMG02YkE0Tk01TGJqcnhwTkhkSjZZclBqR0ptZkFtaG82NWR3cDVKMHVhcmo0Ni9JRk9OcFdzcGQvOUhnbk1MNFgzZUg5WWx5MTZFTCt1TXEzalB2VnppS3VvWk5LdTRnPT0tLWhrUUFKcW9CRkUyQU9ZdmFWV0E4MFE9PQ==--17ca54d2d1c3747df425b7106ca3b285d145ae14',
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
# 保存数据到表格
def saveData(datalist,savepath):
print("保存运动员信息到表格!")
book = xlwt.Workbook(encoding="utf-8", style_compression=0) #创建 workbook 对象
sheet = book.add_sheet('运动员基本信息表', cell_overwrite_ok=True) #创建工作表
col = ("姓名", "出生年月", "身高/体重")
for i in range(0, len(col)):
sheet.write(0,i,col[i]) #列名
for i in range(0, len(datalist)):
print("第%d条" %(i+1)) #输出语句,用来测试
data = datalist[i]
for j in range(0, len(data)):
sheet.write(i+1,j,data[j]) #数据
book.save(savepath) #保存
if __name__ == "__main__": # 当程序执行时
# 调用函数
main()
print("爬取完毕!")
效果演示
- 查看输出的
Excel
学习中的问题
- python 程序打包 成
.exe
应用程序报错了
Traceback (most recent call last):
File "AutoUpdateHostFile.py", line 1, in <module>
ModuleNotFoundError: No module named 'easygui'
Command "python setup.py egg_info" failed with error code
-
问题二:UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
-
分析:解码失败;VPN 开启的缘故,关闭即可
-
问题三:ValueError: check_hostname requires server_hostname
-
分析:可能是本地开启了
VPN
的缘故,关掉运行如下: