1.UA伪装
ua:请求载体的身份标识,例如Mozilla/5.0等;
服务器通过ua识别用户是以什么身份向服务器发送请求的;
如果使用浏览器向某个门户网站发起请求的话,门户网站会认为这是一个正常的请求,但是如果使用程序(例如爬虫)发送请求的话,则服务器会认为这是一个非法请求,该请求很容易被加以识别并进行拒绝访问;
所以爬虫的第一步是ua伪装,将身份伪装成一款浏览器;
# ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
# 发送请求
response = requests.get(url=url, params=param,headers=headers)
百度翻译案例:
import json
import requests
if __name__ == '__main__':
# 获取关键字
keyword = input('请输入关键字:')
# 参数
param = {
'kw': keyword
}
# url地址
url = 'https://fanyi.baidu.com/sug'
# ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
# 发送请求
response = requests.post(url=url, data=param, headers=headers)
# 获取响应数据
page_obj = response.json()
print(page_obj)
# 持久化存储
fileName = keyword + '.json'
with open('./reptile/'+fileName, 'w', encoding='utf-8',) as fp:
json.dump(page_obj, fp=fp, ensure_ascii=False)
print('保存成功!!!')
豆瓣电影案例:
import requests
import json
if __name__ == '__main__':
# 参数
param = {
'type': '24',
'interval_id': '100:90',
'action': '',
'start': '40',
'limit': '20'
}
# url地址
url = 'https://movie.douban.com/j/chart/top_list'
# ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
# 发送请求
response = requests.get(url=url, params=param, headers=headers)
# 获取响应数据
page_obj = response.json()
# 持久化存储
fileName = "豆瓣电影" + '.json'
with open('./reptile/'+fileName, 'w', encoding='utf-8') as fp:
json.dump(page_obj, fp=fp, ensure_ascii=False)
print(fileName, '保存成功!!!')
2.数据解析
获取到数据后就需要对数据进行解析,数据解析有3种方式:正则解析,bs4,xpath;
数据解析的基本原理是:1.标签定位,2.对标签中的内容或者属性进行提取
正则解析适用于所有语言
正则解析wallhaven.cc网站图片案例
import re
import requests
if __name__ == '__main__':
# 爬取wallhaven.cc网站图片数据,爬取5页的图片数据
# 1.指定url
url = "https://wallhaven.cc/search?q=id%3A37&sorting=random&ref=fp&seed=tf50Sz&page="
i = 1
while i <= 1:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
ex = '<img alt="loading" class="lazyload" data-src="(.*?)" src="" >'
# 2.发起请求
page_text = requests.get(url=url+str(i), headers=headers).text
# 3.解析数据
# re.S允许.匹配换行符, re.M将字符串根据换行符分割成多行,再进行匹配(匹配到的内容没有换行符)
imgs = re.findall(ex, page_text, re.S)
for img in imgs:
img = img.replace('small', 'full')
img = img.replace('th.wallhaven.cc', 'w.wallhaven.cc')
img = img.replace(
img.split('/')[-1], 'wallhaven-'+img.split('/')[-1])
# 4.保存数据
with open('./imgs/'+img.split('/')[-1], 'wb') as fp:
# text返回的是字符串形式的响应数据,content返回的是字节形式的响应数据,json返回json数据
img_data = requests.get(url=img, headers=headers).content
fp.write(img_data)
print(img, '下载成功!')
# 5.构造下一页的url地址,循环2-5步
i += 1
bs4解析是python独有的数据解析方式;
bs4解析的基本原理是:建立一个用于解析html和xml的python对象BeautifulSoup,BeautifulSoup将html内容转换成树形结构,然后提供方法遍历这些树形结构;
#bs4相关方法
# 获取标签的第一个元素
print(bf.a)
print(bf.find('a'))
# # 获取标签的所有元素
print(bf.find_all('a'))
# # 根据标签属性获取元素
print(bf.find_all('a', class_='mnav'))
# 根据选择器获取元素
print(bf.select('.mnav'))
# 根据层级选择器获取元素,>代表子集,跨级则用空格
print(bf.select('.mnav > .c-color-t'))
# 获取标签及其子标签内的文本
print(bf.select('div ')[1].text)
# 只能获取当前标签内的文本
print(bf.select('.mnav > .c-color-t')[0].string)
# 获取元素的属性
print(bf.select('.mnav > .c-color-t')[0]['href'])
bs4爬取数据demo:
# encoding=utf-8
import requests
if __name__ == '__main__':
# 爬取诗词网
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
page_text = requests.get(url=url, headers=headers).text.encode(
'iso-8859-1').decode('utf-8')
# 用bs4解析
from bs4 import BeautifulSoup
soup = BeautifulSoup(page_text, 'lxml')
list = soup.select('.book-mulu li a')
for a in list:
# 获取章节的标题
title = a.string
# 获取章节的url
detail_url = 'https://www.shicimingju.com' + a['href']
# 对详情页发起请求,解析出章节内容
detail_page_text = requests.get(url=detail_url, headers=headers).text.encode(
"iso-8859-1").decode("utf-8")
# 解析出详情页中相关的章节内容
detail_soup = BeautifulSoup(detail_page_text, 'lxml')
div_tag = detail_soup.find('div', class_='chapter_content')
# 解析到的章节内容
content = div_tag.text
# 写入文件
with open('./sanguo.txt', 'a', encoding='utf-8') as fp:
fp.write(title + ':' + content + '\n')
xpath:最便捷也是最常用高效的解析方式
原理:1先实例化一个etree对象,将页面源码数据加载到该对象中;
2.etree对象结合xpath方法,定位标签或者标签属性、内容;
from lxml import etree
import requests
if __name__ == '__main__':
# 测试etree
# 目标页面
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
page_text = requests.get(url=url, headers=headers).text.encode(
'iso-8859-1').decode('utf-8')
# 数据解析
tree = etree.HTML(page_text)
print(tree)
# //表示中间可以相隔多个层级,并且可以从任意位置开始定位,/表示子级,[]表示属性,[@class="book-mulu"]表示属性值为book-mulu
# //li表示li标签,[3]表示下标,从1开始
# 如果是/a/text()表示a标签的文本内容
text = tree.xpath('//div[@class="book-mulu"]//li[3]/a/text()')[0]
print(text)
# 获取属性 @attrName
img = tree.xpath('//div[@class="card bookmark-list"]//img/@src')[0]
print(img)
爬取58demo:
from lxml import etree
import requests
if __name__ == '__main__':
# xpath爬取58同城二手房信息
url = 'https://wz.58.com/ershoufang'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
page_text = requests.get(url=url, headers=headers).text
# 数据解析
tree = etree.HTML(page_text)
# 存储的是li标签对象
list = tree.xpath(
'//section[@class="list"]')[0].xpath('./div[@class="property"]')
with open('./58.txt', 'a', encoding='utf-8') as fp:
for li in list:
# ./表示当前节点,/表示根节点
title = li.xpath(
'./a//div[@class="property-content-detail"]//h3/@title')[0]
print(title)
fp.write(title + '\n')
爬取图片:
from lxml import etree
import os
import requests
if __name__ == '__main__':
# 爬取pic.netbian.com网站图片数据,爬取5页的图片数据
url = "https://pic.netbian.com/4kfengjing/index.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
page = 1
while page <= 5:
if (page == 1):
new_url = url
else:
new_url = url.replace('.html', '_'+str(page)+'.html')
print(new_url)
page = page+1
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
list = tree.xpath('//div[@class="slist"]/ul/li//a/@href')
for link in list:
link = 'https://pic.netbian.com' + link
# 2.发起请求
page_text = requests.get(url=link, headers=headers).text
tree2 = etree.HTML(page_text)
# 3.解析数据
img = tree2.xpath('//div[@class="photo-pic"]/a/img/@src')[0]
# 4.保存数据
# 查看文件是否存在
if (os.path.exists('./imgs2/'+img.split('/')[-1])):
print(img, '已存在!')
continue
with open('./imgs2/'+img.split('/')[-1], 'wb') as fp:
# text返回的是字符串形式的响应数据,content返回的是字节形式的响应数据,json返回json数据
down_url = 'https://pic.netbian.com'+img
img_data = requests.get(url=down_url, headers=headers).content
fp.write(img_data)
print(img, '下载成功!')
通过验证码登录网站:
from chaojiying.chaojiying import Chaojiying_Client
import requests
from lxml import etree
import os
import sys
if __name__ == '__main__':
# 验证码登录测试
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 普通的http请求是无状态的,如果要保持请求状态,保持请求者的信息,需要携带cookie,我们创建session对象在发起请求的时候携带cookie信息
session = requests.session()
page_text = session.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
# 验证码图片的src属性值
code_img_src = 'https://so.gushiwen.cn' + \
tree.xpath('//*[@id="imgCode"]/@src')[0]
print(code_img_src)
viwcode = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
with open("./yzm/yzm.jpg", "wb") as fp:
img_data = session.get(url=code_img_src, headers=headers).content
fp.write(img_data)
chaojiying = Chaojiying_Client(
'15257741312', 'p6666666', '947550') # 用户中心>>软件ID 生成一个替换 96001
im = open('./yzm/yzm.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
obj = (chaojiying.PostPic(im, 1004))
data = {
'__VIEWSTATE': viwcode,
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '15257741312@163.com',
'pwd': 'p6666666',
'code': obj.get('pic_str'),
'denglu': '登录'
}
page_text = session.post(url=url, headers=headers, data=data).text
with open('./yzm/1.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
print(page_text)
3.代理服务器
频繁的爬取某个网站的数据,很容易被网站识别出来,从而被封禁IP,此时就需要借助代理服务器,我们的请求发送到中间服务器,中间服务器代理我们的请求发送到目标服务器,目标服务器返回数据到中间服务器,再转发到回来,这个中间服务器就叫代理服务器;
代理ip类型:
http请求只能使用http类型,https请求只能使用https类型;
代理ip匿名度:
透明:目标服务器知道是代理请求,并知道源头请求ip;
匿名:目标服务器知道是代理请求,不知道源头请求ip;
高匿:目标服务器不知道是代理请求,不知道源头请求ip;
4.异步爬虫
普通的爬虫是同步的,效率是低下的,使用异步爬去可以提高爬取的效率;
异步爬虫方式:
1.多线程,多进程:由于无法无限制的开启多进程,所以不建议使用;
2.进程池、线程池:通过它我们可以很好的控制系统对进程或者线程的创建与销毁频率,降低系统开销;弊端是池中的进程或者线程是有数量限制的;建议适当使用;
3.单线程+异步协程:涉及知识event_loop,coroutine,task
event_loop:事件循环,一个无限循环,我们把函数注册到循环上,当某些条件满足时,函数就会被循环执行;
coroutine:协程对象,我们可以将协程对象注册到事件循环中,它会被事件循环调用
线程池爬取视频案例
task:任务,它是对协程对象的进一步封装,包含任务的各个状态
future:代表将来要执行或者还没执行的任务,与task没什么区别
async:定义一个协程,即用async关键字来定义一个方法,这个方法在被调用时不会立即被执行,而是返回一个协程对象
await:用来挂起阻塞方法的执行
线程池爬取视频
import os
import random
import re
import time
import requests
# 引入线程池类
from multiprocessing.dummy import Pool
from lxml import etree
# 写爬取过程
def do_get_video(i):
# 梨视频网站视频数据,爬取5页的视频数据
# 1.指定url
url = "https://www.pearvideo.com/panorama_loading.jsp?start="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
num = 24*i
page_text = requests.get(url=url+str(num), headers=headers).text
tree = etree.HTML(page_text)
list = tree.xpath(
'//li[@class="categoryem"]//div[@class="vervideo-bd"]/a/@href')
for link in list:
# 获取0-1直接的随机数
rd = random.random()
id = link.split('_')[1]
video_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + \
id+'&mrd='+str(rd)
# 2.发起请求
headers2 = {
"Referer": "https://www.pearvideo.com/video_"+id,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
page_text_detail = requests.get(url=video_url, headers=headers2).text
# 3.解析数据
# page_text_detail转为字典
page_text_detail = eval(page_text_detail)
# https://image.pearvideo.com/cont/20200622/cont-1681503-12410697.png
img_src = page_text_detail['videoInfo']['video_image']
# 根据/分割字符串,取最后一个
img_name = img_src.split('/')[-1]
# 根据-分割字符串,取第二个
img_name = img_name.split('-')[1]
mid_src = "cont-"+img_name
# https://video.pearvideo.com/mp4/adshort/20200622/1684135200625-15217507_adpkg-ad_hd.mp4
video_src = page_text_detail['videoInfo']['videos']['srcUrl']
# 获取最后一个/后面的字符串
replace_str = video_src.split('/')[-1]
# 根据-分割字符串,取第一个
replace_str = replace_str.split('-')[0]
# 替换字符串
tar_src = video_src.replace(replace_str, mid_src)
# 4.保存数据
# 创建文件夹
if not os.path.exists('./video'):
os.mkdir('./video')
# 保存video
video_data = requests.get(url=tar_src, headers=headers2).content
video_path = './video/'+img_name+'.mp4'
with open(video_path, 'wb') as fp:
fp.write(video_data)
print(img_name, '下载成功')
if __name__ == '__main__':
start_time = time.time()
# do_get_video(1)
# 创建线程池对象
pool = Pool(3)
# # 将任务放到线程池中执行
pool.map(do_get_video, [1, 2, 3])
# 结束时间
end_time = time.time()
#关闭线程池
pool.close()
# 等待所有线程结束
pool.join()
print("总耗时:", end_time-start_time)
线程池爬取图片案例
import os
import re
import time
import requests
# 引入线程池类
from multiprocessing.dummy import Pool
# 写爬取过程
def do_get_img(i):
# 爬取wallhaven.cc网站图片数据,爬取5页的图片数据
# 1.指定url
url = "https://wallhaven.cc/search?q=id%3A37&sorting=random&ref=fp&seed=tf50Sz&page="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
ex = '<img alt="loading" class="lazyload" data-src="(.*?)" src="" >'
# 2.发起请求
page_text = requests.get(url=url+str(i), headers=headers).text
# 3.解析数据
# re.S允许.匹配换行符, re.M将字符串根据换行符分割成多行,再进行匹配(匹配到的内容没有换行符)
imgs = re.findall(ex, page_text, re.S)
for img in imgs:
img = img.replace('small', 'full')
img = img.replace('th.wallhaven.cc', 'w.wallhaven.cc')
img = img.replace(
img.split('/')[-1], 'wallhaven-'+img.split('/')[-1])
# 4.保存数据
# 查看文件是否存在
if (os.path.exists('./imgs/'+img.split('/')[-1])):
print(img, '已存在!')
continue
with open('./imgs/'+img.split('/')[-1], 'wb') as fp:
# text返回的是字符串形式的响应数据,content返回的是字节形式的响应数据,json返回json数据
print("爬取中...")
img_data = requests.get(url=img, headers=headers).content
fp.write(img_data)
print(img, '下载成功!')
if __name__ == '__main__':
start_time = time.time()
# 创建线程池对象
pool = Pool(6)
# 将任务放到线程池中执行
pool.map(do_get_img, [1, 2, 3, 4, 5, 6])
# 结束时间
end_time = time.time()
print("总耗时:", end_time-start_time)
协程说明:
import asyncio
async def req():
print("req start")
print("waiting......")
return "返回值"
test = req()
# 创建协程事件
loop = asyncio.get_event_loop()
# 将协程对象注册到loop中,然后启动loop
# loop.run_until_complete(test)
# task是对协程对象的进一步封装
# task = loop.create_task(test)
# print(task)
# print("执行前.....")
# # 执行循环
# loop.run_until_complete(task)
# print("执行后.....")
# print(task)
# future与task的区别只是封装方法不一样
# future = asyncio.ensure_future(test)
# print(future)
# print("future执行前...")
# loop.run_until_complete(future)
# print("future执行后...")
# print(future)
# 任务对象绑定回调
def callback_func(task):
print("打印协程对象的返回值")
print(task.result())
# 注册协程对象
task = loop.create_task(test)
# 绑定协程对象的回调函数
task.add_done_callback(callback_func)
# 将协程对象注册到loop中执行
loop.run_until_complete(task)
协程演示
import requests
import asyncio
import time
import aiohttp
async def req(url):
print("准备...")
# requests.get(url)是同步的网络请求,会阻塞进程,导致协程无法并发执行,必须使用基于异步的网络请求模块
# aiorequests基于异步的网络请求模块
async with aiohttp.ClientSession() as session:
# get(),post()
# get(url,params = {},headers = {})
async with session.get(url) as res:
# text()方法返回字符串形式的响应数据
# read()方法返回二进制形式的响应数据
# json()方法返回json对象
# 注意:获取响应数据操作之前一定要使用await进行手动挂起
page_text = await res.text()
print("结束..."+page_text)
tasks = []
urls = ['http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jerry']
time1 = time.time()
for url in urls:
c = req(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
time2 = time.time()
print(time2-time1)