简单爬虫
针对一些反爬虫机制不是很成熟甚至没有反爬的网站
可以用来练手
用到的库
- urllib
- json:处理json
网页请求
- get
- post
- ajax
- 模拟登陆
异常捕获
- HTTPError
- URLError
简单案例:
百度翻译
get及post请求
import urllib.parse
import urllib.request
import json
url = 'http://fanyi.baidu.com/?aldtype=16047#en/zh/'
# 模拟浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2626.76 Safari/537.36'
}
# 简单get请求
# response=urllib.request.urlopen(url=url)
#
# content = response.read().decode('utf-8')
# # 对返回的数据解析
# # xpath/正则/BeautifulSoup
# print(content)
# 简单post请求
post_url = 'http://fanyi.baidu.com/sug'
# 处理post请求, 表单通过data参数进行设置
data = {
'kw':'baby'
}
# post请求的data数据必须经过urldecode编码
data = urllib.parse.urlencode(data).encode('utf-8')
# 伪装浏览器访问
request = urllib.request.Request(url=post_url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
json_str = json.loads(content,encoding='utf-8')
print(json_str)
# 将请求到的数据保存到result.json
# 推荐用with上下文管理,此处不做分析
json.dump(json_str, open('result.json','w',encoding='utf-8'),ensure_ascii=False)
图片、视频、网页
import urllib.request
# 使用urlretriev 保存一个网页
urllib.request.urlretrieve(url,'baidu.html')
# 图片
img_url = 'http://img4.imgtn.bdimg.com/it/u=2813434517,83142011&fm=200&gp=0.jpg'
urllib.request.urlretrieve(img_url,'love.jpg')
# 视频下载
vedio_url = 'http://vliveachy.tc.qq.com/om.tc.qq.com/Az8VBMvmOaZ0-uMvijuncOt2WA-hjsfSbUUQjtYZv4Oo/j06552sg6he.p712.1.mp4?sdtfrom=v1105&guid=dabdf0bb81db817f535700efd44a086c&vkey=8722EBFA35A66E250EF94BA72CFA5FC0776AAA68D4855DA156731D9A83B1B6CA3B664F810509D918F5BD577EDFB07C612A8F2760966F776A10554826996CB3A0FBBE8EA6B17450FF173DA43B607B14A3DE1EB9B1EF12D93A1526FE7F8D6571B06C534E987D251DAB8B2B2B7F6D903997DD1D8AC267D77FEF&ocid=2603685292&ocid=760288684'
urllib.request.urlretrieve(vedio_url,'vedio.mp4')
# 保存网页
url='http://www.baidu.com'
response = urllib.request.urlopen(url=url)
content = response.read().decode('utf-8')
with open('baidu2.html','w',encoding='utf-8') as fp:
fp.write(content)
KFC(ajax请求)
import urllib.request
import urllib.parse
import json
post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
# 需求:根据用户输入的页码的起始范围,把每一页的餐厅信息存储为一个独立的json文件
start_page = int(input('请输入起始页'))
end_page = int(input('请输入结束页'))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
for page in range(start_page,end_page+1):
p = str(page)
data = {
'cname': '北京',
'pid': '',
'pageIndex': p,
'pageSize': '10',
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=post_url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# 保存为本地json文件
filename = 'KFC/{}.json'.format(p)
json.dump(content,open(filename,'w',encoding='utf-8'),ensure_ascii=False)
百度贴吧(复杂get请求)
import urllib.request
import urllib.parse
import os
def create_request(barname,page,base_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
p = (page-1)*50
data = {
'kw': barname,
'pn': p
}
data = urllib.parse.urlencode(data)
url = base_url + data
request = urllib.request.Request(url=url, headers=headers)
return request
def request_data(request):
return urllib.request.urlopen(request)
def parse_response(response):
return response.read().decode('utf-8')
def save_data(content,file_path,file_name):
path = os.path.join(file_path,file_name)
try:
with open(path,'w',encoding='utf-8') as fp:
fp.write(content)
except Exception as e:
print(e)
else:
print(file_name + '保存成功')
if __name__ == '__main__':
base_url = 'http://tieba.baidu.com/f?'
barname = input('请输入要查询的吧名')
start_page = int(input('请输入起始页'))
end_page = int(input('请输入结束页'))
for page in range(start_page,end_page+1):
# 创建一个请求对象
request = create_request(barname,page,base_url)
# 处理请求任务
response = request_data(request)
# 数据解析
content = parse_response(response)
# 实现本地化存储
file_name = str(page) + '.html'
file_path = 'tieba'
save_data(content,file_path,file_name)
print('全部保存成功')
异常捕获/处理
对爬虫请求时的一些常见错误进行处理
保证代码能正常执行
import urllib.request
from urllib.error import HTTPError,URLError
url = "http://maoyan.com/films1"
try:
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
print(content)
except HTTPError as e:
print('HTTPError')
print(e)
except URLError as e:
print('URLError')
print(e)
except Exception as e:
print('未知异常')
print(e)
else:
print('ok')
try:
fp = open('1.txt','r',encoding='utf-8')
fp.write('hahahaha')
except Exception as e:
print(e)
Cookie登录
cookie简介
cookie
是存储在本地终端的一种会话机制,它可以用来记录用户的行为,同时也可以作为一种反爬的手段
思考:
在爬取页面时,需要你登录,当我们把cookie复制到
headers
里,就可以爬取网页,但我们不能每次都去用抓包复制cooke,这样失去了爬虫的意义.在这里我们可以使用
handler
处理
handler
handler是一个类,提供HTTPHandler\HTTPCookieProcesser\HTTPProxyHandler
结论:如果需要在请求的过程中配置cookie或者代理,就需要使用opener来处理请求
from http.cookiejar import CookieJar
# 首先创建一个cookiejar对象,用于自动保存服务器向浏览器写入的cookie信息
cookie = CookieJar()
# 1.基本的HTTPHandler的处理方式
# 构建handler对象
handler = urllib.request.HTTPHandler()
# 使用handler对象构建opener对象
opener = urllib.request.build_opener(handler)
# 使用opener发送请求任务
response = opener.open(request)
# urllib.request.urlopen()
# open可以自动保存一些浏览器信息
content = response.read().decode('utf-8')
人人网
import urllib.request
import urllib.parse
from http.cookiejar import CookieJar
# 首先创建一个cookiejar对象,用于自动保存服务器向浏览器写入的cookie信息
cookie = CookieJar()
# 使用HTTPCookieProcessor类来创建handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 每一个人人网的请求,都使用opener.open来处理,因为opener里面包含了cookie信息
opener = urllib.request.build_opener(handler)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
}
# 先处理一下用户登录
post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201842926368'
data = {
# 需要配置你自己的账号信息
"email":"dqsygcz@126.com",
"origURL":"http://www.renren.com/home",
"domain":"renren.com",
"key_id":"1",
"captcha_type":"web_login",
"password":"bc95dbb4e9cfbd8f3b84e6cd1820d210bf6953f4d70630f73a750369c166e61d",
"rkey":"932a8efd469514758b8787a3631dd539",
}
data = urllib.parse.urlencode(data).encode('utf-8')
post_request = urllib.request.Request(url=post_url,data=data,headers=headers)
# 为了记录cookie信息
opener.open(post_request)
url = 'http://www.renren.com/224549540/profile'
request = urllib.request.Request(url=url,headers=headers)
# 在没有登陆的前提下,使用openr查看个人信息,查看结果
# 发现并没有访问成功,说明cookie信息并没有保存
response = opener.open(request)
content = response.read().decode('utf-8')
with open('xiaonei2.html','w',encoding='utf-8') as fp:
fp.write(content)