来自北京图灵学院刘英。
参考资料:
Python网络数据采集,图灵工业出版
精通Python爬虫框架Scrapy,人民邮电出版社
Python3网络爬虫:http://blog.csdn.net/c406495762/article/details/72858983
Scrapy官方教程:http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html
预备知识:
URL
HTTP协议
web前端:html,css,js
ajax
re,xpath
xml
一、爬虫简介:
爬虫定义:网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或脚本。
另外一些不常使用的名字还有蚂蚁、自动索引、、模拟程序或者蠕虫。
两大特征:
能按作者要求下载数据或内容
能自动在网上流窜
三大步骤:
下载网页
提取正确的信息
根据一定规则自动跳到另外网页上执行上两步
爬虫分类:
通用爬虫
专用爬虫(聚集爬虫
Python网络包简介:
Python2:urllib、urllib2、urllib3、httplib、httpli2、requests
Python3:urllib、urllib3、httplib2、requests
Python2:urllib和urllib2配合使用,或requests
Python3:urllib、requests
二、urllib
包含模块:
urllib.request:打开和读取urls
urllib.error:包含urllib.request产生的常见的错误,使用try捕捉
urllib.parse:包含解析url的方法(见案例4)
urllib.robotparse:解析robots.txt文件
案例1:
from urllib importrequest
url='http://sogo.com'rsp=request.urlopen(url)print(type(rsp)) #
html=rsp.read()print(type(html)) #
html=html.decode()print(type(html)) #
print(html)
案例2:使用chardet自动检测字符编码
1 from urllib importrequest2 importchardet3
4 url='http://sogo.com'
5 rsp=request.urlopen(url)6 html=rsp.read()7
8 charset=chardet.detect(html)9 print(type(charset)) #
10 print(charset)11 print(charset.get('encoding'))12
13 html=html.decode(charset.get('encoding','utf-8')) #使用字典get方法的好处:如果没有此键,不会报错。
14 print(html)
案例3:
1 from urllib importrequest2
3 url='http://sogo.com'
4 rsp=request.urlopen(url)5 print(rsp.geturl()) #https://www.sogo.com/
6 print(rsp.getcode()) #200
7 print(rsp.info()) #结果见多行注释
8 '''
9 Server: nginx10 Date: Sun, 01 Dec 2019 14:53:13 GMT11 Content-Type: text/html; charset=utf-812 Transfer-Encoding: chunked13 Connection: close14 Vary: Accept-Encoding15 Set-Cookie: ABTEST=3|1575211993|v17; expires=Tue, 31-Dec-19 14:53:13 GMT; path=/16 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR"17 Set-Cookie: IPLOC=CN3201; expires=Mon, 30-Nov-20 14:53:13 GMT; domain=.sogo.com; path=/18 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR"19 Set-Cookie: SUID=B2E70270541C940A000000005DE3D3D9; expires=Sat, 26-Nov-2039 14:53:13 GMT; domain=.sogo.com; path=/20 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR"21 x-log-ext: nodejs=122 Set-Cookie: black_passportid=; path=/; expires=Thu, 01 Jan 1970 00:00:00 GMT; domain=.sogo.com23 Pragma: No-cache24 Cache-Control: max-age=025 Expires: Sun, 01 Dec 2019 14:53:13 GMT26
27
28 '''
案例4:parse解析url参数(下面两个例子打印的url可以在浏览器访问,在代码中运行无结果,可能网站采用了反爬虫技术)
1 from urllib importrequest, parse2
3 url = 'https://www.sogou.com/web?'
4 wd = input('请输入要搜索的关键字:') #输入:黄山
5 #用字典结构定义data
6 data ={7 'query': wd8 }9 data =parse.urlencode(data)10 print(type(data)) #
11 print(data) #query=%E9%BB%84%E5%B1%B1
12 url +=data13 print(url) #https://www.sogou.com/web?query=%E9%BB%84%E5%B1%B1
14 rsp =request.urlopen(url)15 html =rsp.read().decode()16 print(html)
1 from urllib importrequest, parse2
3 url = 'https://www.baidu.com/s?'
4 wd = input('请输入要搜索的关键字:') #输入:长城
5 #用字典结构定义data
6 data ={7 'wd': wd8 }9 data =parse.urlencode(data)10 print(type(data)) #
11 print(data) #wd=%E9%95%BF%E5%9F%8E
12 url +=data13 print(url) #https://www.baidu.com/s?wd=%E9%95%BF%E5%9F%8E
14 rsp =request.urlopen(url)15 html =rsp.read()16 html=html.decode()17 print(html)
案例5:post请求:自动对参数加密,比get安全
1 from urllib importrequest, parse2 importjson3
4 baseUrl = 'http://fanyi.baidu.com/sug'
5 data = {'kw': 'girl'}6 data =parse.urlencode(data)7 print(type(data)) #
8 data = data.encode('utf-8')9 print(type(data)) #
10 rsp = request.urlopen(baseUrl, data=data)11 html =rsp.read().decode()12 print(type(html)) #
13 html =json.loads(html)14 print(type(html)) #
15 print(html) #{'errmsg': '参数错误', 'errno': 1001}不知道为啥
1 from urllib importrequest, parse2 importjson3
4 baseUrl = 'http://fanyi.baidu.com/sug'
5 data = {'kw': 'girl'}6 data =parse.urlencode(data)7 print(type(data)) #
8 data = data.encode('utf-8')9 print(type(data)) #
10 headers = {'Content-Length': len(data)}11 req = request.Request(baseUrl, data=data, headers=headers)12 rsp =request.urlopen(req)13 html =rsp.read().decode()14 print(type(html)) #
15 html =json.loads(html)16 print(type(html)) #
17 print(html) #{'errmsg': '参数错误', 'errno': 1001}
error模块:HTTPError,URLError
1 from urllib importrequest,error2
3 #url='http://sogo.com'
4 #url='http://sogooooooooooooooooo.com' #会导致URLError
5 url='http://www.sipo.gov.cn/www' #会导致HTTPError
6 try:7 req=request.Request(url)8 rsp=request.urlopen(req)9 html=rsp.read().decode()10 print(html)11 excepterror.HTTPError as e:12 print('HTTPError:{}'.format(e.reason))13 print('HTTPError:{}'.format(e))14 excepterror.URLError as e:15 print('URLError:{}'.format(e.reason)) #URLError:[Errno 11001] getaddrinfo failed
16 print('URLError:{}'.format(e)) #URLError:
17
18 '''HTTPError的打印:19 HTTPError:Not Found20 HTTPError:HTTP Error 404: Not Found21 '''
22 exceptException as e:23 print(e)
User-Agent
1 from urllib importrequest,error2
3 url='http://sogo.com'
4 try:5 #方式一:
6 '''
7 headers={}8 headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'9 req=request.Request(url,headers=headers)10 '''
11
12 #方式二:
13 req=request.Request(url)14 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0')15
16 rsp=request.urlopen(req)17 html=rsp.read().decode()18 print(html)19 excepterror.HTTPError as e:20 print(e)21 excepterror.URLError as e:22 print(e)23 exceptException as e:24 print(e)25 print('Done!')
代理 proxy
1 from urllib importrequest,error2
3 url='http://sogo.com'
4
5 #可获取公用代理IP:西刺免费代理IP:www.xicidaili.com www.goubanjia.com
6
7 #使用代理步骤
8 #1.设置代理地址
9 proxy={'http':'223.199.31.116:9999'}10 #2.创建ProxyHandler
11 proxyHandler=request.ProxyHandler(proxy)12 #3.创建Opener
13 opener=request.build_opener((proxyHandler))14 #4.安装Opener
15 request.install_opener(opener)16
17 try:18 rsp=request.urlopen(url)19 html=rsp.read().decode()20 print(html)21 excepterror.HTTPError as e:22 print(e)23 excepterror.URLError as e:24 print(e)25 exceptBaseException as e:26 print(e)
cookie和session
由于http协议的无记忆性,人们为了弥补这个缺憾,所采用的一个补充协议。
cookie是发给用户(即浏览器)的一段信息,session是保存在服务器上的另一半信息,用来记录用户信息。
cookie和session区别:存放位置不同,cookie在用户端,session在服务器端。cookie不安全
1 from urllib importrequest,error2
3 url='http://www.renren.com/973090887/profile'
4
5 #方式一:
6 #req=request.Request(url)
7 #rsp=request.urlopen(req)
8
9 #方式二:
10 rsp=request.urlopen(url)11
12 html=rsp.read().decode('utf-8')13 with open('rsp.html','w') as f:14 f.write(html)
直接复制cookie,注:headers={'Cookie':''}也可小写的cookie如headers={'cookie':''}
1 from urllib importrequest,error2
3 url='http://www.renren.com/973090887/profile'
4 headers={'Cookie':'anonymid=k46zbise-53dqfd; depovince=GW; jebecookies=b42c5959-ff61-401a-9163-06036762bca2|||||; _r01_=1; ick_login=cc725645-63f4-402a-86e5-6ec548cdf7db; t=83b415286308687c31207afe721529db7; societyguester=83b415286308687c31207afe721529db7; id=973090887; xnsid=f66a859a; JSESSIONID=abc9MR0JPSynRciKxLj8w; ver=7.0; loginfrom=null; jebe_key=8757b4d5-eed1-4a7a-b767-a881d17faa92%7Ca98b442c97be77cf055d58a8be6ee9ca%7C1576413790880%7C1%7C1576413789040; jebe_key=8757b4d5-eed1-4a7a-b767-a881d17faa92%7Ca98b442c97be77cf055d58a8be6ee9ca%7C1576413790880%7C1%7C1576413789050; wp_fold=0'}5 req=request.Request(url,headers=headers)6 rsp=request.urlopen(req)7 html=rsp.read().decode()8 with open('rsp.html','w') as f:9 f.write(html)
http模块包含一些关于cookie的模块,通过他们可以自动使用cookie
CookieJar
管理存储cookie,向传出的http请求添加cookie
cookie存储在内存中,CookieJar实例回收后cookie将消失
FileCookieJar(filename,delayload=None,policy=None)
使用文件管理cookie
filename是保存cookie的文件
MozillaCookieJar(filename,delayload=None,policy=None)
创建与mozilla浏览器cookie.txt兼容的FileCookieJar实例
LwpCookieJar
创建与libwww-perl标准兼容的Set-Cookie3格式的FileCookieJar实例
关系:CookieJar -> FileCookieJar -> MozillaCookieJar & LwpCookieJar
1 from urllib importrequest,parse2 from http importcookiejar3
4 #创建cookiejar实例
5 cookie=cookiejar.CookieJar()6 #生成cookie管理器
7 cookie_handler=request.HTTPCookieProcessor(cookie)8 #创建http请求管理器
9 http_handler=request.HTTPHandler()10 #生成https管理器
11 https_handler=request.HTTPSHandler()12 #创建请求管理器
13 opener=request.build_opener(http_handler,https_handler,cookie_handler)14
15 deflogin():16 url='http://www.renren.com/PLogin.do'
17 data={'email':'13119144223','password':'123456'}18 data=parse.urlencode(data)19 print('dataAfterUrlEncode=',data)20 req=request.Request(url,data=data.encode())21 rsp=opener.open(req)
print('cookie===>>>',cookie)
print('cookie dir--->>>',dir(cookie))
for i in cookie:
print(i)22
23 defgetHomePage():24 url='http://www.renren.com/965187997/profile'
25 rsp=opener.open(url)26 html=rsp.read().decode()27 with open('rsp.html','w',encoding='utf-8') as f:28 f.write(html)29 defmain():30 login()31 getHomePage()32
33 if __name__ == '__main__':34 main()
cookie保存到文件
1 from urllib importrequest,parse2 from http importcookiejar3
4 #创建cookiejar实例
5 cookie=cookiejar.MozillaCookieJar('cookie.txt')6 #生成cookie管理器
7 cookie_handler=request.HTTPCookieProcessor(cookie)8 #创建http请求管理器
9 http_handler=request.HTTPHandler()10 #生成https管理器
11 https_handler=request.HTTPSHandler()12 #创建请求管理器
13 opener=request.build_opener(http_handler,https_handler,cookie_handler)14
15 deflogin():16 url='http://www.renren.com/PLogin.do'
17 data={'email':'13119144223','password':'123456'}18 data=parse.urlencode(data)19 print('dataAfterUrlEncode=',data)20 req=request.Request(url,data=data.encode())21 rsp=opener.open(req)22 print('cookie===>>>',cookie)23 print('cookie dir--->>>',dir(cookie))24 for i incookie:25 print(i)26
27 #保存cookie: ignore_discard即cookie即将丢弃也保存,ignore_expires即使cookie即将过期也保存
28 cookie.save(ignore_discard=True,ignore_expires=True)29
30 defgetHomePage():31 url='http://www.renren.com/965187997/profile'
32 rsp=opener.open(url)33 html=rsp.read().decode()34 with open('rsp.html','w',encoding='utf-8') as f:35 f.write(html)36 defmain():37 login()38 getHomePage()39
40 if __name__ == '__main__':41 main()
cookie.txt文件内容:
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.renren.comTRUE/FALSE1607525840_de420A8DC764CD1624FC7C8526DA9A3A25
.renren.comTRUE/FALSE1734101840anonymidk474v1fq-iqokun
.renren.comTRUE/FALSEfirst_login_flag1
.renren.comTRUE/FALSEid965187997
.renren.comTRUE/FALSE1579013840ln_hurlhttp://head.xiaonei.com/photos/0/0/men_main.gif
.renren.comTRUE/FALSE1579013840ln_uact13119144223
.renren.comTRUE/FALSEloginfromnull
.renren.comTRUE/FALSEpf089872e2b5af59fd90305191668f7c27
.renren.comTRUE/FALSEsocietyguester93d42a597819c5e50b77a188d04bf43d7
.renren.comTRUE/FALSEt93d42a597819c5e50b77a188d04bf43d7
.renren.comTRUE/FALSEver7.0
.renren.comTRUE/FALSExnsidf5efa6dd
.renren.comTRUE/xtalk/FALSEt20f31c075aa6b30a902235ff55699182
www.renren.comFALSE/FALSEJSESSIONIDabcvLdUTFcjFkh53fek8w
cookie.load()
1 from urllib importrequest,parse2 from http importcookiejar3
4 #创建cookiejar实例
5 cookie=cookiejar.MozillaCookieJar()6 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True)7 #生成cookie管理器
8 cookie_handler=request.HTTPCookieProcessor(cookie)9 #创建http请求管理器
10 http_handler=request.HTTPHandler()11 #生成https管理器
12 https_handler=request.HTTPSHandler()13 #创建请求管理器
14 opener=request.build_opener(http_handler,https_handler,cookie_handler)15
16 defgetHomePage():17 url='http://www.renren.com/965187997/profile'
18 rsp=opener.open(url)19 html=rsp.read().decode()20 with open('rsp.html','w',encoding='utf-8') as f:21 f.write(html)22 defmain():23 getHomePage()24
25 if __name__ == '__main__':26 main()
简单的访问
1 from urllib importrequest2
3 url='https://www.12306.cn/mormhweb'
4 rsp=request.urlopen(url)5 html=rsp.read().decode()6 print(html)
若有SSL未认证问题用这个:
1 from urllib importrequest2 importssl3
4 #利用未认证的上下文环境替换认证的上下文环境
5 ssl._create_default_https_context=ssl._create_unverified_context6
7 url='https://www.12306.cn/mormhweb'
8 rsp=request.urlopen(url)9 html=rsp.read().decode()10 print(html)
复制浏览器F12相关信息,访问有道(youdao)post请求:
1 from urllib importrequest,parse2
3 defyoudao():4 url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
5
6 data={7 'i':'girl',8 'from':'AUTO',9 'to':'AUTO',10 'smartresult':'dict',11 'client':'fanyideskweb',12 'salt':'15764998714040',13 'sign':'6d154aa4d240a327c1b00a7265ee0c42',14 'ts':'1576499871404',15 'bv':'e2a78ed30c66e16a857c5b6486a1d326',16 'doctype':'json',17 'version':'2.1',18 'keyfrom':'fanyi.web',19 'action':'FY_BY_REALTlME'
20 }21
22 data=parse.urlencode(data).encode()23
24 headers={25 'Accept':'application/json, text/javascript, */*; q=0.01',26 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',27 'Connection':'keep-alive',28 'Content-Length':'237',29 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',30 'Cookie':'YOUDAO_MOBILE_ACCESS_TYPE=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; OUTFOX_SEARCH_USER_ID=878677551@112.20.83.126; JSESSIONID=abccfVgNRwMree-0iOo8w; OUTFOX_SEARCH_USER_ID_NCOO=629431719.4616601; _ntes_nnid=cdfe6631d454b95eb3fe744ddcd37a9d,1576498433752; ___rl__test__cookies=1576499871397',31 'Host':'fanyi.youdao.com',32 'Origin':'http://fanyi.youdao.com',33 'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.top',34 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',35 'X-Requested-With':'XMLHttpRequest'
36 }37
38 req=request.Request(url,data=data,headers=headers)39 rsp=request.urlopen(req)40 html=rsp.read().decode()41 print(html)42
43 youdao()
修改上面代码,获取加密算法:
获取JS代码:
粘贴到在线工具:https://tool.oschina.net/codeformat/js,或换个格式化好了的且可复制的浏览器(如智慧联想浏览器)
找到salt,sign:
通过浏览器控制台查看JS代码功能:
1 from urllib importrequest,parse2
3 defgetSalt():4 '''
5 salt: i,6 i = r + parseInt(10 * Math.random(), 10);7 r = "" + (new Date).getTime(),8
9 salt=(new Date).getTime() + parseInt(10 * Math.random(), 10)10 '''
11 importtime,random12 return int(time.time()*1000)+random.randint(0,10)13
14 defgetMD5(v):15 importhashlib16 md5=hashlib.md5()17
18 md5.update(v.encode())19 sign=md5.hexdigest()20
21 returnsign22
23 defgetSign(key,salt):24 '''
25 sign: n.md5("fanyideskweb" + e + i + "n%A-rKaT5fb[Gy?;N5@Tj")26 i = r + parseInt(10 * Math.random(), 10);27 r = "" + (new Date).getTime(),28
29 sign="fanyideskweb" + key + salt +"n%A-rKaT5fb[Gy?;N5@Tj"30 '''
31 return getMD5(''.join(("fanyideskweb" , key , str(salt) ,"n%A-rKaT5fb[Gy?;N5@Tj")))32
33 defyoudao(key):34 url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
35
36 salt=getSalt()37 data={38 'i':key,39 'from':'AUTO',40 'to':'AUTO',41 'smartresult':'dict',42 'client':'fanyideskweb',43 'salt':str(salt),44 'sign':getSign(key,salt),45 'ts':'1576499871404',46 'bv':'e2a78ed30c66e16a857c5b6486a1d326',47 'doctype':'json',48 'version':'2.1',49 'keyfrom':'fanyi.web',50 'action':'FY_BY_REALTlME'
51 }52
53 data=parse.urlencode(data).encode()54
55 headers={56 'Accept':'application/json, text/javascript, */*; q=0.01',57 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',58 'Connection':'keep-alive',59 'Content-Length':len(data),60 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',61 'Cookie':'YOUDAO_MOBILE_ACCESS_TYPE=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; OUTFOX_SEARCH_USER_ID=878677551@112.20.83.126; JSESSIONID=abccfVgNRwMree-0iOo8w; OUTFOX_SEARCH_USER_ID_NCOO=629431719.4616601; _ntes_nnid=cdfe6631d454b95eb3fe744ddcd37a9d,1576498433752; ___rl__test__cookies=1576499871397',62 'Host':'fanyi.youdao.com',63 'Origin':'http://fanyi.youdao.com',64 'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.top',65 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',66 'X-Requested-With':'XMLHttpRequest'
67 }68
69 req=request.Request(url,data=data,headers=headers)70 rsp=request.urlopen(req)71 html=rsp.read().decode()72 print(html)73
74 youdao('boy')
……