spider(爬虫小Demo)
请不要大量并且长时间爬取同一网站,这样会给被访问的网站服务器带来巨大压力
-
Requests(推荐小型爬取)
- 简单
''' 利用urllib模块的request爬取一个网页 ''' from urllib import request url = "https://www.douban.com" req = request.urlopen(url) # 打开网址得到页面源代码 html = req.read().decode() # read()读取 decode()解码默认utf-8 print(html)
-
data
- get
''' 利用get方法模仿搜索 ''' from urllib import request,parse url = "https://www.douban.com/search?" q = input("请输入你想查询的:") kv = {'q': q} kv = parse.urlencode(kv) url = url + kv req = request.urlopen(url) html = req.read().decode() print(html)
- post
''' 利用post方法模仿搜索 ''' from urllib import request,parse url = "https://www.douban.com/search?" q = input("请输入你想查询的:") kv = {'q': q} kv = parse.urlencode(kv).encode() # data需要bytes类型,所以需要进行编码 req = request.urlopen(url, data=kv) html = req.read().decode() print(html)
-
异常处理
- URLError的触发原因
- 无网络
- 服务器不存在
- 服务器连接失败
- 触发HTTPError
- HTTPError: URLError的一个子类,主要对应的是http请求返回码错误
- URLError的触发原因
-
UserAgent(用户代理),服务器通过用户代理判断访问者身份,属于headers
''' 伪装成一个Windows用户爬取网页 ''' from urllib import request url = "https://www.douban.com" header = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'} req = request.Request(url, headers=header) resp = request.urlopen(req).read().decode() print(resp)
-
代理服务器:利用代理IP来防止IP被禁禁止访问无法爬取,当同一个IP对一个网站进行大量访问时,该IP可能会被禁止访问,获取代理服务器网址:www.xicidaili.com,www.data5u.com,www.goubanjia.com 等
from urllib import request, error import random url = "https://www.douban.com" header = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'} # 推荐代理写多一点 pro_list=[{"http": '1.20.100.134'}, {"http": '27.191.234.69'}, {"http": '183.88.140.212'}, {"http": '125.39.9.34'}, {"http": '110.164.197.99'} ] proxy = random.choice(pro_list) # 从列表中随机选出一个元素 proxy_handler = request.ProxyHandler(proxy) opener = request.build_opener(proxy_handler) request.install_opener(opener) try: req = request.Request(url, headers=header) rsp = request.urlopen(req).read().decode() print(rsp) except error.HTTPError as e: print(e) except error.URLError as e: print(e) except Exception as e: print(e)
-
Cookie代替登录
from urllib import request,parse from http import cookiejar cookie = cookiejar.CookieJar() # 创建cookiejar实例 cookie_handler = request.HTTPCookieProcessor(cookie) # 创建cookie的管理器 http_handler = request.HTTPHandler() # 创建http管理器 https_handler = request.HTTPSHandler() # 创建https管理器 opener = request.build_opener(http_handler, https_handler, cookie_handler) #创建请求管理器 def login(): ''' 用于初次登录记录cookie ''' url = "https://accounts.douban.com/j/mobile/login/basic" data = { "name": "username", "password": "password" } data = parse.urlencode(data).encode() req = request.Request(url, data=data) rsp = opener.open(req) def getPage(): ''' 在执行完login函数后opener自动包含相应的cookie ''' url = "https//www.douban.com" rsp = opener.open(url) print(html) if __name__ == '__main__': login() getPage()
如何查看cookie进行传递的地址以及关键字:找到你想要爬取的网址的login页面(登录页面),随便输入用户名以及密码,在返回的数据中寻找。
-
js加密:有一些网站对传输的数据进行了加密处理(大部分使用的是md5码),所以我们为了传递数据就必须模拟加密
''' 爬取有道翻译 ''' from urllib import request,parse import time,random,hashlib def getMD5(sign): md5 = hashlib.md5() md5.update(sign.encode("utf-8")) return md5.hexdigest() # salt的公式是通过分析有道翻译网页的js文件得到的 def getSalt(i): ts = str(int(time.time() * 1000)) salt = ts + str(int(10 * random.random())) sign = "fanyideskweb" + i + salt + "1L5ja}w$puC.v_Kz3@yYn" return salt, sign, ts # url,data,header和前面的一样通过返回的数据得到的 def spider(i, salt, sign, ts): url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule" data = { "i": i, "from": "AUTO", "to": "AUTO", "smartresult": "dict", "client": "fanyideskweb", "salt": salt, "sign": sign, "ts": ts, "bv": "d675629694aa9348a06d778e004b8221", "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_REALTlME", "typoResult": "false" } data = parse.urlencode(data).encode() header = { "Accept": "application/json, text/javascript, */*; q=0.01", # "Accept-Encoding": "gzip, deflate", 此属性为设置传回的数据形式为压缩包,所以应该注释掉 "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": len(data), "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Cookie": "OUTFOX_SEARCH_USER_ID=-487474219@10.169.0.84; OUTFOX_SEARCH_USER_ID_NCOO=465299883.3284524; JSESSIONID=aaaCYpy26Ao8dhNGKkEMw; ___rl__test__cookies=1553136805657", "Host": "fanyi.youdao.com", "Origin": "http://fanyi.youdao.com", "Referer": "http://fanyi.youdao.com/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } req = request.Request(url, data=data, headers=header) rsp = request.urlopen(req) html = rsp.read().decode() return html if __name__ == '__main__': i = input("请输入你想翻译的内容") salt, sign, ts = getSalt(i) sign = getMD5(sign) answer = eval(spider(i, salt, sign, ts)) for i in answer['smartResult']['entries']: print(i)
-
提取
- BeautifulSoup
from urllib import request from bs4 import BeautifulSoup url = "https://www.douban.com" req = request.urlopen(url) html = req.read().decode() soup = BeautifulSoup(html, 'lxml') # 将html解析 print(soup.title) # 提取第一个title标签 print(soup.title.string) # 提取第一个title标签的内容 print(soup.a.attrs['class']) # 提取第一个a标签的属性 print(soup.find_all(name='a')) # 提取所有的名称为a的标签 print(soup.select('a')) # 提取所有a标签
- xpath,一般直接response.xpath("//某个标签[@某个元素=‘xxx’]/子标签").extract(),/为下一个,@为元素,text()为标签内内容,extract()为脱壳,将内容转变为list