urllib爬虫包
import urllib.parse 解析包
#解析中文
res = urllib.parse.qoute('你好') #解析成url所能识别的16进制编码
res = urlib.parse.urljoin('https://www.xxxx.com/?a=',res) #拼接url不会解析,会去掉url的参数
res = urllib.parse.urlencode({'wd':'你好'}) #解析中文为16进制url形式表示
res = urllib.parse.unquote('%E4%BD%A0%E5%A5%BD') #还原为中文,可以放url进行还原
res = urllib.parse.parse_qs('wd=hello&wd=heeeee') #还原成字典,值为列表
res = urllib.parse.urlparse(url='https://www.baidu.com/?wd=hello')
#ParseResult(scheme='https', netloc='www.baidu.com', path='/', params='', query='wd=hello', fragment='')
#解析url地址得出各个参数
import urllib.request 请求包
response = urllib.request.urlopen(url,timeout=0.01) #请求对应网址,可以设置超时不访问
response.read().decode() #二进制解码后读取html
response.info() #获取头信息
response.getheaders() #头信息以列表包元组键值对的形式显示
response.geturl() #获取网页请求地址
from urlilib.request import Request #Request对象
headers ={
'User-Agent':#浏览器信息版本等,
'Accept-Encoding':'', #表示不压缩
}
req = Request(url=url,headers=headers) #带着请求头参数,可以进行各种请求封装,伪造信息
response = urllib.request.urlopen(req) #带着请求投访问,
url = "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%88%B1%E6%83%85"
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
#POST请求和GET请求并没有严格区分,对于GET请求我们也可以通过POST传递参数
formdata = {
"start":"0",
"limit":'2'
}
data = urllib.parse.urlencode(formdata).encode('utf-8') #解析中文变成url形势,在变成二进制不加encode('utf-8'),data传值需要data=bytes(data, encoding='utf-8')
request = urllib.request.Request(url, data = data, headers = headers)
response = urllib.request.urlopen(req)
print(response.read().decode())
保存cookie访问
from urllib.request import HTTPCookieProcessor
from http.cookiejar import CookieJar
from urllib.request import build_opener
先创建一个handler, 使用build_opener去打开一个url
cookies = CookieJar()
handler = HTTPCookieProcessor(cookiejar=cookies)
opener = build_opener(handler)
url = ''
response = opener.open(url)
原来为空,访问后就会有值
for cookie in cookies:
print(cookie)
代理
from urllib.request import ProxyHandler, build_opener
proxies = {
'http': '125.123.127.253:9999',
'https': '60.167.159.236:808'
}
handler = ProxyHandler(proxies=proxies)
opener = build_opener(handler)
resp = opener.open('https://www.baidu.com')
import urllib.error
错误包
try:
urllib.request.urlopen(url)
except urllib.error.HTTPError as e: #状态码错误类型,如404
print(e.reason)
except urllib.error.URLError as e: #HTTPError的父类,包含各种错误
print('URLError')
import ssl
忽略证书
ssl._create_default_https_context = ssl._create_unverified_context()
import urllib.robotparser #解析robots协议的,君子协议,允许爬取的范围