urllib.request 模块
处理:基本和摘要式身份验证,重定向,cookies等等。
1.导入:
import urllib.request
2.urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
打开网址URL,这可以是一个字符串或一个 Request对象(response)。
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com") # 返回一个对象
- data(为字节类型) 有值时,HTTP请求将会执行POST请求而不是GET请求。
- context 如被指定,它必须是一个 ssl.SSLContext实例描述各种SSL选项。点击HTTPSConnection查看更多细节。
- 可选cafile和capath参数指定一组被HTTPS请求信任的CA证书。cafile应该指向一个文件包含CA证书的包,而capath应该指向一个散列的证书文件的目录。点击ssl.SSLContext.load_verify_locations()查看更多的信息。
- cadefault参数被忽略。
# response.geturl() 根据响应内容获取请求的url
print(response.geturl()) # http://www.baidu.com
# response.info() 返回页面的元信息,如标题
print(response.info()) # Bdpagetype: 1 Bdqid: 0xfb1ff00500002b3b Cache-Control: private Content-Type: text/html 等等
# response.getcode() 返回响应的HTTP状态代码
print(response.getcode()) # 200
# response.read() ——读取到html内容,字节类型
print(response.read()) # 字节格式的内容
# response.getheaders() ——获取响应头
print(response.getheaders()) # 返回一个列表,列表内的元素是元组
# response.readlines()——返回一个列表。一行一行读取,列表内是字节类型
print(response.readlines())
3.urllib.request.urlretrieve() 文件链接,保存的名字
response = urllib.request.urlretrieve("http://www.baidu.com")
print(response)
# 元组 ('C:\\Users\\XIN\\AppData\\Local\\Temp\\tmphnkdp400', <http.client.HTTPMessage object at 0x0000021C735C4FD0>)
4.urllib.request.Request(url, data=None, headers={}, origin_req_host=None,unverifiable=False, method=None) 返回一个request对象
response = urllib.request.Request("http://www.baidu.com")
print(response) # <urllib.request.Request object at 0x00000245ED1ECA58>
例子:
get请求:
构建请求头信息(反爬第一步)
构建请求对象:urllib.request.Request(url,headers = headers)
import urllib.request
import urllib.parse
url = "https://www.tianyancha.com/company/2343981465"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
print(response.read().decode())
post请求:
注意:表单数据需要处理 form_data = urllib.parse.urlencode(form_data).encode()
import urllib.request as request
import urllib.parse
import json
str_s = input("请输入你要翻译的单词:")
url = "https://fanyi.baidu.com/sug"
# 构建post表单数据
form_data = {
"kw":str_s,
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
# 构建请求对象
requests = request.Request(url,headers=headers)
# 处理post的表单数据
form_data = urllib.parse.urlencode(form_data).encode()
# 发送请求
response =request.urlopen(requests,data=form_data)
text = response.read().decode()
text = json.loads(text)
print(text["data"][0]["v"])
附录:
# POST 请求
import urllib.request
import urllib.parse
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
fromdata = {
'cname': '北京',
'pid':'',
'pageIndex': '2',
'pageSize': '10',
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
request = urllib.request.Request(url,headers=headers)
fromdata = urllib.parse.urlencode(fromdata).encode()
response = urllib.request.urlopen(request,data=fromdata)
print(response.read().decode())
requests模块_复杂的get
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
url = "http://tieba.baidu.com/f?ie=utf-8"
data = {
"kw":input("请输入吧名"),
"pn":str((int(input("请输入页码"))-1)*50),
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
fromdata = urllib.parse.urlencode(data)
url += fromdata
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request,data=fromdata.encode())
print(response.read().decode())
requests模块_cookie模拟登陆
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
import http.cookiejar #用来保存cookie
# 创建一个cookiejar对象
cj = http.cookiejar.CookieJar()
# 通过cookiejar创建一个handler
handler = urllib.request.HTTPCookieProcessor(cj)
# 根据handler创建一个opener
opener = urllib.request.build_opener(handler)
login_url = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019122253442"
from_data = {
'email': '账号',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '密码',
'rkey': 'd45f5d17e437f1e50eace68adaa5ed89',
'f': 'http%3A%2F%2Fwww.renren.com%2F969798592',
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
from_data = urllib.parse.urlencode(from_data).encode()
request = urllib.request.Request(url=login_url)
response = opener.open(request,data=from_data) # 会将cookie保存到 cj 对象中
print(response.read().decode())
get_url = "http://www.renren.com/969798592/profile"
request = urllib.request.Request(url=get_url,headers=headers)
response = opener.open(request)
print(response.read().decode())
requests模块_handler模拟登陆
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
url = "http://www.baidu.com/"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
# 创建一个handler
handler = urllib.request.HTTPHandler()
# 通过ihandler创建一个opener
# opener 就是一个对象,一会发送请求,直接使用opener的方法即可,不要使用urlopen
opener = urllib.request.build_opener(handler)
# 构建请求对象
request = urllib.request.Request(url,headers=headers)
# 发送请求 (无论是 get,还是 post 都是用这个 open 方法)
response = opener.open(request)
print(response.read().decode())
handler_代码配置代理,验证
import urllib.request
import urllib.parse
url = "http://www.baidu.com/s?wd=ip"
# 创建handler 括号里面 按照这个格式去写 值是 ip+:+端口号
handler = urllib.request.ProxyHandler({"http:":"113.79.75.104:9797"})
# 创建opener
opener = urllib.request.build_opener(handler)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
request = urllib.request.Request(url,headers=headers)
response = opener.open(request)
print(response.read().decode())
urllib.parse 模块 (可以用于url 中文转换)
# urllib.parse.quote(url) 对url中的中文进行编码
response = urllib.parse.quote("http://www.mmthat.com/index.html?name=测试者")
print(response) # http%3A//www.mmthat.com/index.html%3Fname%3D%E6%B5%8B%E8%AF%95%E8%80%85
# urllib.parse.unquote(url) 对url进行解码
response = urllib.parse.upquote("http%3A//www.mmthat.com/index.html%3Fname%3D%E6%B5%8B%E8%AF%95%E8%80%85")
print(response) # http://www.mmthat.com/index.html?name=测试者
# urllib.parse.urlencode(dict) 将字典转化为 ie=utf-8&f=3&rsv_bp=1&tn=25017023_10_pg 格式的数据,用来拼接新的url地址
import urllib.parse
response = urllib.parse.urlencode({"key1":"value1","key2":"value2"})
print(response) # key1=value1&key2=value2