文章目录
urlopen函数
python3中,所有和网络请求相关的方法,都被集到urllib.request模块下面。
form urllib import requeste
resp = request.urlopen("http://www.baidu.com")
print(rest.read())
- url:请求的url
- data:请求的data,如果设置了这个值,将变成post请求
- 返回值:返回值是一个http.client.HTTPResponse对象,对象是一个类文件句柄对象。有read(size)、readline、readlines已及getcode等方法。
urlretrieve函数
将网页上的一个文件保存到本地
from urllib import request
request.urlretrieve("http://www.baidu.com", "baidu.html")
request.urlretrieve("http://b-ssl.duitang.com/uploads/item"
"/201410/04/20141004203543_jCvhC.png", "shaosiming.png")
urlencode函数
url中的其他字符编码。编码、解码相关在parse模块中。
from urllib import parse
params = {"name": "张三", "age": 18, "gender": "Women"}
result = parse.urlencode(params)
print(result)
name=%E5%BC%A0%E4%B8%89&age=18&gender=Women
from urllib import parse
from urllib import request
# https://www.baidu.com/s?wd=少司命
url = "http://www.baidu.com/s"
params = {"wd": "少司命"}
url = url + "?" + parse.urlencode(params)
# request.urlretrieve(url, "shaosimin.html")
resp = request.urlopen(url)
print(resp.read())
parse_qs函数
解码
params = {"name": "许许", "age": 24}
qs = parse.urlencode(params)
print(qs)
result = parse.parse_qs(qs)
print(result)
urlparse和urlsplit
对url的各个组成部分进行分割。
from urllib import parse
url = "http://www.baidu.com/s?python&username=xuxu#1"
result1 = parse.urlparse(url)
print(result1)
print("scheme:", result1.scheme)
result2 = parse.urlsplit(url)
print(result2)
print("path:", result2.path)
urlsplit没有params。
request.Request类
在请求的时候增加一些请求头
from urllib import request
url = "https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (K"
"HTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
req = request.Request(url, headers=headers)
resp = request.urlopen(req)
print(resp.read())
python默认unicode字符,用"encode(“utf-8”)"方法编码,解码用"decode(“utf-8”)"方法
from urllib import request
from urllib import parse
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": 26,
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "user_trace_token=20200316223918-b61b8c2e-dadd-4e75-a585-60fe642815d1; LGUID=20200316223918-e838ab5b-9be6-4a8b-9e17-f0f17dc8a6b8; _ga=GA1.2.276883624.1584369554; _gid=GA1.2.743011689.1584369555; index_location_city=%E5%85%A8%E5%9B%BD; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22170e3c997376ee-078369d031d861-4313f6a-921600-170e3c997383f2%22%2C%22%24device_id%22%3A%22170e3c997376ee-078369d031d861-4313f6a-921600-170e3c997383f2%22%7D; JSESSIONID=ABAAABAABGGAAFD6C2CEA10DB2CD1F8E40B00FF03C1342F; WEBTJ-ID=20200317211338-170e8a0d1981d3-0bebcd448b5889-4313f6a-921600-170e8a0d19975c; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1584369554,1584450819; lagou_utm_source=C; X_MIDDLE_TOKEN=b360671ea2c2703938e52507628de5ff; TG-TRACK-CODE=search_code; LGSID=20200317222217-cb504d37-95f8-4d38-b280-3cd717ae9492; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fc%252B%252B%3FlabelWords%3Dsug%26fromSearch%3Dtrue%26suginput%3Dc; _gat=1; X_HTTP_TOKEN=868f9b3405facccb17155448517540aefdac905056; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1584455164; LGRID=20200317222611-9e7e9d19-16ff-4d53-aae2-d03f92359f4d; SEARCH_ID=e437a05818cd45d8b810296200c30acc",
"Host": "www.lagou.com",
"Origin": "https://www.lagou.com",
"Referer": "https://www.lagou.com/jobs/list_c%2B%2B?labelWords=sug&fromSearch=true&suginput=c",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
"X-Anit-Forge-Code": 0,
"X-Anit-Forge-Token": None,
"X-Requested-With": "XMLHttpRequest"
}
data = {
"first": "true",
"pn": 1,
"kd": "c%2B%2B"
}
# req = request.Request(url, headers=headers, data=parse.urlencode(data).encode("utf-8"),
# method="POST")
req = request.Request(url, headers=headers, data=parse.urlencode(data).encode("utf-8"),
method="POST")
resp = request.urlopen(req)
print(resp.read())
ProxyHandler实现代理ip
- 使用ProxyHandler,传入代理构建一个handler
- 使用上面创建的handler构建一个opener
- 使用opener去发送一个请求
from urllib import request
url = "http://httpbin.org/ip"
# 使用代理
handler = request.ProxyHandler({"http": "183.166.102.237:9999"})
opener = request.build_opener(handler)
resp = opener.open(url)
print(resp.read())
cookie
Set-Cookie: _m7e_session_core=73640e733e4c7a044a09e0ac723c9f8c; domain=.jianshu.com; path=/; expires=Wed, 18 Mar 2020 19:34:20 -0000; secure; HttpOnly
from urllib import request
# 大鹏主页 http://www.renren.com/880151247/profile
# 人人网登陆url http://renren.com/
dapeng_url = "http://www.renren.com/880151247/profile"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
req = request.Request(url=dapeng_url, headers=headers)
resp = request.urlopen(req)
with open("renren.html", "w") as fp:
# write函数必须写入str的数据类型
#resp.read()读出来的是bytes类型
# bytes -> decode -> str
# str -> encode -> bytes
fp.write(resp.read().decode("utf-8"))
将Cookie信息放在请求头中
from urllib import request
# 大鹏主页 http://www.renren.com/880151247/profile
# 人人网登陆url http://renren.com/
dapeng_url = "http://www.renren.com/880151247/profile"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleW"
"ebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
"Cookie": "anonymid=k7xcv8micmqvdd; depovince=GW; _r01_=1; JSESSIONID=abcIY6xmxGVSnf"
"UV1ZTdx; ick_login=e5583469-0480-4789-a43f-17f2957c192c; taihe_bi_sdk_uid="
"b772231939df21ddc6a482d477e89995; taihe_bi_sdk_session=b22040fa38f07a1d62871"
"854fbce174f; _de=DEA807B1AA8865CCECF6A8284EAE1BA26DEBB8C2103DE356; id=974008"
"774; ver=7.0; jebe_key=3ab7af8b-a867-493e-a137-6284f4fc2f68%7Cee315652f2565be"
"baf9cc6ef50905b96%7C1584539247483%7C1%7C1584539237761; jebe_key=3ab7af8b-a867-"
"493e-a137-6284f4fc2f68%7Cee315652f2565bebaf9cc6ef50905b96%7C1584539247483%7C1"
"%7C1584539237763; wp=0; ick=f29e2409-7ec9-472d-ac5c-74fa844ddee5; p=9cd0546c63"
"d35de4675d7a176752a8e04; first_login_flag=1; ln_uact=1366409222@qq.com; ln_hurl"
"=http://hdn.xnimg.cn/photos/hdn121/20200315/1430/h_main_AlBB_0c1a00033b6c195a.jpg"
"; t=1e9a86e70907eebb80c83b2d938a2a5c4; societyguester=1e9a86e70907eebb80c83b2d93"
"8a2a5c4; xnsid=1212fbe6; loginfrom=null; wp_fold=0; jebecookies=52a8880d-b3f6-46"
"0f-99d1-c3d3401eb6bb|||||; XNESSESSIONID=1bda876d9609"
}
req = request.Request(url=dapeng_url, headers=headers)
resp = request.urlopen(req)
with open("dapengrere.html", "w", encoding="utf-8") as fp:
fp.write(resp.read().decode("utf-8"))