url详解
1. urlopen函数—打开一个网页
from urllib import request
r=request.urlopen('http://www.baidu.com')
print(r.read(2000).decode("utf-8"))
2.urlretrieve函数—保存网页文
request.urlretrieve('http://www.baidu.com', 'baidu.html')
#request.urlretrieve('https://ss3.bdstatic.com/70cFv8Sh_Q1YnxGkpoWK1HF6hhy/it/u=3254061209,1473183314&fm=11&gp=0.jpg','biaoqing.jpg')
3. 参数编码与解码
#urlencode函数---可以把字典转换成url编码的数据(编码)
from urllib import parse
params = {'name':'张三','age':18,'grat':'hello world'}
result = parse.urlencode(params)
print(result)
url = 'http://www.baidu.com/s'
params={"wd":"刘德华"}
qs = parse.urlencode(params)
url =url + "?" + qs
r = request.urlopen(url)
print(r.read().decode("utf-8"))
#parse_qs函数 (urlencode函数的反向操作,解码)
params = {'name':'张三','age':18,'grat':'hello world'}
qs = parse.urlencode(params)
print(qs)
result = parse.parse_qs(qs)
print(result)
4.urlparse函数
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlparse(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)
5.urlsplit函数(同urlparse)
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlsplit(url)
print(result)
print('scheme:', result.scheme)
print('netloc:',result.netloc)
print('path',result.path)
#print('params',result.params)
print('query',result.query)
print('fragment',result.fragment)
6.request.Request类 ----增加请求头
from urllib import request
url = 'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py'
r = request.urlopen(url)
print(r.read)
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url,headers = headers)
resp = request.urlopen(req)
print(resp.read(3000).decode('utf-8'))
实例: 爬取拉勾网
from urllib import request,parse
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=py',
}
data ={
'first': 'true',
'pn': 1,
'kd': 'python'
}
req = request.Request(url,headers = headers, data = parse.urlencode(data).encode('utf-8'), method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))
作业:爬取庆余年的短评
from urllib import request
url = 'https://movie.douban.com/subject/25853071/comments?status=P'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url,headers = headers)
r = request.urlopen(req)
print(r.read().decode('utf-8'))
7.ProxyHandler处理器(IP设置)
西刺免费代理IP:http://www.xicidaili.com/
快代理:http://www.kuaidaili.com/
代理云:http://www.dailiyun.com/
from urllib import request
#没使用代理
url = 'http://httpbin.org/ip'
r = request.urlopen(url)
print(r.read())
#使用代理
url = 'http://httpbin.org/ip'
#1.使用ProxyHandler,传入代理构建一个handler
handler = request.ProxyHandler({"http":"163.204.244.84"})
#2.使用上面的handler构建一个opener
opener =request.build_opener(handler)
#3.使用opener发送一个请求
r = opener.open(url)
print(r.read())
8.cookie存储用户ID
#cookie的格式:
set-Cookie:NAME=VALUE;Expires/Max=age=DATE;Path=PATH;Domain=DOMAIN_NAME;SECURE
NAME:cookie的名字
VALUE:cookie的值
Expires:cookie的过期时间
Path:cookie作用的路径
Domain:cookie作用的域名
#使用cookie模拟登录
from urllib import request
#1.不使用cookie来请求知乎页面
zhihu_url = "https://www.zhihu.com/follow"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
req = request.Request(url=zhihu_url,headers=headers)
resp = request.urlopen(req)
#print(resp.read().decode('utf-8'))
with open('zhihu.html','w') as fp:
#write函数必须写入一个str的数据类型
#resp.read()读出来的是bytes数据类型
#str-> encode -> bytes
fp.write(resp.read().decode('gbk','ignore'))
实例:爬取知乎
#http.cookiejar模块
from urllib import request,parse
from http.cookiejar import CookieJar
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
def get_opener():
#1.登陆
#1.1创建一个cookiejar对象
cookiejar = CookieJar()
#1.2使用cookiejar创建一个httpchookiejar对象
handler = request.HTTPCookieProcessor(cookiejar)
#1.3使用上一步创建的handler创建一个opener
opener = request.build_opener(handler)
return opener
def login_zhihu(opener):
#1.4使用opener发送登陆请求
data = {
"username":"19912456595",
"password":"pythoncookie0"
}
login_url = "https://www.zhihu.com/signin?next=%2Fsettings%2Faccount"
req = request.Request(login_url,data =parse.urlencode(data).encode('utf-8'), headers=headers)
opener.open(req)
def visit_zhihu(opener):
#2.访问知乎
zhihu_url = "https://www.zhihu.com/follow"
resp = opener.open(zhihu_url)
with open('zhihu0.html','w',encoding='utf-8') as fp:
fp.write(resp.read().decode("utf-8"))
if __name__ == '__main__':
opener = get_opener()
login_zhihu(opener)
visit_zhihu(opener)
9.cookie信息的加载与保存
#MozillaCookieJar
from urllib import request
from http.cookiejar import MozillaCookieJar
cookiejar = MozillaCookieJar('cookie.txt')
cookiejar.load(ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)
resp = opener.open('http://httpbin.org/cookies/set?freeform=spider')
for cookie in cookiejar:
print(cookie)
#cookiejar.save(ignore_discard=True)
10.requests库的使用
#发送GET请求
import requests
response = requests.get("https://www.baidu.com/")
#print(type(response.text))
#print(response.text)
#print(type(response.content))
#print(response.content.decode('utf-8'))
print(response.url)
print(response.encoding)
print(response.status_code)
import requests
params = {
'wd':'中国'
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.get("https://www.baidu.com/s",params=params, headers=headers)
with open('baidu.html','w',encoding='utf-8') as fp:
fp.write(response.content.decode('utf-8'))
print(response.url)
#发送POST请求
import requests
data = {
'first':'true',
'pn':'1',
'kw':'python'
}
headers = {
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=sug&fromSearch=true&suginput=p',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
response = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=falseRequest Method: POST', data=data,headers=headers)
print(type(response.json()))
print(response.text)
requests使用代理
import requests
proxy = {
'http':'123.149.38.52'
}
response = requests.get('http://httpbin.org/ip',proxies=proxy)
print(response.text)
requests处理cookie信息
#Session 爬取知乎
url = 'https://www.zhihu.com/signin?next=%2Fsettings%2Faccount'
data = { "username":"19912456595","password":"pythoncookie0" }
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
session = requests.Session()
session.post(url,data=data,headers=headers)
response = session.get('https://www.zhihu.com/follow')
with open('zhihu1.html','w',encoding='utf-8') as f:
f.write(response.text)