python爬虫学习系列之urllib(1)

最新推荐文章于 2022-09-20 16:18:54 发布

hoochon

最新推荐文章于 2022-09-20 16:18:54 发布

阅读量388

点赞数

分类专栏： python学习文章标签： python

本文链接：https://blog.csdn.net/hoochon/article/details/87890942

版权

python学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

python爬虫学习系列之urllib

urllib.request
urllib.parse
get请求
请求头部
post请求
handler与opener
- 代理设置
- cookiejar

urllib.request

import urllib.request

url='http://www.baidu.com'
response=urllib.request.urlopen(url)
print(response.read().decode())

#字符串与字节类型之间的转换，
#encode()字符串转换为字节类型,不写参数，默认为utf-8，如果写就写gbk
#decode()二字节类型转换为字符串，如果不写就默认为utf-8，

#根据相应内容获取url
print(response.geturl())
#获取头部信息，列表里面元祖，dict转换为字典
print(dict(response.getheaders()))
#获取状态码
print(response.getcode())
#按行读取，返回列表，都是字节类型
print(response.readlines())

#文件保存至本地，写入文件中
with open('baidu.html','w',encoding='utf8') as fp:
	fp.write(response.read().decode())
#以字节类型直接存储
with open('baidu1.html','wb') as fp:
	# fp.write(response.read())

# 图片保存至本地下载图片
# image_url='http://www.zyglz.com/public/static/home/img/banner/a2.jpg'
# # 方法一
response=urllib.request.urlopen(image_url)
# # 图片写入本地二进制格式
with open('1.jpg','wb') as fp:
 	fp.write(response.read())

# 方法二
urllib.request.urlretrieve(image_url,'2.jpg')

urllib.parse

import urllib.parse
import urllib.request

url='http://www.baidu.com/index.html?name=狗蛋&pwd=123456'

# url只能由特定的字符组成，字母、数字、下划线
# 如果出现其他的，比如$,空格，中文等，就需要对其进行编码

#url 编码函数
ret=urllib.parse.quote(url)
print(ret)
#http%3A//www.baidu.com/index.html%3Fname%3D%E7%8B%97%E8%9B%8B%26pwd%3D123456

# url解码函数
ret=urllib.parse.unquote(ret)
print(ret)
# http://www.baidu.com/index.html?name=狗蛋&pwd=123456

url1='http://www.baidu.com/index.html'
# 假如url有参数，name，age，sex、height
# url='http://www.baidu.com/index.html?name=goutan&age=18&sex=18&height=180'
name='goudan'
age=18
sex='nv'
height='180'

data={
	'name':name,
	'age':age,
	'sex':sex,
	'height':height,
	'weight':180,
}

#遍历字典
lt = []
for k , v in data.items():
	lt.append(k + '=' + str(v))	
query_string='&'.join(lt)
url1=url1+'?'+query_string

# urlencode（data）data为字典格式，将字典拼接为query)_string，如果参数中有特殊字符，将进行url编码
query_string=urllib.parse.urlencode(data)
url1=url1+'?'+query_string
response=urllib.request.urlopen(url1)
print(response.read())

get请求

import urllib.parse
import urllib.request

# word=input('请输入你要搜索的内容：')
word="zyglz"
url='http://www.baidu.com/s?'
# 参数写成一个字典，单独列出来
data={
	'ie':'utf-8',
	'wd':word,
}
query_string=urllib.parse.urlencode(data)
url +=query_string
print(url)
response=urllib.request.urlopen(url)
filename=word +'.html'
with open(filename,'wb')as fp:
	fp.write(response.read())

请求头部

import urllib.parse
import urllib.request

url='http://www.baidu.com/'
#定义自己要伪装的头部
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
#构建请求对象
request=urllib.request.Request(url=url,headers=headers)
#发送请求
response=urllib.request.urlopen(request)
# 伪装自己的UA，让服务器端认为你是浏览器在上网
print(response.read().decode())

post请求

import urllib.parse
import urllib.request

url='https://fanyi.baidu.com/v2transapi'
word='wolf'
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	'Cookie':'PSTM=1547277712; BIDUPSID=0E95D79222272DA55C610BA0B5F94018; BAIDUID=A91FE67E81B916D9B10F1356BA4542AD:FG=1; BDUSS=ZoaU9FSTJoNlpBLTJmYn5aZDdqNjQ5V2dSUXJPdlZtU1lWdElYT3pIODRKR0ZjQVFBQUFBJCQAAAAAAAAAAAEAAABG~UECaG9vY2hvbjEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADiXOVw4lzlcLV; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1549804685; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1549804685; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D',
}
form_data={
	'from':'en',
	'to':'zh',
	'query':word,
	'transtype':'realtime',
	'simple_means_flag':3,
	'sign':'275695.55262',
	'token':'ec367a8c3f87c38b837b3e51934a4351',
}
form_data=urllib.parse.urlencode(form_data).encode()
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request,form_data)
print(response.read().decode('gbk'))

handler与opener

import urllib.parse
import urllib.request

url='http://www.baidu.com'
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
#创建一个handler
handler=urllib.request.HTTPHandler()
# 通过handler创建一个opener
# opener就是一个对象，一会发送请求的时候，直接使用opener里面的方法集合，不要使用urlopen了
opener=urllib.request.build_opener(handler)
#构建请求对象
request=urllib.request.Request(url,headers=headers)
#发送请求
resopnse=opener.open(request)
print(resopnse.read().decode())

代理设置

import urllib.parse
import urllib.request

#创建一个handler
handler=urllib.request.ProxyHandler({'http':'112.85.175.77:9999'})
#创建一个opener
opener=urllib.request.build_opener(handler)
url='http://www.baidu.com/s?wd=ip'
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
request=urllib.request.Request(url,headers=headers)
response=opener.open(request)
with open('ip.html','wb') as fp:
	fp.write(response.read())

cookiejar

import urllib.request
import urllib.parse
import http.cookiejar

post_url='http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019131833818'
# 创建一个cookiejar的对象
cj=http.cookiejar.CookieJar()
# 通过cookiejar创建一个handler
handler=urllib.request.HTTPCookieProcessor(cj)
#根据handler创建一个opener
opener=urllib.request.build_opener(handler)

form_data={
	'email':'hoochon@163.com',
	'icode':'',
	'origURL':'http://www.renren.com/home',
	'domain':'renren.com',
	'key_id':'1',
	'captcha_type':'web_login',
	'password':'************************************',
	'rkey':'39b392090c635431e86ef76d46f31f40',
	'f':'http%3A%2F%2Fwww.renren.com%2F1868758764',
}
headers={
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
	}
#构架请求
request=urllib.request.Request(url=post_url,headers=headers)
form_data=urllib.parse.urlencode(form_data).encode()
response=opener.open(request,data=form_data)
print(response.read().decode())
# {"code":true,"homeUrl":"http://www.renren.com/home"}
print('*' *50)
get_url='http://www.renren.com/1868758764/profile'
request=urllib.request.Request(url=get_url,headers=headers)
response=opener.open(request)
print(response.read().decode())

hoochon

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python爬虫学习系列之urllib(1)

python爬虫学习系列之urlliburllib.requesturllib.parseget请求请求头部post请求urllib.requestimport urllib.requesturl='http://www.baidu.com'response=urllib.request.urlopen(url)print(response.read().decode())#字符串...
复制链接

扫一扫

专栏目录