importrequests#以不同的请求方式获取response
r = requests.get('https://api.github.com/events')
r= requests.post('http://httpbin.org/post', data = {'key':'value'})
r= requests.put('http://httpbin.org/put', data = {'key':'value'})
r= requests.delete('http://httpbin.org/delete')
r= requests.head('http://httpbin.org/get')
r= requests.options('http://httpbin.org/get')
payload= {'key1': 'value1', 'key2': 'value2'}
r= requests.get('http://httpbin.org/get', params=payload)#params参数接收一个dict来添加query string#以None为value的key不会添加到query string里
#pass a list of items as value
payload = {'key1': 'value1', 'key2': ['value2', 'value3']}
r= requests.get('http://httpbin.org/get', params=payload)print(r.url)#Output:http://httpbin.org/get?key1=value1&key2=value2&key2=value3
r.text#The text encoding guessed by Requests
r.encoding #find out what encoding Requests is using#and you can change it whenever you wanna work out what the encoding of the content will be
r.content#to find the encoding
r.encoding #set encoding as you need
r.text #get r.text with correct encoding
r.content#accesss the response body as bytes(binary response content)
r.json()#It should be noted that the success of the call to r.json()#does not indicate the success of the response.#To check that a request is successful,#use r.raise_for_status() or check r.status_code is what you expect.
r= requests.get('https://api.github.com/events', stream=True)#if you'd like to get the raw socket response,make sure you have set#the parameter stream as True
r.raw #get raw socket response
r.raw.read(10)#to save what is being streamed to a file
with open(filename, 'wb') as fd:for chunk inr.iter_content(chunk_size):
fd.write(chunk)#we didn't specify our user-agent in the previous example#If you'd like to add HTTP headers to a request,#simply pass in a dict to the headers parameter
url = 'https://api.github.com/some/endpoint'headers= {'user-agent': 'my-app/0.0.1'}
r= requests.get(url, headers=headers)#Note: All header values must be a string, bytestring, or unicode.#While permitted, it's advised to avoid passing unicode header values.
#you want to send some form-encoded data
payload = {'key1': 'value1', 'key2': 'value2'} #the form
r = requests.post("http://httpbin.org/post", data=payload)#set data parameter with the a dict you defined
>>>print(r.text)
{
..."form": {"key2": "value2","key1": "value1"},
...
}#the GitHub API v3 accepts JSON-Encoded POST/PATCH data
importjson
url= 'https://api.github.com/some/endpoint'payload= {'some': 'data'}
r= requests.post(url, data=json.dumps(payload))#爬取一张照片
from urllib importrequest
url='http://imgpoobbs.b0.upaiyun.com/uploadfile/photo/2016/8/201608051206091841435218.jpg!photo.middle.jpg'Request= request.urlopen(url)#发出请求
Response = Request.read()#获取返回结果
f = open('1.png','wb')#创建一个图片文件
f.write(Response)#把Response写入文件f中
f.close()#关闭文件
#urlopen的data参数
importurllib.requestimporturllib.parse
data=bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')#bytes()方法转字节流,第一个参数为string,这里用urllib.parse.urlencode()#把字典转换为string,第二个参数为编码方式
response = urllib.request.urlopen('http://www.python.org',data=data)#添加附加参数data,传递了data参数则请求方式为POST#print(type(response))
print(response.read())#urlopen的timeout参数
importurllib.requestimporturllib.errorimportsockettry:#(通过try...except..可以使一个页面如果长时间未响应就跳过它的抓取)
response=urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)#设置超时timeout参数,当到达参数时间服务器还未响应,则会抛出URLError
excepturllib.error.URLError as e :ifisinstance(e.reason,socket.timeout):print('TIME OUT')#urllib.request.Request的格式
class urllib.request.Request(url, data=None, headers={}, origin_
req_host=None, unverifiable=False, method=None)#headers={}参数为请求头,请求头最常用的用法就是通过修改 User-Agent 来伪装浏览器,#默认的 UserAgent 是 Python-urllib ,你可以通过修改它来伪装浏览器,#比如要伪装火狐浏览器,你可以把它设置为 Mozilla/5.0 (X11; U; Linux i686)#Gecko/20071127 Firefox/2.0.0.11
#urllib.request.Request的使用
from urllib importrequest,parse
url='http://httpbin.org/post'headers={'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)','Host':'httpbin.org'}
dict={'name':'Germey'}
data=bytes(parse.urlencode(dict),encoding='utf8')
req=request.Request(url=url,data=data,headers=headers,method='POST')#request.Request()可以设置比urlopen多的参数
response=request.urlopen(req)#再通过urlopen()来发送请求,获取响应
print(response.read().decode('utf-8'))#Handler
importurllib.request
auth_handler=urllib.request.HTTPBasicAuthHandler()#实例化一个HTTPBasicAuthHandler对象
auth_handler.add_password(realm='PDQ Application',
uri='http://mahler:8092/site-updates.py',
user='klem',
passwd='kadidd!ehopper')#给这个对象添加进去用户名和密码,相当于建立了一个处理认证的处理器
opener=urllib.request.build_opener(auth_handler)#build_opener方法利用这个处理器构建一个Opener,则这个Opener在发送请求的时候#就具备认证功能了
urllib.request.install_opener(opener)#完成认证
urllib.request.urlopen('http://www.example.com/login.html')#添加代理
importurllib.request
proxy_handler=urllib.request.ProxyHandler({'http':'http://218.202.111.10:80','https':'http://180.250.163.34:8888'})#参数为一个字典,key为协议类型,value是代理链接
opener =urllib.request.build_opener(proxy_handler)#给这个Handler构建一个Opener
response = opener.open('https://www.baidu.com')#发送请求
print(response.read())#1.Handler 2.bulid.opener 3.open
#异常处理#error.URLError
from urllib importrequest,errortry:
response= request.urlopen('http://cuiqingcai.com/index.htm')except error.URLError as e:#由request产生的异常都由URLError捕获
print(e.reason)#URLError的reason属性返回错误原因
#error.HTTPError
from urllib importrequest,errortry:
response= request.urlopen('http://cuiqingcai.com/index.htm')excepterror.HTTPError as e:print(e.reason,e.code,e.headers)#三个属性,code为错误代码,headers为响应头
except error.URLError as e:#URLError为HTTPError的父类,先捕获子类的错误
#如果非HTTPError再捕获父类URLError的错误
print(e.reason)else:print('Request Successfully')from urllib importrequest,errorimportsockettry:
response= request.urlopen('http://www.baidu.com',timeout=0.01)excepterror.URLError as e:print(type(e.reason))ifisinstance(e.reason,socket.timeout):print('TIME OUT')#Output: # reason不一定都返回字符串,这里e.reason为一个socket.timeout对象,而非一个字符串#TIME OUT
#解析链接
from urllib.parse importurlparse
result= urlparse('http://www.baidu.com/index.html;user?id=5#comment')#urlparse的API:urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
print(type(result),result)#Output: #ParseResult为一个元祖#ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')#标准链接格式scheme://netloc/path;parameters?query#fragment
#urlunparse
from urllib.parse importurlunparse
data=['http','www.baidu.com','index.html','user','a=6','comment']print(urlunparse(data))#参数为一个有6个参数(即标准链接格式的六部分)的Iterable
#urlsplit
from urllib.parse importurlsplit
result= urlsplit('http://www.baidu.com/index.html;user?id=5#comment')print(result)#Output:SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')#和urlparse一样只不过不返回params,而把它归为path
#urlunsplit同urlunparse,只不过它只用传5个参数(无params)
#urljoin的拼接
from urllib.parse importurljoinprint(urljoin('http://www.baidu.com', 'FAQ.html'))print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))print(urljoin('http://www.baidu.com/about.html', 'http://cuiqingcai.com/FAQ.html'))print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/'
'FAQ.html?question=2'))print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))print(urljoin('http://www.baidu.com', '?category=2#comment'))print(urljoin('www.baidu,com', '?categoty=2#comment'))print(urljoin('www.baidu.com#comment', '?category=2'))#给出两个链接,第一个为base_url,它会提供scheme,netloc,path三项内容#对于第二个链接中没有这三项内容的就用base_url的来补充#Output:http://www.baidu.com/FAQ.html#https://cuiqingcai.com/FAQ.html#http://cuiqingcai.com/FAQ.html#https://cuiqingcai.com/FAQ.html?question=2#https://cuiqingcai.com/index.php#http://www.baidu.com?category=2#comment#www.baidu,com?categoty=2#comment#www.baidu.com?category=2
#robots协议
from urllib.robotparser importRobotFileParser
rp=RobotFileParser()#创建类实例
rp.set_url('http://www.jianshu.com/robots.txt')
rp.read()#读取robots.txt文件并进行分析,一定要调用这个方法,否则不读取的!
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))#判断是否can_fetch
print(rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))