python返回第一步_Python爬虫第一步

最新推荐文章于 2023-06-18 04:17:26 发布

weixin_39785858

最新推荐文章于 2023-06-18 04:17:26 发布

阅读量653

点赞数

文章标签： python返回第一步

importrequests#以不同的请求方式获取response

r = requests.get('https://api.github.com/events')

r= requests.post('http://httpbin.org/post', data = {'key':'value'})

r= requests.put('http://httpbin.org/put', data = {'key':'value'})

r= requests.delete('http://httpbin.org/delete')

r= requests.head('http://httpbin.org/get')

r= requests.options('http://httpbin.org/get')

payload= {'key1': 'value1', 'key2': 'value2'}

r= requests.get('http://httpbin.org/get', params=payload)#params参数接收一个dict来添加query string#以None为value的key不会添加到query string里

#pass a list of items as value

payload = {'key1': 'value1', 'key2': ['value2', 'value3']}

r= requests.get('http://httpbin.org/get', params=payload)print(r.url)#Output：http://httpbin.org/get?key1=value1&key2=value2&key2=value3

r.text#The text encoding guessed by Requests

r.encoding #find out what encoding Requests is using#and you can change it whenever you wanna work out what the encoding of the content will be

r.content#to find the encoding

r.encoding #set encoding as you need

r.text #get r.text with correct encoding

r.content#accesss the response body as bytes(binary response content)

r.json()#It should be noted that the success of the call to r.json()#does not indicate the success of the response.#To check that a request is successful,#use r.raise_for_status() or check r.status_code is what you expect.

r= requests.get('https://api.github.com/events', stream=True)#if you'd like to get the raw socket response,make sure you have set#the parameter stream as True

r.raw #get raw socket response

r.raw.read(10)#to save what is being streamed to a file

with open(filename, 'wb') as fd:for chunk inr.iter_content(chunk_size):

fd.write(chunk)#we didn't specify our user-agent in the previous example#If you'd like to add HTTP headers to a request,#simply pass in a dict to the headers parameter

url = 'https://api.github.com/some/endpoint'headers= {'user-agent': 'my-app/0.0.1'}

r= requests.get(url, headers=headers)#Note: All header values must be a string, bytestring, or unicode.#While permitted, it's advised to avoid passing unicode header values.

#you want to send some form-encoded data

payload = {'key1': 'value1', 'key2': 'value2'} #the form

r = requests.post("http://httpbin.org/post", data=payload)#set data parameter with the a dict you defined

>>>print(r.text)

{

..."form": {"key2": "value2","key1": "value1"},

...

}#the GitHub API v3 accepts JSON-Encoded POST/PATCH data

importjson

url= 'https://api.github.com/some/endpoint'payload= {'some': 'data'}

r= requests.post(url, data=json.dumps(payload))#爬取一张照片

from urllib importrequest

url='http://imgpoobbs.b0.upaiyun.com/uploadfile/photo/2016/8/201608051206091841435218.jpg!photo.middle.jpg'Request= request.urlopen(url)#发出请求

Response = Request.read()#获取返回结果

f = open('1.png','wb')#创建一个图片文件

f.write(Response)#把Response写入文件f中

f.close()#关闭文件

#urlopen的data参数

importurllib.requestimporturllib.parse

data=bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')#bytes()方法转字节流，第一个参数为string，这里用urllib.parse.urlencode()#把字典转换为string，第二个参数为编码方式

response = urllib.request.urlopen('http://www.python.org',data=data)#添加附加参数data,传递了data参数则请求方式为POST#print(type(response))

print(response.read())#urlopen的timeout参数

importurllib.requestimporturllib.errorimportsockettry:#(通过try...except..可以使一个页面如果长时间未响应就跳过它的抓取)

response=urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)#设置超时timeout参数，当到达参数时间服务器还未响应，则会抛出URLError

excepturllib.error.URLError as e :ifisinstance(e.reason,socket.timeout):print('TIME OUT')#urllib.request.Request的格式

class urllib.request.Request(url, data=None, headers={}, origin_

req_host=None, unverifiable=False, method=None)#headers={}参数为请求头，请求头最常用的用法就是通过修改 User-Agent 来伪装浏览器，#默认的 UserAgent 是 Python-urllib ，你可以通过修改它来伪装浏览器，#比如要伪装火狐浏览器，你可以把它设置为 Mozilla/5.0 (X11; U; Linux i686)#Gecko/20071127 Firefox/2.0.0.11

#urllib.request.Request的使用

from urllib importrequest,parse

url='http://httpbin.org/post'headers={'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)','Host':'httpbin.org'}

dict={'name':'Germey'}

data=bytes(parse.urlencode(dict),encoding='utf8')

req=request.Request(url=url,data=data,headers=headers,method='POST')#request.Request()可以设置比urlopen多的参数

response=request.urlopen(req)#再通过urlopen()来发送请求，获取响应

print(response.read().decode('utf-8'))#Handler

importurllib.request

auth_handler=urllib.request.HTTPBasicAuthHandler()#实例化一个HTTPBasicAuthHandler对象

auth_handler.add_password(realm='PDQ Application',

uri='http://mahler:8092/site-updates.py',

user='klem',

passwd='kadidd!ehopper')#给这个对象添加进去用户名和密码，相当于建立了一个处理认证的处理器

opener=urllib.request.build_opener(auth_handler)#build_opener方法利用这个处理器构建一个Opener，则这个Opener在发送请求的时候#就具备认证功能了

urllib.request.install_opener(opener)#完成认证

urllib.request.urlopen('http://www.example.com/login.html')#添加代理

importurllib.request

proxy_handler=urllib.request.ProxyHandler({'http':'http://218.202.111.10:80','https':'http://180.250.163.34:8888'})#参数为一个字典，key为协议类型，value是代理链接

opener =urllib.request.build_opener(proxy_handler)#给这个Handler构建一个Opener

response = opener.open('https://www.baidu.com')#发送请求

print(response.read())#1.Handler 2.bulid.opener 3.open

#异常处理#error.URLError

from urllib importrequest,errortry:

response= request.urlopen('http://cuiqingcai.com/index.htm')except error.URLError as e:#由request产生的异常都由URLError捕获

print(e.reason)#URLError的reason属性返回错误原因

#error.HTTPError

from urllib importrequest,errortry:

response= request.urlopen('http://cuiqingcai.com/index.htm')excepterror.HTTPError as e:print(e.reason,e.code,e.headers)#三个属性，code为错误代码，headers为响应头

except error.URLError as e:#URLError为HTTPError的父类，先捕获子类的错误

#如果非HTTPError再捕获父类URLError的错误

print(e.reason)else:print('Request Successfully')from urllib importrequest,errorimportsockettry:

response= request.urlopen('http://www.baidu.com',timeout=0.01)excepterror.URLError as e:print(type(e.reason))ifisinstance(e.reason,socket.timeout):print('TIME OUT')#Output： # reason不一定都返回字符串，这里e.reason为一个socket.timeout对象，而非一个字符串#TIME OUT

#解析链接

from urllib.parse importurlparse

result= urlparse('http://www.baidu.com/index.html;user?id=5#comment')#urlparse的API：urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)

print(type(result),result)#Output： #ParseResult为一个元祖#ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')#标准链接格式scheme://netloc/path;parameters?query#fragment

#urlunparse

from urllib.parse importurlunparse

data=['http','www.baidu.com','index.html','user','a=6','comment']print(urlunparse(data))#参数为一个有6个参数(即标准链接格式的六部分)的Iterable

#urlsplit

from urllib.parse importurlsplit

result= urlsplit('http://www.baidu.com/index.html;user?id=5#comment')print(result)#Output：SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')#和urlparse一样只不过不返回params，而把它归为path

#urlunsplit同urlunparse，只不过它只用传5个参数(无params)

#urljoin的拼接

from urllib.parse importurljoinprint(urljoin('http://www.baidu.com', 'FAQ.html'))print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))print(urljoin('http://www.baidu.com/about.html', 'http://cuiqingcai.com/FAQ.html'))print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/'

'FAQ.html?question=2'))print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))print(urljoin('http://www.baidu.com', '?category=2#comment'))print(urljoin('www.baidu,com', '?categoty=2#comment'))print(urljoin('www.baidu.com#comment', '?category=2'))#给出两个链接，第一个为base_url，它会提供scheme,netloc,path三项内容#对于第二个链接中没有这三项内容的就用base_url的来补充#Output：http://www.baidu.com/FAQ.html#https://cuiqingcai.com/FAQ.html#http://cuiqingcai.com/FAQ.html#https://cuiqingcai.com/FAQ.html?question=2#https://cuiqingcai.com/index.php#http://www.baidu.com?category=2#comment#www.baidu,com?categoty=2#comment#www.baidu.com?category=2

#robots协议

from urllib.robotparser importRobotFileParser

rp=RobotFileParser()#创建类实例

rp.set_url('http://www.jianshu.com/robots.txt')

rp.read()#读取robots.txt文件并进行分析，一定要调用这个方法，否则不读取的！

print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))#判断是否can_fetch

print(rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))

weixin_39785858

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python返回第一步_Python爬虫第一步

importrequests#以不同的请求方式获取responser = requests.get('https://api.github.com/events')r= requests.post('http://httpbin.org/post', data = {'key':'value'})r= requests.put('http://httpbin.org/put', data = {'...
复制链接

扫一扫