Python：一些参数、方法的解析总结

最新推荐文章于 2021-02-09 19:16:43 发布

今天写点啥

最新推荐文章于 2021-02-09 19:16:43 发布

阅读量114

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_44396481/article/details/108994492

版权

python 专栏收录该内容

44 篇文章 1 订阅

订阅专栏

#urlopen()   模拟浏览器的发起过程，同时还带有处理授权认证、重定向、浏览器cookie以及其他内容
# import urllib.request
# response=urllib.request.urlopen('https://www.python.org')    #python官网的抓取
# # print(response.read().decode('utf-8'))
# print(type(response))    #查看响应的类型
# print(response.status)
# print(response.getheaders())
# print(response.getheader('Server'))    #查询服务器

#data参数（添加该参数，请求方式为post，不再是get）
# import urllib.parse
# import urllib.request
# data=bytes(urllib.parse.urlencode({'word': 'hello'}),encoding='utf8')    #传递参数是Word，值是hello
# response=urllib.request.urlopen("http://httpbin.org/post",data=data)   #注：链接为双引号
# print(response.read())


#timeout参数     用来设置超时时间，即如果请求超过了这个设置的超时时间还没有响应，就会抛出异常
# import urllib.request
# response=urllib.request.urlopen('http://httpbin.org/get',timeout=1)   #设置超时时间为1秒
# print(response.read())

# import socket
# import urllib.request
# import urllib.error
# try:
#     response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
# except urllib.error.URLError as e:
#     if isinstance(e.reason,socket.timeout):
#         print('TIME OUT')



#request构造方法：class urllib.request.Request(url,data=none,headers={},origin_req_host=none,
# unverifiable=False,method=none)
#url:请求的URL，必需参数
#data:可选，传的是bytes()类型的
#headers：请求头，在构造请求时通过headers参数直接构造，也可以通过调用请求实例的add_header()方法添加
#origin_req_host：请求方的host名称或者IP地址
#unverifiable：请求是否是无法验证的，默认是False
#method：请求使用的方法 post或get或put
# from urllib import request,parse
# url='http://httpbin.org/post'
# headers={
#     'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;windows NT)',
#     'Host':'httpbin.org'
# }
# dict={
#     'name':'Germey'
# }
# data=bytes(parse.urlencode(dict),encoding='utf8')   #参数data用urlencode()和bytes()方法转换成字节流
# req=request.Request(url=url,data=data,headers=headers,method='POST')    #请求方式为post
# response=request.urlopen(req)
# print(response.read().decode('utf-8'))



#urlparse()方法可以实现URL的识别和分段
# from urllib.parse import urlparse
# result=urlparse('http://www.baidu.com/index.html;user?id=5#comment')
# print(type(result),result)
#结果：scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id', fragment='comment'
# 解析：//前面是scheme，代表协议；第一个/前面是netloc,即域名；后面是path,即访问路径；
# 分号后面是params，代表参数；问号后面是是查询条件query，一般用作get类型的URL；＃后面是锚点，用于直接定位页面内部的下拉位置

#标准的链接格式：scheme://netloc/path;param?query#fragment

#scheme参数只有在URL中不含scheme信息时才有效，如果URL中含有scheme信息，就会返回解析的scheme
# from urllib.parse import urlparse
# result=urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')    #不含scheme
# print(result)
#结果scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment'

# from urllib.parse import urlparse
# result=urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')
# print(type(result),result)
#结果：scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment'

#urlparse()的API用法：urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True)
#urlstring:必选项，待解析的URL
#scheme:它是默认的协议（比如http或https等）
#allow_fragments:即是否忽略fragment，如果它被设置为Falese,fragment部分就会被忽略，它会被解析为path、parameters
# 或者query的一部分，而fragment部分为空
# from urllib.parse import urlparse
# result=urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
# print(result)
#结果：scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment=''

# from urllib.parse import urlparse
# result=urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
# print(result)
#结果：scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment=''

#返回的结果是一个元组，可以使用索引顺序或属性名来获取
# from urllib.parse import urlparse
# result=urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)
# print(result.scheme,result[0],result.netloc,result[1],sep='\n')
#结果：
# http
# http
# www.baidu.com
# www.baidu.com

# urlparse()接受的参数是可迭代对象，但长度必须是6
# from urllib.parse import urlunparse
# data=['http','www.baidu.com','index.html','user','a=6','comment']
# print(urlunparse(data))
# 结果：http://www.baidu.com/index.html;user?a=6#comment   成功实现URL构造


#urlsplit()方法不会单独解析params，它只返回5个部分
# from urllib.parse import urlsplit
# result=urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
# print(result)
#结果：scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment'


#urlsplit()是将链接的各部分拼接成一个完整的链接,长度也必须为5
# from urllib.parse import urlunsplit
# data=['http','www.baidu.com','index.html','a=6','comment']
# print(urlunsplit(data))
#结果：http://www.baidu.com/index.html?a=6#comment


# 注：urlunparse()和urlunsplit()方法拼接链接需要特定长度的对象，链接的每一部分都要清晰地分开

# urljoin()同样能生成链接，需要提供一个base_url(基础链接)和新的链接，该方法会对基础链接进行scheme、netloc、path
#这三个部分进行分析并对新链接缺失的部分进行补充，最后返回结果

# from urllib.parse import urljoin
# print(urljoin('http://www.baidu.com','index.html'))
# print(urljoin('http://www.baidu.com','?category=2#comment'))
#结果：
# http://www.baidu.com/index.html
# http://www.baidu.com?category=2#comment
#在新链接中如果不存在scheme、netloc、path，就会补充，若存在，则直接使用新链接的


#urlencode():构造get请求
# from urllib.parse import urlencode
# params={
#     'name':'germey',
#     'age':12
# }
# base_url='http://baidu.com?'
# url=base_url+urlencode(params)
# print(url)

#首先声明一个字典将参数列出来，然后调用urlencode()方法将其序列化为get请求参数


#parse_qs()将get请求参数转换回字典
# from urllib.parse import parse_qs
# query='name=germey&age=22'
# print(parse_qs(query))
#结果:{'name': ['germey'], 'age': ['22']}


# parse_qsl()：将参数转化为元组组成的列表
# from urllib.parse import parse_qsl
# query='name=germey&age=22'
# print(parse_qsl(query))
#结果：[('name', 'germey'), ('age', '22')]


#quote():将内容转化为URL编码的格式。URL带有中文参数的时候，可能会出现乱码的问题，该方法可以将中文字符转化为URL编码
# from urllib.parse import quote
# keyword='壁纸'
# url='http://www.baidu.com/s?wd='+quote(keyword)
# print(url)
#结果：http://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8

#unquote():进行URL解码
# from urllib.parse import unquote
#
# url='http://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'
# print(unquote(url))
#结果：http://www.baidu.com/s?wd=壁纸


#Robots协议：也称为爬虫协议、机器人协议，告诉爬虫和搜索引擎哪些页面可以抓取，哪些页面不可以抓取

#当搜索爬虫访问一个站点时，首先会检查这个站点根目录下是否存在robots.txt文件，如果存在，就会根据这个
# 文件定义的爬取范围来爬取；如果没有找到这个文件，搜索爬虫便会访问所有可直接访问的页面
#例如：
# User-agent:*       搜索爬虫的名称
#Disallow：/          不允许抓取的目录
#Allow：public/       只允许爬取public目录的功能
# Allow和Disallow一起使用，一般不会单独使用，用来排除某些限制


#禁止所有爬虫访问任何目录：User-agent：*    Disallow：/
#允许所有爬虫访问任何目录：User-agent：*    Disallow
#禁止所有爬虫访问网站某些目录：User-agent：*    Disallow：/private/  Disallow:/tmp/
#只允许某个爬虫访问：User-agent：WebCrawler  Disallow  User-agent： Disallow: /


#爬虫的名字已经固定
#robots.txt文件留空也是可以的

#robotparser模块提供的类RobotFileParser,可以根据某网站的robots.txt文件来判断一个爬虫是否有权限来爬取这个网页
# 方法：
# set_url():用来设置robots.txt文件的链接
#read():读取robots.txt文件并进行分析
#parse():解析robots.txt文件，传入的参数是robots.txt文件的某些行的内容
#can_fetch():该方法传入两个参数。一个是User-agent，一个是URL
#mtime:返回上次抓取和分析robots.txt的时间
#modified():将当前的时间设置为上次抓取和分析的时间


from urllib.robotparser import RobotFileParser
rp=RobotFileParser()
rp.set_url('http://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*','http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*',"http://www.jianshu.com/search?q=python&page=1&type=collections"))

今天写点啥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python：一些参数、方法的解析总结

#urlopen() 模拟浏览器的发起过程，同时还带有处理授权认证、重定向、浏览器cookie以及其他内容# import urllib.request# response=urllib.request.urlopen('https://www.python.org') #python官网的抓取# # print(response.read().decode('utf-8'))# print(type(response)) #查看响应的类型# print(response.stat
复制链接

扫一扫