使用urllib (下)

最新推荐文章于 2023-12-21 10:01:37 发布

敲代码的小风

最新推荐文章于 2023-12-21 10:01:37 发布

阅读量93

点赞数

分类专栏：崔庆才《Python3网络爬虫开发实战》阅读笔记

本文链接：https://blog.csdn.net/m0_46653437/article/details/119246294

版权

崔庆才《Python3网络爬虫开发实战》阅读笔记专栏收录该内容

6 篇文章 1 订阅

订阅专栏

3.1-使用urllib

test_21.py

# 有了 urlparse 方法，相应地就有了它的对立方法 urlunparse。
# 它接受的参数是一个可迭代对象，但是它的长度必须是 6，否则会抛出参数数量不足或者过多的问题。
from urllib.parse import urlunparse  

data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']  
print(urlunparse(data))

test_22.py

from urllib.parse import urlsplit  

result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')  
print(result)

result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')  
print(result.scheme, result[0])

test_23.py

from urllib.parse import urlunsplit  

data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']  
print(urlunsplit(data))

test_24.py

from urllib.parse import urljoin  

print(urljoin('http://www.baidu.com', 'FAQ.html'))  
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))  
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))  
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2'))  
print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))  
print(urljoin('http://www.baidu.com', '?category=2#comment'))  
print(urljoin('www.baidu.com', '?category=2#comment'))  
print(urljoin('www.baidu.com#comment', '?category=2'))

test_25.py

from urllib.parse import urlencode  

params = {  
    'name': 'germey',  
    'age': 22  
}  
base_url = 'http://www.baidu.com?'  
url = base_url + urlencode(params)  
print(url)



from urllib.parse import parse_qs  

query = 'name=germey&amp;age=22'  
print(parse_qs(query))

test_26.py

from urllib.parse import parse_qsl  

query = 'name=germey&amp;age=22'  
print(parse_qsl(query))

test_27.py

from urllib.parse import quote  

keyword = ' 壁纸 '  
# keyword = '刘德华'  
url = 'https://www.baidu.com/s?wd=' + quote(keyword)  
print(url)


from urllib.parse import unquote  

url = 'https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'  
print(unquote(url))

test_28.py

# 分析robots.txt协议
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('http://www.jianshu.com/robots.txt')
# rp = RobotFileParser('http://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))

test_29.py

# 分析robots.txt协议
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen
rp = RobotFileParser()
rp.parse(urlopen('http://www.jianshu.com/robots.txt').read().decode('utf-8').split('\n'))
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))