python模块(六)------JSON模块及其常用方法
urllib模块
>>> import urllib.request
>>> resp = urllib.request.urlopen('http://www.baidu.com')
>>> resp
<http.client.HTTPResponse object at 0x000001F02F4273C8>
>>> resp.url
'http://www.baidu.com'
>>> resp.headers
<http.client.HTTPMessage object at 0x000001F02F43B108>
>>> resp.status
200
>>> resp.read()
>>> resp.read(10)
b'<!DOCTYPE '
>>> resp.readline()
>>> import urllib.request
>>> import urllib.parse
>>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params
>>> with urllib.request.urlopen(url) as f:
... print(f.read().decode('utf-8'))
...
>>> import urllib.request
>>> import urllib.parse
>>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0})
>>> data = data.encode('ascii')
>>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f:
... print(f.read().decode('utf-8'))
...
>>> resp = urllib.request.Request('http://www.baidu.com')
>>> resp
<urllib.request.Request object at 0x000001F02F41CCC8>
Request.full_url
Request.type
Request.host
Request.origin_req_host
Request.selector
Request.data
Request.unverifiable
Request.method
Request.get_method()
Request.add_header(key, val)
Request.add_unredirected_header(key, header)
Request.has_header(header)
Request.remove_header(header)
Request.get_full_url()
Request.set_proxy(host, type)
Request.get_header(header_name, default=None)
Request.header_items()
>>> import urllib.request
>>> req = urllib.request.Request(url='https://localhost/cgi-bin/test.cgi',
... data=b'This data is passed to stdin of the CGI')
>>> with urllib.request.urlopen(req) as f:
... print(f.read().decode('utf-8'))
...
Got Data: "This data is passed to stdin of the CGI"
import urllib.request
DATA = b'some data'
req = urllib.request.Request(url='http://localhost:8080', data=DATA, method='PUT')
with urllib.request.urlopen(req) as f:
pass
print(f.status)
print(f.reason)
import urllib.request
req = urllib.request.Request('http://www.example.com/')
req.add_header('Referer', 'http://www.python.org/')
req.add_header('User-Agent', 'urllib-example/0.1 (Contact: . . .)')
r = urllib.request.urlopen(req)
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.open('http://www.example.com/')
urllib.parse.quote(string, safe='/', encoding=None, errors=None)
例如: quote('/El Niño/') 将产生 '/El%20Ni%C3%B1o/'。
urllib.parse.unquote(string, encoding='utf-8', errors='replace')
>>> urllib.parse.unquote("https%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735072616%2F1658823828%5F1265602313%5F1808%5FsProdImgNo%5F1%2Ejpg%2F200")
'https://shp.qpic.cn/ishow/2735072616/1658823828_1265602313_1808_sProdImgNo_1.jpg/200'
>>> urllib.request.unquote("https%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735072616%2F1658823828%5F1265602313%5F1808%5FsProdImgNo%5F1%2Ejpg%2F200")
'https://shp.qpic.cn/ishow/2735072616/1658823828_1265602313_1808_sProdImgNo_1.jpg/200'
urllib.parse.quote_plus(string, safe='', encoding=None, errors=None)
例如: quote_plus('/El Niño/') 将产生 '%2FEl+Ni%C3%B1o%2F'。
urllib.parse.unquote_plus(string, encoding='utf-8', errors='replace')
例如: unquote_plus('/El+Ni%C3%B1o/') 将产生 '/El Niño/'。
urllib.parse.quote_from_bytes(bytes, safe='/')
例如: quote_from_bytes(b'a&\xef') 将产生 'a%26%EF'。
urllib.parse.unquote_to_bytes(string)
例如: unquote_to_bytes('a%26%EF') y将产生 b'a&\xef'。
urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)
>>> urllib.parse.urlencode({"name":"zs"})
'name=zs'
>>> data = urllib.parse.urlencode({"name":"zs","age":25})
>>> data
'name=zs&age=25'
>>> data.encode('ascii')
b'name=zs&age=25'
urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
>>> from urllib.parse import urlparse
>>> urlparse("scheme://netloc/path;parameters?query#fragment")
ParseResult(scheme='scheme', netloc='netloc', path='/path;parameters', params='',
query='query', fragment='fragment')
>>> o = urlparse("http://docs.python.org:80/3/library/urllib.parse.html?"
... "highlight=params#url-parsing")
>>> o
ParseResult(scheme='http', netloc='docs.python.org:80',
path='/3/library/urllib.parse.html', params='',
query='highlight=params', fragment='url-parsing')
>>> o.scheme
'http'
>>> o.netloc
'docs.python.org:80'
>>> o.hostname
'docs.python.org'
>>> o.port
80
>>> o._replace(fragment="").geturl()
'http://docs.python.org:80/3/library/urllib.parse.html?highlight=params'
>>> from urllib.parse import urlparse
>>> urlparse('//www.cwi.nl:80/%7Eguido/Python.html')
ParseResult(scheme='', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
params='', query='', fragment='')
>>> urlparse('www.cwi.nl/%7Eguido/Python.html')
ParseResult(scheme='', netloc='', path='www.cwi.nl/%7Eguido/Python.html',
params='', query='', fragment='')
>>> urlparse('help/Python.html')
ParseResult(scheme='', netloc='', path='help/Python.html', params='',
query='', fragment='')
>>> from urllib.parse import urlparse
>>> u = urlparse('//www.cwi.nl:80/%7Eguido/Python.html')
>>> u
ParseResult(scheme='', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
params='', query='', fragment='')
>>> u._replace(scheme='http')
ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
params='', query='', fragment='')
urllib.parse.urljoin(base, url, allow_fragments=True)
>>> from urllib.parse import urljoin
>>> urljoin('http://www.cwi.nl/%7Eguido/Python.html', 'FAQ.html')
'http://www.cwi.nl/%7Eguido/FAQ.html'
class urllib.robotparser.RobotFileParser(url='')
set_url(url)
read()
parse(lines)
can_fetch(useragent, url)
mtime()
modified()
crawl_delay(useragent)
request_rate(useragent)
site_maps()
>>> import urllib.robotparser
>>> rp = urllib.robotparser.RobotFileParser()
>>> rp.set_url("http://www.musi-cal.com/robots.txt")
>>> rp.read()
>>> rrate = rp.request_rate("*")
>>> rrate.requests
3
>>> rrate.seconds
20
>>> rp.crawl_delay("*")
6
>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
False
>>> rp.can_fetch("*", "http://www.musi-cal.com/")
True
python爬虫(二)------requests模块及其常用方法-get()、post()