1、urllib库
1.1、get请求
import urllib.request
response = urllib.request.urlopen('http://httpbin.org/') #返回一个类对象
#获取二进制代码 b' '
response.read()
1.2、post请求
import urllib.request
import urllib.parse
#将字典转换成字符串,再转换成二进制
data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
#post方法
response = urllib.request.urlopen('http://httpbin.org/post', data=data) #返回一个类对象
#解码获取html代码
print(response.read().decode())
1.3、request请求
from urllib import request,parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE S.S; Windows NT)',
'Host':'httpbin.org'
}
dict = {
'name':'tom'
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
#通过Request函数构造参数
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
2、request库
2.1.1、request请求,文本
import requests
url = 'https://github.com/favicon.ico'
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE S.S; Windows NT)', }
r = requests.get(url=url,headers=headers)
#获取html字符串文本
print(r.text)
2.1.1、request请求,音频图片
import requests
url = 'https://dss0.bdstatic.com/5aV1bjqh_Q23odCf/static/newmusic/img/default_b91837ae.png'
#图片类型文件请求不能带参数
r = requests.get(url=url)
#获取html字符串文本
print(r.text)
#获取图片二进制代码
print(r.content)
#保存图片二进制文件
with open('image.jpg','wb') as f:
f.write(r.content)
2.1.1、request请求,制作cookie
import requests
Cookies = '''Hm_lvt_3eec0b7da6548cf07db3bc477ea905ee=1605540848,1605699228,1605796015,1605889462;
__gads=ID=8f7f1d94aac31670-22c8680b0cc40050:T=1602669898:RT=1602669898:S=ALNI_MbJoiXyh7qpesB_FMo0Di8whk3euA;
_ga=GA1.2.1489454111.1604571205; SERVERID=6930fc440219e0310834d3d7076971ef|1605889461|1605889460;
Hm_lpvt_3eec0b7da6548cf07db3bc477ea905ee=1605889462; _gid=GA1.2.1826454918.1605889463'''
jar = requests.cookies.RequestsCookieJar()
for cookie in Cookies.split(';'):
key, value = cookie.split('=', 1)
jar.set(key, value)
print(type(jar), '\n', jar)
----------------------------------------------------------------------------------------
<class 'requests.cookies.RequestsCookieJar'>
<RequestsCookieJar[<Cookie
Hm_lpvt_3eec0b7da6548cf07db3bc477ea905ee=1605889462 for />, <Cookie
__gads=ID=8f7f1d94aac31670-22c8680b0cc40050:T=1602669898:RT=1602669898:S=ALNI_MbJoiXyh7qpesB_FMo0Di8whk3euA for />, <Cookie
_ga=GA1.2.1489454111.1604571205 for />, <Cookie SERVERID=6930fc440219e0310834d3d7076971ef|1605889461|1605889460 for />, <Cookie _gid=GA1.2.1826454918.1605889463 for />, <Cookie Hm_lvt_3eec0b7da6548cf07db3bc477ea905ee=1605540848,1605699228,1605796015,1605889462 for />]>
2.2、post请求
import requests
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE S.S; Windows NT)',
'Host':'httpbin.org'
'cookie':''
}
data = {
'name':'tom',
'age':22
}
r = requests.post(url=url, headers=headers, data=data)
#获取html字符串文本
print(r.text)
3、解析
3.1、css选择器
#选择 class="intro"的所有节点 .intro
#选择 id= "firstname”的所有节点 #firstname
#选择所有节点 *
#选择所有 div 节点和所有 p 节点 div,p
#选择 div 节点内部的所有 p节点 div p
#选择父节点为 div 节点的所有 p节点 div>p
#选择带有 target 属性的所有节点 [target]
#选择 target=”blank”的所有节点 [target=”blank”]
#选择活动链接 a:active
#选择没有子节点的所有 p节点 p:empty
#选择被用户选取的节点部分 ::selection
3.2、正则表达式
#匹配字母、数字及下划线 \w
#匹配任意空白字符,等价于[\t\n\r\f] \s
#匹配任意数字,等价于[0~9] \d
#匹配一行字符串的开头 ^
#匹配一行字符串的结尾 $
#匹配任意字符 .
#用来表示一组字符,单独列出 [....]
#匹配0个或多个表达式 *
#匹配 1 个或多个表达式 +
#匹配0个或 l个前面的正则表达式定义的片段,非贪婪方式 ?
#匹配 n到 m次由前面正则表达式定义的片段,贪婪方式 {n,m}
#匹配 a 或 b a|b
#匹配括号内的表达式,也表示一个组 ()
3.3、XPath
nodename #选取此节点的所有子节点
/ #从当前节点选取直接子节点
// #从当前节点选取子孙节点
. #选取当前节点
.. #选取当前节点的父节点
@ #选取属性
from lxml import etree
import requests
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0',
}
response = requests.get(url=url, headers=headers)
#调用 HTML类进行初始化,获得XPath解析对象
html = etree.HTML(response.text)
nodes = html.xpath('//div[@class="hd"]')
for i in nodes:
#获取标签文本
print(i.xpath('./a/span[1]/text()'))
#获取多个标签文本
print(i.xpath('./a/span/text()'))
#获取属性值
print(i.xpath('./a/@href'))
#获取父节点
print(i.xpath('./../div[2]/div/span[2]/text()'))
#获取最后一个节点
print(i.xpath('./../div[2]/div/span[last()]/text()'))
#获取前两个节点
print(i.xpath('./a/span[position()<3]/text()'))
#获取祖先节点
result = html.xpath('//li[l]/ancestor::div')
#获取同级的后续节点
result = html.xpath('//li[1]/following-sibling::*')
3.4、json-Ajax
import requests
#分析XHR请求获取start_url
start_url = 'https://www.ofweek.com/ai/CATList-201700-{}-.htm'
url_list = [start_url.format(i) for i in range(1, 10)]
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0',
}
for url in url_list:
r = requests.get(url=url,headers=headers).json() #ajax页面获取的是字典对象
for new in r["newsList"]:
print(new)
print(new["title"])
print(new["htmlpath"])