一些常用方法,敲敲就明白了
from lxml import etree
xml = """
<book>
<id>1</id>
<name>野花遍地香</name>
<price>1.23</price>
<author>
<nick>Zhou Daqiang</nick>
<nick>Zhou Zhiruo</nick>
<div>
<nick>Zhou JieLun</nick>
</div>
</author>
</book>
"""
tree = etree.XML(xml)
result = tree.xpath("/book")# /表示层级关系,第一个是根节点
result = tree.xpath("/book/id/text()") # text()表示获取文本内容
result = tree.xpath("/book/author//nick/text()") # 获取所有的nick标签的文本内容, author目录下及其子目录下的nick标签
result = tree.xpath("/book/author/*/nick/text()") # * 是通配符,匹配任意节点
xml = """
<html>
<head>
<meta charset="UTF-8" />
<title>Title</title>
</head>
<body>
<ul>
<li><a href="http://www.baidu.com">百度</a></li>
<li><a href="http://www.google.com">谷歌</a></li>
<li><a href="http://www.sogou.com">搜狗</a></li>
</ul>
<ol>
<li><a href="feiji">飞机</a></li>
<li><a href="dapao">大炮</a></li>
<li><a href="huoche">火车</a></li>
</ol>
<div class="job">李嘉诚</div>
<div class="common">胡辣汤</div>
</body>
</html>
"""
tree = etree.XML(xml)
result = tree.xpath("/html/body/ul/li[1]/a/@href") # 获取所有a标签的href属性 , xpath的顺序是从1开始数的, []表示索引
result = tree.xpath("/html/body/ol/li/a[@href='dapao']/text()") # [@xxx=xxx] 属性筛选
ol_li_list = tree.xpath('/html/body/ol/li')
for li in ol_li_list:
# 从每个li中提取到文字信息
la = li.xpath('./a/text()') # 在li中继续去寻找,相对查找
print(la)
# 要学会使用浏览器开发工具
# print(result)
爬取猪八戒招聘网站信息
import requests
url = "https://www.zbj.com/fw/?k=saas"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
"cookie":""
}
resp = requests.get(url, headers=headers, verify=False)
from lxml import etree
# print(resp.text)
# 解析
html = etree.HTML(resp.text)
divs = html.xpath('//*[@id="__layout"]/div/div[3]/div[1]/div[4]/div/div[2]/div[1]/div[2]/div')
for div in divs:
price = div.xpath('./div/div[3]/div[1]/span/text()')
title = "SAAS".join(div.xpath('./div/div[3]/div[2]/a/span/text()'))
print(title + price[0])
resp.close()
对于需要登录的网站来说,cookie是个很好的选择
# 登录,得到cookie
# 带着cookie去请求到暑假url -> 书架上的内容
# 把上面的两个操作连起来
# 我们可以使用session进行请求 -> session 可以认为是一连串请求,在这个过程中cookie不会丢失
import requests
# 第一种办法
# 会话
session = requests.session()
# 1. 登录
data = {
"loginName": "18614075987",
"password": ""
}
url = "https://passport.17k.com/ck/user/login"
resp = session.post(url, data=data)
# 2. 拿书架上的数据
url = "https://user.17k.com/ck/author/shelf"
resp = session.get(url)
# print(resp)
# 第二种办法
# 要想第一次访问成功,那就要使用cookie
某些网站的反爬手段是检查你从哪个页面进来的,这时就要加入referer信息
"""
开发者工具看到的是实时页面的数据
想要的页面不在源代码里那他肯定在二次请求里
"""
import requests
url = "https://www.pearvideo.com/video_1693606"
contId = url.split('_')[1]
videoStatus = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.3755057662779513'
# 把systemTime替换成cnt + contId
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
# 防盗链:溯源,本次请求的上一级是谁
"Referer": url
}
# 防盗链:溯源
# 就是说该网站的url链条是1->2->3,当你请求2的时候,2会检查你是不是从1过来的,如果不是,就不让你看
resp = requests.get(videoStatus, headers=headers, verify=False)
systemTime = resp.json()['systemTime']
srcUrl = resp.json()['videoInfo']['videos']['srcUrl']
srcUrl = srcUrl.replace(systemTime, f'cont-{contId}')
# 下载视频
with open("video/a.mp4", mode="wb") as f:
f.write(requests.get(srcUrl, verify=False).content)
还有可能进行封ip操作,这时我们可以使用代理
"""
快速,大量请求去爬取网站数据,会被网站封IP,所以需要代理IP
法律问题
"""
import requests
proxies = {
"https" : "http://121.40.253.153:80",
}
resp = requests.get('https://www.baidu.com', proxies=proxies)
resp.encoding = 'utf-8'
resp.close()
print(resp.text)