#精准定位class为song的divs
r = tree.xpath('//div[@class="ong"]')
索引定位
# 索引定位,返回第几个元素,且索引从1开始
r = tree.xpath('//div[@class="song"]/p[3]')
取直系文本
#取文本,text()返回的是一个列表,取得是直系内容
r = tree.xpath('//div[@class="song"]//li[5]/a/text()')
取非直系文本
#获取标签中非直系的文本内容
r = tree.xpath('//li[7]//text()')
取属性值
#取属性值
r = tree.xpath('//div[@class="song"]/img/@src')[0]
以上所有xpath方法返回的都是列表
xpath实战之爬取58二手房
import requests
from lxml import etree
if __name__=='__main__':
# 获取页面源码数据
url='https://bj.58.com/ershoufang/'
# UA伪装
head={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text=requests.get(url=url,headers=head).text
#数据解析
tree =etree.HTML(page_text)
list=tree.xpath('//div[@class="property-content-detail"]')
fp=open('58.txt','w',encoding='utf-8')
for h3 in list:
#./表示定位到的div标签
title=h3.xpath('.//text()')[0]
print(title)
fp.write(title+'\n')
fp.close()
xpath实战之4k图片解析下载
import requests
from lxml import etree
import os
if __name__=='__main__':
# 获取页面源码数据
url='https://pic.netbian.com/4kmeinv/'
# UA伪装
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
response = requests.get(url=url, headers=head)
#手动给响应数据设置编码
# response.encoding='gbk'
page_text=response.text
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
for li in li_list:
img_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
# 通用解决中文乱码的解决方案
img_name=img_name.encode('iso-8859-1').decode('gbk')
# print(img_name,img_src)
img_data=requests.get(url=img_src,headers=head).content
img_path='picLibs/'+img_name
with open(img_path,'wb')as fp:
fp.write(img_data)
print(img_name+'下载完成!!')
xpath实战之全国城市名称爬取
import requests
from lxml import etree
import os
if __name__=='__main__':
# 获取页面源码数据
url='https://www.aqistudy.cn/historydata/'
# UA伪装
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text = requests.get(url=url, headers=head).text
tree = etree.HTML(page_text)
host_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
all_city_names=[]
# 也可以一次获取全部
# tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
#解析热门城市名称
for li in host_li_list:
host_city_name=li.xpath('./a/text()')[0]
all_city_names.append(host_city_name)
city_names_list=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
#解析全部城市名称
for li in city_names_list:
city_name=li.xpath('./a/text()')[0]
all_city_names.append(city_name)
print(all_city_names,len(all_city_names))
xpath实战之图片爬取
import requests
import os
from lxml import etree
if __name__=='__main__':
lxm=0
url = 'https://www.vilipix.com/ranking'
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text=requests.get(url=url,headers=head).text
tree = etree.HTML(page_text)
img_list=tree.xpath('//div[@class="title"]/a')
if not os.path.exists('./p站'):
os.mkdir('./p站')
for i in img_list:
img_url='https://www.vilipix.com'+i.xpath('./@href')[0]
img_data = requests.get(url=img_url, headers=head).text
ptree=etree.HTML(img_data)
p_list=ptree.xpath('//a[@href="javascript: void(0)"]/img')
for img in p_list:
lxm+=1
img_p=img.xpath('./@src')[0]
pp=requests.get(url=img_p,headers=head).content
img_name=img.xpath('./@alt')[0]+str(lxm)+'.jpg'
img_path='p站/'+img_name
img_path=img_path.replace("?","L")
with open(img_path,'wb') as fp:
fp.write(pp)
print(img_name+'下载完成!!')
print("over!!!!!")