Xpath
先来一个无关的小练习–求和求平均数
N=10
sum=0
count=0
while count<N:
number=int(input())
sum=sum+number
count=count+1
average=sum/N
print("N={},sum={}".format(N,sum))
print(""Average={}".format(average))
XPATH解析HTML字符串
html=etree.HTML(text)
##转化为XML语法
result=etree.tostring(html,encoding='utf-8')
##result为bytes类型
XPATH解析HTML文件
htmlemt=etree.parse('text.xml')
result=etree.tostring(htmlemt,pretty_print=True)
##pretty_print 优化输出
获得所 < li> 标签
htmlemt=etree.parse('text.xml')
result=htmlemt.xpath('//li')
print(result)
print(result[0])
change a little
获得所 < li> 标签的所有class
result=htmlemt.xpath(’//li/@class’)
获得不包括 < li> 标签本身的class
result=htmlemt.xpath(’//li/a//@class’)
实战练习.爬取豆瓣网站
import requests
from lxml import etree
headers={
"User-Agentv":"Mozilla/5.0(windows NT 6.1;WOW64)AppleWebkit/537.36(khtml,like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Referer":"https://movie.douban.com/",
}#请求头设置
url="https://movie.douban.com/cinema/nowplaying/chongqing/"
#定义uRL
rep=requests.get(url,headers=headers)
text=rep.text#返回UNICODE
html=etree.html(text)
u1=html.xpath("//ul[@class='lists']")[0]
lis=ul.xpath("./li") #当前UL下所有LI标签
movie = []
for li in lis: #循环更迭每一个li标签
title=li.xpath("@data-title")[0]
#直接通过@li属性来获得值
score=li.xpath("@data-score")[0]
region=li.xpath("@data-region")[0]
actors=li.xpath("@data-actors")[0]
director=li.xpath("@data-director")[0]
liimg=li.xpath(".//img/@src")
movie={
"title":title,
"score":score,
"region":region,
"actors":actors,
"directors":directors,
"liimg":liimg,
}#字典数据
movies.append(movie)
#添加到列表
print(movies)