xpath简介
-
lxml是一个第三方框架,用于对xml文件进行格式化操作(html文件是一种特殊xml文件)
-
xpath是一种基于xml文件,根据xml文件的文档结构来提取目标元素或者属性的语法,它的基本依赖工具就是lxml
将本地的test.html文件格式化成一个节点树对象
from lxml import etree # etree是lxml中的一种格式化工具,用于将html文件格式化成一个节点树结构 html_tree = etree.parse("./test.html") # 将本地的test.html文件格式化成一个节点树对象 print(html_tree) # 输出结果:<lxml.etree._ElementTree object at 0x0000028A81E566C8>
xpath功能介绍
- 1、获取节点
ret = html_tree.xpath("/html/body/ol/li[1]") # 里面用xpath路径来定位目标节点。如果以“/”代表从根节点开始查找
# xpath语法中 "/"代表当前节点的子节点 ,"//"代表当前节点的后代节点
# xpath函数,传入一个字符串参数,代表的是xpath路径,用于定位目标节点,返回值是一个列表,列表中定位到检测的那些节点
# 【注意】在xpath语法中数字都是从1开始数,没有0序号也没有负数
ret = html_tree.xpath("/html/body/div/div[1]/a")
- 2、提取节点的属性和内容
1 ret = html_tree.xpath("/html/body/div/div[1]/a/text()") # 提取标签的内容 3 ret = html_tree.xpath("/html/body/div/div[1]/a/@href") # 提取href属性,【注意】xpath语法中所有的节点属性要在前面加上“@ ”符号
- 3、定位
1 层级定位 2 ret = html_tree.xpath("/html/body//li/text()") # 获取页面上的所有的li
1 属性定位 3 ret = html_tree.xpath("/html/body//li[@id]/text()") # 查找页面上所有带有id属性的li 4 ret = html_tree.xpath("/html/body//li[@class='dudu']/text()") # 查找页面上所有class属性为dudu的li 5 ret = html_tree.xpath("/html/body//li[@class='tanshui taohua']/text()") # 属性的值一定要写全
- 4、模糊匹配
1 ret = html_tree.xpath("/html/body//li[contains(@class,'he')]/text()") # 包含:查找所有class值包含he的li 2 ret = html_tree.xpath("/html/body//li[starts-with(@class,'h')]/text()") # 开头:查找所有class值以h开头的li
- 5、逻辑匹配
1 ret = html_tree.xpath("/html/body//li[@class and @id]/text()") # 与:查找所有的li中包含id属性和class属性的li 2 ret = html_tree.xpath("//li[@class='nene' or @id='hh']/text()") # 或:查找所有的id值为hh或class值为neme的li 3 print(ret)
- 6、相对定位
ol = html_tree.xpath("//ol[2]")[0] # 查找第二个ol # 从上面查找到的ol中提取li ret = ol.xpath("//li/text()") # 用绝对路径来提取,无论xpath函数前面用谁调用,都是从文档的跟节来提取点 ret = ol.xpath("..//li/text()") # 用相对路径来提取,从xpath前面调用对象来查找。"."代表当前 ;".."代表当前的上一级 print(ret)
实例
1 from urllib import request,parse 2 from time import sleep 3 from lxml import etree 4 import re 5 import json 6 import csv 7 import redis 8 9 # 1、【数据的获取】 10 def request_from(url,page,city): 11 page_url = url%(city,page) 12 req = request.Request(headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'},url=page_url) 13 return req 14 def get_pages(url,start,end,city): 15 # 创建请求对象 16 for page in range(start,end+1): 17 req = request_from(url=url,page=page,city=city) 18 # 发起请求 19 res = request.urlopen(req) 20 sleep(1) 21 html = res.read().decode("utf-8") 22 23 yield html 24 25 # 2、【数据的解析】 26 def anylasis_data(pages): 27 for page in pages: 28 # 用etree将页面转成节点树 29 page_tree = etree.HTML(page) 30 house_list = page_tree.xpath("//ul[@class='sellListContent']/li") 31 # print(house_list) 32 # 迭代每一个li(每一个房屋信息内容) 33 for house in house_list: 34 # 提取内容 35 # 创建一个item字典,用于整合每一个房屋信息 36 item = {} 37 item["title"] = house.xpath(".//div[@class='title']//a/text()")[0] 38 item["houseInfo"] = "".join(house.xpath(".//div[@class='houseInfo']//text()")) 39 item["positionInfo"] = "".join(house.xpath(".//div[@class='positionInfo']//text()")) 40 item["unitPrice"] = re.findall(pattern=r'[0-9]+',string=house.xpath(".//div[@class='unitPrice']//text()")[0])[0] 41 item["totalPrice"] = house.xpath(".//div[@class='totalPrice']//text()")[0] 42 item["picUrl"] = house.xpath(".//img[@class='lj-lazy']/@data-original")[0] 43 44 yield item 45 46 # 3、【数据的存储】 47 def write_to_json(houses): 48 # 整合json数据 49 # 创建一个字典用于整合所有的房屋数据 50 hd = {} 51 # 创建一个列表,用于存储每一个房屋的信息 52 hl = [] 53 for house in houses: 54 hl.append(house) 55 hd["house"] = hl 56 # print(hd) 57 with open("house.json",'w',encoding='utf-8') as fp: 58 fp.write(json.dumps(hd)) 59 60 def write_to_redis(houses): 61 # 创建redis数据库连接 62 rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=6) 63 for house in houses: 64 rds.lpush("ershoufang",house) 65 66 def write_to_csv(houses): 67 # 打开一个csv文件 68 fp = open("ershoufang.csv","a+") 69 # 创建一个写对象 70 writer = csv.writer(fp) 71 # 写表头 72 writer.writerow(["title","houseInfo","positionInfo","unitPrice","totalPrice","picUrl"]) 73 for house in houses: 74 # csv二维表的每一行是一个列表 75 values = [] 76 for k,v in house.items(): 77 values.append(v) 78 writer.writerow(values) 79 fp.close() 80 81 if __name__ == '__main__': 82 url = "https://%s.lianjia.com/ershoufang/pg%d/" 83 city = input("请输入城市简称:") 84 start = int(input("请输入起始页:")) 85 end = int(input("请输入终止页:")) 86 pages = get_pages(url=url,city=city,start=start,end=end) 87 # print(pages) 88 houses = anylasis_data(pages) 89 # 存入json 90 write_to_csv(houses)