爬虫xpath学习和练习使用

 xpath学习

from lxml import etree, html
# data = open("../python/aa.html", "r", encoding="utf-8").read()
#解析数据
data = open("./bb.html", "r", encoding="utf-8").read()
root_ele = html.fromstring(data)
print(root_ele, type(root_ele))

#提取数据
# ele_list = root_ele.xpath('//标签[@属性名=“属性值"]')
# root_ele.xpath('//div[@class]/li[@class="nav"]/text()')
# root_ele.xpath('//div[@class]/li[@class="nav"]/@属性名')
#/nodename 从根节点开始找
# ./nodename从当前节点选取子节点
# //nodename从根节点开始匹配所有子孙节点
#.//nodename从当前节点开始匹配所有子孙节点

#选取当前所有div节点
# result = root_ele.xpath('//div')
# print('1',result,len(result))
# print(2,result[0].xpath(".."))#选择当前节点父节点
# print(3,result[0].xpath(".//div"))#3个  选择当前节点的子孙div
# print(3,result[0].xpath("//div"))#5个 选择根节点的子孙div
# print(4,result[0].xpath("./div/@id"))#1个 选择当前节点的子div
# print(5,root_ele.xpath("./body"))#1
# print(5,root_ele.xpath("/html/body/div[1]//a/@href"))#1 绝对路径  [1]选中第一个
# print(5,root_ele.xpath("./body/div"))#2
# print(6,root_ele.xpath("/html/body/div[1]//a/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[1]/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[last()]/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[position()<3]/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[position()>=3]/text()"))#下标从1开始
# print(6,root_ele.xpath("/html/body/div[1]//a[@href]/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[@class='bri']/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[starts-with(@href,'https')]/text()"))#starts-with以什么开头
# print(6,root_ele.xpath("/html/body/div[1]//a[contains(@href,'baidu')]/text()"))
# print(6,root_ele.xpath("/html/body/div[1]//a[contains(@href,'baidu')]/text()|/html/body/div[1]//a[starts-with(@href,'https')]/text()"))#|并集
# print(7,result[0].xpath("//*[@id]/@id"))

# result = root_ele.xpath('//body')
# print(1,root_ele.xpath("//html//body//ul//a//text()"))
# print(2,root_ele.xpath("//html//body//ul//a[contains(@href,'baidu')]"))
# print(3,root_ele.xpath("//html//body//ol//a"))
# print(4,root_ele.xpath("//html//body//ol//a/text()"))
'''xpath
1. 打印所有的ul标签中的所有文本
2. 找到href包含baidu的标签
3. 找到ol标签下,所有的a标签
4. 找到ol标签下,所有的a标签,输出文本内容'''

 从指定网站 右键检查  获取豆瓣书的标题 封面链接 以及评分等信息 并存放到文件

from download import get_text
from download import get_content
# from 暑假 import download

#创建目录
# path = "douban_top_250_xpath_imgs"
# if not os.path.isdir(path):
#     os.mkdir(path)#如果目录不存在 则自动创建
# file = open("douban_top_250.csv", "w", encoding="utf-8")

base_url = "https://book.douban.com/top250?start=0"
text = get_text(base_url)
if text:
        from lxml import html
        root = html.fromstring(text)
        all_books = root.xpath("//table")
        # print(all_books,len(all_books))
        for book in all_books:
            book_title = "".join("".join(book.xpath(".//td[2]//a//text()")).split()).replace(":", ":")#去除空格 替换
            book_info = book.xpath(".//td[2]//p[@class='pl']/text()")
            book_img = book.xpath(".//td[1]//img/@src")
            book_rate = book.xpath(".//td[2]//div[2]/span[2]/text()")
            print(book_title,book_info,book_img,book_rate)
        #     file.write(f"{book_title},{book_info},{book_img},{book_rate}\n")
        # file.close()

 在网站右键检查发现html不完整 去网络的xhc找源文件和url 再用xpath查找

from bs4 import BeautifulSoup
import download

text = download.get_text("https://movie.douban.com/")
root = BeautifulSoup(text, "lxml")
# print(root)
result = root.find_all("a", attrs={"class": "item"})
print(result)
# web->html->script->自动执行一些内容
# request->url->html 后面有些不会执行
# 爬虫->爬取网页->解析网页->提取数据->存储数据->分析数据->可视化数据->展示数据
file = open("movie2.csv", "w", encoding="utf-8")
hot_movie = download.get_text("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0")# json接口
# print(hot_movie)
import json
hot_movie_obj = json.loads(hot_movie)
# print(hot_movie_obj,type(hot_movie_obj))
for movie in hot_movie_obj["subjects"]:
    print(movie["title"],movie["rate"],movie["cover"])

# js <-> json <-> python
# json轻量级数据交换格式  符合json规范的一个字符串 (必须双引号)
hot_dianshi = download.get_text("https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0")
hot_dianshi_obj = json.loads(hot_dianshi)
for dianshi in hot_dianshi_obj["subjects"]:
    print(dianshi["title"],dianshi["rate"],dianshi["cover"])
    file.write(f"{dianshi['title']},{dianshi['rate']},{dianshi['cover']}\n")
file.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值