lxml结合xpath语法实例一(数据提取)

我本人比较喜欢蜡笔小新,这个实例批量下载蜡笔小新图片
 
源码分析:所有图片包含在class为searchbqppdiv tagbqppdiv的div下的a标签的src属性中
 
思路:获取源码,提取数据,下载图片
 
目标地址: https://www.fabiaoqing.com/
requests+lxml 只提取一页的数据
 
import requests,lxml
images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/1.html'
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
req_url_images=requests.get(url=images_url,headers=head)
req_url_images.text

from lxml import etree
html_images=etree.HTML(req_url_images.text)
images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
images_last=html_images.xpath(images_get)
#用ulllib自带的保存文件的方式去保存
from urllib import request

#索引出序列用来做图片名字
Indexes=1
for images_save in images_last:
        #这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save,r"image/"+'%s.jpg'%Indexes)
        Indexes+=1

 

urllib+lxml
from urllib import request
from lxml import etree
images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/1.html'
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
new_way=request.Request(url=images_url,headers=head)
req_url_images=request.urlopen(new_way)

html_images=etree.HTML(req_url_images.read().decode('utf-8'))

images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
images_last=html_images.xpath(images_get)
#用ulllib自带的保存文件的方式去保存

#索引出序列用来做图片名字
Indexes=1
for images_save in images_last:
        #这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save,r"image/"+'%s.jpg'%Indexes)
        Indexes+=1

 

提取多页的数据requests+lxml
pages = 1
Indexes = 1
while pages < 11:
    import requests, lxml

    images_url = 'https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/' + '%s.html' % pages
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    req_url_images = requests.get(url=images_url, headers=head)
    req_url_images.text
    pages += 1
    from lxml import etree

    html_images = etree.HTML(req_url_images.text)
    images_get = "//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
    images_last = html_images.xpath(images_get)
    # 用ulllib自带的保存文件的方式去保存
    from urllib import request

    # 索引出序列用来做图片名字

    for images_save in images_last:
        # 这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save, r"image/" + '%s.jpg' % Indexes)
        print('已经爬取了%s张' % Indexes)
        Indexes += 1

  

提取多页的数据urllib+lxml
from urllib import request
from lxml import etree
pages=1
Indexes=1
while pages<11:
        images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/'+'%s.html'%pages
        head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
        new_way=request.Request(url=images_url,headers=head)
        req_url_images=request.urlopen(new_way)

        html_images=etree.HTML(req_url_images.read().decode('utf-8'))

        images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
        images_last=html_images.xpath(images_get)
        #用ulllib自带的保存文件的方式去保存

        #索引出序列用来做图片名字
        count=1
        for images_save in images_last:
                # 这个网站的图片都是以jpg结尾比较简单
                request.urlretrieve(images_save,"image/"+'%s.jpg'%Indexes)
                print('已经爬取了%s张'%Indexes)
                Indexes+=1
        pages+=1

  

 

 

 
 
 

转载于:https://www.cnblogs.com/lcyzblog/p/11285962.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值