lxml结合xpath语法实例一（数据提取）

最新推荐文章于 2024-02-22 11:05:45 发布

weixin_30807779

最新推荐文章于 2024-02-22 11:05:45 发布

阅读量138

点赞数

原文链接：http://www.cnblogs.com/lcyzblog/p/11285962.html

版权

我本人比较喜欢蜡笔小新，这个实例批量下载蜡笔小新图片

源码分析：所有图片包含在class为searchbqppdiv tagbqppdiv的div下的a标签的src属性中

思路：获取源码，提取数据，下载图片

目标地址： https://www.fabiaoqing.com/

requests+lxml 只提取一页的数据

import requests,lxml
images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/1.html'
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
req_url_images=requests.get(url=images_url,headers=head)
req_url_images.text

from lxml import etree
html_images=etree.HTML(req_url_images.text)
images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
images_last=html_images.xpath(images_get)
#用ulllib自带的保存文件的方式去保存
from urllib import request

#索引出序列用来做图片名字
Indexes=1
for images_save in images_last:
        #这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save,r"image/"+'%s.jpg'%Indexes)
        Indexes+=1

urllib+lxml

from urllib import request
from lxml import etree
images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/1.html'
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
new_way=request.Request(url=images_url,headers=head)
req_url_images=request.urlopen(new_way)

html_images=etree.HTML(req_url_images.read().decode('utf-8'))

images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
images_last=html_images.xpath(images_get)
#用ulllib自带的保存文件的方式去保存

#索引出序列用来做图片名字
Indexes=1
for images_save in images_last:
        #这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save,r"image/"+'%s.jpg'%Indexes)
        Indexes+=1

提取多页的数据requests+lxml

pages = 1
Indexes = 1
while pages < 11:
    import requests, lxml

    images_url = 'https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/' + '%s.html' % pages
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    req_url_images = requests.get(url=images_url, headers=head)
    req_url_images.text
    pages += 1
    from lxml import etree

    html_images = etree.HTML(req_url_images.text)
    images_get = "//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
    images_last = html_images.xpath(images_get)
    # 用ulllib自带的保存文件的方式去保存
    from urllib import request

    # 索引出序列用来做图片名字

    for images_save in images_last:
        # 这个网站的图片都是以jpg结尾比较简单
        request.urlretrieve(images_save, r"image/" + '%s.jpg' % Indexes)
        print('已经爬取了%s张' % Indexes)
        Indexes += 1

提取多页的数据urllib+lxml

from urllib import request
from lxml import etree
pages=1
Indexes=1
while pages<11:
        images_url='https://www.fabiaoqing.com/search/search/keyword/%E8%9C%A1%E7%AC%94%E5%B0%8F%E6%96%B0/type/bq/page/'+'%s.html'%pages
        head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
        new_way=request.Request(url=images_url,headers=head)
        req_url_images=request.urlopen(new_way)

        html_images=etree.HTML(req_url_images.read().decode('utf-8'))

        images_get="//div[@class='searchbqppdiv tagbqppdiv']//img/@data-original"
        images_last=html_images.xpath(images_get)
        #用ulllib自带的保存文件的方式去保存

        #索引出序列用来做图片名字
        count=1
        for images_save in images_last:
                # 这个网站的图片都是以jpg结尾比较简单
                request.urlretrieve(images_save,"image/"+'%s.jpg'%Indexes)
                print('已经爬取了%s张'%Indexes)
                Indexes+=1
        pages+=1

转载于:https://www.cnblogs.com/lcyzblog/p/11285962.html