Python爬取某图实现图片的下载保存

最新推荐文章于 2023-07-01 16:21:05 发布

forevu

最新推荐文章于 2023-07-01 16:21:05 发布

阅读量366

点赞数

本文链接：https://blog.csdn.net/forevu/article/details/92429776

版权

1. 项目介绍

	使用Python3.7中提供的一些库来实现妹子图的爬取；其中使用到的库有requests库，os模块，还有xpath库。图片的存储我使用的是write进行图片的写入实现图片的存储；也可以使用urllib提供的request中的urlretrieve()方法来实现图片的存储。

2.项目整备

对网站的爬取，我们必须先对需要爬取的网站进行分析；分析我们需要的爬取的内容在哪，并该使用那种方式对内容进行提取；本篇文章中，对内容的提取，我是用的是xpath方法；因为它比较bs4或者re来说，提取的方法相对简单。
并且，需要注意的是：该网站中下载图片，我们不可以直接进行下载，需要提供每张图片的refer才能实现图片的无损下载；具体的会在代码中进行说明。

3.对网页进行分析

打开主页我们会发现，图片分为几个不同的主题，
首先我们先实现主页的爬取，试试手：

import requests
import os
from lxml import etree


headers = {
        'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1556805731; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1556805746',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
    }
base_url = 'http://www.mzitu.com/'
response = requests.get(base_url, headers=headers)
# print(response.status_code)
response.encoding = response.apparent_encoding
html = response.text
html_x_1 = etree.HTML(html)
# with open('zhuye.html', 'w', encoding='utf-8')as fp:
#     fp.write(html)
# 获取详情页的图片链接
link_url = html_x_1.xpath('.//ul[@id="pins"]/li') # .选取当前节点
# print(link_url)
for info in link_url:
    title = info.xpath('./span[1]/a')[0].text    # 获取图片主题，作为图片存储的文件夹
    # 增加图片存储的文件夹
    if not os.path.exists(title):
        os.mkdir(title)
    detail_url = info.xpath('./a/@href')[0] # 获取到每个图片列表页面的url
    print(title, detail_url)
    # 访问图片列表页
    response = requests.get(detail_url, headers=headers)
    print(response.status_code)
    response.encoding = response.apparent_encoding    #解码
    html_det = response.text
    html_x = etree.HTML(html_det)
    # with open('detail.html', 'w', encoding='utf-8')as fp:
    #     fp.write(html)
    # 获取detail_url下面图片的页数
    total = html_x.xpath('//div[@class="pagenavi"]/a/span')[-2].text
    print(total)
    #访问每一页获取每页图片的url并进行下载
    for i in range(1, int(total)+1):
        # 拼接每一页的url
        pj_url = detail_url + '/' + str(i)
        print('正在访问第%s页：' % i + pj_url)
        response = requests.get(pj_url, headers=headers)
        response.encoding = response.apparent_encoding
        html_end = response.text
        html_x_end = etree.HTML(html_end)
        pic_url = html_x_end.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
        print(pic_url)
        # 图片下载
        referer = pj_url
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'referer': referer,
        }
        filename = pic_url.split('/')[-1]   #图片名字的命名通过去每张图片url的后部分
        response = requests.get(pic_url, headers=headers)
        response.encoding = response.apparent_encoding
        with open(title + '/' + filename, 'wb') as fp:
            fp.write(response.content)

通过上面代码可以实现主页的爬取

实现不同模块之间的爬取，使用了字典的操作：

```
import requests
import os
from lxml import etree

# 获取首页情况
def get_type(url, headers):
    first_url = 'http://www.mzitu.com/page/{}/'
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    with open("1.html", "w", encoding="utf-8") as fp:
        fp.write(html)
    html_x = etree.HTML(html)
    # 获取总的页数
    total = html_x.xpath('//div[@class="nav-links"]/a/text()')[-2]
    for i in range(1, int(total)+1):
        end_url = first_url.format(i)
        print("正在访问：" + end_url)
        xiangqqing(end_url)
        break

def xiangqqing(url):
    print("进去详情页面...")
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    html_x = etree.HTML(html)
    link_url = html_x.xpath('.//ul[@id="pins"]/li') # .选取当前节点
    # print(link_url)
    # 有点问题 需要进行修整...
    for link_info in link_url:
        title = link_info.xpath('./span[1]/a')[0].text
        if os.path.exists(title):
            pass
        else:
            os.mkdir(title)
        tuurl = link_info.xpath('./a/@href')[0]
        print(tuurl)
        get_downurl(tuurl, title)
        # break

def get_downurl(url, title):
    print('进入获取图片的url页面...')
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    html_x = etree.HTML(html)
    # 获取图片的页数
    total = html_x.xpath('//div[@class="pagenavi"]/a/span')[-2].text
    print(total)
    # 访问每一页获取每页图片的url并进行下载
    for i in range(1, int(total) + 1):
        # 拼接每一页的url
        endurl = url + '/' + str(i)
        print(endurl)
        downloadtu(endurl, title)
def downloadtu(url,title):
    print('进入获取图片的链接...')
    # html_x = get_url(url)
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    html_x = etree.HTML(html)
    # print(html)
    # with open("２.html", "w", encoding="utf-8") as fp:
    #     fp.write(html)
    tupian_url = html_x.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
    down_photo(tupian_url, url, title)
    # 开始下载
def down_photo(tupian_url, url, title):
    referer = repr(url)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'referer' : referer
    }
    filename = tupian_url.split('/')[-1]
    response = requests.get(tupian_url, headers=headers)
    response.encoding = response.apparent_encoding
    with open(title +'/' + filename, 'wb') as fp:
        fp.write(response.content)
if __name__ == '__main__':
    type = input("请输入要搜索的类型（性感、日本、台湾、清纯）：")
    dict = {"性感": "xinggan/", "日本": "japan/", "台湾": "taiwan/", "清纯": "mm/"}
    pj = dict[type]
    print(pj)
    base_url = 'http://www.mzitu.com/' + pj
    print(base_url)
    headers = {
        'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1556805731; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1556805746',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
    }
    get_type(base_url, headers)
```