爬虫（一）——必应排行榜壁纸图片下载

最新推荐文章于 2024-02-10 22:09:24 发布

ccc_irene

最新推荐文章于 2024-02-10 22:09:24 发布

阅读量426

点赞数

分类专栏： Python数据分析

本文链接：https://blog.csdn.net/qq_41433316/article/details/104988074

版权

Python数据分析专栏收录该内容

4 篇文章 1 订阅

订阅专栏

实现的是一个爬取必应每日壁纸排行榜上的壁纸图片，下载图片，保存图片信息的一个简单的过程。
代码可以直接用，缺模块直接安装就好了，pandas和requests和bs4必装

一、代码

先给出完整版：
crawler_picture.py

import argparse
import os
import time

import pandas
import requests
from bs4 import BeautifulSoup
from proxy_available import proxy_available

class Crawler():
    def __init__(self,args):

        self.headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36'}

        self.start_page = args.start_page                       # 起始页
        self.end_page = args.end_page                           # 截止页
        self.url = 'https://bing.ioliu.cn/ranking'              # url
        
        self.page_num = self.start_page

        self.if_write = args.write
        self.if_download = args.download
        self.download_num = 0
        self.if_analysis = args.analysis

        self.proxy_available = proxy_available

        self.pic_result_list = []                               # 图片信息汇总
        if self.if_write:
            pic_list = []
            pic_list.append('name/图片名')
            pic_list.append('description/描述')
            pic_list.append('calendar/日期')
            pic_list.append('location/地点')
            pic_list.append('view/查看次数')
            pic_list.append('like/点赞')
            pic_list.append('download_times/下载次数')
            self.pic_result_list.append(pic_list)
        if self.if_download:
            self.datatype = 'images'                            # 下载的图片保存至images文件夹下
            if not os.path.exists(self.datatype):
                os.makedirs(self.datatype)
            else:
                for i in os.listdir(self.datatype):
                    path_file = os.path.join(self.datatype,i)
                    if os.path.isfile(path_file):
                        os.remove(path_file)
        if self.if_analysis: 
            self.location_name = ['亚洲','欧洲','非洲','美洲','大洋洲']
            self.location_times = [0,0,0,0,0]

    def get_page(self, url): 
        if not url:
            url=self.url+'?p='+str(self.start_page)
        try:
            self.req = requests.get(url, headers=self.headers)
            if self.req.status_code == 200:                     # 状态码为200时进行解析
                self.decode(self.req.content)
            else:                                               # 状态码不为200时选择使用代理ip
                for i in range(0, len(self.proxy_available)):
                    proxy = self.proxy_available[i]
                    try:
                        self.req = requests.get(url, headers=self.headers, proxies=proxy)
                        if self.req.status_code == 200:
                            self.decode(self.req.content)
                            break
                    except:
                        print("该代理ip无效")
        except Exception as e:
            print("出现错误")
            print(e)

    def decode(self, html):

        soup = BeautifulSoup(html, "html.parser")
        mask = soup.find('body').find('div', attrs={'class':'mask'})
        container = mask.next_sibling

        start_time = time.time()

        for picture in container.find_all('div', attrs={'class':'item'}):
            card = picture.find('div', attrs={'class':'card progressive'})
            pic = card.find('img')
            pic_url = pic['src'].split('?')[0]

            description = card.find('div', attrs={'class':'description'})
            description_text = description.find('h3').getText()
            if description.find('p', attrs={'class':'calendar'}):               # 获取日期
                calendar = description.find('p', attrs={'class':'calendar'}).find('em', attrs={'class':'t'}).getText()
            else:
                calendar = "0000-00-00"
            if description.find('p', attrs={'class':'location'}):               # 获取拍摄地址
                location = description.find('p', attrs={'class':'location'}).find('em', attrs={'class':'t'}).getText()
            else:
                location = "未知"
            if description.find('p', attrs={'class':'view'}):                   # 获取观看量
                view = description.find('p', attrs={'class':'view'}).find('em', attrs={'class':'t'}).getText()
            else:
                view = 0

            options = card.find('div', attrs={'class':'options'})
            if options.find('span', attrs={'class':'ctrl heart'}):              # 获取点赞量
                like = options.find('span', attrs={'class':'ctrl heart'}).find('em', attrs={'class':'t'}).getText()
            else:
                like = 0
            if options.find('a', attrs={'class':'ctrl download'}):              # 获取下载量
                download_times = options.find('a', attrs={'class':'ctrl download'}).find('em', attrs={'class':'t'}).getText()
            else:
                download_times = 0


            pic_list = []
            pic_list.append(pic_url.split('/')[-1])
            pic_list.append(description_text)
            pic_list.append(calendar)
            pic_list.append(location)
            pic_list.append(view)
            pic_list.append(like)
            pic_list.append(download_times)
            self.pic_result_list.append(pic_list)                               # 将图片信息保存至列表中
            
            if self.if_download:
                self.download(pic_url, self.datatype)                           # 下载图片

            if self.if_analysis:
                self.analysis(location)                                         # 分析拍摄地址
        
        if self.if_download:
            print('\n'+"第"+str(self.page_num)+"页下载完毕，该页耗时："+str(round(time.time()-start_time,2))+"s")
        if self.page_num == self.end_page:                                      # 此页为设定的最后一页
            if self.if_download:
                print("共下载 "+str(self.download_num)+" 张图片")
            if self.if_analysis:
                self.data_analysis()                                            # 进行拍摄地址的数据分析
            self.write()                                                        # 将所有图片信息写入表格中
            return 
        else:
            self.page_num += 1


        next_page = soup.find('div', attrs={'class','page'})                    # 寻找下一页
        for page in next_page.find_all('a'):
            if page.getText()=='下一页':
                if self.page_num == page['href'].split('?')[-1]:                # 下一页与当前页相等，表示已经到了最后一页
                    self.write()
                    if self.if_download:
                        print("共下载 "+str(self.download_num)+" 幅图片")
                    if self.if_analysis:
                        self.data_analysis()
                else:    
                    next = self.url + '?' + page['href'].split('?')[-1]
                    self.get_page(next)
            else:
                continue

    def download(self, pic_url, datatype):
        img = requests.get(pic_url)                                             # 获取图片数据
            
        if img.status_code == 200:
            img_path = datatype+'/'+pic_url.split('/')[-1]
            data_count = 0
            content_size = int(img.headers['content-length'])
            with open(img_path, "wb") as file:
                print('\n'+pic_url.split('/')[-1])
                for data in img.iter_content(chunk_size=1024):                  # 一块一块以下载，一块的大小为1MB
                    file.write(data)
                    data_count = data_count + len(data)
                    now = (data_count / content_size) * 50                      # 计算下载的进度
                    print('\r'+"已经下载："+int(now)*"="+" 【"+str(round(data_count/1024/1024,2))+"MB】"+"【"+str(round(float(data_count/content_size)*100,2))+"%"+"】", end='')
            self.download_num += 1
            
    def result(self):
        print(self.pic_result_list)

    def write(self):
        summaryDataFrame = pandas.DataFrame(self.pic_result_list)               # 将二位列表转化成DataFrame数据类型
        summaryDataFrame.to_excel("summary_pictures_biying.xlsx", encoding='utf-8', index=False, header=False)

    def analysis(self, location):
        for i in range(0, 5):
            if self.location_name[i] in location:
                self.location_times[i] += 1
                break

    def data_analysis(self):
        from pyecharts import options as opts                                           # 导入包
        from pyecharts.charts import Pie
        from pyecharts.render import make_snapshot
        c = Pie()                                                                       # 绘制饼状图
        c.add(
                "",
                [list(z) for z in zip(self.location_name, self.location_times)],
                radius=["40%", "75%"],		                                            # 内半径和外半径占比
            )
        c.set_global_opts(title_opts=opts.TitleOpts(title="壁纸拍摄地址分布图"),)
        c.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
        c.render('壁纸拍摄地址分布图.html')
        # make_snapshot(driver, c.render(), "图片地理位置分布图.png")                     # 需下载chromedriver


parser = argparse.ArgumentParser()
parser.add_argument("-s","--start_page",type=int,default=1,help="起始页")
parser.add_argument("-e","--end_page",type=int,default=1,help="终止页")
parser.add_argument("-d","--download",default=False,help="是否下载图片")
parser.add_argument("-w","--write",default=True,help="是否写入excel")
parser.add_argument("-a","--analysis",default=False,help="是否进行数据分析")
args = parser.parse_args()

crawler = Crawler(args)
crawler.get_page('')
# crawler.result()
# crawler.write()
# crawler.data_analysis()

proxy_available.py
（该文件内ip地址无法适用于爬取必应网站，只能用于http开头的网站，放在这里只是为了学习如何使用代理ip）

proxy_available = [
{
    "http":"http://211.149.172.228:9999",
    "https":"https://116.196.90.181:3128",
},
{
    "http":"http://211.149.172.228:9999",
    "https":"https://116.196.90.181:3128",
},
]

下面是详细的讲述：

1.1、参数传入

因为最近写了好多代码都用命令行传入参数，直接传感觉有点low low的，就稍微学了一下argparse模块，主要是三个步骤：

创建ArgumentParser()对象
调用add_argument()方法添加命令行输入的参数
使用parse_args()解析输入的参数

代码如下所示：

parser = argparse.ArgumentParser()
parser.add_argument("-s","--start_page",type=int,default=1,help="起始页")
parser.add_argument("-e","--end_page",type=int,default=1,help="终止页")
parser.add_argument("-d","--download",default=False,help="是否下载图片")
parser.add_argument("-w","--write",default=True,help="是否写入excel")
parser.add_argument("-a","--analysis",default=False,help="是否进行数据分析")
args = parser.parse_args()

假如后面想调用起始页的变量，那就args.start_page就好了

1.2、发送请求

向服务器发送请求使用到的是python的requests库，通过Chrome的开发者工具可以看到，必应网使用的是GET请求方式：

因此调用的是requests库的get()方法，获取到返回的包含服务器资源的Response对象后，需要判断请求是否成功，也就是通过查看状态码是否为200来进行判断。如果状态码正常，那就去解析网页。

但是大多网页都有反爬虫机制，所以就需要使用一些策略来绕过这些策略。本代码中主要采用伪装成服务器的方式来进行破解。

同样在开发者工具中可以看到浏览器的代理名称：

通过字典的方式将其写入到get()方法的headers参数中去，这样就可成功打开网页并获得内容。代码如下：

self.headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36'}
self.url = 'https://bing.ioliu.cn/ranking'
def get_page(self, url):
    if not url:
        url=self.url+'?p='+str(self.start_page)		#起始页
	self.req = requests.get(url, headers=self.headers)
	if self.req.status_code == 200:
		self.decode(self.req.content)			# 解析网页

除此之外，也可以使用代理ip的方式绕过反爬虫机制：

for i in range(0, len(self.proxy_available)):
    proxy = self.proxy_available[i]
    try:
        self.req = requests.get(url, headers=self.headers, proxies=proxy)
        if self.req.status_code == 200:
            self.decode(self.req.content)
            break
    except:
        print("该代理ip无效")

其中proxy为字典，这里会依次从代理ip的列表中选取代理ip并传给proxies参数。鉴于免费且稳定的ip地址并不多，所以这一部分代码在本次爬虫实验中不会被调用到。

1.3、获取并保存图片信息

解析网页用到的是BeautifulSoap库。这里面大多时用到的是find()、find_all()、getText()等方法。其中find()和find_all()可以返回满足条件的子节点，而getText()可以获取数据内容。每一个壁纸图像涉及的信息在下图的节点中：

依次找到需要的节点，再获取包含的信息即可。
但是，鉴于不是所有图片都包含了所有信息，存在着信息缺失的情况，所以这里还需要进行判断：

mask = soup.find('body').find('div', attrs={'class':'mask'})
container = mask.next_sibling

for picture in container.find_all('div', attrs={'class':'item'}):
    card = picture.find('div', attrs={'class':'card progressive'})
    pic = card.find('img')
    pic_url = pic['src'].split('?')[0]

    description = card.find('div', attrs={'class':'description'})
    description_text = description.find('h3').getText()
    if description.find('p', attrs={'class':'calendar'}):
        calendar = description.find('p', attrs={'class':'calendar'}).find('em', attrs={'class':'t'}).getText()
    else:
        calendar = "0000-00-00"
    if description.find('p', attrs={'class':'location'}):
        location = description.find('p', attrs={'class':'location'}).find('em', attrs={'class':'t'}).getText()
    else:
        location = "未知"
    ......
    # 其余信息的获取均同理

图片信息获取完毕后，将其保存至excel表格里，用到了pandas库。图片的信息被保存到一个二维列表中，最后将汇总后的二维列表转换成DataFrame数据类型，并使用to_excel()方法就可以写入excel表中：

summaryDataFrame = pandas.DataFrame(self.pic_result_list)
summaryDataFrame.to_excel("summary_pictures_biying.xlsx", encoding='utf-8', index=False, header=False)

1.4、下载图片

通过解析数据，可以找到图片保存的地址，获取到地址后，使用get()方法获取图片的数据并写入文件。

图片的数据用的是二进制表示的方法，写入到.jpg文件里即为下载成功。

为了能够看到图片的下载进度，使用了headers方法来获取图片的总大小，每一块下载的大小为1MB，最后将已下载的大小与总大小的占比展示在终端上。

def download(self, pic_url, datatype):
img = requests.get(pic_url)
    
if img.status_code == 200:
    img_path = datatype+'/'+pic_url.split('/')[-1]
    data_count = 0
    content_size = int(img.headers['content-length'])
    with open(img_path, "wb") as file:
        print('\n'+pic_url.split('/')[-1])
        for data in img.iter_content(chunk_size=1024): # 一块一块以下载
            file.write(data)
            data_count = data_count + len(data)
            now = (data_count / content_size) * 50 # 计算下载的进度
            print('\r'+"已经下载："+int(now)*"="+" 【"+str(round(data_count/1024/1024,2))+"MB】"+"【"+str(round(float(data_count/content_size)*100,2))+"%"+"】", end='')
    self.download_num += 1

1.5、更新网址

如果尚未到达指定的截止页，需要更新网页地址获取新的资源数据。

这里依旧用find()和getText()方法来获取到下一页的网址。

next_page = soup.find('div', attrs={'class','page'})
for page in next_page.find_all('a'):
    if page.getText()=='下一页':
        if self.page_num == page['href'].split('?')[-1]:            # 下一页与当前页相等，表示已经到了最后一页
            self.write()
            if self.if_download:
                print("共下载 "+str(self.download_num)+" 幅图片")
            if self.if_analysis:
                self.data_analysis()
        else:    
            next = self.url + '?' + page['href'].split('?')[-1]
            self.get_page(next)

1.6、数据分析

对于抓取到的数据，可以简单地进行一个分析。

在本次爬虫实验中，图片的信息里包含了拍摄地址，可以对拍摄地址来进行一个比较。可以将拍摄地址大体分为亚洲、欧洲、非洲、美洲、大洋洲五类，然后再根据获取到的地址分别进行计数，最后使用pyecharts模块里的Pie()方法，绘制饼状图。

def data_analysis(self):
    c = Pie()
    c.add(
            "",
            [list(z) for z in zip(self.location_name, self.location_times)],
            radius=["40%", "75%"],		#内半径和外半径占比
        )
    c.set_global_opts(title_opts=opts.TitleOpts(title="壁纸拍摄地址分布图"))
    c.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    c.render('壁纸拍摄地址分布图.html')
    # make_snapshot(driver, c.render(), "图片地理位置分布图.png")

其中self.location_name和self.location_times为列表，分别保存了类别和每一类的数量。

二、结果

首先输入参数：
在这里插入图片描述
这里设置了参数，从第一页到第四页，图片下载选项设置为Ture
图片的信息保存在summary_pictures_biying.xlsx中：

图片下载进度显示如下图所示：

图片保存在同目录下的images文件夹中：

拍摄地址的占比结果如下图所示（爬取页数为1-20）：