python学习笔记 之爬取图片

目的:根据输入的导航条名 获取对应的图片并分类保存到本地

首先导入所需要的模块,如果提示报错,显示未安装 就到cmd中使用pip 安装就可以了

import requests,os,shutil
import re
import time
from fake_useragent import UserAgent  #随机一个User-Agent
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

最近在练习class 的使用方法所以这次就使用了类来编写 先上代码 解析在后



"""根据输入的类别,获取www.169we.com中的图片"""

# coding:utf-8
import requests,os,shutil
import re
import time
from fake_useragent import UserAgent #随机一个User-Agent
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

# base_url = "http://www.169we.com/diannaobizhi/list_7_1.html"

class Pic169Spider(object):
    def __init__(self,category):
        self.category = category
        self.base_url = "http://www.169we.com/{}/".format(self.category)
        self.headers = {
            "User-Agent":UserAgent().random,
            "Host":"www.169we.com",
        }
        self.img_headers = {
            "User-Agent": UserAgent().random,
            "Host": "724.169pp.net",
        }
        self.dic = {
        "diannaobizhi":"7",
        "shoujibizhi":"6",
        "wangyouzipai":"2",
        "gaogensiwa":"3",
        "xiyangmeinv":"4",
        "guoneimeinv":"5",
        "xingganmeinv":"1"
        }
    def getIndexPage(self,url):
        """获取图片分类的总页数"""
        response = requests.get(url,headers=self.headers)
        soup = BeautifulSoup(response.text,"lxml")
        page_all = soup.select('body  div.page  ul li a')
        return page_all[-3].get_text()

    def getPage(self, url):
        """获取图片的总页数"""
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "lxml")
        page_all = soup.select('body div ul.pagelist li a')
        return page_all[-2].get_text()

    def getIndexImgHref(self,url):
        response = requests.get(url,headers=self.headers)
        response.encoding="gbk"
        data = re.findall(r'<li><a href="(.*?)" class="pic".*?>.*?<p>(.*?)</p>',response.text,re.S)
        return data

    def getImgUrl(self,url,dirname,index):
        """获取图片的src"""
        if not os.path.exists(self.category + "/" +dirname):
            os.makedirs(self.category + "/" +dirname)
        response = requests.get(url, headers=self.headers)
        # response.encoding = "gbk"
        img_urls = re.findall(r'<p align="center">.*?<img src="(.*?)".*?</p>',response.text,re.S)
        for img_url in img_urls:
            self.saveImg(img_url,dirname,index)
            index+=1

    def saveImg(self,img_url,dirname,index):
        """保存图片到本地"""

        result = requests.get(img_url,headers=self.img_headers)
        with open(self.category + "/" + dirname +"/"+ str(index) + ".jpg", "wb") as f:
            f.write(result.content)
            # time.sleep(1)

    def main(self):
        """主函数"""
        if not os.path.exists(self.category):
            os.makedirs(self.category)
        pages = int(self.getIndexPage(self.base_url))
        for page in range(1,pages+1):
            print(page)
            num = self.dic[self.category]
            url = "http://www.169we.com/{}/list_{}_{}.html".format(self.category,num,page)
            for data in self.getIndexImgHref(url):
                old_url = data[0]
                pattern = re.compile(r'<.*?>',re.S)
                dirname = re.sub(pattern,'',data[1])
                for i in range(1,int(self.getPage(old_url))+1):
                    if i == 1:
                        new_url = old_url
                    else:
                        lista = old_url.split(".")
                        lista[-2] = lista[-2] + "_{}".format(i)
                        new_url = ".".join(lista)
                    self.getImgUrl(new_url,dirname,i)

class SingleUrl(object):
    def __init__(self):
        self.headers ={
            "User-Agent":UserAgent().random
        }
        self.total_page = 1
    def spider(self,url):
        self.getPage(url)
        dirname = url.split('/')[-1].split('.')[0]
        if os.path.exists(dirname):
            shutil.rmtree(dirname)
        os.mkdir(dirname)
        os.chdir(dirname)
        self.getImgUrl(url)
        for i in range(2,self.total_page+1):
            lista = url.split(".")
            lista[-2] = lista[-2] + "_{}".format(i)
            new_url = ".".join(lista)
            self.getImgUrl(new_url)
    def getPage(self, url):
        response = requests.get(url, headers=self.headers)
        response.encoding="gbk"
        self.total_page = int(re.findall(r'.*?共(.*?)页',response.text,re.S)[0])
    def getImgUrl(self,url):
        response = requests.get(url, headers=self.headers)
        # response.encoding = "gbk"
        img_urls = re.findall(r'<p align="center">.*?<img src="(.*?)".*?</p>', response.text, re.S)
        for img_url in img_urls:
            urlretrieve(img_url,img_url.split('/')[-1])

def all_img():
    msg = """
               提示信息:
               diannaobizhi--——>电脑壁纸
               shoujibizhi--——>手机壁纸
               wangyouzipai--——>网友自拍
               gaogensiwa--——>高跟丝袜
               xiyangmeinv--——>西洋美女
               guoneimeinv--——>国内美女
               xingganmeinv--——>性感美女

       """
    print(msg)
    category = input("请输入你要获取的类别名:")
    diannaobizhi = Pic169Spider(category)
    diannaobizhi.main()

def single_url_img():
    url = input("请将图片链接粘贴此处:")
    single = SingleUrl()
    single.spider(url)

if __name__ == "__main__":
    all_img()
    #single_url_img()


先介绍Pic169Spider类中的方法,

该方法为Pic169Spider类中的对象属性 

该方法获取总页数如下图所示

该方法获取图片链接中的总页数

获取分类的链接

比如:从http://www.169we.com/diannaobizhi/list_7_2.html获取到http://www.169we.com/diannaobizhi/2017/0709/39113.html

获取单个图片的链接,用此链接保存到本地

根据图片链接保存图片

运行程序会执行该方法,根据提示输入就可以爬取了

比如想爬取电脑壁纸 在控制台输入 diannaobizhi  按下回车键就可以了

 

 这个类是用来获取单个url的,比如我只想获取http://www.169we.com/diannaobizhi/2017/0630/38997.html

的图片,那么只需要调用single_url_img()函数,将该url传进去就可以了

 

个人文采不行,语言表达能力较弱,该代码不完善,待改进

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值