python_urllib入门爬取图片

有感而发几篇博客:

"While we teach we learn "
欢迎提问!

会改文件目录的,能直接run----2021.12.09
在这里插入图片描述

'''
2021-12-09
1.框架结构
.0头信息使用了用户代理(未使用ip代理)
.1爬取主页获得图片大类类别名称,并写入5.天堂图片网url爬取(目录).txt
.2根据拿出的名称获得其下的图片小类类别名称及url,并追加写入6.天堂图片网url爬取(分目录).txt
.3

'''
import time
from urllib import request
import re
import random
uapools = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763,'
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
]
path = 'C:\\Users\\Administrator\\Desktop\\Python基础与应用\\爬虫\\爬到的东西\\'

def ua(uapool):#用户代理池
    opener = random.choice(uapool)
    head = ('User-Agent',opener)
    opener1 = request.build_opener()
    opener1.addheaders = [head]
    request.install_opener(opener1)
    print('全局用户代理创建成功当前User-Agent:', head)

#获得ivsky一级目录并写入txt
def ivskyoneurl():
    url = 'https://www.ivsky.com/tupian/'
    pet = '"><a href="/(.*?)" title=".*?">(.*?)</a></li><li '
    data1 = request.urlopen(url).read().decode('utf-8', 'ignore')
    imagurl = re.compile(pet).findall(data1)
    print('类别名称数量:',len(imagurl))
    with open(path + '5.天堂图片网url爬取(目录).txt', 'w+', encoding='utf-8') as f:
        for page in range(len(imagurl)):
            # print(data1[page])
            str1 = "  ".join(imagurl[page])
            f.write(str1 + '\n')
        print('5.天堂图片网url爬取(目录).txt写入成功!!!')

#获取5.天堂图片网url爬取(目录).txt内任一关键词
def daleiurlget():
    with open(path + '5.天堂图片网url爬取(目录).txt', 'r+', encoding='utf-8') as f:
        read = f.readlines()
        print("从5.天堂图片网url爬取(目录).txt中读取成功!!!")
        urlandname = random.choice(read)
        print('将要爬取的类别名称(大类)以及url:', urlandname)
        return urlandname


#得到大类类别下的小类url并写入文本文档
def xiaoleiurlget(urlandname):
    urle = re.search('(.*?) ', urlandname)  # 得到URL
    # print(urle.group(0)) #代表已匹配到的对象和所在位置,通过加上.group(0),可以使返回值变为字符串。
    url = 'https://www.ivsky.com/' + urle.group(0)
    print('获得具体图片类别url:', url)
    pet = '</a> <a href="(.*?)"  ti.*?>(.*?)</a>'
    dataurl = request.urlopen(url).read().decode('utf-8', 'ignore')
    imagurl = re.compile(pet).findall(dataurl)
    print(imagurl)
    with open(path + '6.天堂图片网url爬取(分目录).txt', 'a', encoding='utf-8') as f:
        for page in range(len(imagurl)):
            # print(data1[page])
            str1 = "  ".join(imagurl[page])
            f.write(str1 + '\n')
        print('写入 6.天堂图片网url爬取(分目录).txt成功!!!')

#小类url随机拿出,返回一个url
def takeouturl():
    with open(path + '6.天堂图片网url爬取(分目录).txt', 'r+', encoding='utf-8') as f:
        read = f.readlines()
        print("6.天堂图片网url爬取(分目录).txt中读取成功!!!")
        urlandname2=random.choice(read)
    print('将要爬取的具体url及名称',urlandname2)
    urle = re.search('(.*?) ',urlandname2)
    #print(urle.group(0)) #代表已匹配到的对象和所在位置,通过加上.group(0),可以使返回值变为字符串。
    urlpage = 'https://www.ivsky.com'+urle.group(0)#页码网页
    print('将要爬取的具体大类下的url:',urlpage)
    return urlpage

    '''
    pet2 = '</div><p><a href="(.*?)" '
    urlbigimg = 'https://www.ivsky.com/' + pet2 #具体图片链接的上一层
    pet1= '</script><img id="imgis".*?//(.*?).jpg.*? alt="(.*?)"></a>'
    urlimg = 'https://'+pet1+'.jpg' #具体图片链接,进入此链接后才能下载图片
    '''

#下载图片至本地
def downloadimg(urlpage):
    pet1= '</script><img id="imgis".*?//(.*?).jpg.*? alt="(.*?)"></a>'
    pet2 = '</div><p><a href="(.*?)" '
    pa = int(input("请输入爬取开始页数:"))
    pe = int(input("请输入爬取结束页数:"))
    print(urlpage)
    urlpage = urlpage.strip()
    imgtxt = []
    for page in range(pa,pe+1,1):
        try :
            url = urlpage + 'index_'+str(page)+'.html'
            print(url)
            #获得此页面的所有大图链接
            databigimaghtml = request.urlopen(url).read().decode('utf-8','ignore')
            databigimaghtmlset = re.compile(pet2).findall(databigimaghtml)
            print(databigimaghtmlset)
            for i in range(len(databigimaghtmlset)):
                try:
                    time.sleep(random.uniform(0.1,0.5))
                    urlbigimg = 'https://www.ivsky.com' + databigimaghtmlset[i]
                    #获得含有大图地址的网址:
                    urlbigimg1 = request.urlopen(urlbigimg).read().decode('utf-8','ignore')
                    #筛选出大图地址:
                    databigimagimgset = re.compile(pet1).findall(urlbigimg1)
                    strdatabigimagimgset= ' '.join(databigimagimgset[0])
                    petwangzhi = '/(.*?) '
                    pethanzi = ' (.*)'
                    wangzhi = re.search(petwangzhi,strdatabigimagimgset).group(0)
                    wangzhi = wangzhi.strip()
                    #对图片名称命名
                    hanzi = re.search(pethanzi,strdatabigimagimgset).group(0)
                    hanzi = hanzi.strip()
                    print(wangzhi)
                    print(hanzi)
                    #设置img下载地址
                    downlownimg =  path + str(page)+hanzi+str(i)+'.jpg'
                    imgtxt.append(str(page)+hanzi+str(i)+'.jpg')
                    #要下载的imgurl
                    urlimg = 'https://img-pre.ivsky.com' + wangzhi + '.jpg'
                    print(urlimg)
                    request.urlretrieve(urlimg,downlownimg)
                except Exception as err :
                    print('---------2层错误:-----------',err)
        except Exception as err:
            print('---------1层错误:-----------', err)
    print('获得的图片名称:',imgtxt)

'''
        if page < 25:
            databigimaghtmlset1 = []
            l = len(databigimaghtmlset)
            for i in range(0,l,5):
                a = databigimaghtmlset[i]
                #print('我是废物?',a)
                databigimaghtmlset1.append(a)
            databigimaghtmlset = databigimaghtmlset1
            '''
def main():
    ua(uapools)  # 用户代理池
    ivskyoneurl()#获得ivsky一级目录
    #daleiurlget()#从txt取出一级目录下的类别
    xiaoleiurlget(daleiurlget())#从txt取出一级目录下的类别 得到小类类别url并写入文档
    #takeouturl()#拿出小类类别url,返回一个url
    downloadimg(takeouturl())#取出小类类别url,并获得图片链接然后下载至本地


if __name__ == '__main__':
    main()





  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值