python爬虫天堂网图片的进阶版

闲话不多说,直接上代码:

import requests
from bs4 import BeautifulSoup
import re
import urllib
def cbk(a,b,c):
    '''''回调函数
    @a:已经下载的数据块
    @b:数据块的大小
    @c:远程文件的大小
    '''
    per=100.0*a*b/c
    if per>100:
        per=100
    print ('%.2f%%' % per)
    print(" ")

url = 'http://www.ivsky.com/tupian/meishishijie/'
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400','Referer':'http://www.ivsky.com/tupian/qita/index_11.html'}
html = requests.get(url,headers = headers)
soup = BeautifulSoup(html.text,'html.parser')

for i in range(0,12):
    link = url +'/index_'+str(i)+'.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400',
        'Referer':'http://www.ivsky.com/tupian/qita/index_11.html'}
    html = requests.get(link, headers=headers)
    mess = BeautifulSoup(html.text, 'html.parser')
    for page in mess.find_all('ul',class_='ali'):
        for img in page.find_all('img'):
            imgre = re.compile(r'src="(.*?\.jpg)" alt')
            imglist = re.findall(imgre,html.text)
            #imgurl = img.get('src')
            #print imgurl
            x = 0
            for imgurl in imglist:
                work_path = "E:/img/" + str(x) + ".jpg"
                urllib.urlretrieve(imgurl,work_path,cbk)
                x += 1
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值