python爬虫自动搜索下载游民今日搞笑图集_csdn爬虫自动搜索下载-CSDN博客

本文链接：https://blog.csdn.net/andylou_1/article/details/38235359

游民的今日搞笑图集排版时每页3张一连40多页的，看得人翻起页来着实麻烦，如果有个脚本能把那图片都下载到本地，然后再大快朵颐，想必是极好的

游民每日图片发布网址：http://www.gamersky.com/ent/

代码如下：

#-*-coding:UTF-8-*-
'''首先在python文件下创建一个叫youmin的文件夹'''
import re
import urllib
import threading
import time
import Image


src='<p .*?center".*?<img.*?src="(.*?)">'

img=re.compile(src)
datadict=['周一','周二','周三','周四','周五','周六','周末']
def today():
     #获知今天是星期几
     a=time.localtime(time.time())[6]
     print a
     return datadict[a]
     



def getbaseurl():
    todaydata=today()
    print todaydata.decode('utf8')
    url='http://www.gamersky.com/ent/'#游民每日图片发布页
    s=urllib.urlopen(url).read()
    urlhtm='<li class="lii"><a.*?>.*?</a></li>'#寻找图片发布页网址所在的html区域
    imghtm='http:.*?shtml'#图片发布页网址
    urs=re.compile(urlhtm)
    urs1=re.compile(imghtm)
    urllist=re.findall(urs,s)#查找所有最新图片发布页网址
    todaydata1=re.compile(todaydata)
    for i in urllist[:4]:
       
        todaydata2=re.search(todaydata1,i)#判断是否是今日发布的
        
        if todaydata2:
            #print i
            return re.search(urs1,i).group()#返回今日图片发布页网址
def gethtml(url):
    return urllib.urlopen(url).read()

baseurl=getbaseurl()#获得今日图片发布的网址

def getimg(html,s):
     #下载图片
    global img
    try:
        imglist=re.findall(img,html)
        x=0
        for i in imglist:
            urllib.urlretrieve(i,'.\\youmin\\'+str(s)+str(x)+i[-4:])
            x+=1
    except:
         pass

class getmy(threading.Thread):
     #创建多线程
    def __init__(self,begin,end):
        threading.Thread.__init__(self)
        self.begin = begin
        self.end =end
    def run(self):
        try:
            for i in range(self.begin,self.end+1):
                s=i
                if i==1:
                    i=''
                else:
                    i='_'+str(i)
                url = baseurl[:-6]+str(i)+baseurl[-6:]
            #print theurl
                getimg(gethtml(url),s)#pageloop(theurl)
        except:
            pass

threads =[]
i=1
j=5
#每个线程下载5页，一共下载50页
for s in range(10):
     threads.append(getmy(i,j))
     i+=5
     j+=5
for t in threads:
     t.start()        
for t in threads:
     t.join()