import scrapy, urllib, hashlib, time, random, threading, os from pyquery import PyQuery as pq headers = { 'Referer': 'http://www.mm131.com/1/1', 'user-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } def getSiteSource(url): try: req = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(req) return response.read() except Exception: return '' class downLoadImg(threading.Thread): dir = 'c:/mm131' catSiteUrl = 0 def __init__(self, catSiteUrl): threading.Thread.__init__(self) self.catSiteUrl = catSiteUrl def run(self): # 获取列表源码 listPageCode = getSiteSource(self.catSiteUrl) doc = pq(listPageCode) dls = pq(doc('dl').filter('.public-box')) hrefs = dls('dl a') goodUrls = [] for i in range(hrefs.length): if (hrefs.eq(i).attr('target') == '_blank'): goodUrls.append(hrefs.eq(i).attr('href')) for i in range(goodUrls.__len__()): baseUrl = goodUrls[i].replace('.html', '') for k in range(1, 70): if k == 1: picInurl = goodUrls[i] else: picInurl = (baseUrl + '_' + str(k) + '.html') # 下载图片地址 pcode = getSiteSource(picInurl) if len(pcode) == 0: continue pcodepq = pq(pcode) img = pcodepq('div .content-pic a img').eq(0).attr('src') if len(img) > 0: imgInfo = img.split('/') parent = imgInfo[4] #图片写入文件夹 imgFold = self.dir + '/' + str(parent) if not os.path.exists(imgFold): os.makedirs(imgFold) req = urllib.request.Request(img,headers=headers) response = urllib.request.urlopen(req) # if len(response.read()) == 0: # return # f = open(imgFold + '/' + hashlib.md5((str(time.time()) + str(int(random.randint(1, 1000000)))).encode(encoding='UTF-8')).hexdigest() + ".jpg", 'wb') f = open(imgFold + '/' + str(imgInfo[5]), 'wb') f.write(response.read()) f.close() # 获取一个列表所有的图片 # 获取网站的大分类 的所有的url class getSiteCat(): siteUrl = 'http://www.mm131.com/' def getCat(self): urlList = [] result = [] html = getSiteSource(self.siteUrl) doc = pq(html) hrefs = doc('div').filter('.nav ul li a') for i in range(0, hrefs.length): herf = hrefs.eq(i).attr('href') if self.siteUrl != herf: urlList.append(herf) for i in range(0, urlList.__len__()): sourceCode = getSiteSource(urlList[i]) doc = pq(sourceCode) allHref = doc('a') catMaxPageNum = 0 for k in range(allHref.length): if allHref.eq(k).text() == '末页': catMaxPageNumInfo = str(allHref.eq(k).attr('href').replace('list_', '').replace('.html', '')) catMaxPageNumArr = catMaxPageNumInfo.split("_") catMaxPageNum = catMaxPageNumArr[1] temp = {'maxPageNum': catMaxPageNum, 'url': urlList[i], 'catId': catMaxPageNumArr[0]} result.append(temp) break return result def buildAllUrl(self, allSiteCat: list): allUrlList = [] for i in range(0, allSiteCat.__len__()): allUrlList.append(allSiteCat[i]['url']) # print(allSiteCat[i]['url']) # exit() for j in range(2, int(allSiteCat[i]['maxPageNum'])): allUrlList.append(allSiteCat[i]['url'] + 'list_' + str(allSiteCat[i]['catId']) + '_' + str(j) + '.html') return allUrlList site = getSiteCat() catList = site.getCat() urlList = site.buildAllUrl(catList) thList = [] for i in range(0,50):#这里暂定50个进程 开到 203 启动不来,可以去研究下怎么解决。 目前全站有 203个 列表页面 开50 然后分成40次跑也可以 threadtemp = downLoadImg(urlList[i]) threadtemp.start() thList.append(threadtemp) # thList =[] # for i in range(1000,900,-1): # threadtemp = downLoadImg(i) # threadtemp.start() # thList.append(threadtemp)