__author__ = 'Administrator'
import urllib.request
import os
import random
import re
from bs4 import BeautifulSoup
import multiprocessing
import time
import threading
import socket
def getHtml(url):
my_headers = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_0) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30',
'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)',
'Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)']
header={"User-Agent":random.choice(my_headers)}
req=urllib.request.Request(url, headers=header)
html=urllib.request.urlopen(req).read()
return html
def getPages(soup):
pages=soup.find_all('a',class_="page-numbers")
return int(pages[-2].text)
def getKinds(pageSoup):
kinds=[]
list=pageSoup.find_all('a',target="_blank")
for i in list:
imgg=i.find_all("img",class_='lazy')
if imgg:
kinds.append([imgg[0]['alt'],i['href']])
return kinds
def getPicNumbers(kindSoup):
xx=kindSoup.find_all('div',class_="pagenavi")[0].find_all('a')
return int(xx[-2].text)
def getPicTrueUrl(picSoup):
xx=picSoup.find_all('div',class_="main-image")[0].find_all("img")[0]['src']
return xx
def func():
print("***********************************现在下载第%d页***********************************"%page)
global Dir,url,html,soup,pages
pageDir=Dir+"/"+str(page)
if not os.path.exists(pageDir):
os.mkdir(pageDir)
pageUrl=url+"page/"+str(page)
pageHtml=getHtml(pageUrl)
pageSoup=BeautifulSoup(pageHtml,"html.parser")
kinds=getKinds(pageSoup)
print("------------------------------第%d页有%d类别------------------------------"%(page,len(kinds)))
kindX=0
for kind in kinds:
kindX+=1
print("*******************现在下载第%d页第%d类别*******************"%(page,kindX))
kind[0]=re.sub(r'[\/:?"<>|]',"",kind[0])
kindDir=pageDir+"/"+kind[0]
if not os.path.exists(kindDir):
os.mkdir(kindDir)
kindUrl=kind[1]
kindHtml=getHtml(kindUrl)
kindSoup=BeautifulSoup(kindHtml,"html.parser")
picNumbers=getPicNumbers(kindSoup)
threads = []
for picNumber in range(1,picNumbers+1):
#print("------------现在下载第%d页第%d类别第%d张-------------"%(page,kindX,picNumber))
picUrl=kindUrl+"/"+str(picNumber)
picHtml=getHtml(picUrl)
picSoup=BeautifulSoup(picHtml,"html.parser")
picTrueUrl=getPicTrueUrl(picSoup)
if not os.path.exists(kindDir+'/%d.jpg' % picNumber):
t=threading.Thread(target=funcc,args=(picTrueUrl,kindDir+'/%d.jpg' % picNumber,picNumber))
threads.append(t)
for t in threads:
t.setDaemon(True)
t.start()
for t in threads:
t.join()
def funcc(picTrueUrl,picDir,picNumber):
#print("现在在下载第%d张"%picNumber)
urllib.request.urlretrieve
urllib.request.urlretrieve(picTrueUrl,picDir)
#print("完成下载第%d张"%picNumber)
if __name__=="__main__":
socket.setdefaulttimeout(5.0)
xxx=time.time()
Dir="D:/MeiZiTu"
if not os.path.exists(Dir):
os.mkdir(Dir)
url="http://www.mzitu.com/"
html=getHtml(url)
soup=BeautifulSoup(html,"html.parser")
pages=getPages(soup)
print("---------------------------------------------总共有%d页---------------------------------------------"%pages)
for page in range(1,pages+1):
func()
yyy=time.time()
print(yyy-xxx)
python 爬虫某网站图片
最新推荐文章于 2024-07-12 16:16:27 发布