from urllib import request
import re,time
class spidermain(object):
def __init__(self):
self.urls=urldownload()
self.parser=htmlparser()
self.output=output()
def craw(self,root_url):
count=0
self.urls.add_newurl(root_url)
while self.urls.has_newurl():
try:
newurl=self.urls.get_new_url()
print('个数%d:URL%s'%(count,newurl))
urls,imgurls=self.parser.parser(newurl)
self.urls.add_newurls(urls)
self.output.collect_data(imgurls)
count+=1
finally:
print('错误')
if count==3:
break
time.sleep(1)
self.output.dataimg()
self.output.dataimg()
class urldownload(object):
def __init__(self):
self.newurls=set()
self.oldurls=set()
def get_new_url(self):
newurl=self.newurls.pop()
self.oldurls.add(newurl)
return newurl
def add_newurl(self,url):
if url is None:
return
if url not in self.newurls and url not in self.oldurls:
self.newurls.add(url)
def has_newurl(self):
s=len(self.newurls)>0
return s
def add_newurls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_newurl(url)
class htmlparser(object):
def parser(self,newurl):
if newurl is None or len(newurl)==0:
return
headers = {"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"}
print(newurl)
req=request.Request(url=newurl,headers=headers)
res=request.urlopen(req)
data=res.read()
try:
data=data.decode('gbk')
except:
print('00')
imgurls=set(re.findall(r'(img.*?(http:.*?(jpg|png|gif)))',str(data)))
urls=set(re.findall(r'http://www.meizitu.com/.*?\.html',str(data)))
return urls,imgurls
class output(object):
def __init__(self):
self.imgurls=set([])
def collect_data(self,imgurls):
if imgurls is None or len(imgurls)==0:
return
s=self.imgurls|imgurls
self.imgurls=s
def dataimg(self):
headers = {"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"}
opener=request.build_opener()
opener.addheaders=headers
request.install_opener(opener)
f=open('d://1.txt','a')
for i in self.imgurls:
i=i[1:][::3]
x=0
for i in i:
f.write(str(i)+'\n')
res=request.Request(url=i,headers=headers)
request.urlretrieve(res,'d:\\1\\%s'%x)
x=x+1
print(i)
f.close
root_url='http://meizitu.com/'
spidermain().craw(root_url)
import re,time
class spidermain(object):
def __init__(self):
self.urls=urldownload()
self.parser=htmlparser()
self.output=output()
def craw(self,root_url):
count=0
self.urls.add_newurl(root_url)
while self.urls.has_newurl():
try:
newurl=self.urls.get_new_url()
print('个数%d:URL%s'%(count,newurl))
urls,imgurls=self.parser.parser(newurl)
self.urls.add_newurls(urls)
self.output.collect_data(imgurls)
count+=1
finally:
print('错误')
if count==3:
break
time.sleep(1)
self.output.dataimg()
self.output.dataimg()
class urldownload(object):
def __init__(self):
self.newurls=set()
self.oldurls=set()
def get_new_url(self):
newurl=self.newurls.pop()
self.oldurls.add(newurl)
return newurl
def add_newurl(self,url):
if url is None:
return
if url not in self.newurls and url not in self.oldurls:
self.newurls.add(url)
def has_newurl(self):
s=len(self.newurls)>0
return s
def add_newurls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_newurl(url)
class htmlparser(object):
def parser(self,newurl):
if newurl is None or len(newurl)==0:
return
headers = {"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"}
print(newurl)
req=request.Request(url=newurl,headers=headers)
res=request.urlopen(req)
data=res.read()
try:
data=data.decode('gbk')
except:
print('00')
imgurls=set(re.findall(r'(img.*?(http:.*?(jpg|png|gif)))',str(data)))
urls=set(re.findall(r'http://www.meizitu.com/.*?\.html',str(data)))
return urls,imgurls
class output(object):
def __init__(self):
self.imgurls=set([])
def collect_data(self,imgurls):
if imgurls is None or len(imgurls)==0:
return
s=self.imgurls|imgurls
self.imgurls=s
def dataimg(self):
headers = {"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"}
opener=request.build_opener()
opener.addheaders=headers
request.install_opener(opener)
f=open('d://1.txt','a')
for i in self.imgurls:
i=i[1:][::3]
x=0
for i in i:
f.write(str(i)+'\n')
res=request.Request(url=i,headers=headers)
request.urlretrieve(res,'d:\\1\\%s'%x)
x=x+1
print(i)
f.close
root_url='http://meizitu.com/'
spidermain().craw(root_url)