相对于上一次的Python爪巴虫1.0版,2.0版完全是昨晚花了半个下午+大半个晚上敲出来的。。。耗时6个小时,写了好多个文件,最后发现文件之间的参数没办法链接。。。。好吧,其实应该归结到我对python的不了解= =||
最后仍然是以hdu的OJ作为测试的网站,扒了几秒钟就over了,明显就是不够给力啊!!!!应该是没办法分清楚网址指的是一个页面呢还是一个网站,不过实现了下载图片的功能,相对于上一个1.0版的,只能说是有很大的进步了,上一个1.0版,基本上是不把机器搞死机就已经很不错了。。。不过,后期测试的时候,发现图片很多下下来居然打不开- - 真心怀疑网上给的那个函数了。。。
对于网页方面的知识还是有待巩固。
最近给自己定了一个目标,每周至少要交出一个Python的小程序,每个程序都必须涉及到多方面的知识。希望这样子能够提升我的Python水平。
当然,也不能把时间都交给了Python,我还是得花点时间学点新东西的。
后向大牛请教,预计错误点应该在于我的header头部未加,导致服务器端自动结束我的程序,好无奈- - 以后不加头部的毛病一定要改改了,下学期回学校以后,重新搞起,求大神保佑!!!
Main.py
__author__ = 'glcsnz123'
import urllib2, urllib
import os
import threading, thread, time
import DownLoadHtml,DownLoadImage
import Analyze_Robot,ReSize
mompath = os.getcwd()
momdict = {}
momlist = []
momdown = []
momname = "http://acm.hdu.edu.cn"
def Init():
#momname = raw_input("Input the url please:")
if not momname.startswith("hdttp://"):
hostname = "http://" + momname
if momname[-1] == '/':
hostname = momname[0:-1:1]
try:
urllib2.urlopen(momname)
except Exception, e:
print "url error!"
exit(1)
#print momname
momlist.append(momname)
momdict[momname] = momname
def FindAllResource(data, hostname):
ansimg, anshtml = Analyze_Robot.FindImg(data), Analyze_Robot.FindHtml(data)
for htmlsrc in anshtml:
#print htmlsrc+"***"
htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
#print htmlsrc
if not momdict.has_key(htmlsrc):
#print htmlsrc
momdict[htmlsrc] = hostname
momlist.append(htmlsrc)
for imgsrc in ansimg:
imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
if imgsrc not in momdown:
DownLoadImage.DownLoadImage(imgsrc, hostname)
def Robot_Work():
while True:
if len(momlist) > 0:
print "now download the page: " + momlist[0]
data, url = DownLoadHtml.DownLoadHtml(momlist[0], momdict[momlist[0]])
del momlist[0]
FindAllResource(data, url)
else:
time.sleep(3)
if len(momlist):
continue
else:
break
#print momlist
if __name__ == '__main__':
Init()
Robot_Work()
Analyze_Robot.py
__author__ = 'glcsnz123'
import sys, urllib, urllib2, os
import re, Main
import DownLoadImage
import ReSize
#<img\ssrc=".*"\s>
def FindImg(data):
#print data
reant = re.compile("<img\ssrc=[\"\']?[^\'\"]+[\"\']?")
imglist = reant.findall(data)
#print imglist
#print len(imglist)
ansimg = []
for imgsrc in imglist:
for x in range(len(imgsrc)):
if imgsrc[x] == '"'or imgsrc[x] == "'":
ansimg.append(imgsrc[x + 1:-1:1])
break
#print ansimg
return ansimg
#<a\shref=[\'\"]
def FindHtml(data):
#print data
reant = re.compile("<a\shref=[\'\"][^\'\"]+[\'\"]")
htmllist = reant.findall(data)
anshtml = []
#print htmllist
for htmlsrc in htmllist:
for x in range(len(htmlsrc)):
if htmlsrc[x] == '"'or htmlsrc[x] == "'":
anshtml.append(htmlsrc[x + 1:-1:1])
break
#print anshtml
return anshtml
def FindAllResource(data, hostname):
ansimg, anshtml = FindImg(data), FindHtml(data)
for htmlsrc in anshtml:
#print htmlsrc+"***"
htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
#print htmlsrc
if not Main.momdict.has_key(htmlsrc):
#print htmlsrc
Main.momdict[htmlsrc] = hostname
Main.momlist.append(htmlsrc)
for imgsrc in ansimg:
imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
if imgsrc not in Main.momdown:
DownLoadImage.DownLoadImage(imgsrc, hostname)
if __name__ == "__main__":
import DownLoadHtml
data = DownLoadHtml.DownLoadHtml("http://acm.hdu.edu.cn", "http://acm.hdu.edu.cn")
FindAllResource(data, "http://acm.hdu.edu.cn")
DownLoadHtml.py
__author__ = 'glcsnz123'
import urllib2, urllib, os, sys
import ReSize
import Save_File
import Main
import Analyze_Robot
"""
give me the url and hostname, I can download this html file but without image file!
"""
def DownLoadHtml(url, hostname):
#print url,hostname
url, fpath = ReSize.ReFact(url, hostname)
if url in Main.momdown:
exit()
else:
Main.momdown.append(url)
#print url
data = urllib.urlopen(url).read()
#print data
os.chdir(Main.mompath)
#print fpath
Save_File.SaveFile(fpath.split("/"), data)
return data, url
if __name__ == '__main__':
#Main.mompath = os.getcwd()
print DownLoadHtml("passed.php", "http://acm.hdu.edu.cn/recentcontest/")
DownLoadImage.py
__author__ = 'glcsnz123'
import urllib, urllib2
import Image, os, sys
import ReSize, time
import Main, string
def DownLoadImage(url, hostname):
url, fpath = ReSize.ReFact(url, hostname)
print "**********", url, "++++", fpath, "**************"
if url in Main.momdown:
exit(0);
else:
Main.momdown.append(url)
pathlist = fpath.split('/')
#print fpath
os.chdir(Main.mompath)
for path in pathlist[0:-1:1]:
try:
table = string.maketrans(r'/\?*:<>|"', r'_________')
path = path.translate(table)
if not os.path.isdir(path):
os.mkdir(path)
if os.getcwd() == Main.mompath and path == '..':
continue
os.chdir(path)
except Exception, e:
print "dir create failed in downloadimage!---" + path
#print pathlist[-1]
try:
urllib.urlretrieve(url, pathlist[-1])
#time.sleep(0.5)
except Exception, e:
print "DownLoad Image Error!---" + url
#print os.getcwd()
if __name__ == '__main__':
Main.mompath = os.getcwd()
Main.momdict = {}
Main.momname = "http://acm.hdu.edu.cn"
DownLoadImage("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")
Save_File.py
__author__ = 'glcsnz123'
import sys, os
import Main
import string
def SaveFile(pathlist, data):
#print "*********", pathlist, "*****"
for path in pathlist[0:-1:1]:
if len(pathlist) == 1:
break
try:
table = string.maketrans(r'/\?*:<>|"', r'.........')
path = path.translate(table)
if not os.path.isdir(path):
os.mkdir(path)
if os.getcwd() == Main.mompath and path == '..':
continue
os.chdir(path)
except Exception, e:
print "dir create failed in save_file!---" + path
try:
table = string.maketrans(r'/\?*:<>|"', r'.........')
pathlist[-1] = pathlist[-1].translate(table)
#print pathlist[-1], "is create"
f = open(pathlist[-1], "w")
f.write(data)
f.close()
except Exception, e:
print e;
print "Save File Error!----" + "\\".join(pathlist)
#print data
if __name__ == "__main__":
SaveFile("status.php", "test/test/test".split("/"), "thank you")
__author__ = 'glcsnz123'
import os
import sys
import Main
def ReFactUrl(url, hostname):
#print hostname
tmpname = hostname[7:].split("/")
#print tmpname
while url.startswith("../"):
url = url[3:]
if len(tmpname) > 1:
del tmpname[-1]
while url.startswith("./"):
url = url[2:]
hostname = hostname[:7] + "/".join(tmpname)
#print hostname
if not url.startswith(hostname):
if url[0] != '/':
return hostname + "/" + url
else:
return hostname + url
return url
def ReFactPath(url, hostname):
if url.startswith(hostname):
#print hostname
#print url
return url[len(hostname) + 1:]
else:
print "ReFactPath Error!---" + url
def ReFact(url, hostname):
print url,hostname
url = ReFactUrl(url, hostname)
path = ReFactPath(url, Main.momname)
if path == '':
path = "index.html"
return url, path
if __name__ == "__main__":
Main.momname="http://acm.hdu.edu.cn"
print ReFact("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")