Python 爪巴虫 2.0版

相对于上一次的Python爪巴虫1.0版,2.0版完全是昨晚花了半个下午+大半个晚上敲出来的。。。耗时6个小时,写了好多个文件,最后发现文件之间的参数没办法链接。。。。尴尬好吧,其实应该归结到我对python的不了解= =||

最后仍然是以hdu的OJ作为测试的网站,扒了几秒钟就over了,明显就是不够给力啊!!!!应该是没办法分清楚网址指的是一个页面呢还是一个网站,不过实现了下载图片的功能,相对于上一个1.0版的,只能说是有很大的进步了,上一个1.0版,基本上是不把机器搞死机就已经很不错了。。。不过,后期测试的时候,发现图片很多下下来居然打不开- - 真心怀疑网上给的那个函数了。。。

对于网页方面的知识还是有待巩固。

最近给自己定了一个目标,每周至少要交出一个Python的小程序,每个程序都必须涉及到多方面的知识。希望这样子能够提升我的Python水平。

当然,也不能把时间都交给了Python,我还是得花点时间学点新东西的。


后向大牛请教,预计错误点应该在于我的header头部未加,导致服务器端自动结束我的程序,好无奈- - 以后不加头部的毛病一定要改改了,下学期回学校以后,重新搞起,求大神保佑!!!



Main.py

__author__ = 'glcsnz123'
import urllib2, urllib
import os
import threading, thread, time
import DownLoadHtml,DownLoadImage
import  Analyze_Robot,ReSize

mompath = os.getcwd()
momdict = {}
momlist = []
momdown = []
momname = "http://acm.hdu.edu.cn"

def Init():
    #momname = raw_input("Input the url please:")
    if not momname.startswith("hdttp://"):
        hostname = "http://" + momname
    if momname[-1] == '/':
        hostname = momname[0:-1:1]
    try:
        urllib2.urlopen(momname)
    except Exception, e:
        print "url error!"
        exit(1)
        #print momname

    momlist.append(momname)
    momdict[momname] = momname


def FindAllResource(data, hostname):
    ansimg, anshtml = Analyze_Robot.FindImg(data), Analyze_Robot.FindHtml(data)
    for htmlsrc in anshtml:
        #print htmlsrc+"***"
        htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
        #print htmlsrc
        if not momdict.has_key(htmlsrc):
            #print htmlsrc
            momdict[htmlsrc] = hostname
            momlist.append(htmlsrc)
    for imgsrc in ansimg:
        imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
        if imgsrc not in momdown:
            DownLoadImage.DownLoadImage(imgsrc, hostname)


def Robot_Work():
    while True:
        if len(momlist) > 0:
            print "now download the page: " + momlist[0]
            data, url = DownLoadHtml.DownLoadHtml(momlist[0], momdict[momlist[0]])
            del momlist[0]
            FindAllResource(data, url)
        else:
            time.sleep(3)
            if len(momlist):
                continue
            else:
                break
                #print momlist


if __name__ == '__main__':
    Init()
    Robot_Work()

Analyze_Robot.py

__author__ = 'glcsnz123'
import sys, urllib, urllib2, os
import re, Main
import DownLoadImage
import ReSize

#<img\ssrc=".*"\s>

def FindImg(data):
    #print data
    reant = re.compile("<img\ssrc=[\"\']?[^\'\"]+[\"\']?")
    imglist = reant.findall(data)
    #print imglist
    #print len(imglist)
    ansimg = []
    for imgsrc in imglist:
        for x in range(len(imgsrc)):
            if imgsrc[x] == '"'or imgsrc[x] == "'":
                ansimg.append(imgsrc[x + 1:-1:1])
                break
    #print ansimg
    return ansimg

#<a\shref=[\'\"]
def FindHtml(data):
    #print data
    reant = re.compile("<a\shref=[\'\"][^\'\"]+[\'\"]")
    htmllist = reant.findall(data)
    anshtml = []
    #print htmllist
    for htmlsrc in htmllist:
        for x in range(len(htmlsrc)):
            if htmlsrc[x] == '"'or htmlsrc[x] == "'":
                anshtml.append(htmlsrc[x + 1:-1:1])
                break
    #print anshtml
    return anshtml


def FindAllResource(data, hostname):
    ansimg, anshtml = FindImg(data), FindHtml(data)
    for htmlsrc in anshtml:
        #print htmlsrc+"***"
        htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
        #print htmlsrc
        if not Main.momdict.has_key(htmlsrc):
            #print htmlsrc
            Main.momdict[htmlsrc] = hostname
            Main.momlist.append(htmlsrc)
    for imgsrc in ansimg:
        imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
        if imgsrc not in Main.momdown:
            DownLoadImage.DownLoadImage(imgsrc, hostname)


if __name__ == "__main__":
    import DownLoadHtml

    data = DownLoadHtml.DownLoadHtml("http://acm.hdu.edu.cn", "http://acm.hdu.edu.cn")
    FindAllResource(data, "http://acm.hdu.edu.cn")


DownLoadHtml.py

__author__ = 'glcsnz123'
import urllib2, urllib, os, sys
import ReSize
import Save_File
import Main
import Analyze_Robot

"""
    give me the url and hostname, I can download this html file but without image file!

"""

def DownLoadHtml(url, hostname):
    #print url,hostname
    url, fpath = ReSize.ReFact(url, hostname)
    if url in Main.momdown:
        exit()
    else:
        Main.momdown.append(url)

    #print url
    data = urllib.urlopen(url).read()
    #print data
    os.chdir(Main.mompath)
    #print fpath
    Save_File.SaveFile(fpath.split("/"), data)
    return data, url

if __name__ == '__main__':
    #Main.mompath = os.getcwd()
    print DownLoadHtml("passed.php", "http://acm.hdu.edu.cn/recentcontest/")



DownLoadImage.py

__author__ = 'glcsnz123'
import urllib, urllib2
import Image, os, sys
import ReSize, time
import Main, string

def DownLoadImage(url, hostname):
    url, fpath = ReSize.ReFact(url, hostname)
    print "**********", url, "++++", fpath, "**************"
    if url in Main.momdown:
        exit(0);
    else:
        Main.momdown.append(url)
    pathlist = fpath.split('/')
    #print fpath
    os.chdir(Main.mompath)
    for path in pathlist[0:-1:1]:
        try:
            table = string.maketrans(r'/\?*:<>|"', r'_________')
            path = path.translate(table)
            if not os.path.isdir(path):
                os.mkdir(path)
            if  os.getcwd() == Main.mompath and path == '..':
                continue
            os.chdir(path)
        except  Exception, e:
            print "dir create failed in downloadimage!---" + path
            #print pathlist[-1]
    try:
        urllib.urlretrieve(url, pathlist[-1])
        #time.sleep(0.5)
    except Exception, e:
        print "DownLoad Image Error!---" + url
        #print os.getcwd()

if __name__ == '__main__':
    Main.mompath = os.getcwd()
    Main.momdict = {}
    Main.momname = "http://acm.hdu.edu.cn"
    DownLoadImage("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")





Save_File.py

__author__ = 'glcsnz123'
import sys, os
import Main
import string

def SaveFile(pathlist, data):
    #print "*********", pathlist, "*****"
    for path in pathlist[0:-1:1]:
        if len(pathlist) == 1:
            break
        try:
            table = string.maketrans(r'/\?*:<>|"', r'.........')
            path = path.translate(table)
            if not os.path.isdir(path):
                os.mkdir(path)
            if  os.getcwd() == Main.mompath and path == '..':
                continue
            os.chdir(path)
        except  Exception, e:
            print "dir create failed in save_file!---" + path

    try:
        table = string.maketrans(r'/\?*:<>|"', r'.........')
        pathlist[-1] = pathlist[-1].translate(table)
        #print pathlist[-1], "is create"
        f = open(pathlist[-1], "w")
        f.write(data)
        f.close()
    except Exception, e:
        print e;
        print "Save File Error!----" + "\\".join(pathlist)
        #print data


if __name__ == "__main__":
    SaveFile("status.php", "test/test/test".split("/"), "thank you")



ReSize.py
__author__ = 'glcsnz123'
import os
import sys
import Main

def ReFactUrl(url, hostname):
    #print hostname
    tmpname = hostname[7:].split("/")
    #print tmpname
    while url.startswith("../"):
        url = url[3:]
        if len(tmpname) > 1:
            del tmpname[-1]
    while url.startswith("./"):
        url = url[2:]
    hostname = hostname[:7] + "/".join(tmpname)
    #print hostname
    if not url.startswith(hostname):
        if url[0] != '/':
            return hostname + "/" + url
        else:
            return hostname + url
    return url


def ReFactPath(url, hostname):
    if url.startswith(hostname):
        #print hostname
        #print url
        return url[len(hostname) + 1:]
    else:
        print "ReFactPath Error!---" + url


def ReFact(url, hostname):
    print url,hostname
    url = ReFactUrl(url, hostname)
    path = ReFactPath(url, Main.momname)
    if path == '':
        path = "index.html"
    return url, path


if __name__ == "__main__":
    Main.momname="http://acm.hdu.edu.cn"
    print ReFact("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")









评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值