Python 爪巴虫 2.0版_八爪虫python-CSDN博客

本文链接：https://blog.csdn.net/gripleaf/article/details/8437682

相对于上一次的Python爪巴虫1.0版，2.0版完全是昨晚花了半个下午+大半个晚上敲出来的。。。耗时6个小时，写了好多个文件，最后发现文件之间的参数没办法链接。。。。好吧，其实应该归结到我对python的不了解= =||

最后仍然是以hdu的OJ作为测试的网站，扒了几秒钟就over了，明显就是不够给力啊！！！！应该是没办法分清楚网址指的是一个页面呢还是一个网站，不过实现了下载图片的功能，相对于上一个1.0版的，只能说是有很大的进步了，上一个1.0版，基本上是不把机器搞死机就已经很不错了。。。不过，后期测试的时候，发现图片很多下下来居然打不开- - 真心怀疑网上给的那个函数了。。。

对于网页方面的知识还是有待巩固。

最近给自己定了一个目标，每周至少要交出一个Python的小程序，每个程序都必须涉及到多方面的知识。希望这样子能够提升我的Python水平。

当然，也不能把时间都交给了Python，我还是得花点时间学点新东西的。

后向大牛请教，预计错误点应该在于我的header头部未加，导致服务器端自动结束我的程序，好无奈- - 以后不加头部的毛病一定要改改了，下学期回学校以后，重新搞起，求大神保佑！！！

Main.py

__author__ = 'glcsnz123'
import urllib2, urllib
import os
import threading, thread, time
import DownLoadHtml,DownLoadImage
import  Analyze_Robot,ReSize

mompath = os.getcwd()
momdict = {}
momlist = []
momdown = []
momname = "http://acm.hdu.edu.cn"

def Init():
    #momname = raw_input("Input the url please:")
    if not momname.startswith("hdttp://"):
        hostname = "http://" + momname
    if momname[-1] == '/':
        hostname = momname[0:-1:1]
    try:
        urllib2.urlopen(momname)
    except Exception, e:
        print "url error!"
        exit(1)
        #print momname

    momlist.append(momname)
    momdict[momname] = momname


def FindAllResource(data, hostname):
    ansimg, anshtml = Analyze_Robot.FindImg(data), Analyze_Robot.FindHtml(data)
    for htmlsrc in anshtml:
        #print htmlsrc+"***"
        htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
        #print htmlsrc
        if not momdict.has_key(htmlsrc):
            #print htmlsrc
            momdict[htmlsrc] = hostname
            momlist.append(htmlsrc)
    for imgsrc in ansimg:
        imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
        if imgsrc not in momdown:
            DownLoadImage.DownLoadImage(imgsrc, hostname)


def Robot_Work():
    while True:
        if len(momlist) > 0:
            print "now download the page: " + momlist[0]
            data, url = DownLoadHtml.DownLoadHtml(momlist[0], momdict[momlist[0]])
            del momlist[0]
            FindAllResource(data, url)
        else:
            time.sleep(3)
            if len(momlist):
                continue
            else:
                break
                #print momlist


if __name__ == '__main__':
    Init()
    Robot_Work()

Analyze_Robot.py

__author__ = 'glcsnz123'
import sys, urllib, urllib2, os
import re, Main
import DownLoadImage
import ReSize

#<img\ssrc=".*"\s>

def FindImg(data):
    #print data
    reant = re.compile("<img\ssrc=[\"\']?[^\'\"]+[\"\']?")
    imglist = reant.findall(data)
    #print imglist
    #print len(imglist)
    ansimg = []
    for imgsrc in imglist:
        for x in range(len(imgsrc)):
            if imgsrc[x] == '"'or imgsrc[x] == "'":
                ansimg.append(imgsrc[x + 1:-1:1])
                break
    #print ansimg
    return ansimg

#<a\shref=[\'\"]
def FindHtml(data):
    #print data
    reant = re.compile("<a\shref=[\'\"][^\'\"]+[\'\"]")
    htmllist = reant.findall(data)
    anshtml = []
    #print htmllist
    for htmlsrc in htmllist:
        for x in range(len(htmlsrc)):
            if htmlsrc[x] == '"'or htmlsrc[x] == "'":
                anshtml.append(htmlsrc[x + 1:-1:1])
                break
    #print anshtml
    return anshtml


def FindAllResource(data, hostname):
    ansimg, anshtml = FindImg(data), FindHtml(data)
    for htmlsrc in anshtml:
        #print htmlsrc+"***"
        htmlsrc = ReSize.ReFactUrl(htmlsrc, hostname)
        #print htmlsrc
        if not Main.momdict.has_key(htmlsrc):
            #print htmlsrc
            Main.momdict[htmlsrc] = hostname
            Main.momlist.append(htmlsrc)
    for imgsrc in ansimg:
        imgsrc = ReSize.ReFactUrl(imgsrc, hostname)
        if imgsrc not in Main.momdown:
            DownLoadImage.DownLoadImage(imgsrc, hostname)


if __name__ == "__main__":
    import DownLoadHtml

    data = DownLoadHtml.DownLoadHtml("http://acm.hdu.edu.cn", "http://acm.hdu.edu.cn")
    FindAllResource(data, "http://acm.hdu.edu.cn")

DownLoadHtml.py

__author__ = 'glcsnz123'
import urllib2, urllib, os, sys
import ReSize
import Save_File
import Main
import Analyze_Robot

"""
    give me the url and hostname, I can download this html file but without image file!

"""

def DownLoadHtml(url, hostname):
    #print url,hostname
    url, fpath = ReSize.ReFact(url, hostname)
    if url in Main.momdown:
        exit()
    else:
        Main.momdown.append(url)

    #print url
    data = urllib.urlopen(url).read()
    #print data
    os.chdir(Main.mompath)
    #print fpath
    Save_File.SaveFile(fpath.split("/"), data)
    return data, url

if __name__ == '__main__':
    #Main.mompath = os.getcwd()
    print DownLoadHtml("passed.php", "http://acm.hdu.edu.cn/recentcontest/")

DownLoadImage.py

__author__ = 'glcsnz123'
import urllib, urllib2
import Image, os, sys
import ReSize, time
import Main, string

def DownLoadImage(url, hostname):
    url, fpath = ReSize.ReFact(url, hostname)
    print "**********", url, "++++", fpath, "**************"
    if url in Main.momdown:
        exit(0);
    else:
        Main.momdown.append(url)
    pathlist = fpath.split('/')
    #print fpath
    os.chdir(Main.mompath)
    for path in pathlist[0:-1:1]:
        try:
            table = string.maketrans(r'/\?*:<>|"', r'_________')
            path = path.translate(table)
            if not os.path.isdir(path):
                os.mkdir(path)
            if  os.getcwd() == Main.mompath and path == '..':
                continue
            os.chdir(path)
        except  Exception, e:
            print "dir create failed in downloadimage!---" + path
            #print pathlist[-1]
    try:
        urllib.urlretrieve(url, pathlist[-1])
        #time.sleep(0.5)
    except Exception, e:
        print "DownLoad Image Error!---" + url
        #print os.getcwd()

if __name__ == '__main__':
    Main.mompath = os.getcwd()
    Main.momdict = {}
    Main.momname = "http://acm.hdu.edu.cn"
    DownLoadImage("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")

Save_File.py

__author__ = 'glcsnz123'
import sys, os
import Main
import string

def SaveFile(pathlist, data):
    #print "*********", pathlist, "*****"
    for path in pathlist[0:-1:1]:
        if len(pathlist) == 1:
            break
        try:
            table = string.maketrans(r'/\?*:<>|"', r'.........')
            path = path.translate(table)
            if not os.path.isdir(path):
                os.mkdir(path)
            if  os.getcwd() == Main.mompath and path == '..':
                continue
            os.chdir(path)
        except  Exception, e:
            print "dir create failed in save_file!---" + path

    try:
        table = string.maketrans(r'/\?*:<>|"', r'.........')
        pathlist[-1] = pathlist[-1].translate(table)
        #print pathlist[-1], "is create"
        f = open(pathlist[-1], "w")
        f.write(data)
        f.close()
    except Exception, e:
        print e;
        print "Save File Error!----" + "\\".join(pathlist)
        #print data


if __name__ == "__main__":
    SaveFile("status.php", "test/test/test".split("/"), "thank you")

ReSize.py

__author__ = 'glcsnz123'
import os
import sys
import Main

def ReFactUrl(url, hostname):
    #print hostname
    tmpname = hostname[7:].split("/")
    #print tmpname
    while url.startswith("../"):
        url = url[3:]
        if len(tmpname) > 1:
            del tmpname[-1]
    while url.startswith("./"):
        url = url[2:]
    hostname = hostname[:7] + "/".join(tmpname)
    #print hostname
    if not url.startswith(hostname):
        if url[0] != '/':
            return hostname + "/" + url
        else:
            return hostname + url
    return url


def ReFactPath(url, hostname):
    if url.startswith(hostname):
        #print hostname
        #print url
        return url[len(hostname) + 1:]
    else:
        print "ReFactPath Error!---" + url


def ReFact(url, hostname):
    print url,hostname
    url = ReFactUrl(url, hostname)
    path = ReFactPath(url, Main.momname)
    if path == '':
        path = "index.html"
    return url, path


if __name__ == "__main__":
    Main.momname="http://acm.hdu.edu.cn"
    print ReFact("../../../data/images/merry2012-4.JPG", "http://acm.hdu.edu.cn")