关闭

python爬虫

标签: python
121人阅读 评论(0) 收藏 举报
分类:

再不写点东西就真不会写代码了
最近一直在学习云计算的东西,慢慢的也准备整理一下。
今天试试手,写一个简单的爬虫,之前就写了,但一直没有实现。很简单,正好熟悉一下python

# coding=utf-8
import sys
import os
import urllib
import urllib2
import threading

base_uri = "http://www.csdn.net/"
connect_flag = True
running = False
from time import sleep
import re

LINK_RE = r'(http://[\w,\b,/,\.,-]*)'
IMAGE_RE = r"<img.*src\s?=\s?\"[^\w]*([^>]*?)\""
link_set = set()

#初始化参数
link_set_copy = set()
link_set.add(base_uri)
link_set_copy.add(base_uri)
SUCCESS_COUNT=1
ERROR_COUNT=0
IMAGE_COUNT=0



# 创建一个类去处理爬虫程序
class Crawler(object):
    def __init__(self):
        pass

    def writeLinks(self, link):
        file_obj = open("./links/links.txt","a")
        file_obj.write(link+"\n")
        file_obj.close()

    def getImages(self, contents):
        global IMAGE_COUNT
        imagere = re.compile(IMAGE_RE)
        image_list = imagere.findall(contents)
        n=0
        for image in image_list:
            try:

                if image.index("http") == -1:
                    image = "http://"+image
                #print str(image)
                data = urllib.urlopen(image).read()
                f = file("./images/"+str(IMAGE_COUNT)+".jpg", 'wb')
                f.write(data)
                f.close()
                IMAGE_COUNT+=1
            except Exception, e:
                # print "download image failed: "+image

                #print e
                pass

    def getLink(self,contents):
        '''
        :param contents: 获取出来的网页内容
        :return: 无
        '''
        global link_set, link_set_copy
        linkre = re.compile(LINK_RE)
        link_list = linkre.findall(contents, )
        #print "链接数量: " + str(len(link_list))
        #如果不在已经删除的链接的集合就加进去
        for link in link_list:
            if link not in link_set_copy:
                link_set.add(link)

    def req_contents(self):
        global link_set, link_set_copy,ERROR_COUNT,SUCCESS_COUNT,connect_flag, running

        uri = link_set.pop()
        link_set_copy.add(uri)
        #print uri
        self.writeLinks(uri)

        try :
            request = urllib2.Request(uri)
            response = urllib2.urlopen(request)
            result = response.read().decode('utf8')
            self.getImages(result)
            SUCCESS_COUNT += 1
            if connect_flag:
                connect_flag = False
            if not running:
                running = True
            self.getLink(result)
        except UnicodeDecodeError, e:
            ERROR_COUNT += 1
            # print "decoding error"
        except Exception, e:
            ERROR_COUNT += 1
            # print e
        except urllib2.URLError, e:
            ERROR_COUNT += 1
            # print e



    def start(self):
        global link_set
        #import pdb;pdb.set_trace()
        while len(link_set) > 0:
            self.req_contents()



#也是添加一个不断更新的效果,在第一次请求的时候,如果需要时间比较长久能看得到。
class IsGrabbig(threading.Thread):
    def run(self):
        n = 1
        global connect_flag
        # print "grbbing ",
        while connect_flag:

            if n % 4 == 0:
                sys.stdout.write("\r || grabbing || \r")
            elif n % 4 == 1:
                sys.stdout.write("\r /\\ grabbing /\\ \r")
            elif n % 4 == 2:
                sys.stdout.write("\r -- grabbing -- \r")
            else:
                sys.stdout.write("\r \\/ grabbing \\/ \r")
            sys.stdout.flush()
            sleep(0.5)
            n += 1
        #一直想实现一下linux上的进度条的效果,之前没有试过怎么在一行输出并不断更新。sys.stdout.write("\r")就可以实现,\r的效果是从头开始重新输入,会覆盖之前的输出。
        # \r从头开始,覆盖后面的数据,中间不能有换行的输出
        sys.stdout.write("\r 已获取到数据" + " " * 100 + "\n")
        sys.stdout.flush()
#输出一个不断更新的效果
class LinkCount(threading.Thread):

    def run(self):
        global running,SUCCESS_COUNT,ERROR_COUNT
        while not running:
            sleep(0.1)
            pass
        while running:
            sys.stdout.write("\r 已读取的网页数量: "+str(SUCCESS_COUNT) + "\t失败数量:"+str(ERROR_COUNT) + "\t")
            sys.stdout.flush()
            sleep(1)



def generateReport():
    global link_set_copy, link_set
    print "\n"+"*" * 10 + "Report" + "*" * 10
    print " " * 10 + "over" + " " * 10
    print "总共识别出了"+str(len(link_set_copy)+len(link_set))+"个网页"
    print "已经读取"+str(SUCCESS_COUNT)+"个网页"
    print "读取出错的数量" + str(ERROR_COUNT) + "个网页"
    print "*" * 26


if __name__ == "__main__":
    # raw_input("please input the uri")
    # uri_confirm = raw_input("confirm the uri is : " + base_uri + " (Y/N) \n")                                                             "")
    # if uri_confirm is "N" or uri_confirm is "n":
    # sys.exit(0)
    print "program is starting ... "
    IsGrabbig().start()

    crawler = Crawler()
    try:

        LinkCount().start()
        crawler.start()
        print
    except KeyboardInterrupt,e:

        print
    except UnicodeDecodeError,e:
        print
    finally:
        running = False
        connect_flag = True
        generateReport()

很简单,输出的结果也就是这样

sh-3.2# python main.py
program is starting ... 
 已获取到数据                                                                                                    
 已读取的网页数量: 11   失败数量:5  ^C

**********Report**********
          over          
总共识别出了739个网页
已经读取11个网页
读取出错的数量5个网页
**************************
0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:43840次
    • 积分:1144
    • 等级:
    • 排名:千里之外
    • 原创:73篇
    • 转载:10篇
    • 译文:0篇
    • 评论:2条
    文章分类
    最新评论