周末小爬虫

今天值班好像没什么事要干,趁机学学python

目标网站:http://wooyun.tangscan.cn(记得之前看的时候这个网站上还没有小广告,现在好多小广告)

import requests
import re
import threading

class Collector(object):
    def __init__(self):
        self.pageindex=[]
        self.pagecontent=[]

class ContentParse(object):
    def __init__(self):
        self.pagerule=re.compile('totalPages:\s*(\d+)')
        self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
    def getpage(self,strings):
        return self.pagerule.findall(strings)
    def getpageurl(self,strings):
        return self.pageurlrule.findall(strings)

class Spider(threading.Thread):
    def __init__(self,prefix,page='1'):
        super(Spider, self).__init__()
        self.prefix=prefix
        self.page=page
    def run(self):
        strings=requests.get(self.prefix+self.page)
        return strings.content

def main():
    threads = []
    prefix="http://wooyun.tangscan.cn/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page="
    spider=Spider(prefix)
    collector=Collector()
    contentparse=ContentParse()
    collector.pagecontent.append(spider.run())
    pages=contentparse.getpage(collector.pagecontent[0])
    for i in range(2,int(pages[0])+1):
        threads.append(Spider(prefix,str(i)))
    for thread in threads:
        collector.pagecontent.append(thread.run())
    for strings in collector.pagecontent:
        collector.pageindex.extend(contentparse.getpageurl(strings))
    for index in collector.pageindex:
        print "url=%s,title=%s" % (index[0],index[1].strip())

main()

暂时就写了这么多,功能和性能还差好多→_→:

(1)应该在collector类中实现collect方法,而不是把解析出的内容直接填进去

(2)页面下载(最好能过滤掉html一些乱七八糟的标签)和日志记录功能都没有实现

(3)等到爬完整个网站才进行关键内容的提取,实际测试下来速度也很慢,如果能在下载的同时进行解析url和标题,打印日志的工作就好了

暂时就想到这么多,今晚或者明天写完这个爬虫。另外开源中国这个代码的配色好难看,有人知道在哪改吗......

改了下代码,现在可以在下载的时候解析页面中得目标url和标题了。还差日志打印和目标页面的下载工作。

import requests
import re
import threading
from Queue import Queue

class Collector(object):
    def __init__(self):
        self.url2title={}
    def storage(self,name,strings):
            for i in strings:
                self.url2title[r"http://www.anquan.us/"+i[0]]=i[1].strip()

class ContentParse(threading.Thread):
    def __init__(self,queue,collector):
        super(ContentParse,self).__init__()
        self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
        self.queue=queue
        self.collector=collector
    def getpageurl(self,strings=None):
        return self.pageurlrule.findall(strings)
    def run(self):
        while True:
            data=self.queue.get()
            if data=='final':
                break
            else:
                print "parseing..........."
            self.collector.storage('url2title',self.pageurlrule.findall(data))

class Spider(threading.Thread):
    def __init__(self,prefix,queue,page='1'):
        super(Spider, self).__init__()
        self.prefix=prefix
        self.page=page
        self.queue=queue
    def run(self):
        content=requests.get(self.prefix+self.page)
        print "dowlanding.............."
        self.queue.put(content.text)

def getpage(strings):
    return re.findall('totalPages:\s*(\d+)',strings)

def main():
    prefix=r"http://wooyun.tangscan.cn/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page="
    queue=Queue()
    collector = Collector()
    spider=Spider(prefix,queue)
    spider.run()
    pages=getpage(queue.get())
    contentparse = ContentParse(queue,collector)
    threads=[]
    for i in range(2,int(pages[0])+1):
        threads.append(Spider(prefix,queue,str(i)))
    for thread in threads:
        thread.start()
    contentparse.start()
    for thread in threads:
        thread.join()
    queue.put('final')

main()

字符编码的问题简直让人蛋疼的不行,先写到这儿吧。学点mongo了接着写。

#-*- encoding:utf-8 -*-
import requests
import re
import threading
import logging
from Queue import Queue

class Collector(object):
    def __init__(self):
        self.url2title={}
    def storage(self,name,strings):
            for i in strings:
                self.url2title[r"http://www.anquan.us/"+i[0]]=i[1].strip()

class ContentParse(threading.Thread):
    def __init__(self,queue,collector):
        super(ContentParse,self).__init__()
        self.pageurlrule=re.compile(r'<a href=\"(?=static)([^"]+)\"\starget=\"_blank\">([\s\S]+?)</a>')
        self.queue=queue
        self.collector=collector
    def getpageurl(self,strings=None):
        return self.pageurlrule.findall(strings)
    def run(self):
        while True:
            try:
                data,url=self.queue.get()
                if data=='final':
                    break
                else:
                    writelog("pasing %s" % (url))
                self.collector.storage('url2title',self.pageurlrule.findall(data))
            except Exception,e:
                writelog(str(e))

class Spider(threading.Thread):
    def __init__(self,prefix,queue,page='1'):
        super(Spider, self).__init__()
        self.prefix=prefix
        self.page=page
        self.queue=queue
    def run(self):
        try:
            content=requests.get(self.prefix+self.page)
            writelog("dowland %s" % self.prefix + self.page)
        except Exception,e:
            writelog(str(e))
        self.queue.put((content.text,self.prefix+self.page))

def getpage(strings):
    return re.findall('totalPages:\s*(\d+)',strings)

def writelog(message):
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%a %d %b %Y %H:%M:%S',
                        )
    logging.info(message)

def main():
    prefix=r"http://wooyun.tangscan.cn/search?keywords=app&&content_search_by=by_bugs&&search_by_html=False&&page="
    queue=Queue()
    collector = Collector()
    spider=Spider(prefix,queue)
    spider.run()
    pages=getpage(queue.get()[0])
    contentparse = ContentParse(queue,collector)
    threads=[]
    for i in range(2,int(pages[0])+1):
        threads.append(Spider(prefix,queue,str(i)))
    for thread in threads:
        thread.start()
    contentparse.start()
    for thread in threads:
        thread.join()
    queue.put('final')
    for key in collector.url2title.keys()
        r=requests.get(key)
        try:
            r = requests.get(key)
            r.close()
            fp=open(collector.url2title[key].encode('gb2312')+'.html','w')
            fp.write(r.text.encode(r.encoding).decode('utf8').encode('utf8'))
        except Exception,e:
            writelog(str(e))
main()

 

转载于:https://my.oschina.net/0eb1/blog/867264

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值