FreeBuf爬虫

freebuf爬虫

#C:\Python27\python.exe
#coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import re
import os
import urllib
import requests
from multiprocessing import Pool


subject_dict = {u'漏洞':'http://www.freebuf.com/vuls', u'安全工具':'http://www.freebuf.com/sectool',
                u'WEB安全':'http://www.freebuf.com/articles/web', u'系统安全':'http://www.freebuf.com/articles/system',
                u'网络安全':'http://www.freebuf.com/articles/network', u'无线安全':'http://www.freebuf.com/articles/wireless',
                u'终端安全':'http://www.freebuf.com/articles/terminal', u'数据安全':'http://www.freebuf.com/articles/database',
                u'安全管理':'http://www.freebuf.com/articles/security-management', u'企业安全':'http://www.freebuf.com/articles/es',
                u'极客':'http://www.freebuf.com/geek'}


def spider(filename, url):
    print "Crawling subject: %s" % filename
    if os.path.isfile(filename + ".html"):
        os.remove(filename + ".html")
    with open(filename + ".html",'a') as f:
        page = 0
        error_couter = 0
        while True:
            page += 1
            try:
                html = requests.get(url + '/page/' + str(page))
                code = html.status_code
                if code == 404:
                    error_couter += 1
                    if error_couter == 1:
                        print "Subject %s may only have %s pages." % (filename, str(page - 1))
                    if error_couter <= 3:
                        print "Retrying %s: 404 not Found!" % str(error_couter)
                        continue
                    else:
                        print "Subject %s finished!" % filename
                        print "#################################"
                        break
                else:
                    print u"Parsing page: " + str(page)
                    if page == 1:
                        site = re.findall('([\s\S]*)      </div>\n      <div class="news-more" id="pagination">',html.text,re.S)
                    else:
                        site = re.findall('<div id="timeline" class="news-detial">([\s\S]*?)      </div>\n      <div class="news-more" id="pagination">',html.text,re.S)
                    for each in site:
                        f.write(urllib.unquote(each.encode('utf-8')))
            except Exception as e:
                print e
                pass
        f.close()


def main():
    for key,value in subject_dict.items():
        spider(key, value)

    # pool = Pool(processes=4)
    # for i in range(0, subject_dict.__len__()):
    #     arg_list = subject_dict.items()[i]
    #     pool.apply_async(spider, (arg_list[0], arg_list[1],)).get(timeout=None)
    # pool.close()
    # pool.join()


if __name__ == '__main__':
    main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值