运用python抓取博客园首页的全部数据,并且定时持续抓取新发布的内容存入mongodb中

原文地址: 运用python抓取博客园首页的全部数据,并且定时持续抓取新发布的内容存入mongodb中



依赖包:

1.jieba

2.pymongo

3.HTMLParser

# -*- coding: utf-8 -*-
"""
@author: jiangfuqiang
"""

from HTMLParser import  HTMLParser
import re
import time
from datetime import  date
import pymongo
import urllib2
import sys
import traceback
import jieba

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
isExist = False

class FetchCnblog(HTMLParser):
    def __init__(self, id):
        HTMLParser.__init__(self)
        self.result = []
        self.data = {}
        self.isTitleLink = False
        self.id = id
        self.isSummary = False
        self.isPostItem = False
        self.isArticleView = False


    def handle_data(self, data):
        if self.isTitleLink and self.isPostItem:
            self.data['title'] = data
            self.isTitleLink = False
        elif self.isSummary and self.isPostItem:
            data = data.strip()
            if data:
                self.data['desc'] = data


    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for key, value in attrs:
                if key == 'class':
                    if value == 'titlelnk':
                        self.isTitleLink = True
                    elif value == 'gray' and self.isArticleView:
                        self.isArticleView = False
                        for key, value in attrs:
                            if key == 'href':
                                self.data['readmoreLink'] = value
                                reg = 'd+'
                                result = re.search(reg,value)
                                self.isPostItem = False

                                if result:
                                    self.data['id'] = int(result.group())
                                else:
                                    self.data = {}
                                    return
                                if self.data['id'] <= self.id:
                                    self.data = {}
                                    isExist = True
                                    return
                                else:
                                    self.data['srouce'] = "www.cnblogs.com"
                                    self.data['source_key'] = 'cnblogs'
                                    self.data['fetchTime'] = str(date.today())
                                    self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
                                    self.result.append(self.data)
                                    self.data = {}

        elif tag == 'p':
            for key, value in attrs:
                if key == 'class' and value == 'post_item_summary':
                    self.isSummary = True
        elif tag == 'img':
            for key, value in attrs:
                if key == 'class' and value == 'pfs':
                    for key, value in attrs:
                        if key == 'src':
                            self.data['imgSrc'] = value


        elif tag == 'div':
            for key, value in attrs:
                if key == 'class' and value == 'post_item_foot':
                    self.isSummary = False
                elif key == 'class' and value == 'post_item':
                    self.isPostItem = True
        elif tag == 'span':
            for key , value in attrs:
                if key == 'class' and value == 'article_view':
                    self.isArticleView = True


    def getResult(self):

        return self.result


if __name__ == "__main__":
    con = pymongo.Connection('localhost', 27017)
    db = con.blog
    fetchblog = db.fetch_blog
    record = db.record
    url = "http://www.cnblogs.com/sitehome/p/%d"
    count = 1
    flag = False
    headers={
             'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    reco = record.find_one({"type":'cnblogs'})
    id = 0
    if reco:
        id = reco['maxId']
    while isExist == False:
        try:
            req = urllib2.Request(url%count,headers=headers)
            request = urllib2.urlopen(req)
            data = request.read()
            fj = FetchCnblog(id)
            fj.feed(data)
            result = fj.getResult()
            if len(result) < 1:
                isExist = True
            else:
                if flag == False:
                    flag = True
                    dic = result[0]
                    id = int(dic['id'])
                    record.update({"type":'cnblogs'},{"$set":{'maxId':id}},True,False)
                result.reverse()
                for doc in result:
                    fetchblog.insert(doc)
                print "page is %d"%count
                count += 1

                time.sleep(5)
        except Exception, e:
            traceback.print_exc()
            print "parse error",e

程序如果在linux,mac下执行,在可在crontab -e中设置定时任务,如果在windows执行,则自己再在程序里加个定时器即可


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值