二.scrapy抓取百度新闻排行榜,并且推送到指定邮箱

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/beyond_f/article/details/73967603
#encoding=utf-8
import scrapy
import requests
from pymongo import MongoClient
from ..items import FirstoneItem
import smtplib
from email.mime.text import MIMEText
from .. import settings
import time
from scrapy.http import Request
from scrapy.spiders import Spider
class MaimaiSpider(Spider):
    #mongodb
    cn=MongoClient(settings.MONGODB_HOST,settings.MONGODB_PORT)
    db=cn[settings.MONGODB_DB]
    tb=db[settings.MONGODB_TABLE]

    name='baidunews'
    allowed_domains=['baidu.com']
    start_urls=['http://top.baidu.com/buzz?b=341']
    mainurl='http://top.baidu.com/'

    def parse(self, response):
        modes=response.xpath('//div[@class="hblock"]/ul/li/a/@href').extract()
        for mode in modes[1:]:
            news_type=response.xpath('//div[@class="hblock"]/ul/li[{}]/a/@title'.format(str(1+modes.index(mode)))).extract_first()
            yield Request(url=self.mainurl+mode[1:],callback=self.parse_item,meta={'news_type':news_type})

    def parse_item(self,response):
        bodys=response.xpath('//table[@class="list-table"]/tr')
        for body in bodys:
            if body.xpath('.//td[@class="first"]').extract():
                items=FirstoneItem()
                num=body.xpath('.//td[@class="first"]/span/text()').extract_first()
                title=body.xpath('.//td[@class="keyword"]/a/text()').extract_first()
                href=body.xpath('.//td[@class="keyword"]/a/@href').extract_first()
                focus_num=body.xpath('.//td[@class="last"]/span/text()').extract_first()
                items['num']=num
                items['_id']=title
                items['news_type']=response.meta['news_type']
                items['baidu_url']=href
                items['focus_num']=focus_num
                yield items

             #   print response.meta['news_type'].encode('gb18030'),num,title.encode('gb18030'),href


    def close(self, reason):
        if reason=='finished':
            header='<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><table border="0" cellspacing="0" cellpadding="3" align="left" >'
            tail='</table></body></html>'
            line=''
            for data in self.tb.find():
                if int(data['num'])<=3:
                    tp0='<tr align="left"><td colspan="6">%s</td></tr>'%('*'*10)
                    tp1='<tr align="left"><td colspan="6">%s</td></tr>'%data['news_type']
                    tp2='<tr align="left"><td colspan="6">%s</td></tr>'%data['num']
                    tp3='<tr align="left"><td colspan="6">%s</td></tr>'%data['_id']
                    tp4='<tr align="left"><td colspan="6">%s</td></tr>'%data['baidu_url']
                    line=line+tp0+tp1+tp2+tp3+tp4
            body=header+line+tail
            msg = MIMEText(body,'html', 'utf-8')
            msg["Subject"] = "[%s]BaiduTopNews"%time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            msg["From"]    = settings.email_From
            msg["To"]      = ','.join(settings.email_To)
            try:
                s = smtplib.SMTP_SSL(settings.smtpHost, settings.smtpPort)
                s.login(settings.email_From,settings.email_pwd)
                s.sendmail(settings.email_From, msg["To"], msg.as_string())
                s.quit()
                print "Success!"
            except smtplib.SMTPException,e:
                print "sendemail_Falied,the reson is %s"%e
阅读更多
换一批

没有更多推荐了,返回首页