项目4:新闻聚合-Python基础教程

问题
1. 只能匹配一段
发现是因为ZIP(title,body)函数,最多迭代title(1次),不可能迭代到body那么多次
for title in titles:
for body in bodies:
yield NewsItem(title,wrap(body))
这样就会有个问题,就是出现很多对title,body
而实际上是一个title对应一个bodies,很多个body
或者这个题目用来提取title和body,而不是展示整个新闻,描述为一个网页上有多个新闻,提取每个title和对应的body,但是一个html只有一个title,还是不对。
说明要修改NewItem这个类才行
2. 打印中文符号会出现某些无法显示,比如逗号,
3. NNTP未找到服务器,暂时注释掉
新闻网址:http://m.cnr.cn/news/20150706/t20150706_519082828_tt.html?tt_group_id=4658836325

代码为:

#!/usr/bin/env python
# -*- coding: utf-8  -*-

from nntplib import NNTP
from time import time , strftime, localtime
from email import message_from_string
from urllib import urlopen
import textwrap
import re

day =  24 *60 * 60
def wrap(string, max =70):
    return '\n'.join(textwrap.wrap(string)) + '\n'

class NewAgent(object):
    def __init__(self):
        self.sources = []
        self.destinations = []
    def addsource(self,source):
        self.sources.append(source)
    def adddestination(self,dest):
        self.destinations.append(dest)
    def distribute(self):
        items = []
        for source in self.sources:
            items.extend(source.getItems())
            #调用NNYPSource和SimplewebSource两个类方法getitem,用法为分别为两个类绑定实例,通过实例来调用class里的方法
        for dest in self.destinations:
            dest.receiveItems(items)

class NewsItem(object):
    def __init__(self,title,body):
        self.title = title
        self.body = body

class NNTPSource(object):
    def __init__(self,servername,group,window):
        self.servername = servername
        self.group = group
        self.window = window
    def getItems(self):
        start = localtime(time()-self.window*day)
        date = strftime('%y%m%d',start)
        hour = strftime('%H%M%S',start)

        server = NNTP(self.servername)
        ids =server.newnews(self.group,date , hour)[1]
        for id in ids:
            lines = server.article(id)[3]
            message = message_from_string('\n'.join(lines))
            title = message['subject']
            body = message.get_payload()
            if message.is_multipart():
                body = body[0]
            yield NewsItem(title,body)
        server.quit()

class SimpleWebSource(object):
    def __init__(self,url,titlePattern,bodyPattern):
        self.url = url
        self.titlePattern = re.compile(titlePattern)
        self.bodyPattern = re.compile(bodyPattern)

    def getItems(self):
        text = urlopen(self.url).read()
        titles = self.titlePattern.findall(text)
        bodies = self.bodyPattern.findall(text)
        for title, body in zip(titles,bodies):
            yield NewsItem(title,wrap(body))

class PlainDesination(object):
    def receiveItems(self,items):
        for item in items:
            print item.title
            print '-'*len(item.title)
            print item.body

class HTMLDeatination(object):
    def __init__(self,filename):
        self.filename = filename
    def receiveItems(self,items):
        out = open(self.filename,'w')
        print >> out,'''
       <html>
          <head>
            <title>Today's new</title>
          </head>
       </html>
       <body>
       <h1>Today's News</h1>
       '''

        print >> out, '<u1>'
        id = 0
        for item in items:
            id += 1
            print >> out, '<li><a href="#%i">%s</a></li>' % (id, item.title)
        print >> out, '</u1>'

        id = 0
        for item in items:
            id+=1
            print >>out, '<h2><a name="#%i">%s</a></h2>' % (id, item.title)
            print >> out ,'<pre>%s</pre>' % item.body

        print >> out ,'''
        </body>
       </html>
       '''


def runDefaultSetup():
    agent =NewAgent()
    _url = 'http://m.cnr.cn/news/20150706/t20150706_519082828_tt.html?tt_group_id=4658836325'
    _title = r'<title>(.+?)</title>'
    _body = r'<p.*>(.+?)</p>'
    bbc = SimpleWebSource(_url,_title,_body)
    agent.addsource(bbc)
    agent.adddestination(PlainDesination())
    agent.adddestination(HTMLDeatination('new.html'))
    agent.distribute()
'''
    clap_server  = ''
    clap_group = ''
    clap_window = 1
    clap = NNTPSource(clap_server, clap_group, clap_window)
    agent.addsource(clap)
'''



if __name__ == '__main__' : runDefaultSetup()

运行结果:

男子19万买二手宝马车 开出不到100米被”收回”

  看着眼前这辆19万买来的宝马5系车,江西人小王�
��里那叫一个开心。如果不出意外,车子开回江西后�
�手一卖,还能再赚个两三万。

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页