问题
1. 只能匹配一段
发现是因为ZIP(title,body)函数,最多迭代title(1次),不可能迭代到body那么多次
for title in titles:
for body in bodies:
yield NewsItem(title,wrap(body))
这样就会有个问题,就是出现很多对title,body
而实际上是一个title对应一个bodies,很多个body
或者这个题目用来提取title和body,而不是展示整个新闻,描述为一个网页上有多个新闻,提取每个title和对应的body,但是一个html只有一个title,还是不对。
说明要修改NewItem这个类才行
2. 打印中文符号会出现某些无法显示,比如逗号,
3. NNTP未找到服务器,暂时注释掉
新闻网址:http://m.cnr.cn/news/20150706/t20150706_519082828_tt.html?tt_group_id=4658836325
代码为:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from nntplib import NNTP
from time import time , strftime, localtime
from email import message_from_string
from urllib import urlopen
import textwrap
import re
day = 24 *60 * 60
def wrap(string, max =70):
return '\n'.join(textwrap.wrap(string)) + '\n'
class NewAgent(object):
def __init__(self):
self.sources = []
self.destinations = []
def addsource(self,source):
self.sources.append(source)
def adddestination(self,dest):
self.destinations.append(dest)
def distribute(self):
items = []
for source in self.sources:
items.extend(source.getItems())
#调用NNYPSource和SimplewebSource两个类方法getitem,用法为分别为两个类绑定实例,通过实例来调用class里的方法
for dest in self.destinations:
dest.receiveItems(items)
class NewsItem(object):
def __init__(self,title,body):
self.title = title
self.body = body
class NNTPSource(object):
def __init__(self,servername,group,window):
self.servername = servername
self.group = group
self.window = window
def getItems(self):
start = localtime(time()-self.window*day)
date = strftime('%y%m%d',start)
hour = strftime('%H%M%S',start)
server = NNTP(self.servername)
ids =server.newnews(self.group,date , hour)[1]
for id in ids:
lines = server.article(id)[3]
message = message_from_string('\n'.join(lines))
title = message['subject']
body = message.get_payload()
if message.is_multipart():
body = body[0]
yield NewsItem(title,body)
server.quit()
class SimpleWebSource(object):
def __init__(self,url,titlePattern,bodyPattern):
self.url = url
self.titlePattern = re.compile(titlePattern)
self.bodyPattern = re.compile(bodyPattern)
def getItems(self):
text = urlopen(self.url).read()
titles = self.titlePattern.findall(text)
bodies = self.bodyPattern.findall(text)
for title, body in zip(titles,bodies):
yield NewsItem(title,wrap(body))
class PlainDesination(object):
def receiveItems(self,items):
for item in items:
print item.title
print '-'*len(item.title)
print item.body
class HTMLDeatination(object):
def __init__(self,filename):
self.filename = filename
def receiveItems(self,items):
out = open(self.filename,'w')
print >> out,'''
<html>
<head>
<title>Today's new</title>
</head>
</html>
<body>
<h1>Today's News</h1>
'''
print >> out, '<u1>'
id = 0
for item in items:
id += 1
print >> out, '<li><a href="#%i">%s</a></li>' % (id, item.title)
print >> out, '</u1>'
id = 0
for item in items:
id+=1
print >>out, '<h2><a name="#%i">%s</a></h2>' % (id, item.title)
print >> out ,'<pre>%s</pre>' % item.body
print >> out ,'''
</body>
</html>
'''
def runDefaultSetup():
agent =NewAgent()
_url = 'http://m.cnr.cn/news/20150706/t20150706_519082828_tt.html?tt_group_id=4658836325'
_title = r'<title>(.+?)</title>'
_body = r'<p.*>(.+?)</p>'
bbc = SimpleWebSource(_url,_title,_body)
agent.addsource(bbc)
agent.adddestination(PlainDesination())
agent.adddestination(HTMLDeatination('new.html'))
agent.distribute()
'''
clap_server = ''
clap_group = ''
clap_window = 1
clap = NNTPSource(clap_server, clap_group, clap_window)
agent.addsource(clap)
'''
if __name__ == '__main__' : runDefaultSetup()
运行结果:
男子19万买二手宝马车 开出不到100米被”收回”
看着眼前这辆19万买来的宝马5系车,江西人小王�
��里那叫一个开心。如果不出意外,车子开回江西后�
�手一卖,还能再赚个两三万。