#_*_ coding:utf-8 _*_
importurllib2importre#import sys
#reload(sys)#sys.setdefaultencoding(‘utf-8‘)
classTool:
removeImg= re.compile(r‘
‘)
removeAddr= re.compile(r‘|‘)
replaceLine= re.compile(r‘
|replaceTD= re.compile(r‘
‘)replacePara= re.compile(r‘
‘)replaceBR= re.compile(r‘
|
‘)
removeExtraTag= re.compile(r‘<.>‘)defreplace(self,text):
text= re.sub(self.removeImg,"",text)
text= re.sub(self.removeAddr,"",text)
text= re.sub(self.replaceLine,"\n",text)
text= re.sub(self.replaceTD,"\t",text)
text= re.sub(self.replacePara,"\n"+" ",text)
text= re.sub(self.replaceBR,"\n",text)
text= re.sub(self.removeExtraTag,"",text)returntext.strip()classWYXW:def __init__(self,baseUrl):
self.baseURL=baseUrl
self.user_agent= ‘Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)‘self.headers= {‘User-Agent‘:self.user_agent}#self.file = None
self.fileName = u‘网易新闻‘self.tool=Tool()defget_homepage(self):
url=self.baseURL
request= urllib2.Request(url,headers =self.headers)
response=urllib2.urlopen(request)
content= response.read().decode(‘utf-8‘,‘ignore‘)#print content#.encode(‘gbk‘,‘ignore‘)
returncontentdefextract_url(self,homepage):
pattern= "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html"news_url=re.findall(pattern,homepage)#print news_url
returnnews_urldefextract_sub_web_time(self,sub_web):
pattern= re.compile(r‘\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}‘,re.S)
time=re.findall(pattern,sub_web)printtime[0]returntime[0]defextract_sub_web_source(self,sub_web):
pattern= re.compile(r‘(.*?)‘)
source=re.findall(pattern,sub_web)printsource[0]returnsource[0]defextract_sub_web_title(self,sub_web):#pattern = "
.+"#pattern = ‘
(.*?)
‘pattern = re.compile(r‘
(.*?)
‘,re.S)title=re.findall(pattern,sub_web)if title is notNone:printtitle[0]returntitle[0]else:returnNonedefextract_sub_web_content(self,sub_web):#pattern = "
pattern = re.compile(r‘
content=re.findall(pattern,sub_web)#print content[0]
if content is notNone:returncontent[0]else:returnNonedefwriteData(self,fName):if fName is notNone:
file= open(fName + ‘.txt‘,"w+")else:
file= open(self.fileName + ‘.txt‘,"w+")
homepage=self.get_homepage()
news_urls=self.extract_url(homepage)for url innews_urls:printurl
web=urllib2.urlopen(url).read()
title=self.extract_sub_web_title(web).strip()
content=self.extract_sub_web_content(web)
time=self.extract_sub_web_time(web).strip()
source=self.extract_sub_web_source(web).strip()if content is notNone:
content=self.tool.replace(content)
news= title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n"file.write(news)
sep= "\n" + "-------------------------------------------------------------------------" + "\n"file.write(sep)print u"新闻写入成功" + "\n"baseUrl= "http://news.163.com"wyxw=WYXW(baseUrl)
wyxw.writeData(None)