python爬网易新闻_Python爬虫实战(三):爬网易新闻

该博客详细介绍了如何使用Python爬虫从网易新闻网站抓取新闻数据,包括设置User-Agent、正则表达式解析HTML、提取新闻标题、内容、时间、来源等关键信息,并将数据写入到TXT文件中。
摘要由CSDN通过智能技术生成

#_*_ coding:utf-8 _*_

importurllib2importre#import sys

#reload(sys)#sys.setdefaultencoding(‘utf-8‘)

classTool:

removeImg= re.compile(r‘

‘)

removeAddr= re.compile(r‘|‘)

replaceLine= re.compile(r‘

|
|
|‘)

replaceTD= re.compile(r‘

‘)

replacePara= re.compile(r‘

‘)

replaceBR= re.compile(r‘
|
‘)

removeExtraTag= re.compile(r‘<.>‘)defreplace(self,text):

text= re.sub(self.removeImg,"",text)

text= re.sub(self.removeAddr,"",text)

text= re.sub(self.replaceLine,"\n",text)

text= re.sub(self.replaceTD,"\t",text)

text= re.sub(self.replacePara,"\n"+" ",text)

text= re.sub(self.replaceBR,"\n",text)

text= re.sub(self.removeExtraTag,"",text)returntext.strip()classWYXW:def __init__(self,baseUrl):

self.baseURL=baseUrl

self.user_agent= ‘Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)‘self.headers= {‘User-Agent‘:self.user_agent}#self.file = None

self.fileName = u‘网易新闻‘self.tool=Tool()defget_homepage(self):

url=self.baseURL

request= urllib2.Request(url,headers =self.headers)

response=urllib2.urlopen(request)

content= response.read().decode(‘utf-8‘,‘ignore‘)#print content#.encode(‘gbk‘,‘ignore‘)

returncontentdefextract_url(self,homepage):

pattern= "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html"news_url=re.findall(pattern,homepage)#print news_url

returnnews_urldefextract_sub_web_time(self,sub_web):

pattern= re.compile(r‘\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}‘,re.S)

time=re.findall(pattern,sub_web)printtime[0]returntime[0]defextract_sub_web_source(self,sub_web):

pattern= re.compile(r‘(.*?)‘)

source=re.findall(pattern,sub_web)printsource[0]returnsource[0]defextract_sub_web_title(self,sub_web):#pattern = "

.+"

#pattern = ‘

(.*?)

pattern = re.compile(r‘

(.*?)

‘,re.S)

title=re.findall(pattern,sub_web)if title is notNone:printtitle[0]returntitle[0]else:returnNonedefextract_sub_web_content(self,sub_web):#pattern = "

"

pattern = re.compile(r‘

(.*?)‘,re.S)

content=re.findall(pattern,sub_web)#print content[0]

if content is notNone:returncontent[0]else:returnNonedefwriteData(self,fName):if fName is notNone:

file= open(fName + ‘.txt‘,"w+")else:

file= open(self.fileName + ‘.txt‘,"w+")

homepage=self.get_homepage()

news_urls=self.extract_url(homepage)for url innews_urls:printurl

web=urllib2.urlopen(url).read()

title=self.extract_sub_web_title(web).strip()

content=self.extract_sub_web_content(web)

time=self.extract_sub_web_time(web).strip()

source=self.extract_sub_web_source(web).strip()if content is notNone:

content=self.tool.replace(content)

news= title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n"file.write(news)

sep= "\n" + "-------------------------------------------------------------------------" + "\n"file.write(sep)print u"新闻写入成功" + "\n"baseUrl= "http://news.163.com"wyxw=WYXW(baseUrl)

wyxw.writeData(None)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值