采用python2.7所写,主要是为了准备单位的网络爬虫方面的讲课,所写写了这个示例程序。全部代码如下:
# coding:UTF-8
'''
运行即爬取浙江农信所有的新闻,保存文件名格式为:当前时间.news ,保存在脚本目录
使用python IDLE打开,运行即可
'''
from lxml import etree # 导入xpath支持的包
import urllib2
import time
class Spider:
'''
爬取浙江农信所有新闻的一个类
'''
def __init__(self):
# 定义一个类内部的变量,起到全局变量的作用
self.nongxin = "http://www.zj96596.com/"
# 获取当前时间,作为保存的文件名,精确到分
thetime = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
self.file_name = thetime + ".news"
# 该方法对每个url链接返回一个xpath选择器
def load_page(self, url):
# print "loading page now:", url
user_agent = "Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;"
headers = {"User-Agent": user_agent}
req = urllib2.Request(url, headers=headers)
try:
response = urllib2.urlopen(req)
except:
print "链接异常,3秒后重新连接..."
for second in range(3):
print "%ds left.." % (3-second)
time.sleep(1)
self.load_page(url) #睡眠后重新执行该函数
html = response.read()
html = html.decode('gbk').encode('utf8') #浙江农信的页面是GBK格式,编码问题很多,所以要对GBK页面进行解码成unicode
# 用xpath处理html返回一个选择器
selector = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8')) #最好指定编码格式
return selector
# 获取每页的新闻链接,返回一个列表
def get_all_page(self, url):
list = []
selector = self.load_page(url)
option_list = selector.xpath('//select[@name="select"]/option/@value')
for option in option_list:
op = self.nongxin + option
list.append(op)
return list
# 获取当前页所有的文章链接
def get_title_list(self, url):
newurl_list = []
selector = self.load_page(url)
newsListc = selector.xpath('//dl[@class="newsListc pBno"]/dd/a/@href')
for oneurl in newsListc:
newsurl = self.nongxin + oneurl
newurl_list.append(newsurl)
# print '新闻链接:', newsurl
return newurl_list
# 获取文章标题和内容
def get_one_news(self, url):
selector = self.load_page(url)
# 获取标题
title = selector.xpath('//div[@class="newsDth"]/text()')[0]
title = title.replace(' ', '').replace('\t', '') #去除标题中多余的空格和制表符
# 获取文章内容......
content = selector.xpath('//div[@class="newsDthd"]')[0]
content = content.xpath('string(.)')
#print url,title,content
self.write_to_file(url, title, content)
# 将文章保存到本地
def write_to_file(self, url, title, content):
f = open(self.file_name, 'a')
f.write(title.encode('utf-8'))
f.write(url.encode('utf-8') )
f.write(content.encode('utf-8') )
f.write('---------------------------------------------------------------------------------------------\n')
f.close()
#导出到word
def write_to_word(self, url, title, content):
pass
#执行整个过程
def do_work(self, url):
# 获取所有每页新闻的链接
page_list = self.get_all_page(url)
# 从每页新闻中获取文章列表
# 文章数量
sum = 0
# 页面序号
pagenum = 0
for onepage in page_list:
pagenum += 1
print 'get page:%d' % pagenum
#防止访问频率过大,睡眠三秒
print "sleeping.."
for second in range(3):
print "%ds..." % (3-second)
time.sleep(1)
title_list = self.get_title_list(onepage)
for title_url in title_list:
sum += 1
print 'download No.%d ' % sum
self.get_one_news(title_url)
if __name__ == "__main__":
# 新闻主页url
url = "http://www.zj96596.com/sy/nxxw/default.shtml"
mySpider = Spider()
mySpider.do_work(url)