# coding=utf-8
# 如果,要爬取别的网站,需要对项目中的网站、正则表达式部分按照实际的进行修改
import urllib2import random
import re
class Spider(object):
def __init__(self, page):
# 用来控制爬虫是否爬取的
self.enable = True
# 用来控制爬虫从第几页开始
self.page = page
# 爬取网站内容
def loadPage(self, page):
url = "http://www.neihan8.com/article/list_5_" + str(page) + ".html"
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; ) Apple.... ",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0)... ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X.... ",
"Mozilla/5.0 (Macintosh; Intel Mac OS... "
]
ua_header = random.choice(ua_list)
header = {'User-Agent': ua_header}
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request)
html = response.read()
# return html
gbk_html = html.decode('gbk').encode('utf-8') # 因为,返回的内容有乱码,所以,可以先解码,然后再编码
return gbk_html
# 对爬取的内容进行筛选
def screenHTML(self, html):
parrent = re.compile(r'<div.*?class="f18 mb20">(.*?)</div>', re.S)
item_list = parrent.findall(html)
for item in item_list:
print '--'*50
item = item.replace(r'<p>', '').replace(r'</p>', '').replace('<br />', '')
self.writeHTML(item)
# 把筛选后的页面,储存在文件中
def writeHTML(self, item):
myFile = open('./duanzi.txt', 'a')
myFile.write(item)
myFile.close()
def doWork(self):
while self.enable:
try:
gdk_html = self.loadPage(self.page)
except urllib2.URLError, e:
print e.reason
continue
self.screenHTML(gdk_html)
self.page += 1
print '按回车继续..'
print '按Q退出..'
command = raw_input()
if command == 'q':
self.enable = False
break
if __name__ == '__main__':
ss = Spider(1)
ss.doWork()