啊啊啊啊啊 话不多说直接上代码
#coding=utf-8
import urllib2
import re
class Spider:
"""
内涵段子爬虫类
"""
def __init__(self, page, enable):
self.page = page
self.enable = enable
def loadPage(self, page):
url = 'http://www.neihan8.com/article/list_5_' + str(page) + '.html'
#User-Agent头
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident/5.0'
headers = {'User-Agent': user_agent}
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
html = response.read()
gbk_html = html.decode('gbk').encode('utf-8')
# 找到所有的段子内容
# re.S 如果没有re.S 则是只匹配一行有没有符合规则的字符串,如果没有则下一行重新匹配
# 如果加上re.S 则是将所有的字符串将一个整体进行匹配
pattern = re.compile(r'
(.*?)