内涵段子前段时间发生的事情,想必大家都有所耳闻,我本人也是非常喜欢看段子的。但这并不能阻止我想看段子的脚步,幸好,给大家推荐一个网站,叫做内涵吧,里面也有非常丰富的段子内容。下面就以内涵吧为例爬取所有段子,可供离线观看。
先分析请求的url地址
第一页段子url:https://www.neihan8.com/wenzi/index.html
第二页段子url:https://www.neihan8.com/wenzi/index.html
第3页段子url:https://www.neihan8.com/wenzi/index.html
那么就可以得出一个规律,也就是第n页段子url:https://www.neihan8.com/wenzi/index.html
如果要设计一个较为人性化的使用方式,可以让用户自己设定爬取页码的开始和结束,部分代码如下所示:
start_page = int(raw_input("请输入您要下载的起始页:"))
end_page = int(raw_input("请输入您要下载的结束页:"))
for page in range(start_page, end_page+1):
if page == 1:
url = "https://www.neihan8.com/wenzi/index.html"
else:
url = "https://www.neihan8.com/wenzi/index_" + str(page) + ".html"
整理逻辑功能
通过第一步已经拿到了每页的url,而每页内容都是一个段子列表,每页至少20个段子,这些列表包含段子的标题和每个段子的url,所以要提取出来,然后逐个读取。再将读取的内容与标题拼接成字符串,通过文件的读写功能写入每一个文件中
下载每页所有的段子
def loadPage(self, page, url):
"""
作用:下载该页所有的段子
page:当前下载的页码
url:当前页的url地址
"""
print "*"*50
print "正在下载第%s页"%page
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
html = response.read()
# print html
return html
提取出url和标题
def dealPage(self, page, html):
"""
作用:处理当前页面的段子
page:当前处理的页码
html:当前页面的html内容
"""
print "*"*50
print "正在处理第%s页"%page
pattern = re.compile(r'<h3><a\shref="(.*?)"\s.*?>(.*?)</a></h3>', re.S)
urls_names = pattern.findall(html)
duanzi_dict = self.dealEach(urls_names)
return duanzi_dict
使用正则解析每个段子的url
def dealPage(self, page, html):
"""
作用:处理当前页面的段子
page:当前处理的页码
html:当前页面的html内容
"""
print "*"*50
print "正在处理第%s页"%page
pattern = re.compile(r'<h3><a\shref="(.*?)"\s.*?>(.*?)</a></h3>', re.S)
urls_names = pattern.findall(html)
duanzi_dict = self.dealEach(urls_names)
return duanzi_dict
将段子拼接并写入文件
def writePage(self, page, duanzi_dict):
"""
作用:写入该页所有的段子
duanzi_dict:该页所有的段子标题与内容
"""
print "*"*50
print "正在写入第%s页"%page
print "*"*50
num = 1
with open("第%s页"%page, "w") as f:
for duanzi_name in duanzi_dict:
print "正在记录第%s个段子"%num
f.write(duanzi_name + '\n' + duanzi_dict[duanzi_name]+ '\n' + '\n' + '\n')
num += 1
全部代码
#!/usr/bin/env python
# coding=utf-8
import urllib2
import re
class Spider(object):
def __init__(self):
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"}
def loadPage(self, page, url):
"""
作用:下载该页所有的段子
page:当前下载的页码
url:当前页的url地址
"""
print "*"*50
print "正在下载第%s页"%page
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
html = response.read()
# print html
return html
def dealPage(self, page, html):
"""
作用:处理当前页面的段子
page:当前处理的页码
html:当前页面的html内容
"""
print "*"*50
print "正在处理第%s页"%page
pattern = re.compile(r'<h3><a\shref="(.*?)"\s.*?>(.*?)</a></h3>', re.S)
urls_names = pattern.findall(html)
duanzi_dict = self.dealEach(urls_names)
return duanzi_dict
def dealEach(self, urls_names):
"""
作用:处理每个段子的内容
urls_names:每个段子的url和标题
"""
duanzi_names = []
duanzi_contents = []
duanzi_content = ''
for url,name in urls_names:
duanzi_url = "https://www.neihan8.com" + url
# print duanzi_url
request = urllib2.Request(duanzi_url, headers=self.headers)
response = urllib2.urlopen(request)
html = response.read()
# print html
pattern = re.compile(r'<p>(.*?)</p>')
duanzi_content = ''.join(pattern.findall(html)).replace("“","").replace("”","").replace("…","").replace(""","").replace("…","").replace(" ","").replace("—","")
# print duanzi_content
duanzi_contents.append(duanzi_content)
duanzi_names.append(name)
duanzi_dict = dict(zip(duanzi_names, duanzi_contents))
return duanzi_dict
def writePage(self, page, duanzi_dict):
"""
作用:写入该页所有的段子
duanzi_dict:该页所有的段子标题与内容
"""
print "*"*50
print "正在写入第%s页"%page
print "*"*50
num = 1
with open("第%s页.txt"%page, "w") as f:
for duanzi_name in duanzi_dict:
print "正在记录第%s个段子"%num
f.write(duanzi_name + '\n' + duanzi_dict[duanzi_name]+ '\n' + '\n' + '\n')
num += 1
def neihan(self):
list_urls = []
start_page = int(raw_input("请输入您要下载的起始页:"))
end_page = int(raw_input("请输入您要下载的结束页:"))
if start_page <=0 or end_page <=0:
print "当前输入不合法"
return
for page in range(start_page, end_page+1):
if page == 1:
url = "https://www.neihan8.com/wenzi/index.html"
else:
url = "https://www.neihan8.com/wenzi/index_" + str(page) + ".html"
# print url
html = self.loadPage(page, url)
duanzi_dict = self.dealPage(page, html)
self.writePage(page, duanzi_dict)
if __name__ == "__main__":
neihan_Spider = Spider()
neihan_Spider.neihan()