脑袋大大,爬的都迷路了~~
各种段子等你来拿哦~
#导入相应的包
from urllib.request import Request,urlopen,URLError
import re
#定义一个类
class Spider(object):
#爬完之后保存成.txt文件
def save_text(self,text):
with open('内涵段子.txt', "a") as f:
f.write(text)
#处理爬取下来段子的数据
def handle_data(self,list):
for item in list:
item = item.replace('<div>','').replace('</div>','').replace('<p>','').replace('</p>','').replace('“','""').replace('”','""').replace('…','...').replace('<br />','').strip()
print(item)
self.save_text(item)
#加载页面,准备爬虫前的工作
def load_page(self,page):
url = "https://www.neihan8.com/article/list_5_"+str(page)+".html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/61.0"
}
#捕获异常,(出现断网的情况)
try:
request = Request(url,headers=headers)
response = urlopen(request)
data = response.read()
return data
except URLError as e:
print("报错喽~",e)
return ""
#开始爬去喽,并且分页
def start(self):
print("爬取内涵吧开始喽~")
self.page = 1
while True:
print("正则爬取...",self.page,"页")
if self.page <= 10:
html = self.load_page(self.page).decode("GBK")
pattern = re.compile(r'<div class="f18 mb20">(.*?)</div>',re.S)
naihans_list = pattern.findall(html)
self.handle_data(naihans_list)
self.page += 1
else:
break
#函数的入口,也是一切的开端
if __name__ == "__main__":
Spider().start()
效果图:
每天都有小小的案例,希望可以帮到你们,有兴趣的话,扫个码,关注一下公众号呗~~