1. 概述
本篇文章中将分享使用Python爬虫爬取糗事百科热门笑话(注:其中不包含任何针对糗事百科的恶意攻击和其它商业行为),爬取的主要步骤是:1. 模拟浏览器抓取糗事百科的网页;2. 对网页内容使用正则表达式解析,得到想要的内容;3. 再将这些内容分类保存和整理。
在书写匹配的正则表达式时可以使用Chrome浏览器的F12键对元素进行快速定位,这对表达式提供了方便。
2. 实现
# -*- coding=utf-8 -*-
import urllib as url_lib
import urllib2 as url_lib2
import bs4 as BS4
import re
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class get_joke(object):
def __init__(self, url_str):
self.url_str = url_str # 糗事百科笑话链接的初始化链接
self.page_href = set() # 糗事百科其他页的链接
self.next_page = "" # 下一页的链接
# 糗事百科对爬虫访问进行了阻止,这里采用伪装的形式进行访问
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'
self.headers = {"User-Agent": self.user_agent}
# 读取网页数据
def getwebpage(self, url_str = ""):
if "" is self.url_str:
if "" == url_str:
return None
else:
self.url_str = url_str
try:
request = url_lib2.Request(self.url_str, headers = self.headers)
webpage = url_lib2.urlopen(request).read().decode('utf-8')
except url_lib2.URLError as ex:
if hasattr(ex, "code"):
print(ex.code)
if hasattr(ex, "reason"):
print(ex.reason)
webpage = webpage.decode("utf-8")
return webpage
# 读取糗事百科上面的文本笑话,包括文本笑话的作者,内容,点赞数目,评论数目
def getjoke_text(self, webpage):
if None is webpage:
return None
re_str = '<div.*?>.*?<div class="author.*?>.*?<img.*?alt="(.*?)"/>.*?<div class="content">
.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<span class="dash">
.*?<i class="number">(.*?)</i>.*?</div>'
re_obj = re.compile(re_str, re.S) # re.S 标志代表在匹配时为点任意匹配模式,点 . 也可以代表换行符
jokes = re.findall(re_obj, webpage)
print("获取到的笑话的个数:%d" % len(jokes))
joke_list = []
for joke in jokes:
temp_str = "%s" % joke[1]
temp_str = re.sub("(<br/>)", " ", temp_str)
print("发表者:%s"%joke[0] + "\n内容:%s"%temp_str + "\n点赞数目:%s"%joke[2] + "\n评论数目:%s\n"%joke[3])
joke_list.append([joke[0], joke[1], joke[2], joke[3]])
return joke_list
# 寻找网页上的下一页跳转链接
def getnext_page(self, webpage):
if None is webpage:
return None
print('爬虫开始获得下一页的链接。。。')
re_str = '<li>.*?<a href="(.*?/page/.*?)".*?>.*?<!--<.*?>-->.*?<span class="next">.*?</span>.*?</a>.*?</li>'
re_obj = re.compile(re_str, re.S)
next_href = re.findall(re_obj, webpage)
print("下一页链接的个数:%d" % len(next_href))
if len(next_href) > 0:
nexthref = self.get_absoluteurl(next_href[0][6:])
print("得到的下一页的链接为:%s" % nexthref)
return nexthref
# 获取网页的绝对路径
def get_absoluteurl(self, url_str):
if url_str.startswith("http://"):
return url_str
if url_str.startswith("http://www."):
url_str = "http://" + url_str[11:]
return url_str
if url_str.startswith("www."):
url_str = "http://" + url_str[4:]
return url_str
else:
url_str = self.url_str + "/" + url_str
return url_str
if self.url_str not in url_str: # 去除盗链
return None
return None
def write2file(self, list):
if None is list:
return None
else:
dir_str = os.path.abspath('.')
dir_str += "/jokes/"
print("文件保存路径:%s" % dir_str)
if not os.path.exists(dir_str):
os.makedirs(dir_str)
dir_str += "joke.txt"
try:
my_file = open(dir_str, 'w')
for item in list:
my_file.writelines(item[0]+"\t" + item[1]+"\t" + item[2]+"\t" + item[3]+"\t\n")
except IOError,ex:
my_file.close()
print ex
return None
finally:
my_file.close()
# 主函数部分
url_str = "http://www.qiushibaike.com/text"
print("需要采集的网页为:" + url_str)
my_obj = get_joke(url_str)
webpage = my_obj.getwebpage()
my_obj.getnext_page(webpage)
joke_list = my_obj.getjoke_text(webpage)
my_obj.write2file(joke_list)