Python语法简单、框架多、代码少,用于爬虫是一个不错的选择。爬虫是模拟Http请求获取静态网页,并解析网页的html得到相应的信息的一种方法。文章针对糗事百科的段子做爬虫练习,并用Pandas作简单统计写入到csv文件
请求网页
通过urllib框架模拟http请求,获取html,分析网页html标签,针对性地提取对应信息。
import urllib
import re
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
page = 1
fetch_url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
# 构建Request对象
req = urllib.request.Request(fetch_url, headers=headers)
# urlopen打开Request对象
fetch_res = urlopen(req).read().decode('utf-8')
# print(fetch_res)
soup = BeautifulSoup(fetch_res, features='lxml')
# 页面格式化输出
print(soup.prettify())
页面打印如下,基本可以看出网页结构,一个div class=article block
是一个段子组成,因此可以分析成一个div
列表,然后再进行解析,提取信息。
解析html标签
- 通过每个div解析作者、内容及评论数,code片断如下
articles = soup.findAll('div', class_=re.compile('article block'))
contents = []
authors = []
vote_nums = []
for article in articles:
if article.children:
contents.append(article.select('div[class="author clearfix"] h2'))
authors.append(article.select('div[class="content"] span'))
vote_nums.append(article.select('span[class="stats-vote"] i'))
print("{}, {}, {}".format(len(contents), len(authors), len(vote_nums)))
res_pd = pd.DataFrame({'作者': [txt[0].contents[0].strip() for txt in authors],
'内容': [txt[0].contents[0].strip() for txt in contents],
'评论数': [int(txt[0].contents[0].strip()) for txt in vote_nums]})
# 找出评论数最高的10位
res_pd.sort_values('评论数', ascending=False).head(10)
- 结果
python面向对象整合
- 根据页数爬虫当前页,返回段子dataframe
- 将每页段子append写入到文件
import urllib
import re
import os
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
class QsbkText:
# 初始化,构造函数,主要用于初始化属性
def __init__(self):
self.pageIndex = 1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
# 初始化agent
self.headers = {'User-Agent': self.user_agent}
# 存放是否运行
self.enable = False
# 获取每页的内容,返回DataFrame
def getPage(self, pageIndex):
try:
fetch_url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
req = urllib.request.Request(fetch_url, headers=self.headers)
# 请求网页获取的结果
fetch_html = urlopen(req).read().decode('utf-8')
# print(fetch_html)
soup = BeautifulSoup(fetch_html, features='lxml')
articles = soup.findAll('div', class_=re.compile('article block'))
contents = []
authors = []
vote_nums = []
for article in articles:
if article.children:
authors.append(article.select_one('div[class="author clearfix"] h2'))
contents.append(article.select_one('div[class="content"] span'))
vote_nums.append(article.select_one('span[class="stats-vote"] i'))
res_pd = pd.DataFrame({'作者': [txt.contents[0].strip() for txt in authors],
'内容': [txt.contents[0].strip() for txt in contents],
'评论数': [int(txt.contents[0].strip()) for txt in vote_nums]})
return res_pd
except Exception as e:
print('连接失败错误原因', e)
return None
# 写入本地文件
def readCsv(self, pd_data, fileName):
if not os.path.isfile(fileName):
pd_data.to_csv(fileName, header=True, index=False)
else:
pd_data.to_csv(fileName, mode='a', header=False, index=False)
# 读取客户端输入
def readFromClient(self):
inputStr = input('please input, continue--"Enter"; exit--"Q" \n')
if inputStr == "Q":
self.enable = False
def start(self):
self.enable = True
index_of_page = 1
while self.enable:
pageData = self.getPage(index_of_page)
self.readCsv(pageData, '1.pachong_pract/qsbk_res.csv')
index_of_page += 1
self.readFromClient()
# 开始运行
QsbkText().start()
- 运行过程,Enter表示继续,Q表示退出,得到的CSV文件如下
- 读取CSV文件,获取评论数最多的20个段子
import pandas as pd
read_data = pd.read_csv('1.pachong_pract/qsbk_res.csv', encoding='gbk', delimiter=",")
read_data.sort_values('评论数', ascending=False).head(20)