python爬虫 ---- 糗事百科爬虫
首先进入糗事百科官网首页 —> 糗事百科
本次爬虫的目标是翻页爬取糗事百科的信息,包括 标题, 链接, 作者名, 好笑数&评论数
之后右键检查,开始分析网页代码并找出我们所需要提取的数据
下面是提取数据的代码,用到的格式为beautifulsoup
def parse_data(html_data):
soup = BeautifulSoup(html_data, 'lxml')
title = soup.find_all('a', {'class': 'recmd-content'})
author = soup.find_all('a', {'class': 'recmd-user'})
num = soup.find_all('div', {'class': 'recmd-num'})
for titles, authors, nums in zip(title, author, num):
tit = titles.get_text()
link = "https://www.qiushibaike.com" + titles['href']
aut = authors.img['alt']
n = nums.get_text()
print("标题: " + tit)
print("链接: " + link)
print("作者名: " + aut)
print("好笑数&评论数: " + n)
print('--------' * 10)
全部代码及结果展示
import requests
from bs4 import BeautifulSoup
from openpyxl import workbook
wb = workbook.Workbook() # 创建excel对象
ws = wb.active # 获取当前的表对象
ws.append(['标题', '链接', '作者名', '好笑数&评论数'])
def get_html(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
# print(response.status_code)
# print(response.text)
def parse_data(html_data):
soup = BeautifulSoup(html_data, 'lxml')
title = soup.find_all('a', {'class': 'recmd-content'})
author = soup.find_all('a', {'class': 'recmd-user'})
num = soup.find_all('div', {'class': 'recmd-num'})
for titles, authors, nums in zip(title, author, num):
tit = titles.get_text()
link = "https://www.qiushibaike.com" + titles['href']
aut = authors.img['alt']
n = nums.get_text()
print("标题: " + tit)
print("链接: " + link)
print("作者名: " + aut)
print("好笑数&评论数: " + n)
print('--------' * 10)
my_list = [tit, link, aut, n] # 封装
ws.append(my_list)
if __name__ == '__main__':
for i in range(1, 14):
url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(i)
# print(url)
html_data = get_html(url)
parse_data(html_data)
wb.save('t.xlsx') # 保存
提取结果如下: