Python知乎热榜爬虫
环境
re
urllib
bs4==0.0.1
numpy==1.18.5
pandas==0.25.1
热搜URL
- 找到指定URL可以区别于其他热搜项目的关键值,需要观察并多次测试
- 转到知乎热榜页面,点击Ctrl + Shift + I进入检查页面,点击
Element
,Ctrl+F进入搜索得到的关键值,可以看到在script
标签内,且id
为js-initialData
- 以
UTF-8
格式解码,标题和摘要都是正确解析的
- 链接没有正确解析
- 基于得到的内容,设计正确的正则表达式匹配模式
self.titlePattern = re.compile(r'{"titleArea":{"text":"(.*?)"}')
self.excerptPattern = re.compile(r'"excerptArea":{"text":"(.*?)"}')
- 提取完标题和摘要后,以
UTF-8
格式编码,再以unicode-escape
解码
frame = str(frame).encode('utf-8').decode('unicode-escape')
- 得到正确解析的链接
- 基于得到的内容,设计正确的正则表达式匹配模式
self.urlPattern = re.compile(r'"link":{"url":"(.*?)"}', re.S)
核心代码
从抓取到的页面提取信息,利用正则表达式进行提取,对热搜链接的提取需要进行特殊处理
def extractData(self):
page = self.crawlPage()
beautifulSoup = BeautifulSoup(page, 'html.parser')
linkList = []
titleList = []
excerptList = []
for frame in beautifulSoup.find_all('script', id = 'js-initialData'):
frame = str(frame)
titleList = re.findall(self.titlePattern, str(frame))
excerptList = re.findall(self.excerptPattern, str(frame))
# encode string to utf-8 and decode to unicode-escape to get links
frame = str(frame).encode('utf-8').decode('unicode-escape')
linkList = re.findall(self.urlPattern, str(frame))
billboardList = []
# generate list of list
for i in range(0, len(titleList)):
billboardList.append([titleList[i], excerptList[i], linkList[i]])
return billboardList
pandas导出为excel文件
def export2Excel(self, data, index, columns, path):
dataFrame = pd.DataFrame(data, index = index, columns = columns)
print("Exporting...")
dataFrame.to_excel(path)
print("Export successfully")
实现代码
# -*- coding:utf-8 -*-
import re
from bs4 import BeautifulSoup
import urllib
import pandas as pd
class Spider():
'''
Description:
Spider to crawl page and extract top hot searches from https://www.zhihu.com/billboard
Attributes:
None
'''
def __init__(self):
self.url = 'https://www.zhihu.com/billboard'
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
# regular expression
self.urlPattern = re.compile(r'"link":{"url":"(.*?)"}', re.S)
self.titlePattern = re.compile(r'{"titleArea":{"text":"(.*?)"}')
self.excerptPattern = re.compile(r'"excerptArea":{"text":"(.*?)"}')
'''
Description:
crawl page from https://www.zhihu.com/billboard
Args:
None
Returns:
page:
the page which contain content of Zhihu top hot searches rank
'''
def crawlPage(self):
request = urllib.request.Request(headers = self.headers, url = self.url)
response = urllib.request.urlopen(request)
page = response.read().decode('utf-8')
return page
'''
Description:
extract data from the page crawled
Args:
None
Returns:
billboardList:
the list contain the list of hot-search title, excerpt and link
'''
def extractData(self):
page = self.crawlPage()
beautifulSoup = BeautifulSoup(page, 'html.parser')
linkList = []
titleList = []
excerptList = []
for frame in beautifulSoup.find_all('script', id = 'js-initialData'):
frame = str(frame)
titleList = re.findall(self.titlePattern, str(frame))
excerptList = re.findall(self.excerptPattern, str(frame))
# encode string to utf-8 and decode to unicode-escape to get links
frame = str(frame).encode('utf-8').decode('unicode-escape')
linkList = re.findall(self.urlPattern, str(frame))
billboardList = []
# generate list of list
for i in range(0, len(titleList)):
billboardList.append([titleList[i], excerptList[i], linkList[i]])
return billboardList
'''
Description:
export data to .xlsx format into given path according to given index and column name
Args:
data:
the data to be exported
index:
the index of data
columns:
the names of columns
path:
path to save the .xlsx file
Returns:
None
'''
def export2Excel(self, data, index, columns, path):
dataFrame = pd.DataFrame(data, index = index, columns = columns)
print("Exporting...")
dataFrame.to_excel(path)
print("Export successfully")
if __name__ == "__main__":
spider = Spider()
billboardList = spider.extractData()
index = [i for i in range(1, len(billboardList) + 1)]
columns = ['title', 'excerpt', 'link']
path = './ZhihuBillboard.xlsx'
spider.export2Excel(billboardList, index, columns, path)
测试结果
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!