爬虫思路分析
-
爬取的内容为简书笔者用户动态的信息(https://www.jianshu.com/c/22f2ca261b85),如图
-
当首次打开该网页URL,选择热门,会发现网页URL并没有发生变化,如图所示,所以判断该网页采用了异步加载技术。
-
打开Chrome浏览器的开发者工具(按 F12 键),单击Network选项卡,选中 XHR 项,可发现网页加载了用户信息的文件,如图所示, 但并不是我们想获取的信息.
-
依次点击最新评论, 最新收录和热门, 在开发者工具中会看到新的 XHR 产生,这个网址对应的信息就是我们想要获取的文章信息;
-
观察该文件的 Response ,发现返回的是XML文件,内容也正是用户“动态”内容(如图),每个li 标签就是一个用户动态内容。
-
通过下滑浏览,会发现也是使用了异步加载技术进行分页处理的,如图所示,以此记录前几页的URL:
-
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=2
-
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=3
-
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=4
示例代码
核心代码
# encoding=utf-8
"""
Date:2019-08-15 16:30
User:LiYu
Email:liyu_5498@163.com
"""
import re
import requests
from fake_useragent import UserAgent
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
def get_page(url):
"""
获取的网页源代码: 抓取代理
:param url:
:param options:
:return:
"""
print('[+] 正在抓取', url)
path = url[-15:]
# print(path)
try:
if 'page=' in url:
headers = {
'User-Agent': ua.random
}
else:
headers = {
'User-Agent': ua.random,
'Scheme': 'https',
'Authority': 'www.jianshu.com',
'Method': 'GET',
'Path': path,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=headers)
print('[+] 抓取成功', url, response.status_code)
if response.status_code == 200:
return response.text
except ConnectionError:
print('[-] 抓取失败', url)
return None
def getArticleHrefs(html):
if html:
patternObj = re.compile(r'<a class="title" target="_blank" href=(.*?)>')
articleHrefs = patternObj.findall(html)
for articleHref in articleHrefs:
print(articleHref)
yield articleHref
def analyseHtml(html):
if html:
findTitle = re.compile(r'<h1 class="title">(.*?)</h1>')
findAuthor = re.compile(r'<span class="name"><a href=(.*?)>(.*?)</a></span>')
findDate = re.compile(r'<span class="publish-time" data-toggle="tooltip" '
r'data-placement="bottom" title=(.*?)>(.*?)</span>')
findFontCount = re.compile(r'<span class="wordage">(.*?)</span>')
try:
title = findTitle.findall(html)[0]
except:
title = ''
try:
author = findAuthor.findall(html)[0][1]
except:
author = ''
try:
date = findDate.findall(html)[0][1]
except:
date = ''
try:
fontCount = findFontCount.findall(html)[0].lstrip('字数 ')
except:
fontCount = ''
result = [{
'title': title,
'author': author,
'date': date,
'fontCount': fontCount
}]
df = pd.DataFrame(result)
# print(df)
return df
else:
return pd.DataFrame({})
def task(page):
url = 'https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=' + str(page)
html = get_page(url)
articleHref = getArticleHrefs(html)
for Href in articleHref:
articleUrl = 'https://www.jianshu.com' + Href[1:-1]
articleHtml = get_page(articleUrl)
df = analyseHtml(articleHtml)
yield df
if __name__ == '__main__':
ua = UserAgent()
pages = 10
CsvFileName = 'jianshu.csv'
with ThreadPoolExecutor(100) as pool:
results = pool.map(task, range(2, pages + 1))
totalDf = pd.DataFrame({})
for result in results:
for df in result:
totalDf = pd.concat([totalDf, df], axis=0)
# print(totalDf)
totalDf.to_csv(CsvFileName, sep=',', header=True, index=False)
print('文件%s存储成功' % CsvFileName)
数据分析模块
结果演示:
# encoding=utf-8
"""
Date:2019-08-16 15:02
User:LiYu
Email:liyu_5498@163.com
"""
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
CsvFileName = 'jianshu.csv'
# 配置中文字体和修改字体大小
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.size'] = 12
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_csv(CsvFileName)
def showFontCount():
fontCount = df['fontCount']
# print(fontCount)
fontCountLarge = fontCount[fontCount > 1000]
# print(len(fontCountLarge))
# print(fontCountLarge)
fontCountSmall = fontCount[fontCount <= 1000]
# print(len(fontCountSmall))
# print(fontCountSmall)
labels = [u'字数大于1000', u'字数大于1000']
sizes = [len(fontCountLarge), len(fontCountSmall)]
colors = ['red', 'yellowgreen']
plt.figure(figsize=(10, 5))
plt.pie(sizes, labels=labels, colors=colors,
labeldistance=1.1, autopct='%3.1f%%', shadow=False,
startangle=90, pctdistance=0.6)
plt.show()
if __name__ == '__main__':
showFontCount()