爬虫项目实战：简书用户动态信息_基于爬虫的用户行为动态标签-CSDN博客

本文链接：https://blog.csdn.net/Dreaming5498/article/details/99684568

文章目录

爬虫思路分析
示例代码
- 核心代码
- 数据分析模块

爬虫思路分析

爬取的内容为简书笔者用户动态的信息（https://www.jianshu.com/c/22f2ca261b85），如图
当首次打开该网页URL，选择热门，会发现网页URL并没有发生变化，如图所示，所以判断该网页采用了异步加载技术。
打开Chrome浏览器的开发者工具（按 F12 键），单击Network选项卡，选中 XHR 项，可发现网页加载了用户信息的文件，如图所示，但并不是我们想获取的信息.
依次点击最新评论，最新收录和热门，在开发者工具中会看到新的 XHR 产生，这个网址对应的信息就是我们想要获取的文章信息;
观察该文件的 Response ，发现返回的是XML文件，内容也正是用户“动态”内容（如图），每个li 标签就是一个用户动态内容。
通过下滑浏览，会发现也是使用了异步加载技术进行分页处理的，如图所示，以此记录前几页的URL：
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=2
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=3
https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=4

在这里插入图片描述

示例代码

核心代码

# encoding=utf-8
"""
Date:2019-08-15 16:30
User:LiYu
Email:liyu_5498@163.com

"""
import re

import requests
from fake_useragent import UserAgent
import pandas as pd
from concurrent.futures import ThreadPoolExecutor


def get_page(url):
    """
    获取的网页源代码: 抓取代理
    :param url:
    :param options:
    :return:
    """
    print('[+] 正在抓取', url)
    path = url[-15:]
    # print(path)
    try:
        if 'page=' in url:
            headers = {
                'User-Agent': ua.random
            }
        else:
            headers = {
                'User-Agent': ua.random,
                'Scheme': 'https',
                'Authority': 'www.jianshu.com',
                'Method': 'GET',
                'Path': path,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Upgrade-Insecure-Requests': '1'

            }
        response = requests.get(url, headers=headers)
        print('[+] 抓取成功', url, response.status_code)
        if response.status_code == 200:
            return response.text
    except ConnectionError:
        print('[-] 抓取失败', url)
        return None


def getArticleHrefs(html):
    if html:
        patternObj = re.compile(r'<a class="title" target="_blank" href=(.*?)>')
        articleHrefs = patternObj.findall(html)
        for articleHref in articleHrefs:
            print(articleHref)
            yield articleHref


def analyseHtml(html):
    if html:
        findTitle = re.compile(r'<h1 class="title">(.*?)</h1>')
        findAuthor = re.compile(r'<span class="name"><a href=(.*?)>(.*?)</a></span>')
        findDate = re.compile(r'<span class="publish-time" data-toggle="tooltip" '
                              r'data-placement="bottom" title=(.*?)>(.*?)</span>')
        findFontCount = re.compile(r'<span class="wordage">(.*?)</span>')
        try:
            title = findTitle.findall(html)[0]
        except:
            title = ''
        try:
            author = findAuthor.findall(html)[0][1]
        except:
            author = ''
        try:
            date = findDate.findall(html)[0][1]
        except:
            date = ''
        try:
            fontCount = findFontCount.findall(html)[0].lstrip('字数 ')
        except:
            fontCount = ''
        result = [{
            'title': title,
            'author': author,
            'date': date,
            'fontCount': fontCount
        }]
        df = pd.DataFrame(result)
        # print(df)
        return df
    else:
        return pd.DataFrame({})


def task(page):
    url = 'https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=' + str(page)
    html = get_page(url)
    articleHref = getArticleHrefs(html)
    for Href in articleHref:
        articleUrl = 'https://www.jianshu.com' + Href[1:-1]
        articleHtml = get_page(articleUrl)
        df = analyseHtml(articleHtml)
        yield df


if __name__ == '__main__':
    ua = UserAgent()
    pages = 10
    CsvFileName = 'jianshu.csv'
    with ThreadPoolExecutor(100) as pool:
        results = pool.map(task, range(2, pages + 1))
    totalDf = pd.DataFrame({})
    for result in results:
        for df in result:
            totalDf = pd.concat([totalDf, df], axis=0)
    # print(totalDf)
    totalDf.to_csv(CsvFileName, sep=',', header=True, index=False)
    print('文件%s存储成功' % CsvFileName)

数据分析模块

结果演示：
在这里插入图片描述

# encoding=utf-8
"""
Date:2019-08-16 15:02
User:LiYu
Email:liyu_5498@163.com

"""
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

CsvFileName = 'jianshu.csv'
# 配置中文字体和修改字体大小
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.size'] = 12
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False

df = pd.read_csv(CsvFileName)


def showFontCount():
    fontCount = df['fontCount']
    # print(fontCount)
    fontCountLarge = fontCount[fontCount > 1000]
    # print(len(fontCountLarge))
    # print(fontCountLarge)
    fontCountSmall = fontCount[fontCount <= 1000]
    # print(len(fontCountSmall))
    # print(fontCountSmall)
    labels = [u'字数大于1000', u'字数大于1000']
    sizes = [len(fontCountLarge), len(fontCountSmall)]
    colors = ['red', 'yellowgreen']
    plt.figure(figsize=(10, 5))
    plt.pie(sizes, labels=labels, colors=colors,
            labeldistance=1.1, autopct='%3.1f%%', shadow=False,
            startangle=90, pctdistance=0.6)
    plt.show()


if __name__ == '__main__':
    showFontCount()