爬虫项目实战:简书用户动态信息

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/Dreaming5498/article/details/99684568

爬虫思路分析

  • 爬取的内容为简书笔者用户动态的信息(https://www.jianshu.com/c/22f2ca261b85),如图
    在这里插入图片描述

  • 当首次打开该网页URL,选择热门,会发现网页URL并没有发生变化,如图所示,所以判断该网页采用了异步加载技术。
    在这里插入图片描述

  • 打开Chrome浏览器的开发者工具(按 F12 键),单击Network选项卡,选中 XHR 项,可发现网页加载了用户信息的文件,如图所示, 但并不是我们想获取的信息.
    在这里插入图片描述

  • 依次点击最新评论, 最新收录和热门, 在开发者工具中会看到新的 XHR 产生,这个网址对应的信息就是我们想要获取的文章信息;
    在这里插入图片描述

  • 观察该文件的 Response ,发现返回的是XML文件,内容也正是用户“动态”内容(如图),每个li 标签就是一个用户动态内容。
    在这里插入图片描述

  • 通过下滑浏览,会发现也是使用了异步加载技术进行分页处理的,如图所示,以此记录前几页的URL:

  • https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=2

  • https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=3

  • https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=4

在这里插入图片描述

示例代码

核心代码

# encoding=utf-8
"""
Date:2019-08-15 16:30
User:LiYu
Email:liyu_5498@163.com

"""
import re

import requests
from fake_useragent import UserAgent
import pandas as pd
from concurrent.futures import ThreadPoolExecutor


def get_page(url):
    """
    获取的网页源代码: 抓取代理
    :param url:
    :param options:
    :return:
    """
    print('[+] 正在抓取', url)
    path = url[-15:]
    # print(path)
    try:
        if 'page=' in url:
            headers = {
                'User-Agent': ua.random
            }
        else:
            headers = {
                'User-Agent': ua.random,
                'Scheme': 'https',
                'Authority': 'www.jianshu.com',
                'Method': 'GET',
                'Path': path,
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Upgrade-Insecure-Requests': '1'

            }
        response = requests.get(url, headers=headers)
        print('[+] 抓取成功', url, response.status_code)
        if response.status_code == 200:
            return response.text
    except ConnectionError:
        print('[-] 抓取失败', url)
        return None


def getArticleHrefs(html):
    if html:
        patternObj = re.compile(r'<a class="title" target="_blank" href=(.*?)>')
        articleHrefs = patternObj.findall(html)
        for articleHref in articleHrefs:
            print(articleHref)
            yield articleHref


def analyseHtml(html):
    if html:
        findTitle = re.compile(r'<h1 class="title">(.*?)</h1>')
        findAuthor = re.compile(r'<span class="name"><a href=(.*?)>(.*?)</a></span>')
        findDate = re.compile(r'<span class="publish-time" data-toggle="tooltip" '
                              r'data-placement="bottom" title=(.*?)>(.*?)</span>')
        findFontCount = re.compile(r'<span class="wordage">(.*?)</span>')
        try:
            title = findTitle.findall(html)[0]
        except:
            title = ''
        try:
            author = findAuthor.findall(html)[0][1]
        except:
            author = ''
        try:
            date = findDate.findall(html)[0][1]
        except:
            date = ''
        try:
            fontCount = findFontCount.findall(html)[0].lstrip('字数 ')
        except:
            fontCount = ''
        result = [{
            'title': title,
            'author': author,
            'date': date,
            'fontCount': fontCount
        }]
        df = pd.DataFrame(result)
        # print(df)
        return df
    else:
        return pd.DataFrame({})


def task(page):
    url = 'https://www.jianshu.com/c/22f2ca261b85?order_by=commented_at&page=' + str(page)
    html = get_page(url)
    articleHref = getArticleHrefs(html)
    for Href in articleHref:
        articleUrl = 'https://www.jianshu.com' + Href[1:-1]
        articleHtml = get_page(articleUrl)
        df = analyseHtml(articleHtml)
        yield df


if __name__ == '__main__':
    ua = UserAgent()
    pages = 10
    CsvFileName = 'jianshu.csv'
    with ThreadPoolExecutor(100) as pool:
        results = pool.map(task, range(2, pages + 1))
    totalDf = pd.DataFrame({})
    for result in results:
        for df in result:
            totalDf = pd.concat([totalDf, df], axis=0)
    # print(totalDf)
    totalDf.to_csv(CsvFileName, sep=',', header=True, index=False)
    print('文件%s存储成功' % CsvFileName)

数据分析模块

结果演示:
在这里插入图片描述

# encoding=utf-8
"""
Date:2019-08-16 15:02
User:LiYu
Email:liyu_5498@163.com

"""
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

CsvFileName = 'jianshu.csv'
# 配置中文字体和修改字体大小
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.size'] = 12
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False

df = pd.read_csv(CsvFileName)


def showFontCount():
    fontCount = df['fontCount']
    # print(fontCount)
    fontCountLarge = fontCount[fontCount > 1000]
    # print(len(fontCountLarge))
    # print(fontCountLarge)
    fontCountSmall = fontCount[fontCount <= 1000]
    # print(len(fontCountSmall))
    # print(fontCountSmall)
    labels = [u'字数大于1000', u'字数大于1000']
    sizes = [len(fontCountLarge), len(fontCountSmall)]
    colors = ['red', 'yellowgreen']
    plt.figure(figsize=(10, 5))
    plt.pie(sizes, labels=labels, colors=colors,
            labeldistance=1.1, autopct='%3.1f%%', shadow=False,
            startangle=90, pctdistance=0.6)
    plt.show()


if __name__ == '__main__':
    showFontCount()
展开阅读全文

没有更多推荐了,返回首页