爬取MOOC课程评论区的评论数据

对MOOC上某一课程评论区的评论数据爬取出来(例如:爬取《大学生计算机基础》)

1、爬取流程框架

在这里插入图片描述

2、爬取数据

** 用户名-namesList、用户ID-user_ID、评论内容-commentList、
评论时间-commentTime、浏览次数-watch_numList、回复次数-reply_numList、
用户个人主页user_indexList、用户的身份信息 -user_infoList

该课程评论区界面在这里插入图片描述

该评论者个人页面在这里插入图片描述

3、数据可视化

1.将爬取的数据以xlsx保存下来

For example:
在这里插入图片描述
2.回复次数、浏览次数折线图

For example:
在这里插入图片描述

4、具体代码实现

1、先进入MOOC登录页面,进行登录,再进入该课程评论区爬取数据
GetComment.py

from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.chrome.options import Options
import re
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import  WebDriverWait#显示等待函数
from selenium.webdriver.support import expected_conditions as  EC
import json

def login(base_url):
    driver = webdriver.Chrome(executable_path=r"下载的chromedriver.位置")
    driver.get(base_url)
    time.sleep(1)
    driver.execute_script('window.scrollTo(0, 200)')  # 遇见了反爬行为,通过滚动页面对抗

    # 先找到其他方式登录按钮
    jump = driver.find_element_by_class_name('ux-login-set-scan-code_ft_back')
    # 然后点击跳转到其他方式登录界面
    jump.click()

    iframe = driver.find_elements_by_tag_name("iframe")[0]
    driver.switch_to.frame(iframe)

    email = "你的账号"
    password = "你的密码"

    driver.find_element_by_name("email").send_keys(email)
    time.sleep(1)
    driver.find_element_by_name("password").send_keys(password)
    time.sleep(1)
    driver.find_element_by_id("dologin").click()  # 点击登录
    time.sleep(1)

    driver.switch_to.default_content()# 退出框架

    cookies = driver.get_cookies()  # 获取cookie,列表形式
    f1 = open('cookies.txt', 'w')
    f1.write(json.dumps(cookies))
    f1.close()
    print(cookies)
    print(type(cookies))
    driver.close()

def getpageInfo(url_head,pagenum):
    df_all = pd.DataFrame()
    # 构建URL地址
    #url_head=url_head+'?t=0&p=1'
    #print(url_head)
    for page_num in range(0,pagenum):
        try:
            url_head='https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex?t=0&p={}'.format(page_num+1)
            print(url_head)
            # 调用函数
            tmp = get_comment_detail(url_head)
            df_all = df_all.append(tmp, ignore_index=True)
            # 打印进度
            #print('我正在获取第{}页的信息'.format(page_num+1), end='\r')
            # 休眠一秒
            time.sleep(1)
        except Exception as e:
            break
    return df_all

def getCommentPageNum(url_head):
    driver = webdriver.Chrome(executable_path=r"executable_path=r"下载的chromedriver.位置"")
    driver.get(url_head)
    content = driver.page_source
    dom = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
    page_list = dom.xpath('//*[@id="courseLearn-inner-box"]/div/div[7]/div/div[2]/div/div[1]/div[2]/div/a')  # 页码列表
    page_Num=page_list[-2].text # 打印最后一个标签的文本内容即总页码
    driver.quit()  # 记得关闭
    return page_Num

def get_comment_detail(url_head):
    namesList = []  # 发表评论的用户名列表
    commentTime = []  # 用户评论时间
    commentList= []  # 评论内容
    watch_numList = []  # 评论浏览次数
    reply_numList = []  # 评论回复次数
    comment_href=[]
    user_href = []
    user_indexList = []#评论者个人详情页
    comment_List=[]#评论内容
    user_infoList=[]#评论者信息
    user_ID=[]#评论者ID
    comment_index=[]# 进入评论贴子详情界面的链接

    driver = webdriver.Chrome(executable_path=r"executable_path=r"下载的chromedriver.位置"")
    driver.get(url_head)
    f2 = open("cookies.txt")
    cookies = json.loads(f2.read())
    # 使用cookies登录
    for cook in cookies:
        driver.add_cookie(cook)
    # 刷新页面
    driver.refresh()
    time.sleep(2)

    print("成功进入该课程讨论区")
    content = driver.page_source
    dom = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
    comment_l =dom.xpath('//*[@id="courseLearn-inner-box"]/div/div[7]/div/div[2]/div/div[1]/div[1]/li')
    #获取每页评论列表
    for li in comment_l:
        commentList.append(li.xpath('./div/a/text()'))
        comment_href.append(li.xpath('./div/a/@href'))
        watch_numList.append(li.xpath('./p[1]/text()'))
        reply_numList.append(li.xpath('./p[2]/text()'))
        commentTime.append(li.xpath('./span/span[1]/span[2]/text()'))
        name=li.xpath('./span/span[1]/span[1]/span/span[2]/a/@title')
        # 需要注意有些用户是匿名发表
        if name==[]:
            namesList.append(['匿名'])
        else:
            namesList.append(name)
        ushref=li.xpath('./span/span[1]/span[1]/span/span[2]/a/@href')
        if ushref==[]:
            ID='匿名'
        else :
            for j in ushref:
                s = ' '.join(str(i) for i in j)  # /learn/forumpersonal?uid=1028283590"
                st = re.findall('([0-9]{1,15})', s)  # 此方法返回的是列表["1028283590"]
                ID= ''.join(st)
        user_href.append(ushref)
        user_ID.append(ID)


    user_hrefList = []
    for i in user_href:  # 进入用户界面的链接
        if i==[]:
            user_hrefList.append('匿名')
        else:
            s = ' '.join(str(j) for j in i)
            user_hrefList.append(url_head.split('#')[0] + s)
    print(user_hrefList)

    # 根据用户ID和用户界面提取用户个人主页user_indexList
    # url_head = 'https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex'
    #   user_hrefList='https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumpersonal?uid=1028283590'
    #   user_indexList='https://www.icourse163.org/home.htm?userId=1028283590#/home/discuss?page=1'
    for i in user_hrefList :
        if i=='匿名':
            user_indexList.append('匿名')
        else:
            s = ''.join(str(j) for j in i)
            t=s.split('uid=')[1]
            print(t)
            user_link = s.split('/learn')[0] + '/home.htm?userId=' + t + '#/home/discuss?page=1'  # 拼接用户主页的链接
            user_indexList.append(user_link)
    #print(user_indexList)


    for i in user_indexList:
        if i=='匿名':
            user_infoList.append('匿名')
        else:
            driver = webdriver.Chrome(executable_path=r"C:\插件\chromedriver_win32\chromedriver.exe")
            driver.get(i)
            time.sleep(2)
            f2 = open("cookies.txt")
            cookies = json.loads(f2.read())
            # 使用cookies登录
            for cook in cookies:
                driver.add_cookie(cook)
            # 刷新页面
            driver.refresh()
            time.sleep(2)
            driver.execute_script('window.scrollTo(0, 200)')  # 遇见了反爬行为,通过滚动页面对抗
            print('成功进入用户个人主页')
            content = driver.page_source
            dom2 = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
            user_infoList.append(dom2.xpath('//*[@id="j-self-content"]/div/div[3]/span/text()'))
            driver.quit()
    #print(user_infoList)

    tmp = pd.DataFrame({
        '用户名':namesList,
        'ID':user_ID,
        '用户身份':user_infoList,
        '评论内容':commentList,
        '评论时间':commentTime,
        '浏览次数':watch_numList,
        '回复次数':reply_numList,
        '用户主页网址:':user_indexList,
    })
    driver.quit()  # 记得关闭
    return tmp

if __name__ == '__main__':

    '''  
        step1:模拟登陆(保存cookie) -课程需要登陆才能爬取具体评论内容   
    '''
    base_url = "https://www.icourse163.org/member/login.htm#/webLoginIndex"
    driver=login(base_url)
    print("登陆成功!")

    '''
        step2:以已登录的状态进入课程评论页面获得每页的评论信息
    '''
    # 爬取《大学生计算机基础》评论区评论
    url_head = 'https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex'
    
    '''   
        step3:以已登录的状态实现翻页爬取所有评论信息   
    '''
    pagenum=getCommentPageNum(url_head)
    print('该课程一共有'+pagenum+'页评论')
    pagenum=int(pagenum)
    df_all=getpageInfo(url_head,pagenum)

    '''
        step4:将爬取的评论信息存储起来
    '''
    df_all.to_excel('comment.xlsx')

2、对爬取的数据进行简单格式化处理

处理前:
在这里插入图片描述
处理后:
在这里插入图片描述
Data格式处理.py

import xlrd
import pandas as pd

#格式处理浏览次数数据
def extractwatch_num(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    watch_num=[]
    watch_numList = []
    for i in range(1, nrows):  # 第0行为表头
        areaAFpiece = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = areaAFpiece[6]  # 取出表中列数据
        watch_num.append(result)
    for i in watch_num:
            num=''.join(str(a) for a in i)
            num=num[5:-2]
            watch_numList.append(num)
    return watch_numList

#格式处理回复次数数据
def extractreply_num(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    reply_num=[]
    reply_numList = []
    for i in range(1, nrows):  # 第0行为表头
        areaAFpiece = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = areaAFpiece[7]  # 取出表中列数据
        reply_num.append(result)
    for i in reply_num:
            num=''.join(str(a) for a in i)
            num=num[5:-2]
            reply_numList.append(num)
    return reply_numList

#格式处理用户个人主页数据(保持原样即可)
def extractuser_index(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    userindex_List = []
    for i in range(1, nrows):  # 第0行为表头
        userindex = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = userindex[8]  # 取出表中列数据
        userindex_List.append(result)
    return userindex_List

#格式处理用户名数据
def extractusername(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    usernameList = []
    username_List=[]
    for i in range(1, nrows):  # 第0行为表头
        username = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = username[1]  # 取出表中列数据
        usernameList.append(result)
    for i in usernameList:
            name=''.join(str(a) for a in i)
            name=name[2:-2]
            username_List.append(name)
    return username_List

#格式处理用户ID数据(保持原样即可)
def extractuserid(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    userIDList = []
    for i in range(1, nrows):  # 第0行为表头
        id=table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = id[2]  # 取出表中列数据
        userIDList.append(result)
    return userIDList

#格式处理用户身份数据
def extractuserinfor(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    userinfoList = []
    user_infoList = []
    for i in range(1, nrows):  # 第0行为表头
        userinfo = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = userinfo[3]  # 取出表中列数据
        userinfoList.append(result)
    for i in userinfoList:
            name=''.join(str(a) for a in i)
            name=name[2:-2]
            user_infoList.append(name)
    return user_infoList

#格式处理评论内容数据
def extractcommentlist(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    commentList = []
    comment_List=[]
    for i in range(1, nrows):  # 第0行为表头
        comment = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = comment[4]  # 取出表中列数据
        commentList.append(result)
    for i in commentList:
            name=''.join(str(a) for a in i)
            name=name[2:-2]
            comment_List.append(name)
    return comment_List

#格式处理评论时间数据
def extractcommenttime(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    commentTime = []
    commentTimeList=[]
    for i in range(1, nrows):  # 第0行为表头
        time = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = time[5]  # 取出表中列数据
        commentTime.append(result)
    for i in commentTime:
            name=''.join(str(a) for a in i)
            name=name[2:-4]
            commentTimeList.append(name)
    return commentTimeList

#对数据进行格式处理形成新的pd数据
def dataProcess(commentpath):
    watch_numList=extractwatch_num(commentpath)
    reply_numList=extractreply_num(commentpath)
    userindex_List=extractuser_index(commentpath)
    userIDList=extractuserid(commentpath)
    usernameList=extractusername(commentpath)
    user_infoList=extractuserinfor(commentpath)
    comment_List=extractcommentlist(commentpath)
    commentTimeList=extractcommenttime(commentpath)
    tf = pd.DataFrame({
        '用户名': usernameList,
        'ID': userIDList,
        '用户身份': user_infoList,
        '评论内容': comment_List,
        '评论时间': commentTimeList,
        '浏览次数': watch_numList,
        '回复次数': reply_numList,
        '用户主页网址:': userindex_List,
    })
    return tf


if __name__ == '__main__':
    tf=dataProcess('comment.xlsx')
    tf.to_excel('格式处理后comment.xlsx')

3、可视化
Draw频数折线图.py

import xlrd
import re
import matplotlib.pyplot as plt
import numpy as np

#处理浏览次数数据
def extractwatch_num(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    watch_num=[]
    watch_numList = []
    for i in range(1, nrows):  # 第0行为表头
        areaAFpiece = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = areaAFpiece[6]  # 取出表中列数据
        watch_num.append(result)
    for i in watch_num:
            num=''.join(str(a) for a in i)
            num=num[5:-2]
            watch_numList.append(num)
    return watch_numList

#处理回复次数数据
def extractreply_num(commentpath):
    data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
    table = data.sheets()[0]  # 选定表
    nrows = table.nrows  # 获取行号
    ncols = table.ncols  # 获取列号
    reply_num=[]
    reply_numList = []
    for i in range(1, nrows):  # 第0行为表头
        areaAFpiece = table.row_values(i)  # 循环输出excel表中每一行,即所有数据
        result = areaAFpiece[7]  # 取出表中列数据
        reply_num.append(result)
    for i in reply_num:
            num=''.join(str(a) for a in i)
            num=num[5:-2]
            reply_numList.append(num)
    return reply_numList

#可视化浏览次数数据
def  drawatch(dict):

    fig = plt.figure()
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    x = list(dict.keys())
    x = list(map(str, x))  # 使用list(map(str,x1))方法,将返回一个列表,列表中所有元素是str类型
    #将字符串列表转为数值列表
    ndarray = np.array(x)
    ndarray = ndarray.astype(np.int).tolist()
    x=ndarray
    y = list(dict.values())
    plt.subplot(111)

    plt.xticks([10, 20, 30,  40, 50, 60,70,80,90,100,120,140,160,180,200,300,400,500,600,700,800])
    plt.yticks([2, 4, 6, 8, 10, 12])
    plt.plot(x, y)  # label为设置图例标签,需要配合legend()函数才能显示出
    #plt.scatter(x, y, c='r', marker='o')
    plt.xlabel('浏览次数')
    plt.ylabel('频数')
    # plt.title('评论浏览次数散点图')
    plt.title('评论浏览次数折线图')

    # plt.grid(axis='y')
    # 设置数字标签**
    # for a, b in zip(x,y):
    #     plt.text(a, b, '%d' % b, ha='center', va= 'bottom',fontsize=9)

    plt.legend()  # 需要配合这个才能显示图例标签
    plt.show()

#可视化回复次数数据
def  drawreply(dict):

    fig = plt.figure()
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    x = list(dict.keys())
    x = list(map(str, x))  # 使用list(map(str,x1))方法,将返回一个列表,列表中所有元素是str类型
    #将字符串列表转为数值列表
    ndarray = np.array(x)
    ndarray = ndarray.astype(np.int).tolist()
    x=ndarray
    y = list(dict.values())
    print(x)
    print(y)
    plt.subplot(111)

    plt.xticks([0,5,10,15,20,25, 30, 35, 40,45, 50, 60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700])
    plt.yticks([2, 4, 6,8,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150])

    plt.plot(x, y)  # label为设置图例标签,需要配合legend()函数才能显示出
    #plt.scatter(x, y, c='r', marker='o')

    plt.xlabel('回复次数')
    plt.ylabel('频数')

    # plt.title('评论回复次数散点图')
    # plt.title('评论回复次数折线图')

    # plt.grid(axis='y')
    # 设置数字标签**
    # for a, b in zip(x,y):
    #     plt.text(a, b, '%d' % b, ha='center', va= 'bottom',fontsize=9)

    plt.legend()  # 需要配合这个才能显示图例标签
    plt.show()

#将浏览\回复次数数据进行计数处理
def Count(List):
    dict={}
    for area in List:
        keys=area.split(",")
        for key in keys:
            if key in dict.keys():
                dict[key]=dict[key]+1
            else:
                dict[key]=1
    return dict

if __name__ == '__main__':
    watch_numList=extractwatch_num('comment.xlsx')
    reply_numList=extractreply_num('comment.xlsx')
    dict1=Count(watch_numList)
    dict2=Count(reply_numList)
    print(dict1)
    print(dict2)
    drawatch(dict1)
    drawreply(dict2)

5、总结:

此爬虫涉及关键技术:

  • 使用selenium库实现模拟登陆
    selenium是进行自动化测试的一种库,配合浏览器相对应的webdriver,可以模拟浏览器行为登录,大大方便、简化了登录操作。因为我用的是Chrome浏览器做的测试,所以要下载与Chrome对应的Chrome webdriver版本
  • 14
    点赞
  • 101
    收藏
    觉得还不错? 一键收藏
  • 22
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 22
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值