对MOOC上某一课程评论区的评论数据爬取出来(例如:爬取《大学生计算机基础》)
1、爬取流程框架
2、爬取数据
** 用户名-namesList、用户ID-user_ID、评论内容-commentList、
评论时间-commentTime、浏览次数-watch_numList、回复次数-reply_numList、
用户个人主页user_indexList、用户的身份信息 -user_infoList
该课程评论区界面
该评论者个人页面
3、数据可视化
1.将爬取的数据以xlsx保存下来
For example:
2.回复次数、浏览次数折线图
For example:
4、具体代码实现
1、先进入MOOC登录页面,进行登录,再进入该课程评论区爬取数据
GetComment.py
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.chrome.options import Options
import re
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait#显示等待函数
from selenium.webdriver.support import expected_conditions as EC
import json
def login(base_url):
driver = webdriver.Chrome(executable_path=r"下载的chromedriver.位置")
driver.get(base_url)
time.sleep(1)
driver.execute_script('window.scrollTo(0, 200)') # 遇见了反爬行为,通过滚动页面对抗
# 先找到其他方式登录按钮
jump = driver.find_element_by_class_name('ux-login-set-scan-code_ft_back')
# 然后点击跳转到其他方式登录界面
jump.click()
iframe = driver.find_elements_by_tag_name("iframe")[0]
driver.switch_to.frame(iframe)
email = "你的账号"
password = "你的密码"
driver.find_element_by_name("email").send_keys(email)
time.sleep(1)
driver.find_element_by_name("password").send_keys(password)
time.sleep(1)
driver.find_element_by_id("dologin").click() # 点击登录
time.sleep(1)
driver.switch_to.default_content()# 退出框架
cookies = driver.get_cookies() # 获取cookie,列表形式
f1 = open('cookies.txt', 'w')
f1.write(json.dumps(cookies))
f1.close()
print(cookies)
print(type(cookies))
driver.close()
def getpageInfo(url_head,pagenum):
df_all = pd.DataFrame()
# 构建URL地址
#url_head=url_head+'?t=0&p=1'
#print(url_head)
for page_num in range(0,pagenum):
try:
url_head='https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex?t=0&p={}'.format(page_num+1)
print(url_head)
# 调用函数
tmp = get_comment_detail(url_head)
df_all = df_all.append(tmp, ignore_index=True)
# 打印进度
#print('我正在获取第{}页的信息'.format(page_num+1), end='\r')
# 休眠一秒
time.sleep(1)
except Exception as e:
break
return df_all
def getCommentPageNum(url_head):
driver = webdriver.Chrome(executable_path=r"executable_path=r"下载的chromedriver.位置"")
driver.get(url_head)
content = driver.page_source
dom = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
page_list = dom.xpath('//*[@id="courseLearn-inner-box"]/div/div[7]/div/div[2]/div/div[1]/div[2]/div/a') # 页码列表
page_Num=page_list[-2].text # 打印最后一个标签的文本内容即总页码
driver.quit() # 记得关闭
return page_Num
def get_comment_detail(url_head):
namesList = [] # 发表评论的用户名列表
commentTime = [] # 用户评论时间
commentList= [] # 评论内容
watch_numList = [] # 评论浏览次数
reply_numList = [] # 评论回复次数
comment_href=[]
user_href = []
user_indexList = []#评论者个人详情页
comment_List=[]#评论内容
user_infoList=[]#评论者信息
user_ID=[]#评论者ID
comment_index=[]# 进入评论贴子详情界面的链接
driver = webdriver.Chrome(executable_path=r"executable_path=r"下载的chromedriver.位置"")
driver.get(url_head)
f2 = open("cookies.txt")
cookies = json.loads(f2.read())
# 使用cookies登录
for cook in cookies:
driver.add_cookie(cook)
# 刷新页面
driver.refresh()
time.sleep(2)
print("成功进入该课程讨论区")
content = driver.page_source
dom = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
comment_l =dom.xpath('//*[@id="courseLearn-inner-box"]/div/div[7]/div/div[2]/div/div[1]/div[1]/li')
#获取每页评论列表
for li in comment_l:
commentList.append(li.xpath('./div/a/text()'))
comment_href.append(li.xpath('./div/a/@href'))
watch_numList.append(li.xpath('./p[1]/text()'))
reply_numList.append(li.xpath('./p[2]/text()'))
commentTime.append(li.xpath('./span/span[1]/span[2]/text()'))
name=li.xpath('./span/span[1]/span[1]/span/span[2]/a/@title')
# 需要注意有些用户是匿名发表
if name==[]:
namesList.append(['匿名'])
else:
namesList.append(name)
ushref=li.xpath('./span/span[1]/span[1]/span/span[2]/a/@href')
if ushref==[]:
ID='匿名'
else :
for j in ushref:
s = ' '.join(str(i) for i in j) # /learn/forumpersonal?uid=1028283590"
st = re.findall('([0-9]{1,15})', s) # 此方法返回的是列表["1028283590"]
ID= ''.join(st)
user_href.append(ushref)
user_ID.append(ID)
user_hrefList = []
for i in user_href: # 进入用户界面的链接
if i==[]:
user_hrefList.append('匿名')
else:
s = ' '.join(str(j) for j in i)
user_hrefList.append(url_head.split('#')[0] + s)
print(user_hrefList)
# 根据用户ID和用户界面提取用户个人主页user_indexList
# url_head = 'https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex'
# user_hrefList='https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumpersonal?uid=1028283590'
# user_indexList='https://www.icourse163.org/home.htm?userId=1028283590#/home/discuss?page=1'
for i in user_hrefList :
if i=='匿名':
user_indexList.append('匿名')
else:
s = ''.join(str(j) for j in i)
t=s.split('uid=')[1]
print(t)
user_link = s.split('/learn')[0] + '/home.htm?userId=' + t + '#/home/discuss?page=1' # 拼接用户主页的链接
user_indexList.append(user_link)
#print(user_indexList)
for i in user_indexList:
if i=='匿名':
user_infoList.append('匿名')
else:
driver = webdriver.Chrome(executable_path=r"C:\插件\chromedriver_win32\chromedriver.exe")
driver.get(i)
time.sleep(2)
f2 = open("cookies.txt")
cookies = json.loads(f2.read())
# 使用cookies登录
for cook in cookies:
driver.add_cookie(cook)
# 刷新页面
driver.refresh()
time.sleep(2)
driver.execute_script('window.scrollTo(0, 200)') # 遇见了反爬行为,通过滚动页面对抗
print('成功进入用户个人主页')
content = driver.page_source
dom2 = etree.HTML(content, etree.HTMLParser(encoding='utf-8'))
user_infoList.append(dom2.xpath('//*[@id="j-self-content"]/div/div[3]/span/text()'))
driver.quit()
#print(user_infoList)
tmp = pd.DataFrame({
'用户名':namesList,
'ID':user_ID,
'用户身份':user_infoList,
'评论内容':commentList,
'评论时间':commentTime,
'浏览次数':watch_numList,
'回复次数':reply_numList,
'用户主页网址:':user_indexList,
})
driver.quit() # 记得关闭
return tmp
if __name__ == '__main__':
'''
step1:模拟登陆(保存cookie) -课程需要登陆才能爬取具体评论内容
'''
base_url = "https://www.icourse163.org/member/login.htm#/webLoginIndex"
driver=login(base_url)
print("登陆成功!")
'''
step2:以已登录的状态进入课程评论页面获得每页的评论信息
'''
# 爬取《大学生计算机基础》评论区评论
url_head = 'https://www.icourse163.org/learn/CAU-23004?tid=1002299017#/learn/forumindex'
'''
step3:以已登录的状态实现翻页爬取所有评论信息
'''
pagenum=getCommentPageNum(url_head)
print('该课程一共有'+pagenum+'页评论')
pagenum=int(pagenum)
df_all=getpageInfo(url_head,pagenum)
'''
step4:将爬取的评论信息存储起来
'''
df_all.to_excel('comment.xlsx')
2、对爬取的数据进行简单格式化处理
处理前:
处理后:
Data格式处理.py
import xlrd
import pandas as pd
#格式处理浏览次数数据
def extractwatch_num(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
watch_num=[]
watch_numList = []
for i in range(1, nrows): # 第0行为表头
areaAFpiece = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = areaAFpiece[6] # 取出表中列数据
watch_num.append(result)
for i in watch_num:
num=''.join(str(a) for a in i)
num=num[5:-2]
watch_numList.append(num)
return watch_numList
#格式处理回复次数数据
def extractreply_num(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
reply_num=[]
reply_numList = []
for i in range(1, nrows): # 第0行为表头
areaAFpiece = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = areaAFpiece[7] # 取出表中列数据
reply_num.append(result)
for i in reply_num:
num=''.join(str(a) for a in i)
num=num[5:-2]
reply_numList.append(num)
return reply_numList
#格式处理用户个人主页数据(保持原样即可)
def extractuser_index(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
userindex_List = []
for i in range(1, nrows): # 第0行为表头
userindex = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = userindex[8] # 取出表中列数据
userindex_List.append(result)
return userindex_List
#格式处理用户名数据
def extractusername(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
usernameList = []
username_List=[]
for i in range(1, nrows): # 第0行为表头
username = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = username[1] # 取出表中列数据
usernameList.append(result)
for i in usernameList:
name=''.join(str(a) for a in i)
name=name[2:-2]
username_List.append(name)
return username_List
#格式处理用户ID数据(保持原样即可)
def extractuserid(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
userIDList = []
for i in range(1, nrows): # 第0行为表头
id=table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = id[2] # 取出表中列数据
userIDList.append(result)
return userIDList
#格式处理用户身份数据
def extractuserinfor(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
userinfoList = []
user_infoList = []
for i in range(1, nrows): # 第0行为表头
userinfo = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = userinfo[3] # 取出表中列数据
userinfoList.append(result)
for i in userinfoList:
name=''.join(str(a) for a in i)
name=name[2:-2]
user_infoList.append(name)
return user_infoList
#格式处理评论内容数据
def extractcommentlist(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
commentList = []
comment_List=[]
for i in range(1, nrows): # 第0行为表头
comment = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = comment[4] # 取出表中列数据
commentList.append(result)
for i in commentList:
name=''.join(str(a) for a in i)
name=name[2:-2]
comment_List.append(name)
return comment_List
#格式处理评论时间数据
def extractcommenttime(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
commentTime = []
commentTimeList=[]
for i in range(1, nrows): # 第0行为表头
time = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = time[5] # 取出表中列数据
commentTime.append(result)
for i in commentTime:
name=''.join(str(a) for a in i)
name=name[2:-4]
commentTimeList.append(name)
return commentTimeList
#对数据进行格式处理形成新的pd数据
def dataProcess(commentpath):
watch_numList=extractwatch_num(commentpath)
reply_numList=extractreply_num(commentpath)
userindex_List=extractuser_index(commentpath)
userIDList=extractuserid(commentpath)
usernameList=extractusername(commentpath)
user_infoList=extractuserinfor(commentpath)
comment_List=extractcommentlist(commentpath)
commentTimeList=extractcommenttime(commentpath)
tf = pd.DataFrame({
'用户名': usernameList,
'ID': userIDList,
'用户身份': user_infoList,
'评论内容': comment_List,
'评论时间': commentTimeList,
'浏览次数': watch_numList,
'回复次数': reply_numList,
'用户主页网址:': userindex_List,
})
return tf
if __name__ == '__main__':
tf=dataProcess('comment.xlsx')
tf.to_excel('格式处理后comment.xlsx')
3、可视化
Draw频数折线图.py
import xlrd
import re
import matplotlib.pyplot as plt
import numpy as np
#处理浏览次数数据
def extractwatch_num(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
watch_num=[]
watch_numList = []
for i in range(1, nrows): # 第0行为表头
areaAFpiece = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = areaAFpiece[6] # 取出表中列数据
watch_num.append(result)
for i in watch_num:
num=''.join(str(a) for a in i)
num=num[5:-2]
watch_numList.append(num)
return watch_numList
#处理回复次数数据
def extractreply_num(commentpath):
data = xlrd.open_workbook(commentpath, encoding_override='utf-8')
table = data.sheets()[0] # 选定表
nrows = table.nrows # 获取行号
ncols = table.ncols # 获取列号
reply_num=[]
reply_numList = []
for i in range(1, nrows): # 第0行为表头
areaAFpiece = table.row_values(i) # 循环输出excel表中每一行,即所有数据
result = areaAFpiece[7] # 取出表中列数据
reply_num.append(result)
for i in reply_num:
num=''.join(str(a) for a in i)
num=num[5:-2]
reply_numList.append(num)
return reply_numList
#可视化浏览次数数据
def drawatch(dict):
fig = plt.figure()
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
x = list(dict.keys())
x = list(map(str, x)) # 使用list(map(str,x1))方法,将返回一个列表,列表中所有元素是str类型
#将字符串列表转为数值列表
ndarray = np.array(x)
ndarray = ndarray.astype(np.int).tolist()
x=ndarray
y = list(dict.values())
plt.subplot(111)
plt.xticks([10, 20, 30, 40, 50, 60,70,80,90,100,120,140,160,180,200,300,400,500,600,700,800])
plt.yticks([2, 4, 6, 8, 10, 12])
plt.plot(x, y) # label为设置图例标签,需要配合legend()函数才能显示出
#plt.scatter(x, y, c='r', marker='o')
plt.xlabel('浏览次数')
plt.ylabel('频数')
# plt.title('评论浏览次数散点图')
plt.title('评论浏览次数折线图')
# plt.grid(axis='y')
# 设置数字标签**
# for a, b in zip(x,y):
# plt.text(a, b, '%d' % b, ha='center', va= 'bottom',fontsize=9)
plt.legend() # 需要配合这个才能显示图例标签
plt.show()
#可视化回复次数数据
def drawreply(dict):
fig = plt.figure()
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
x = list(dict.keys())
x = list(map(str, x)) # 使用list(map(str,x1))方法,将返回一个列表,列表中所有元素是str类型
#将字符串列表转为数值列表
ndarray = np.array(x)
ndarray = ndarray.astype(np.int).tolist()
x=ndarray
y = list(dict.values())
print(x)
print(y)
plt.subplot(111)
plt.xticks([0,5,10,15,20,25, 30, 35, 40,45, 50, 60,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650,700])
plt.yticks([2, 4, 6,8,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150])
plt.plot(x, y) # label为设置图例标签,需要配合legend()函数才能显示出
#plt.scatter(x, y, c='r', marker='o')
plt.xlabel('回复次数')
plt.ylabel('频数')
# plt.title('评论回复次数散点图')
# plt.title('评论回复次数折线图')
# plt.grid(axis='y')
# 设置数字标签**
# for a, b in zip(x,y):
# plt.text(a, b, '%d' % b, ha='center', va= 'bottom',fontsize=9)
plt.legend() # 需要配合这个才能显示图例标签
plt.show()
#将浏览\回复次数数据进行计数处理
def Count(List):
dict={}
for area in List:
keys=area.split(",")
for key in keys:
if key in dict.keys():
dict[key]=dict[key]+1
else:
dict[key]=1
return dict
if __name__ == '__main__':
watch_numList=extractwatch_num('comment.xlsx')
reply_numList=extractreply_num('comment.xlsx')
dict1=Count(watch_numList)
dict2=Count(reply_numList)
print(dict1)
print(dict2)
drawatch(dict1)
drawreply(dict2)
5、总结:
此爬虫涉及关键技术:
- 使用selenium库实现模拟登陆
selenium是进行自动化测试的一种库,配合浏览器相对应的webdriver,可以模拟浏览器行为登录,大大方便、简化了登录操作。因为我用的是Chrome浏览器做的测试,所以要下载与Chrome对应的Chrome webdriver版本