1.采集
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
#数据采集
driver=webdriver.Edge('C:/msedgedriver.exe')
url="https://www.huya.com/l"
driver.get(url)
driver.maximize_window()
# 创建一个空的 DataFrame 对象
data = pd.DataFrame(columns=['room_type','room_name', 'room_viewer','room_url', 'room_description'])
for page in range(1, 101):
# 输出当前页码
print(f'正在爬取第{page}页...')
# 定位到翻页输入框,并发送页码
elem_page_input = driver.find_element(By.CSS_SELECTOR, 'input.laypage_skip')
elem_page_input.clear()
elem_page_input.send_keys(str(page))
# 定位到跳转按钮,并点击
elem_submit = driver.find_element(By.CSS_SELECTOR, 'button.laypage_btn')
elem_submit.click()
# 等待页面加载完成
time.sleep(5)
No=1
anchors = driver.find_elements(By.CSS_SELECTOR,'.live-list>li')
for anchor in anchors:
name=anchor.find_element(By.CLASS_NAME,'nick').text
type=anchor.find_element(By.CLASS_NAME,'game-type').text
num=anchor.find_element(By.CLASS_NAME,'js-num').text
href=anchor.find_element(By.CSS_SELECTOR,'a').get_attribute("href")
dc=anchor.find_element(By.CLASS_NAME,'title').text
print(str(No)+".主播名:"+name+", 游戏类型:"+type+", 人气:"+num+", 房间描述:"+dc+", 网址:"+href)
No=int(No)+1
new_data = pd.DataFrame({
'room_type':type,
'room_name':name,
'room_viewer':num,
'room_url':href,
'room_description':dc
},index=[0])
data = data._append(new_data, ignore_index=True)
data.to_csv('E:/python/huya.csv',index=False,encoding='utf-8-sig')
driver.quit()
2.清洗
import pandas as pd
#数据清洗
datafile='E:/python/huya.csv'
cleandata='E:/python/cleanhuya.csv'
data=pd.read_csv(datafile,encoding='utf-8-sig')
e1=pd.notnull(data['room_viewer'])
e2=pd.notnull(data['room_type'])#查找缺失值
print('\n')
print('检查整个data是否存在缺失值:(如果有会自动删除)')
print(data.isnull().any().any())
data_notnull=data.loc[e1&e2, :]#删除缺失值
data['room_viewer']=data['room_viewer'].str.replace('万','')
e3=data_notnull['room_viewer']>='0' #异常值的处理,人气必须大于等于0
data_notnull=data_notnull.loc[e3, :]#删除异常值
print('检查整个 DataFrame 是否存在重复行:(如果有会自动删除)')
print(data_notnull.duplicated().any())#查找是否有重复值
data_notnull.drop_duplicates(inplace=True)#删除重复行
data_notnull['room_viewer']=data['room_viewer'].astype(float)
data_sorted=data.sort_values(by='room_viewer', ascending=False)#按降序来排序
data_notnull.to_csv(cleandata,index=False,encoding='utf-8-sig')#保存清洗后的数据
3.数据可视化
from tkinter import messagebox
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
#数据可视化1
datafile='E:/python/cleanhuya.csv'
data=pd.read_csv(datafile,encoding='utf-8-sig')
room_type_counts = data["room_type"].value_counts()
# 统计不同直播类型数量
k= data["room_type"].nunique()
print("数据集中共有 %d 种不同的游戏类型,但只筛选数量大于100的类型" % k)
# 只选择数量大于100的直播类型,否则可视化很繁琐
room_type_counts_top100 = room_type_counts[room_type_counts > 100]
# 输出直播类型和对应的数量
for game_type, count in room_type_counts_top100.items():
print(f"{game_type}: {count}")
plt.rcParams['font.sans-serif'] = ['SimHei']#设置字体
plt.bar(room_type_counts_top100.index, room_type_counts_top100.values,width=0.7)
for x, y in zip(room_type_counts_top100.index, room_type_counts_top100.values):
plt.text(x, y, str(y), ha="center", va="bottom")#标签数量对齐显示
plt.title("每种类型主播数大于100的统计")
plt.xlabel("直播类型")
plt.ylabel("数量")
# 旋转 x 轴标签并显示竖直
plt.xticks(rotation=45, ha='center', va='top')
# 调整图表大小
plt.gcf().set_size_inches(20, 10)
# 设置字体大小
mpl.rcParams['font.size'] = 4 # 设置字体大小为4号
x = np.arange(k)
xticks = np.arange(0, 1000, 50)
# 定义鼠标点击事件的处理函数
def on_click(event):
# 如果是左键单击事件
if event.button == 1:
# 获取点击位置的坐标(x, y)
x, y = event.xdata, event.ydata
# 如果点击位置在图表内部
if x is not None and y is not None:
# 查找最近的一个条形图
index = int(round(x))
# 显示该条形图的 x 轴标签和对应的数量
label = room_type_counts_top100.index[index]
value = room_type_counts_top100[label]
messagebox.showinfo("数量提示", f"{label}: {value}个主播")
# 注册鼠标点击事件的处理函数
cid = plt.gcf().canvas.mpl_connect('button_press_event', on_click)
plt.show()
# 按直播类型分组,计算每个直播类型的人气综合
total_sum = data.groupby('room_type')['room_viewer'].sum()
total_sum = total_sum.apply(lambda x: f'{x:.0f}万')
total_sum.to_csv('E:/python/totalhuya.csv',header=['人气总和'])
room_type_counts_top100 = room_type_counts_top100.apply(lambda x: f'{x:.0f}万')
room_type_counts_top100.to_csv('E:/python/totalhuya100.csv', header=['人气总和'])
print("数据集中共有 %d 种不同的游戏类型,但只筛选数量大于100的类型" % k)