1. 引言
1.1 为什么要抓取游戏数据?
随着游戏产业的快速发展,玩家和开发者都希望获得最新的游戏数据,例如:
- 游戏排行榜:了解热门游戏趋势,如 Steam、TapTap、App Store、Google Play 等平台上的榜单变化。
- 玩家数据:分析玩家行为,如在线时长、胜率、最受欢迎的游戏等。
- 市场研究:帮助开发者优化游戏设计,调整运营策略。
1.2 本文目标
本教程将使用 Python 爬虫 抓取游戏排行榜数据,并使用 数据分析和可视化 技术进行深度分析,具体内容包括:
- 爬取数据:使用
requests
和BeautifulSoup
抓取游戏排行榜及玩家数据。 - 数据清洗与存储:使用
pandas
处理数据,并存入SQLite
数据库。 - 数据分析:分析热门游戏趋势、玩家偏好等。
- 数据可视化:利用
matplotlib
和seaborn
绘制排行榜变化趋势。 - 预测未来趋势:使用机器学习模型预测未来热门游戏。
2. 环境准备
2.1 安装所需库
我们需要以下 Python 库:
bash
复制编辑
pip install requests beautifulsoup4 pandas matplotlib seaborn sqlite3 numpy scikit-learn
2.2 导入依赖库
python
复制编辑
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
3. 数据爬取
3.1 目标网站分析
我们以 Steam 游戏排行榜 为例,爬取热门游戏排行榜的数据,包括:
- 排名(Rank)
- 游戏名称(Game Name)
- 当前在线玩家数(Current Players)
- 峰值在线人数(Peak Players)
- 游戏价格(Price)
3.2 发送请求获取网页内容
python
复制编辑
def fetch_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
print(f"请求失败,状态码:{response.status_code}")
return None
3.3 解析 HTML 获取数据
python
复制编辑
def parse_game_data(html):
soup = BeautifulSoup(html, "html.parser")
games = []
for row in soup.find_all("tr", class_="game-row"):
rank = row.find("td", class_="rank").text.strip()
name = row.find("td", class_="game-name").text.strip()
current_players = row.find("td", class_="current-players").text.strip()
peak_players = row.find("td", class_="peak-players").text.strip()
price = row.find("td", class_="price").text.strip()
games.append({
"rank": int(rank),
"name": name,
"current_players": int(current_players.replace(",", "")),
"peak_players": int(peak_players.replace(",", "")),
"price": price
})
return games
3.4 抓取多天的数据
python
复制编辑
def fetch_game_rankings(url, days=7):
all_data = []
for _ in range(days):
html = fetch_page(url)
if html:
games = parse_game_data(html)
all_data.extend(games)
return all_data
4. 数据存储
4.1 存入 SQLite 数据库
python
复制编辑
def save_to_database(game_data, db_name="game_rankings.db"):
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS game_rankings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
rank INTEGER,
name TEXT,
current_players INTEGER,
peak_players INTEGER,
price TEXT,
date TEXT
)
""")
for game in game_data:
cursor.execute("""
INSERT INTO game_rankings (rank, name, current_players, peak_players, price, date)
VALUES (?, ?, ?, ?, ?, ?)
""", (game['rank'], game['name'], game['current_players'], game['peak_players'], game['price'], datetime.today().strftime('%Y-%m-%d')))
conn.commit()
conn.close()
5. 数据分析
5.1 读取数据
python
复制编辑
def load_data(db_name="game_rankings.db"):
conn = sqlite3.connect(db_name)
df = pd.read_sql("SELECT * FROM game_rankings", conn)
conn.close()
return df
df = load_data()
print(df.head())
5.2 游戏在线人数趋势
python
复制编辑
def plot_player_trend(df, game_name):
game_df = df[df['name'] == game_name]
plt.figure(figsize=(10, 5))
sns.lineplot(x="date", y="current_players", data=game_df, marker="o")
plt.xticks(rotation=45)
plt.title(f"{game_name} 在线玩家趋势")
plt.xlabel("日期")
plt.ylabel("当前玩家数")
plt.show()
plot_player_trend(df, "Counter-Strike 2")
5.3 热门游戏分析
python
复制编辑
def popular_games(df):
popular_df = df.groupby("name").mean().sort_values("current_players", ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=popular_df.index, y=popular_df["current_players"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("最受欢迎游戏(平均在线玩家)")
plt.xlabel("游戏名称")
plt.ylabel("平均在线玩家数")
plt.show()
popular_games(df)
6. 预测未来游戏趋势
6.1 训练模型
python
复制编辑
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
python
复制编辑
def train_prediction_model(df, game_name):
game_df = df[df['name'] == game_name]
X = game_df[['day', 'month', 'year']]
y = game_df['current_players']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("均方误差:", mean_squared_error(y_test, y_pred))
print("平均绝对误差:", mean_absolute_error(y_test, y_pred))
return model
model = train_prediction_model(df, "Counter-Strike 2")
6.2 预测未来在线玩家数
python
复制编辑
def predict_players(model, day, month, year):
return model.predict([[day, month, year]])[0]
predicted_players = predict_players(model, 15, 3, 2025)
print(f"预测 2025-03-15 的在线玩家数: {predicted_players:.0f}")