同构图???啥是同构图呢? 同构图是指图中的所有节点和边类型相同,具有一致的特征和属性。
特点:
1、节点类型统一:图中的所有节点类型相同。
2、边类型统一:图中的所有边类型相同。
3、一致的特征空间:节点和边的特征位于同一个特征空间。
例如:
Alice
|
|
Bob—Charlie
|
|
David
那异构图呢?(节点可以代表不同的实体)
Alice --works_at–> Company
| |
| |
lives_in located_in
| |
City Country
导包:
import os
import requests
# 加载数据
import pandas as pd
import itertools
import numpy as np
from torch_geometric.data import Data
下载文件
使用的是球员信息,其中有id,技能,球队名,位置,等等等信息。
# 文件URL
file_urls = [
"https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player.csv",
"https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player_skill.csv",
"https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_team.csv"
]
# 指定保存路径
save_path ='data/fifa21_dataset'
os.makedirs(save_path,exist_ok=True)
# 下载并保存文件
for url in file_urls:
# 拼接出文件名
filename = os.path.join(save_path,os.path.basename(url))
# 根据url发出HTTP请求获取数据
response = requests.get(url)
with open(filename,'wb') as f:
f.write(response.content)
print(f'文件:{os.path.basename(url)}下载完成,保存在{save_path}' )
读取CSV文件数据
player_df = pd.read_csv('data/fifa21_dataset/tbl_player.csv')
skill_df = pd.read_csv('data/fifa21_dataset/tbl_player_skill.csv')
team_df = pd.read_csv('data/fifa21_dataset/tbl_team.csv')
提取出需要的子集并合并。
# 提取出需要的子集
player_df = player_df[["int_player_id", "str_player_name", "str_positions", "int_overall_rating", "int_team_id"]]
skill_df = skill_df[["int_player_id", "int_long_passing", "int_ball_control", "int_dribbling"]]
team_df = team_df[["int_team_id", "str_team_name", "int_overall"]]
# 合并数据
player_df = player_df.merge(skill_df,on='int_player_id') # 以int_palyer_id 进行合并
fifa_df = player_df.merge(team_df,on='int_team_id')
# 排序
fifa_df = fifa_df.sort_values(by='int_overall_rating',ascending=False) # 以int_overall_rating列降序
print(f'Players:',fifa_df.shape[0])
fifa_df = fifa_df[:10]
fifa_df.head(10)
# 确保没有重复的节点
max(fifa_df['int_player_id'].value_counts())
# 以Int_player_id 升序排列
sorted_df = fifa_df.sort_values(by='int_player_id')
处理数据:
# 挑选节点特征》
node_features = sorted_df[["str_positions", "int_long_passing", "int_ball_control", "int_dribbling"]]
"""
str_positions int_long_passing int_ball_control int_dribbling
0 RW, ST, CF 91 96 96
33 ST, LW 77 92 88
57 GK 40 30 12
"""
# print(node_features)
# 转换为非数字列
pd.set_option('mode.chained_assignment',None)
positions = node_features['str_positions'].str.split(",",expand=True)
node_features['first_position'] = positions[0]
# one-hot 独热编码
node_features = pd.concat([node_features,pd.get_dummies(node_features['first_position'])],axis=1,join='inner')
"""
str_positions int_long_passing int_ball_control ... LW RW ST
0 RW, ST, CF 91 96 ... False True False
33 ST, LW 77 92 ... False False True
57 GK 40 30 ... False False False
"""
# print(node_features)
# 去掉 'str_positions','first_position'两列
node_features.drop(['str_positions','first_position'],axis=1,inplace=True)
"""
int_long_passing int_ball_control int_dribbling ... LW RW ST
0 91 96 96 ... False True False
33 77 92 88 ... False False True
57 40 30 12 ... False False False
"""
# 将True False 转换为1 0
node_features = node_features.astype(int)
"""
int_long_passing int_ball_control int_dribbling CAM CB GK LW RW ST
0 91 96 96 0 0 0 0 1 0
33 77 92 88 0 0 0 0 0 1
57 40 30 12 0 0 1 0 0 0
"""
# print(node_features.head(10))
# 将其转换为numpy类型
"""
[[91 96 96 0 0 0 0 1 0]
[77 92 88 0 0 0 0 0 1]
[40 30 12 0 0 1 0 0 0]]
"""
x = node_features.to_numpy()
搞标签咯:
# 开始搞标签
sorted_df =fifa_df.sort_values(by='int_player_id')
# 挑选标签
labels = sorted_df[['int_overall']]
# print(labels.head(10))
# 同样转换为 Numpy数组
y = labels.to_numpy()
# 重新映射id,把无序的id排列整成有序的
fifa_df['int_player_id'] = fifa_df.reset_index().index
"""
int_player_id ... int_overall
0 0 ... 84
33 1 ... 83
57 2 ... 83
"""
# print(fifa_df.head(10))
整合数据:
# 查看每支球队中的队员
print(fifa_df["str_team_name"].value_counts())
# 获取每个队伍名
teams = fifa_df['str_team_name'].unique()
# (0,2) 零行两列,表示一个空的二维数组
# 用来存储边关系
all_edges = np.array([], dtype=np.int32).reshape((0, 2))
for team in teams:
# 通过fifa_df['str_team_name'] == team 获取布尔Series
# 0 True
# 1 False
# 2 True
# 进行筛选
team_df =fifa_df[fifa_df['str_team_name'] == team]
# 获取需要链接的球员ID
players = team_df['int_player_id'].values
# 排列组合,生成所有可能的两两组合
permutations = list(itertools.combinations(players,2))
# print('=========')
# print(permutations)
# 边起点
edges_source = [e[0] for e in permutations]
# 边终点
edges_target = [e[1] for e in permutations]
team_edges = np.column_stack([edges_source,edges_target])
all_edges = np.vstack([all_edges,team_edges])
# 转换为 pytorch_geometric 格式
edge_index = all_edges.transpose()
#[[0. 6. 6. 8.]
# [7. 8. 9. 9.]]
# print(edge_index)
# 复制数组中的列并颠倒它们的顺序,并合并
edge_index = np.concatenate([edge_index, edge_index[::-1]], axis=1)
# 去除重复的边,确保每条边只出现一次
edge_index = np.unique(edge_index, axis=1)
data = Data(x=x,edge_index=edge_index,y=y)
print(data)