利用PYG搭建数据集(同构图)

同构图???啥是同构图呢? 同构图是指图中的所有节点和边类型相同,具有一致的特征和属性。
特点:
1、节点类型统一:图中的所有节点类型相同。
2、边类型统一:图中的所有边类型相同。
3、一致的特征空间:节点和边的特征位于同一个特征空间。
例如:
Alice
|
|
Bob—Charlie
|
|
David
那异构图呢?(节点可以代表不同的实体)
Alice --works_at–> Company
| |
| |
lives_in located_in
| |
City Country

导包:

import os
import requests
# 加载数据
import pandas as pd

import itertools
import numpy as np
from torch_geometric.data import Data

下载文件

使用的是球员信息,其中有id,技能,球队名,位置,等等等信息。

# 文件URL
file_urls = [
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player.csv",
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player_skill.csv",
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_team.csv"
]
# 指定保存路径
save_path ='data/fifa21_dataset'

os.makedirs(save_path,exist_ok=True)

# 下载并保存文件
for url in file_urls:
    # 拼接出文件名
    filename = os.path.join(save_path,os.path.basename(url))
    # 根据url发出HTTP请求获取数据
    response = requests.get(url)
    with open(filename,'wb') as f:
        f.write(response.content)
    print(f'文件:{os.path.basename(url)}下载完成,保存在{save_path}' )

读取CSV文件数据

player_df = pd.read_csv('data/fifa21_dataset/tbl_player.csv')
skill_df = pd.read_csv('data/fifa21_dataset/tbl_player_skill.csv')
team_df = pd.read_csv('data/fifa21_dataset/tbl_team.csv')

提取出需要的子集并合并。

# 提取出需要的子集
player_df = player_df[["int_player_id", "str_player_name", "str_positions", "int_overall_rating", "int_team_id"]]
skill_df = skill_df[["int_player_id", "int_long_passing", "int_ball_control", "int_dribbling"]]
team_df = team_df[["int_team_id", "str_team_name", "int_overall"]]

# 合并数据
player_df = player_df.merge(skill_df,on='int_player_id') # 以int_palyer_id 进行合并
fifa_df = player_df.merge(team_df,on='int_team_id')

# 排序
fifa_df = fifa_df.sort_values(by='int_overall_rating',ascending=False) # 以int_overall_rating列降序
print(f'Players:',fifa_df.shape[0])
fifa_df = fifa_df[:10]
fifa_df.head(10)

# 确保没有重复的节点
max(fifa_df['int_player_id'].value_counts())

# 以Int_player_id 升序排列
sorted_df = fifa_df.sort_values(by='int_player_id')

处理数据:

# 挑选节点特征》
node_features = sorted_df[["str_positions", "int_long_passing", "int_ball_control", "int_dribbling"]]
"""
    str_positions  int_long_passing  int_ball_control  int_dribbling
0      RW, ST, CF                91                96             96
33         ST, LW                77                92             88
57             GK                40                30             12
"""
# print(node_features)

# 转换为非数字列
pd.set_option('mode.chained_assignment',None)
positions = node_features['str_positions'].str.split(",",expand=True)
node_features['first_position'] = positions[0]

# one-hot 独热编码
node_features = pd.concat([node_features,pd.get_dummies(node_features['first_position'])],axis=1,join='inner')
"""
    str_positions  int_long_passing  int_ball_control  ...     LW     RW     ST
0      RW, ST, CF                91                96  ...  False   True  False
33         ST, LW                77                92  ...  False  False   True
57             GK                40                30  ...  False  False  False
"""
# print(node_features)
# 去掉 'str_positions','first_position'两列
node_features.drop(['str_positions','first_position'],axis=1,inplace=True)
"""
   int_long_passing  int_ball_control  int_dribbling  ...     LW     RW     ST
0                  91                96             96  ...  False   True  False
33                 77                92             88  ...  False  False   True
57                 40                30             12  ...  False  False  False
"""
# 将True False 转换为1 0
node_features = node_features.astype(int)
"""
     int_long_passing  int_ball_control  int_dribbling  CAM  CB  GK  LW  RW  ST
0                  91                96             96    0   0   0   0   1   0
33                 77                92             88    0   0   0   0   0   1
57                 40                30             12    0   0   1   0   0   0
"""
# print(node_features.head(10))

# 将其转换为numpy类型
"""
[[91 96 96  0  0  0  0  1  0]
 [77 92 88  0  0  0  0  0  1]
 [40 30 12  0  0  1  0  0  0]]
"""
x = node_features.to_numpy()

搞标签咯:

# 开始搞标签
sorted_df =fifa_df.sort_values(by='int_player_id')

# 挑选标签
labels = sorted_df[['int_overall']]
# print(labels.head(10))
# 同样转换为 Numpy数组
y = labels.to_numpy()

# 重新映射id,把无序的id排列整成有序的
fifa_df['int_player_id'] = fifa_df.reset_index().index
"""
int_player_id  ... int_overall
0                0  ...          84
33               1  ...          83
57               2  ...          83
"""
# print(fifa_df.head(10))

整合数据:

# 查看每支球队中的队员
print(fifa_df["str_team_name"].value_counts())

# 获取每个队伍名
teams = fifa_df['str_team_name'].unique()
# (0,2) 零行两列,表示一个空的二维数组
# 用来存储边关系
all_edges = np.array([], dtype=np.int32).reshape((0, 2))

for team in teams:
    # 通过fifa_df['str_team_name'] == team 获取布尔Series
    # 0     True
    # 1    False
    # 2     True
    # 进行筛选
    team_df =fifa_df[fifa_df['str_team_name'] == team]
    # 获取需要链接的球员ID
    players = team_df['int_player_id'].values
    # 排列组合,生成所有可能的两两组合
    permutations = list(itertools.combinations(players,2))
    # print('=========')
    # print(permutations)
    # 边起点
    edges_source = [e[0] for e in permutations]
    # 边终点
    edges_target = [e[1] for e in permutations]

    team_edges = np.column_stack([edges_source,edges_target])
    all_edges = np.vstack([all_edges,team_edges])

# 转换为 pytorch_geometric 格式
edge_index = all_edges.transpose()
#[[0. 6. 6. 8.]
 # [7. 8. 9. 9.]]
# print(edge_index)

# 复制数组中的列并颠倒它们的顺序,并合并
edge_index  = np.concatenate([edge_index, edge_index[::-1]], axis=1)
# 去除重复的边,确保每条边只出现一次
edge_index  = np.unique(edge_index, axis=1)

data = Data(x=x,edge_index=edge_index,y=y)
print(data)


  • 10
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值