I want to train a model but facing a problem: in every batch I will select some node for training, and the id of nodes range from 0 to 999. While training I find the code use the id of the node as the index and try to get the other data of the node so the index is out of range. my code is here
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from torch_geometric.data import DataLoader as loader
import pandas as pd
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
# feature
class GCN(nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels):
super(GCN, self).__init__()
self.conv1 = GCNConv(in_channels, hidden_channels)
self.conv2 = GCNConv(hidden_channels, out_channels)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return x
class MyDataset(Dataset):
def __init__(self, nodes_file, edges_file, communities_file, features_file):
self.nodes_data = pd.read_csv(nodes_file)
self.edges_data = pd.read_csv(edges_file)
self.communities_data = pd.read_csv(communities_file)
self.features_data = pd.read_csv(features_file)
def __len__(self):
return len(self.nodes_data)
def __getitem__(self, idx):
node_id = self.nodes_data.iloc[idx]['ID']
node_category = self.nodes_data.iloc[idx]['Category']
node_community = self.communities_data[self.communities_data['ID'] == node_id]['Community'].values[0]
edge_source = self.nodes_data.iloc[idx]['ID']
edge_targets = eval(self.edges_data.iloc[idx]['neighbor'])
# Get the corresponding features for the current node_id
node_features = self.features_data[self.features_data['NodeID'] == node_id].iloc[:, 1:].values
node_features = torch.tensor(node_features, dtype=torch.float)
# You can process the node, edge, and community data as per your requirements
# and return them as tensors
return node_id, node_category, node_community, edge_source, edge_targets, node_features
def custom_collate_fn(batch):
node_ids, node_categories, node_communities, edge_sources, edge_targets, node_features = zip(*batch)
# Convert to PyTorch tensors
node_ids = torch.tensor(node_ids, dtype=torch.long)
node_categories = torch.tensor(node_categories, dtype=torch.long)
node_communities = torch.tensor(node_communities, dtype=torch.long)
edge_targets = [torch.tensor(targets) for targets in edge_targets]
# Replicate edge_sources to have the same length as edge_targets
edge_sources_replicated = [torch.tensor([source] * len(targets), dtype=torch.long) for source, targets in zip(edge_sources, edge_targets)]
return node_ids, node_categories, node_communities, edge_sources_replicated, edge_targets, node_features
nodes_file = 'nodes.csv'
edges_file = 'RWR_features.csv'
communities_file = 'community.csv'
feature_file = 'node_features.csv'
dataset = MyDataset(nodes_file, edges_file, communities_file, feature_file)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)
input_dim = 1000 # Replace with the actual input dimension
hidden_dim = 64
output_dim = 32
num_categories = 3 # Replace with the actual number of node categories
num_communities = 5 # Replace with the actual number of communities
num_epochs = 300
# GCN
gcn_model = GCN(input_dim, hidden_dim, output_dim)
# classify
...
# train loop
for epoch in range(num_epochs):
for batch_data in dataloader:
node_ids, node_categories, node_communities, edge_sources_list, edge_targets_list, node_feature = batch_data
# print('node_ids:', node_ids)
# print('len:', len(node_ids))
# print('node_categories:', node_categories)
# print('len:', len(node_categories))
# print('node_communities:', node_communities)
# print('len:', len(node_communities))
# print('edge_sources:', edge_sources)
# print('len:', len(edge_sources))
# print('edge_targets', edge_targets)
# print('len:', len(edge_targets))
node_feature = torch.cat(node_feature, dim=0)
# print('node_feature', node_feature)
# print('len:', len(node_feature))
edge_sources = torch.cat(edge_sources_list, dim=0)
edge_targets = torch.cat(edge_targets_list, dim=0)
# print(edge_sources)
# print(len(edge_sources))
# print(edge_targets)
# print(len(edge_targets))
edge_index = torch.stack([edge_sources, edge_targets], dim=0)
# print(edge_index)
x = gcn_model(node_feature, edge_index)
How can I fix this? At first, I want to find his index in batch through the node's ID, and use this index to find the rest data I need, but I don't know how to achieve this, is this method feasible? Thanks a lot.
The error info is below:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[2], line 62
57 edge_index = torch.stack([edge_sources, edge_targets], dim=0)
59 # print(edge_index)
---> 62 x = gcn_model(node_feature, edge_index)
64 # A
65 classify_logits = classify_model(x)
File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
Cell In[1], line 21, in GCN.forward(self, x, edge_index)
20 def forward(self, x, edge_index):
---> 21 x = self.conv1(x, edge_index)
22 x = F.relu(x)
23 x = F.dropout(x, p=0.5, training=self.training)
File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py:210, in GCNConv.forward(self, x, edge_index, edge_weight)
208 cache = self._cached_edge_index
209 if cache is None:
--> 210 edge_index, edge_weight = gcn_norm( # yapf: disable
211 edge_index, edge_weight, x.size(self.node_dim),
212 self.improved, self.add_self_loops, self.flow, x.dtype)
213 if self.cached:
214 self._cached_edge_index = (edge_index, edge_weight)
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py:100, in gcn_norm(edge_index, edge_weight, num_nodes, improved, add_self_loops, flow, dtype)
98 row, col = edge_index[0], edge_index[1]
99 idx = col if flow == 'source_to_target' else row
--> 100 deg = scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
101 deg_inv_sqrt = deg.pow_(-0.5)
102 deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/utils/scatter.py:74, in scatter(src, index, dim, dim_size, reduce)
72 if reduce == 'sum' or reduce == 'add':
73 index = broadcast(index, src, dim)
---> 74 return src.new_zeros(size).scatter_add_(dim, index, src)
76 if reduce == 'mean':
77 count = src.new_zeros(dim_size)
RuntimeError: index 332 is out of bounds for dimension 0 with size 64
the example of the data is below:
node.csv(total: 1000 node): enter image description here
RWR_features.csv(each node have 30 important neighbor, sampled from the edges file, the class is the class of the source node): enter image description here
community.csv: enter image description here
node_features.csv: enter image description here
07/30/2023 edit
Already fixed. The problem lies in the input of the GCN, I rewrote the code.
import pandas as pd
import numpy as np
def load_data():
# Load nodes.csv
nodes_df = pd.read_csv('nodes.csv')
# Load community.csv
community_df = pd.read_csv('community.csv')
# Load node_features.csv
node_features_df = pd.read_csv('node_features.csv')
# Load RWR_features.csv
rwr_features_df = pd.read_csv('RWR_features.csv', usecols=['node', 'neighbor'])
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(eval)
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(lambda x: [int(i) for i in x])
# Create mapping from node ID to index
node_to_index = {node_id: index for index, node_id in enumerate(nodes_df['ID'])}
# Create adjacency matrix
num_nodes = len(nodes_df)
adj = np.zeros((num_nodes, num_nodes))
for _, row in rwr_features_df.iterrows():
node_id = row['node']
neighbor_ids = row['neighbor']
for neighbor_id in neighbor_ids:
adj[node_to_index[node_id]][node_to_index[neighbor_id]] = 1
adj[node_to_index[neighbor_id]][node_to_index[node_id]] = 1 # Assuming undirected graph
# Create feature matrix
feature_columns = node_features_df.columns[1:] # Exclude the first column (ID)
feature = node_features_df[feature_columns].values
# Create class labels
class_label = nodes_df['Category'].values
# Create community labels
com_label = community_df['Community'].values
# Split indices for train, val, and test sets (You can modify this split ratio as needed)
num_train = int(num_nodes * 0.6)
num_val = int(num_nodes * 0.2)
num_test = num_nodes - num_train - num_val
# Generate random indices for train, val, and test sets
indices = np.random.permutation(num_nodes)
train_idx = indices[:num_train]
val_idx = indices[num_train:num_train + num_val]
test_idx = indices[num_train + num_val:]
return adj, feature, class_label, com_label, train_idx, val_idx, test_idx
# Usage
adj, feature, class_label, com_label, train_idx, val_idx, test_idx = load_data()
class MyDataset(Dataset):
def __init__(self, adj, feature, class_label, com_label):
self.adj = adj
self.feature = feature
self.class_label = class_label
self.com_label = com_label
def __len__(self):
return len(self.class_label)
def __getitem__(self, idx):
return idx, self.adj[idx] , self.feature[idx], self.class_label[idx], self.com_label[idx]
Then, I input the adjacent matrix and the feature matrix into the GCN instead of the edge_index.
gcn_feature = gcn_model(batch_adj, batch_feature)
Already fixed. Thanks for all readers time. The problem lies in the input of the GCN, I need to input the adjacent matrix and the feature matrix into the GCN instead of the edge_index. First I rewrote the load_data part:
import pandas as pd
import numpy as np
def load_data():
# Load nodes.csv
nodes_df = pd.read_csv('nodes.csv')
# Load community.csv
community_df = pd.read_csv('community.csv')
# Load node_features.csv
node_features_df = pd.read_csv('node_features.csv')
# Load RWR_features.csv
rwr_features_df = pd.read_csv('RWR_features.csv', usecols=['node', 'neighbor'])
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(eval)
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(lambda x: [int(i) for i in x])
# Create mapping from node ID to index
node_to_index = {node_id: index for index, node_id in enumerate(nodes_df['ID'])}
# Create adjacency matrix
num_nodes = len(nodes_df)
adj = np.zeros((num_nodes, num_nodes))
for _, row in rwr_features_df.iterrows():
node_id = row['node']
neighbor_ids = row['neighbor']
for neighbor_id in neighbor_ids:
adj[node_to_index[node_id]][node_to_index[neighbor_id]] = 1
adj[node_to_index[neighbor_id]][node_to_index[node_id]] = 1 # Assuming undirected graph
# Create feature matrix
feature_columns = node_features_df.columns[1:] # Exclude the first column (ID)
feature = node_features_df[feature_columns].values
# Create class labels
class_label = nodes_df['Category'].values
# Create community labels
com_label = community_df['Community'].values
# Split indices for train, val, and test sets (You can modify this split ratio as needed)
num_train = int(num_nodes * 0.6)
num_val = int(num_nodes * 0.2)
num_test = num_nodes - num_train - num_val
# Generate random indices for train, val, and test sets
indices = np.random.permutation(num_nodes)
train_idx = indices[:num_train]
val_idx = indices[num_train:num_train + num_val]
test_idx = indices[num_train + num_val:]
return adj, feature, class_label, com_label, train_idx, val_idx, test_idx
# Usage
adj, feature, class_label, com_label, train_idx, val_idx, test_idx = load_data()
class MyDataset(Dataset):
def __init__(self, adj, feature, class_label, com_label):
self.adj = adj
self.feature = feature
self.class_label = class_label
self.com_label = com_label
def __len__(self):
return len(self.class_label)
def __getitem__(self, idx):
return idx, self.adj[idx] , self.feature[idx], self.class_label[idx], self.com_label[idx]
The return of the idx is needed to cut the feature matrix. Then, input the data.
for epoch in range(epochs):
...
for ... in progress_bar:
...
gcn_feature = gcn_model(batch_adj, batch_feature)