I am using deepgraph in python to compute correlation coefficients for large matrices. The output gives a multi-index data frame:
s t
0 1 -0.006066
2 0.094063
3 -0.025529
4 0.074080
5 0.035490
6 0.005221
7 0.032064
I want to add a column with corresponding p-values. The original code with input example is obtained from https://deepgraph.readthedocs.io/en/latest/tutorials/pairwise_correlations.html The code surrounded by hashtags is my approach to get p-values. I want to merge the separate edge lists later on.
#!/bin/python
import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
import deepgraph as dg
from numpy.random import RandomState
from scipy.stats import pearsonr, spearmanr
prng = RandomState(0)
n_features = int(5e3)
n_samples = int(1e2)
X = prng.randint(100, size=(n_features, n_samples)).astype(np.float64)
# Spearman's correlation coefficients
X = X.argsort(axis=1).argsort(axis=1)
# whiten variables for fast parallel computation later on
X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
# save in binary format
np.save('samples', X)
# parameters (change these to control RAM usage)
step_size = 1e5
n_processes = 100
# load samples as memory-map
X = np.load('samples.npy', mmap_mode='r')
# create node table that stores references to the mem-mapped samples
v = pd.DataFrame({'index': range(X.shape[0])})
# connector function to compute pairwise pearson correlations
def corr(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
corr = np.einsum('ij,ij->i', features_s, features_t) / n_samples
return corr
#################################
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = spearmanr(features_s, features_t)[1]
return p
#################################
# index array for parallelization
pos_array = np.array(np.linspace(0, n_features*(n_features-1)//2, n_processes), dtype=int)
# parallel computation
def create_ei(i):
from_pos = pos_array[i]
to_pos = pos_array[i+1]
# initiate DeepGraph
g = dg.DeepGraph(v)
# create edges
g.create_edges(connectors=corr, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
g.e.to_pickle('tmp/correlations/{}_corr.pickle'.format(str(i).zfill(3)))
#################################
gp = dg.DeepGraph(v)
# create edges
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
gp.e.to_pickle('tmp/correlations/{}_pval.pickle'.format(str(i).zfill(3)))
#################################
# computation
if __name__ == '__main__':
os.makedirs("tmp/correlations", exist_ok=True)
indices = np.arange(0, n_processes - 1)
p = Pool()
for _ in p.imap_unordered(create_ei, indices):
pass
# store correlation values
files = os.listdir('tmp/correlations/')
files.sort()
for f in files:
et = pd.read_pickle('tmp/correlations/{}'.format(f))
print(et)
store.close()
I get the following error:
Traceback (most recent call last):
File "/lib/python3.9/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "pairwise_corr.py", line 64, in create_ei
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 616, in create_edges
self.e = _matrix_iterator(
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 4875, in _matrix_iterator
ei = _select_and_return(vi, sources_k, targets_k, ft_feature,
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 5339, in _select_and_return
ei = pd.DataFrame({col: data[col] for col in coldtypedic})
File "/lib/python3.9/site-packages/pandas/core/frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 124, in arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 589, in _homogenize
val = sanitize_array(
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 576, in sanitize_array
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 627, in _sanitize_ndim
raise ValueError("Data must be 1-dimensional")
ValueError: Data must be 1-dimensional
Any suggestions? Thanks!
I was able to solve it with
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = [pearsonr(features_s[i, :], features_t[i, :])[1] for i in range(len(features_s))]
p_val = np.asarray(p)
return p_val