I have the following dataframe merged_dft
to scatterplot the two column eg. snv vs snv-dra
samples snv het-hom ti-tv snv-drg het-hom-drg ti-tv-drg insertion-drg deletion-drg insertion deletion ins-del-ratio-drg ins-del-ratio Sample_name Sex Superpopulation_code
0 NA20126 4592368 2.14 1.97 4770140 2.26 1.96 523917 536443 472931 494200 0.98 0.96 NA20126 male AFR
1 NA20127 4699751 2.04 1.97 4918959 2.18 1.97 562430 572733 485645 505302 0.98 0.96 NA20127 female AFR
2 NA20128 4636463 2.09 1.97 4854107 2.22 1.97 552634 566283 478801 500632 0.98 0.96 NA20128 female AFR
3 NA20129 4638940 2.11 1.97 4863336 2.23 1.97 552984 565534 478078 499867 0.98 0.96 NA20129 female AFR
4 NA20274 4339811 2.10 1.96 4554995 2.23 1.96 524046 530728 456420 471116 0.99 0.97 NA20274 female AFR
....
....
--
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import scipy.stats as stats
x = merged_dft['snv']
y = merged_dft['snv-drg']
x_min = merged_dft['snv'].min()
x_max = merged_dft['snv'].max()
y_min = merged_dft['snv-drg'].min()
y_max = merged_dft['snv-drg'].max()
lineStart = min(x_min,y_min)
lineEnd = max(x_max,y_max)
# Create a scatter plot
# plt.scatter(x, y, c='tab:blue')
sns.scatterplot(data=merged_dft, x='snv', y='snv-drg', hue='Superpopulation_code' )
plt.xlabel('NPM')
plt.ylabel('Drgen')
plt.title('Count_SNVs')
plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
plt.xlim(lineStart, lineEnd)
plt.ylim(lineStart, lineEnd)
r, p = stats.pearsonr(x, y)
plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')
# plt.legend(bbox_to_anchor=(1.025,1), loc='upper left', borderaxespad=0.)
I want to scatterplot/pearson correlation the pair of columns in npm_col
vs drg_col
sequentially. I couldn't make it from the below code.
Example
snv vs snv-drg
het-hom vs het-hom-drg
ti-tv vs ti-tv-drg
Code:
# set 1 columns
npm_col = merged_dft[['snv', 'het-hom', 'ti-tv']]
npm_col
# set 2 columns
drg_col = merged_dft[['snv-drg', 'het-hom-drg', 'ti-tv-drg']]
drg_col
for i in range(len(npm_col)):
for j in range(len(drg_col)):
plt.figure()
plt.scatter(merged_dft[npm_col], merged_dft[drg_col])
plt.xlabel(npm_col)
plt.ylabel(drg_col)
plt.title(f'Scatter plot between {npm_col} and {drg_col}')
plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
plt.xlim(lineStart, lineEnd)
plt.ylim(lineStart, lineEnd)
# r, p = stats.pearsonr(x, y)
r, p = stats.pearsonr(merged_dft[npm_col], merged_dft[drg_col])
plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')
# plt.legend(bbox_to_anchor=(1.025,1), loc='upper left', borderaxespad=0.)
plt.show()
Thanks for any help!
Answer:
xcols = npm_col.columns.tolist()
ycols = drg_col.columns.tolist()
# title=0
for i in range(len(xcols)):
plt.scatter(npm_col[xcols[i]], drg_col[ycols[i]])
x_min = npm_col[xcols[i]].min()
x_max = npm_col[xcols[i]].max()
y_min = drg_col[ycols[i]].min()
y_max = drg_col[ycols[i]].max()
# lineStart = min(x_min,y_min)
# lineEnd = max(x_max,y_max)
lineStart = min(x_min,y_min)
lineEnd = max(x_max,y_max)
plt.xlabel('NPM')
plt.ylabel('DRG')
# plt.title(xcols[title])
# title=title+1
plt.title(f'Scatter plot between NPM_{xcols[i]} and DRG_{xcols[i]}')
# plt.title(xcols[i])
plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
r, p = stats.pearsonr(npm_col[xcols[i]], dragen_col[ycols[i]])
plt.plot([lineStart, lineEnd], [lineStart, lineEnd], color = 'r', linestyle = 'dashed')
plt.xlim(lineStart, lineEnd)
plt.ylim(lineStart, lineEnd)
plt.annotate('r = {:.2f}'.format(r), xy=(0.1, 0.95), xycoords='axes fraction')
plt.show()
If you are just trying to get x and y and they are in the same relative position in the new dataframes, you should be able to use this. You can play around with rest of the items in the loop to get the right variables for labels, etc.
xcols = npm_col.columns.tolist()
ycols = drg_col.columns.tolist()
for i in range(len(xcols)):
plt.scatter(npm_col[xcols[i]], drg_col[ycols[i]])
r, p = stats.pearsonr(npm_col[xcols[i]], drg_col[ycols[i]])