I'm working with Neo4j from Python and I compare two graphs at the node level in this way:
query = """
MATCH (n)-[r1]-(), (n)-[r2]-()
WHERE r1.disease = 'MyLabel1'
AND r2.disease = 'MyLabel2'
RETURN DISTINCT n
"""
results = driver.execute_query(query)
results[0]
The results of the query are stored in one at the first index of a list, I would like to show the results using a Pandas Dataframe;
I would like to plot the count of nodes in each graph and the nodes in common using a Venn diagramm in matplotlib
import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
from neo4j import GraphDatabase
uri = "bolt://localhost:7687"
user = "neo4j"
password = "xxxxxx"
driver = GraphDatabase.driver(uri, auth=(user, password))
query = """
MATCH (n1)-[r1]-()
WHERE r1.disease = 'MyLabel1'
WITH COLLECT(DISTINCT n1) AS n1s
MATCH (n2)-[r2]-()
WHERE r2.disease = 'MyLabel2'
WITH n1s, COLLECT(DISTINCT n2) AS n2s
RETURN SIZE(n1s) AS count1, SIZE(n2s) AS count2, SIZE([n IN n1s WHERE n IN n2s]) AS countIntersection
"""
with driver.session() as session:
results = session.run(query)
data = tuple(record.values() for record in results)
df = pd.DataFrame(data, columns=('count1', 'count2', 'countIntersection'))
if not df.empty:
venn2(subsets=df.iloc[0], set_labels=('MyLabel1 nodes', 'MyLabel2 nodes'))
plt.show()
else:
print("no results")
And here is a sample resulting Venn diagram:
]
Notes:
count1
and count2
.MATCH
into 2 MATCH
es in the way shown above (where the results from the first MATCH
are aggregated before doing the second MATCH
) avoids producing a cartesian product.This query assumes that every node has a unique id
property.
import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
from neo4j import GraphDatabase
uri = "bolt://localhost:7687"
user = "neo4j"
password = "xxxxxx"
driver = GraphDatabase.driver(uri, auth=(user, password))
query = """
MATCH (n1)-[r1]-()
WHERE r1.disease = 'MyLabel1'
WITH COLLECT(DISTINCT {labels: labels(n1), properties: properties(n1)}) AS n1s
MATCH (n2)-[r2]-()
WHERE r2.disease = 'MyLabel2'
RETURN n1s, COLLECT(DISTINCT {labels: labels(n2), properties: properties(n2)}) AS n2s
"""
with driver.session() as session:
results = session.run(query)
data = tuple(record.values() for record in results)
# Create list of dictionaries for every node.
n1s_data = []
for node in data[0][0]:
node_dict = node['properties']
node_dict['labels'] = node['labels']
n1s_data.append(node_dict)
n2s_data = []
for node in data[0][1]:
node_dict = node['properties']
node_dict['labels'] = node['labels']
n2s_data.append(node_dict)
df_n1s = pd.DataFrame(n1s_data)
df_n2s = pd.DataFrame(n2s_data)
n1s_ids = set(node['id'] for node in df_n1s.to_dict('records'))
n2s_ids = set(node['id'] for node in df_n2s.to_dict('records'))
venn2(subsets=(len(n1s_ids), len(n2s_ids), len(n1s_ids & n2s_ids)), set_labels=('MyLabel1 nodes', 'MyLabel2 nodes'))
plt.show()