diff --git a/README.md b/README.md index 32fd749b7..cd7dd5856 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ Full end-to-end examples in Jupyter ready-to-run notebooks can be found in the [ * [Load data to a projected graph via graph construction](examples/load-data-via-graph-construction.ipynb) * [Heterogeneous Node Classification with HashGNN and Autotuning](https://github.com/neo4j/graph-data-science-client/tree/main/examples/heterogeneous-node-classification-with-hashgnn.ipynb) * [Perform inference using pre-trained KGE models](examples/kge-predict-transe-pyg-train.ipynb) +* [Visualize GDS Projections](examples/visualize.ipynb) ## Documentation diff --git a/doc/modules/ROOT/pages/tutorials/visualize.adoc b/doc/modules/ROOT/pages/tutorials/visualize.adoc new file mode 100644 index 000000000..eb7722004 --- /dev/null +++ b/doc/modules/ROOT/pages/tutorials/visualize.adoc @@ -0,0 +1,266 @@ +// DO NOT EDIT - AsciiDoc file generated automatically + += Visualizing GDS Projections + + +https://colab.research.google.com/github/neo4j/graph-data-science-client/blob/main/examples/import-sample-export-gnn.ipynb[image:https://colab.research.google.com/assets/colab-badge.svg[Open +In Colab]] + + +This Jupyter notebook is hosted +https://github.com/neo4j/graph-data-science-client/blob/main/examples/visualize-with-pyvis.ipynb[here] +in the Neo4j Graph Data Science Client Github repository. + +The notebook exemplifies how to visualize a graph projection in the GDS +Graph Catalog using the `graphdatascience` +(https://neo4j.com/docs/graph-data-science-client/current/[docs]) and +`pyvis` (https://pyvis.readthedocs.io/en/latest/index.html[docs]) +libraries. + +== Prerequisites + +Running this notebook requires a Neo4j server with GDS installed. We +recommend using Neo4j Desktop with GDS, or AuraDS. + +Also required are of course the Python libraries `graphdatascience` and +`pyvis`: + +[source, python, role=no-test] +---- +%pip install graphdatascience pyvis +---- + +== Setup + +We start by importing our dependencies and setting up our GDS client +connection to the database. + +[source, python, role=no-test] +---- +from graphdatascience import GraphDataScience +import os +from pyvis.network import Network +---- + +[source, python, role=no-test] +---- +# Get Neo4j DB URI, credentials and name from environment if applicable +NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687") +NEO4J_AUTH = None +NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j") +if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"): + NEO4J_AUTH = ( + os.environ.get("NEO4J_USER"), + os.environ.get("NEO4J_PASSWORD"), + ) +gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB) +---- + +== Sampling Cora + +Next we use the +https://neo4j.com/docs/graph-data-science-client/current/common-datasets/#_cora[built-in +Cora loader] to get the data into GDS. The nodes in the Cora dataset is +represented by academic papers, and the relationships connecting them +are citations. + +We will then sample a smaller representative subgraph from it that is +more suitable for visualization. + +[source, python, role=no-test] +---- +G = gds.graph.load_cora() +---- + +Let’s make sure we constructed the correct graph. + +[source, python, role=no-test] +---- +print(f"Metadata for our loaded Cora graph `G`: {G}") +print(f"Node labels present in `G`: {G.node_labels()}") +---- + + +Metadata for our loaded Cora graph `G`: Graph(name=cora, node_count=2708, relationship_count=5429) +Node labels present in `G`: ['Paper'] + +It’s looks correct! Now let’s go ahead and sample the graph. + +We use the random walk with restarts sampling algorithm to get a smaller +graph that structurally represents the full graph. In this example we +will use the algorithm’s default parameters, but check out +https://neo4j.com/docs/graph-data-science/current/management-ops/graph-creation/sampling/rwr/[the +algorithm’s docs] to see how you can for example specify the size of the +subgraph, and choose which start node around which the subgraph will be +sampled. + +[source, python, role=no-test] +---- +G_sample, _ = gds.graph.sample.rwr("cora_sample", G, randomSeed=42, concurrency=1) +---- + +We should have somewhere around 0.15 * 2708 ~ 406 nodes in our sample. +And let’s see how many relationships we got. + +[source, python, role=no-test] +---- +print(f"Number of nodes in our sample: {G_sample.node_count()}") +print(f"Number of relationships in our sample: {G_sample.relationship_count()}") +---- + + +Number of nodes in our sample: 406 +Number of relationships in our sample: 532 + +Let’s also compute +https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/[PageRank] +on our sample graph, in order to get an importance score that we call +``rank'' for each node. It will be interesting for context when we +visualize the graph. + +[source, python, role=no-test] +---- +gds.pageRank.mutate(G_sample, mutateProperty="rank") +---- + +---- +mutateMillis 0 +nodePropertiesWritten 406 +ranIterations 20 +didConverge False +centralityDistribution {'min': 0.14999961853027344, 'max': 2.27294921... +postProcessingMillis 1 +preProcessingMillis 0 +computeMillis 7 +configuration {'mutateProperty': 'rank', 'jobId': '5ca450ff-... +Name: 0, dtype: object +---- + +== Exporting the sampled Cora graph + +We can now export the topology and node properties of our sampled graph +that we want to visualize. + +Let’s start by fetching the relationships. + +[source, python, role=no-test] +---- +sample_topology_df = gds.graph.relationships.stream(G_sample) +display(sample_topology_df) +---- + +[cols=",,,",options="header",] +|=== +| |sourceNodeId |targetNodeId |relationshipType +|0 |31336 |31349 |CITES +|1 |31336 |686532 |CITES +|2 |31336 |1129442 |CITES +|3 |31349 |686532 |CITES +|4 |31353 |31336 |CITES +|... |... |... |... +|527 |34961 |31043 |CITES +|528 |34961 |22883 |CITES +|529 |102879 |9513 |CITES +|530 |102884 |9513 |CITES +|531 |767763 |1136631 |CITES +|=== + +532 rows × 3 columns + +We get the right amount of rows, one for each expected relationship. So +that looks good. + +Next we should fetch the node properties we are interested in. Each node +will have a ``subject'' property which will be an integer 0,…,6 that +indicates which of seven academic subjects the paper represented by the +nodes belong to. We will also fetch the PageRank property ``rank'' that +we computed above. + +[source, python, role=no-test] +---- +sample_node_properties_df = gds.graph.nodeProperties.stream( + G_sample, + ["subject", "rank"], + separate_property_columns=True, +) +display(sample_node_properties_df) +---- + +[cols=",,,",options="header",] +|=== +| |nodeId |rank |subject +|0 |164 |0.245964 |4.0 +|1 |434 |0.158500 |2.0 +|2 |1694 |0.961240 |5.0 +|3 |1949 |0.224912 |6.0 +|4 |1952 |0.150000 |6.0 +|... |... |... |... +|401 |1154103 |0.319498 |3.0 +|402 |1154124 |0.627706 |0.0 +|403 |1154169 |0.154784 |0.0 +|404 |1154251 |0.187675 |0.0 +|405 |1154276 |0.277500 |0.0 +|=== + +406 rows × 3 columns + +Now that we have all the data we want to visualize, we can create a +network with PyVis. We color each node according to its ``subject'', and +size it according to its ``rank''. + +[source, python, role=no-test] +---- +net = Network(notebook = True, +cdn_resources="remote", + bgcolor = "#222222", + font_color = "white", + height = "750px", # Modify according to your screen size + width = "100%", +) + +# Seven suitable light colors, one for each "subject" +subject_to_color = ["#80cce9", "#fbd266", "#a9eebc", "#e53145", "#d2a6e2", "#f3f3f3", "#ff91af"] + +# Add all the nodes +for _, node in sample_node_properties_df.iterrows(): + net.add_node(int(node["nodeId"]), color=subject_to_color[int(node["subject"])], value=node["rank"]) + +# Add all the relationships +net.add_edges(zip(sample_topology_df["sourceNodeId"], sample_topology_df["targetNodeId"])) + +net.show("cora-sample.html") +---- + + +ifdef::backend-html5[] +++++ +include::ROOT:partial$/cora-sample.html[] +++++ +endif::[] + + +Unsurprisingly we can see that papers largely seem clustered by academic +subject. We also note that some nodes appear larger in size, indicating +that they have a higher centrality score according to PageRank. + +We can scroll over the graphic to zoom in/out, and ``click and drag'' +the background to navigate to different parts of the network. If we +click on a node, it will be highlighted along with the relationships +connected to it. And if we ``click and drag'' a node, we can move it. + +Additionally one could enable more sophisticated navigational features +for searching and filtering by providing `select_menu = True` and +`filter_menu = True` respectively to the PyVis `Network` constructor +above. Check out the +https://pyvis.readthedocs.io/en/latest/index.html[PyVis documentation] +for this. + +== Cleanup + +We remove the Cora graphs from the GDS graph catalog to free up memory. + +[source, python, role=no-test] +---- +_ = G_sample.drop() +_ = G.drop() +---- diff --git a/doc/modules/ROOT/partials/cora-sample.html b/doc/modules/ROOT/partials/cora-sample.html new file mode 100644 index 000000000..0f2bc651e --- /dev/null +++ b/doc/modules/ROOT/partials/cora-sample.html @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + +
+

+
+ + + + + + +
+

+
+ + + + + +
+ + +
+
+ + +
+
+
0%
+
+
+
+
+
+ + + + + + \ No newline at end of file diff --git a/doc/modules/ROOT/partials/tutorial-list.adoc b/doc/modules/ROOT/partials/tutorial-list.adoc index d3faa903f..15c1c2cba 100644 --- a/doc/modules/ROOT/partials/tutorial-list.adoc +++ b/doc/modules/ROOT/partials/tutorial-list.adoc @@ -7,5 +7,6 @@ * xref:tutorials/node-regression-with-subgraph-and-graph-sample.adoc[] * xref:tutorials/heterogeneous-node-classification-with-hashgnn.adoc[] * xref:tutorials/kge-predict-transe-pyg-train.adoc[] +* xref:tutorials/visualize.adoc[] * xref:tutorials/gds-sessions.adoc[] (Beta) * xref:tutorials/gds-sessions-self-managed.adoc[] (Beta) diff --git a/examples/cora-sample.html b/examples/cora-sample.html new file mode 100644 index 000000000..0f2bc651e --- /dev/null +++ b/examples/cora-sample.html @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + +
+

+
+ + + + + + +
+

+
+ + + + + +
+ + +
+
+ + +
+
+
0%
+
+
+
+
+
+ + + + + + \ No newline at end of file diff --git a/examples/visualize.ipynb b/examples/visualize.ipynb new file mode 100644 index 000000000..4ac4e1d18 --- /dev/null +++ b/examples/visualize.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "82568b27", + "metadata": {}, + "source": [ + "# Visualizing GDS Projections" + ] + }, + { + "cell_type": "markdown", + "id": "c4b7883a", + "metadata": { + "colab_type": "text" + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "922d0fbb", + "metadata": {}, + "source": [ + "This Jupyter notebook is hosted [here](https://github.com/neo4j/graph-data-science-client/blob/main/examples/visualize-with-pyvis.ipynb) in the Neo4j Graph Data Science Client Github repository.\n", + "\n", + "The notebook exemplifies how to visualize a graph projection in the GDS Graph Catalog using the `graphdatascience` ([docs](https://neo4j.com/docs/graph-data-science-client/current/)) and `pyvis` ([docs](https://pyvis.readthedocs.io/en/latest/index.html)) libraries." + ] + }, + { + "cell_type": "markdown", + "id": "c64321df", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "Running this notebook requires a Neo4j server with GDS installed.\n", + "We recommend using Neo4j Desktop with GDS, or AuraDS.\n", + "\n", + "Also required are of course the Python libraries `graphdatascience` and `pyvis`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc4c3baf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install graphdatascience pyvis" + ] + }, + { + "cell_type": "markdown", + "id": "3bdf33d6", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We start by importing our dependencies and setting up our GDS client connection to the database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26fb276b", + "metadata": {}, + "outputs": [], + "source": [ + "from graphdatascience import GraphDataScience\n", + "import os\n", + "from pyvis.network import Network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937f1476", + "metadata": {}, + "outputs": [], + "source": [ + "# Get Neo4j DB URI, credentials and name from environment if applicable\n", + "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", + "NEO4J_AUTH = None\n", + "NEO4J_DB = os.environ.get(\"NEO4J_DB\", \"neo4j\")\n", + "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", + " NEO4J_AUTH = (\n", + " os.environ.get(\"NEO4J_USER\"),\n", + " os.environ.get(\"NEO4J_PASSWORD\"),\n", + " )\n", + "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)" + ] + }, + { + "cell_type": "markdown", + "id": "a96b84b5", + "metadata": {}, + "source": [ + "## Built-in visualization" + ] + }, + { + "cell_type": "markdown", + "id": "aa1f778c", + "metadata": {}, + "source": [ + "### IMDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68125987", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb = gds.graph.load_imdb()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acc92487", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21b17ba8", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2f6d726", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False, theme=\"light\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29534589", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False, node_count=200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b751ec2c", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.drop()" + ] + }, + { + "cell_type": "markdown", + "id": "c6713a9d", + "metadata": {}, + "source": [ + "### Cora" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe00c465", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora = gds.graph.load_cora()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "258747fc", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5a0704", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dfe9e57", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\", include_node_properties=[\"subject\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db635dd6", + "metadata": {}, + "outputs": [], + "source": [ + "gds.degree.mutate(G_cora, mutateProperty=\"deg\")\n", + "G_cora.visualize(color_property=\"subject\", size_property=\"deg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6bc16f4", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\", center_nodes=[12350])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "447f1a2d", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.drop()" + ] + }, + { + "cell_type": "markdown", + "id": "56dc02c4", + "metadata": {}, + "source": [ + "## Using PyVis" + ] + }, + { + "cell_type": "markdown", + "id": "b1a8ab78", + "metadata": {}, + "source": [ + "### Sampling Cora\n", + "\n", + "Next we use the [built-in Cora loader](https://neo4j.com/docs/graph-data-science-client/current/common-datasets/#_cora) to get the data into GDS.\n", + "The nodes in the Cora dataset is represented by academic papers, and the relationships connecting them are citations.\n", + "\n", + "We will then sample a smaller representative subgraph from it that is more suitable for visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a575da60", + "metadata": {}, + "outputs": [], + "source": [ + "G = gds.graph.load_cora()" + ] + }, + { + "cell_type": "markdown", + "id": "723d6457", + "metadata": {}, + "source": [ + "Let's make sure we constructed the correct graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "428ac7b8", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Metadata for our loaded Cora graph `G`: {G}\")\n", + "print(f\"Node labels present in `G`: {G.node_labels()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6aceb0a", + "metadata": {}, + "outputs": [], + "source": [ + "G.visualize(color_property=\"subject\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e8b8a83", + "metadata": {}, + "source": [ + "It's looks correct!\n", + "Now let's go ahead and sample the graph.\n", + "\n", + "We use the random walk with restarts sampling algorithm to get a smaller graph that structurally represents the full graph.\n", + "In this example we will use the algorithm's default parameters, but check out [the algorithm's docs](https://neo4j.com/docs/graph-data-science/current/management-ops/graph-creation/sampling/rwr/) to see how you can for example specify the size of the subgraph, and choose which start node around which the subgraph will be sampled." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d116738", + "metadata": {}, + "outputs": [], + "source": [ + "G_sample, _ = gds.graph.sample.rwr(\"cora_sample\", G, randomSeed=42, concurrency=1)" + ] + }, + { + "cell_type": "markdown", + "id": "324e0d4c", + "metadata": {}, + "source": [ + "We should have somewhere around 0.15 * 2708 ~ 406 nodes in our sample.\n", + "And let's see how many relationships we got." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d66ea2d5", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Number of nodes in our sample: {G_sample.node_count()}\")\n", + "print(f\"Number of relationships in our sample: {G_sample.relationship_count()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3dd66079", + "metadata": {}, + "source": [ + "Let's also compute [PageRank](https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/) on our sample graph, in order to get an importance score that we call \"rank\" for each node.\n", + "It will be interesting for context when we visualize the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d80d653", + "metadata": {}, + "outputs": [], + "source": [ + "gds.pageRank.mutate(G_sample, mutateProperty=\"rank\")" + ] + }, + { + "cell_type": "markdown", + "id": "722caa7b", + "metadata": {}, + "source": [ + "### Exporting the sampled Cora graph\n", + "\n", + "We can now export the topology and node properties of our sampled graph that we want to visualize.\n", + "\n", + "Let's start by fetching the relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3c76b25", + "metadata": {}, + "outputs": [], + "source": [ + "sample_topology_df = gds.graph.relationships.stream(G_sample)\n", + "display(sample_topology_df)" + ] + }, + { + "cell_type": "markdown", + "id": "a4ad967b", + "metadata": {}, + "source": [ + "We get the right amount of rows, one for each expected relationship.\n", + "So that looks good.\n", + "\n", + "Next we should fetch the node properties we are interested in.\n", + "Each node will have a \"subject\" property which will be an integer 0,...,6 that indicates which of seven academic subjects the paper represented by the nodes belong to.\n", + "We will also fetch the PageRank property \"rank\" that we computed above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c52e3b2", + "metadata": {}, + "outputs": [], + "source": [ + "sample_node_properties_df = gds.graph.nodeProperties.stream(\n", + " G_sample,\n", + " [\"subject\", \"rank\"],\n", + " separate_property_columns=True,\n", + ")\n", + "display(sample_node_properties_df)" + ] + }, + { + "cell_type": "markdown", + "id": "ff81f977", + "metadata": {}, + "source": [ + "### Render visualization\n", + "\n", + "Now that we have all the data we want to visualize, we can create a network with PyVis.\n", + "We color each node according to its \"subject\", and size it according to its \"rank\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93cb3bdf", + "metadata": {}, + "outputs": [], + "source": [ + "net = Network(notebook = True,\n", + "cdn_resources=\"remote\",\n", + " bgcolor = \"#222222\",\n", + " font_color = \"white\",\n", + " height = \"750px\", # Modify according to your screen size\n", + " width = \"100%\",\n", + ")\n", + "\n", + "# Seven suitable light colors, one for each \"subject\"\n", + "subject_to_color = [\"#80cce9\", \"#fbd266\", \"#a9eebc\", \"#e53145\", \"#d2a6e2\", \"#f3f3f3\", \"#ff91af\"]\n", + "\n", + "# Add all the nodes\n", + "for _, node in sample_node_properties_df.iterrows():\n", + " net.add_node(int(node[\"nodeId\"]), color=subject_to_color[int(node[\"subject\"])], value=node[\"rank\"])\n", + "\n", + "# Add all the relationships\n", + "net.add_edges(zip(sample_topology_df[\"sourceNodeId\"], sample_topology_df[\"targetNodeId\"]))\n", + "\n", + "net.show(\"cora-sample.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f8bc4a2", + "metadata": {}, + "source": [ + "Unsurprisingly we can see that papers largely seem clustered by academic subject.\n", + "We also note that some nodes appear larger in size, indicating that they have a higher centrality score according to PageRank.\n", + "\n", + "We can scroll over the graphic to zoom in/out, and \"click and drag\" the background to navigate to different parts of the network.\n", + "If we click on a node, it will be highlighted along with the relationships connected to it.\n", + "And if we \"click and drag\" a node, we can move it.\n", + "\n", + "Additionally one could enable more sophisticated navigational features for searching and filtering by providing `select_menu = True` and `filter_menu = True` respectively to the PyVis `Network` constructor above.\n", + "Check out the [PyVis documentation](https://pyvis.readthedocs.io/en/latest/index.html) for this." + ] + }, + { + "cell_type": "markdown", + "id": "928156de", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "We remove the Cora graphs from the GDS graph catalog to free up memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33523a0b", + "metadata": {}, + "outputs": [], + "source": [ + "_ = G_sample.drop()\n", + "_ = G.drop()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index e5e06eb5a..ca5e95b6d 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -1,7 +1,11 @@ from __future__ import annotations +from itertools import chain +import colorsys +import random from types import TracebackType from typing import Any, List, Optional, Type, Union +from uuid import uuid4 from pandas import Series @@ -79,7 +83,6 @@ def node_count(self) -> int: """ Returns: the number of nodes in the graph - """ return self._graph_info(["nodeCount"]) # type: ignore @@ -188,7 +191,6 @@ def drop(self, failIfMissing: bool = False) -> "Series[str]": Returns: the result of the drop operation - """ result = self._query_runner.call_procedure( endpoint="gds.graph.drop", @@ -202,7 +204,6 @@ def creation_time(self) -> Any: # neo4j.time.DateTime not exported """ Returns: the creation time of the graph - """ return self._graph_info(["creationTime"]) @@ -230,3 +231,194 @@ def __repr__(self) -> str: "memoryUsage", ] return f"{self.__class__.__name__}({self._graph_info(yields=yield_fields).to_dict()})" + + def visualize( + self, + node_count: int = 100, + directed: bool = True, + center_nodes: Optional[List[int]] = None, + color_property: Optional[str] = None, + size_property: Optional[str] = None, + include_node_properties: Optional[List[str]] = None, + rel_weight_property: Optional[str] = None, + notebook: bool = True, + px_height: int = 750, + theme: str = "dark", + ) -> Any: + """ + Visualize the `Graph` in an interactive graphical interface. + The graph will be sampled down to specified `node_count` to limit computationally expensive rendering. + + Args: + node_count: number of nodes in the graph to be visualized + directed: whether or not to display relationships as directed + center_nodes: nodes around subgraph will be sampled, if sampling is necessary + color_property: node property that determines node categories for coloring. Default is to use node labels + size_property: node property that determines the size of nodes. Default is to compute a page rank for this + include_node_properties: node properties to include for mouse-over inspection + rel_weight_property: relationship property that determines width of relationships + notebook: whether or not the code is run in a notebook + px_height: the height of the graphic containing output the visualization + theme: coloring theme for the visualization. "light" or "dark" + + Returns: + an interactive graphical visualization of the specified graph + """ + + actual_node_properties = list(chain.from_iterable(self.node_properties().to_dict().values())) + if (color_property is not None) and (color_property not in actual_node_properties): + raise ValueError(f"There is no node property '{color_property}' in graph '{self._name}'") + + if size_property is not None and size_property not in actual_node_properties: + raise ValueError(f"There is no node property '{size_property}' in graph '{self._name}'") + + if include_node_properties is not None: + for prop in include_node_properties: + if prop not in actual_node_properties: + raise ValueError(f"There is no node property '{prop}' in graph '{self._name}'") + + actual_rel_properties = list(chain.from_iterable(self.relationship_properties().to_dict().values())) + if rel_weight_property is not None and rel_weight_property not in actual_rel_properties: + raise ValueError(f"There is no relationship property '{rel_weight_property}' in graph '{self._name}'") + + if theme not in {"light", "dark"}: + raise ValueError(f"Color `theme` '{theme}' is not allowed. Must be either 'light' or 'dark'") + + visual_graph = self._name + if self.node_count() > node_count: + visual_graph = str(uuid4()) + config = dict(samplingRatio=float(node_count) / self.node_count()) + + if center_nodes is not None: + config["startNodes"] = center_nodes + + self._query_runner.call_procedure( + endpoint="gds.graph.sample.rwr", + params=CallParameters(graph_name=visual_graph, fromGraphName=self._name, config=config), + custom_error=False, + ) + + # Make sure we always have at least a size property so that we can run `gds.graph.nodeProperties.stream` + if size_property is None: + size_property = str(uuid4()) + self._query_runner.call_procedure( + endpoint="gds.pageRank.mutate", + params=CallParameters(graph_name=visual_graph, config=dict(mutateProperty=size_property)), + custom_error=False, + ) + clean_up_size_prop = True + else: + clean_up_size_prop = False + + node_properties = [size_property] + if include_node_properties is not None: + node_properties.extend(include_node_properties) + + if color_property is not None: + node_properties.append(color_property) + + # Remove possible duplicates + node_properties = list(set(node_properties)) + + result = self._query_runner.call_procedure( + endpoint="gds.graph.nodeProperties.stream", + params=CallParameters( + graph_name=visual_graph, + properties=node_properties, + nodeLabels=self.node_labels(), + config=dict(listNodeLabels=True), + ), + custom_error=False, + ) + + # new format was requested, but the query was run via Cypher + if "propertyValue" in result.keys(): + wide_result = result.pivot(index=["nodeId"], columns=["nodeProperty"], values="propertyValue") + # nodeLabels cannot be an index column of the pivot as its not hashable + # so we need to manually join it back in + labels_df = result[["nodeId", "nodeLabels"]].set_index("nodeId") + wide_result = wide_result.join(labels_df, on="nodeId") + result = wide_result.reset_index() + result.columns.name = None + node_properties_df = result + + if rel_weight_property is None: + relationships_df = self._query_runner.call_procedure( + endpoint="gds.graph.relationships.stream", + params=CallParameters(graph_name=visual_graph), + custom_error=False, + ) + else: + relationships_df = self._query_runner.call_procedure( + endpoint="gds.graph.relationshipProperty.stream", + params=CallParameters(graph_name=visual_graph, properties=rel_weight_property), + custom_error=False, + ) + + # Clean up + if visual_graph != self._name: + self._query_runner.call_procedure( + endpoint="gds.graph.drop", + params=CallParameters(graph_name=visual_graph), + custom_error=False, + ) + elif clean_up_size_prop: + self._query_runner.call_procedure( + endpoint="gds.graph.nodeProperties.drop", + params=CallParameters(graph_name=visual_graph, nodeProperties=size_property), + custom_error=False, + ) + + from pyvis.network import Network + + net = Network( + notebook=True if notebook else False, + cdn_resources="remote" if notebook else "local", + directed=directed, + bgcolor="#222222" if theme == "dark" else "#F2F2F2", + font_color="white" if theme == "dark" else "black", + height=f"{px_height}px", + width="100%", + ) + + if color_property is None: + color_map = {label: self._random_themed_color(theme) for label in self.node_labels()} + else: + color_map = { + prop_val: self._random_themed_color(theme) for prop_val in node_properties_df[color_property].unique() + } + + # Add all the nodes + for _, node in node_properties_df.iterrows(): + title = f"Node ID: {node['nodeId']}\nLabels: {node['nodeLabels']}" + if include_node_properties is not None: + title += f"\nNode properties:" + for prop in include_node_properties: + title += f"\n{prop} = {node[prop]}" + + if color_property is None: + color = color_map[node["nodeLabels"][0]] + else: + color = color_map[node[color_property]] + + net.add_node( + int(node["nodeId"]), + value=node[size_property], + color=color, + title=title, + ) + + # Add all the relationships + for _, rel in relationships_df.iterrows(): + if rel_weight_property is None: + net.add_edge(rel["sourceNodeId"], rel["targetNodeId"], title=f"Type: {rel['relationshipType']}") + else: + title = f"Type: {rel['relationshipType']}\n{rel_weight_property} = {rel['rel_weight_property']}" + net.add_edge(rel["sourceNodeId"], rel["targetNodeId"], title=title, value=rel[rel_weight_property]) + + return net.show(f"{self._name}.html") + + @staticmethod + def _random_themed_color(theme) -> str: + l = 0.7 if theme == "dark" else 0.4 + return "#%02X%02X%02X" % tuple(map(lambda x: int(x * 255), colorsys.hls_to_rgb(random.random(), l, 1.0)))