diff --git a/Countries_of_the_world/Countries of the world analysis.ipynb b/Countries_of_the_world/Countries of the world analysis.ipynb index daf45b6..e51546b 100644 --- a/Countries_of_the_world/Countries of the world analysis.ipynb +++ b/Countries_of_the_world/Countries of the world analysis.ipynb @@ -1,3253 +1,2159 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Countries of the world\n", - "\n", - "- Updated to GDS 2.0 version\n", - "- Link to original blog post: https://towardsdatascience.com/community-detection-of-the-countries-of-the-world-with-neo4j-graph-data-science-4d3a022f8399" - ], - "metadata": { - "id": "qugkv-nB4gc3" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install neo4j" - ], - "metadata": { - "id": "8-RMR2Nv4fiD", - "outputId": "208d660f-1069-4e80-b541-a3fd340b1d32", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting neo4j\n", - " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", - "\u001b[?25l\r\u001b[K |███▋ | 10 kB 19.4 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 25.4 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 13.4 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 10.3 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 6.1 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 7.2 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 7.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 7.4 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 1.3 MB/s \n", - "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=43c3ddc1dbfa97e8aa3c34730811c26ad1d4e51d904fffe4d9c5c3b838379b45\n", - " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j\n", - "Successfully installed neo4j-4.4.2\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "I recommend you setup [a blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" - ], - "metadata": { - "id": "drvMogZL5Ex6" - } - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "5DeXOj_o25i8" - }, - "outputs": [], - "source": [ - "# Define Neo4j connections\n", - "from neo4j import GraphDatabase\n", - "host = 'bolt://3.235.2.228:7687'\n", - "user = 'neo4j'\n", - "password = 'seats-drunks-carbon'\n", - "driver = GraphDatabase.driver(host,auth=(user, password))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "iSosOYR-25jA" - }, - "outputs": [], - "source": [ - "def drop_graph(name):\n", - " with driver.session() as session:\n", - " drop_graph_query = \"\"\"\n", - " CALL gds.graph.drop('{}');\n", - " \"\"\".format(name)\n", - " session.run(drop_graph_query)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "9ekQ13NM25jA" - }, - "outputs": [], - "source": [ - "# Import libraries\n", - "import pandas as pd\n", - "\n", - "def read_query(query, params={}):\n", - " with driver.session() as session:\n", - " result = session.run(query, params)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UKpLZvAj25jB" - }, - "source": [ - "### Graph schema\n", - "We will be using the Countries of the world dataset made available on Kaggle by Fernando Lasso. Looking at the acknowledgments, the data originates from the CIA's World Factbook. Unfortunately, the contributor did not provide the year the dataset was compiled. My guess is the year 2013, but I might be wrong. The dataset contains various metrics such as area size, population, infant mortality, and more about 227 countries of the world." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qugkv-nB4gc3" + }, + "source": [ + "# Countries of the world\n", + "\n", + "- Updated to GDS 2.3 version and Neo4j v5\n", + "- Link to original blog post: https://towardsdatascience.com/community-detection-of-the-countries-of-the-world-with-neo4j-graph-data-science-4d3a022f8399" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "8-RMR2Nv4fiD", + "outputId": "208d660f-1069-4e80-b541-a3fd340b1d32" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "PyDKm5dR25jD" - }, - "source": [ - "### Graph import\n", - "\n", - "For some reason, the numbers in the CSV file use a comma as a floating point instead of a dot (0,1 instead of 0.1). We need to preprocess the data to be able to cast the numbers to float in Neo4j. With the help of an APOC procedure apoc.cypher.run, we can preprocess and store the data in a single cypher query. apoc.cypher.run allows us to run independent subqueries within the main cypher query and is excellent for various use cases." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting neo4j\n", + " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", + "\u001b[?25l\r", + "\u001b[K |███▋ | 10 kB 19.4 MB/s eta 0:00:01\r", + "\u001b[K |███████▎ | 20 kB 25.4 MB/s eta 0:00:01\r", + "\u001b[K |███████████ | 30 kB 13.4 MB/s eta 0:00:01\r", + "\u001b[K |██████████████▋ | 40 kB 10.3 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████▎ | 51 kB 6.1 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████████ | 61 kB 7.2 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████▋ | 71 kB 7.7 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████████▎ | 81 kB 7.4 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 89 kB 1.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=43c3ddc1dbfa97e8aa3c34730811c26ad1d4e51d904fffe4d9c5c3b838379b45\n", + " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j\n", + "Successfully installed neo4j-4.4.2\n" + ] + } + ], + "source": [ + "!pip install neo4j" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "drvMogZL5Ex6" + }, + "source": [ + "I recommend you setup [a blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "5DeXOj_o25i8" + }, + "outputs": [], + "source": [ + "# Define Neo4j connections\n", + "from neo4j import GraphDatabase\n", + "host = 'bolt://3.231.25.240:7687'\n", + "user = 'neo4j'\n", + "password = 'hatchets-visitor-axes'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "iSosOYR-25jA" + }, + "outputs": [], + "source": [ + "def drop_graph(name):\n", + " with driver.session() as session:\n", + " drop_graph_query = \"\"\"\n", + " CALL gds.graph.drop('{}');\n", + " \"\"\".format(name)\n", + " session.run(drop_graph_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "9ekQ13NM25jA" + }, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "\n", + "def read_query(query, params={}):\n", + " with driver.session() as session:\n", + " result = session.run(query, params)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UKpLZvAj25jB" + }, + "source": [ + "### Graph schema\n", + "We will be using the Countries of the world dataset made available on Kaggle by Fernando Lasso. Looking at the acknowledgments, the data originates from the CIA's World Factbook. Unfortunately, the contributor did not provide the year the dataset was compiled. My guess is the year 2013, but I might be wrong. The dataset contains various metrics such as area size, population, infant mortality, and more about 227 countries of the world." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PyDKm5dR25jD" + }, + "source": [ + "### Graph import\n", + "\n", + "For some reason, the numbers in the CSV file use a comma as a floating point instead of a dot (0,1 instead of 0.1). We need to preprocess the data to be able to cast the numbers to float in Neo4j. With the help of an APOC procedure apoc.cypher.run, we can preprocess and store the data in a single cypher query. apoc.cypher.run allows us to run independent subqueries within the main cypher query and is excellent for various use cases." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "z93mpqXv25jD", + "outputId": "6ce3ebf0-aa95-4c10-d87d-690a3362c017" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "z93mpqXv25jD", - "outputId": "6ce3ebf0-aa95-4c10-d87d-690a3362c017", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "import_query = \"\"\"\n", - "\n", - "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/countries_of_the_world.csv\" as row\n", - "// cleanup the data and replace comma floating point with a dot\n", - "CALL apoc.cypher.run(\n", - " \"UNWIND keys($row) as key \n", - " WITH row,\n", - " key,\n", - " toFloat(replace(row[key],',','.')) as clean_value\n", - " // exclude string properties\n", - " WHERE NOT key in ['Country','Region'] \n", - " RETURN collect([key,clean_value]) as keys\", \n", - " {row:row}) YIELD value\n", - "MERGE (c:Country{name:trim(row.Country)})\n", - "SET c+= apoc.map.fromPairs(value.keys)\n", - "MERGE (r:Region{name:trim(row.Region)})\n", - "MERGE (c)-[:PART_OF]->(r)\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(import_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3J7I1k7b25jF" - }, - "source": [ - "### Identify missing values\n", - "Another useful APOC procedure is apoc.meta.nodeTypeProperties. With it, we can examine the node property schema of the graph. We will use it to identify how many missing values each feature of the country has." + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import_query = \"\"\"\n", + "\n", + "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/countries_of_the_world.csv\" as row\n", + "// cleanup the data and replace comma floating point with a dot\n", + "CALL apoc.cypher.run(\n", + " \"UNWIND keys($row) as key \n", + " WITH row,\n", + " key,\n", + " toFloat(replace(row[key],',','.')) as clean_value\n", + " // exclude string properties\n", + " WHERE NOT key in ['Country','Region'] \n", + " RETURN collect([key,clean_value]) as keys\", \n", + " {row:row}) YIELD value\n", + "MERGE (c:Country{name:trim(row.Country)})\n", + "SET c+= apoc.map.fromPairs(value.keys)\n", + "MERGE (r:Region{name:trim(row.Region)})\n", + "MERGE (c)-[:PART_OF]->(r)\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(import_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3J7I1k7b25jF" + }, + "source": [ + "### Identify missing values\n", + "Another useful APOC procedure is apoc.meta.nodeTypeProperties. With it, we can examine the node property schema of the graph. We will use it to identify how many missing values each feature of the country has." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "c4pDpXio25jF", + "outputId": "9b17ec6d-3e34-4f98-fd6a-de7a4141e702" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "c4pDpXio25jF", - "outputId": "9b17ec6d-3e34-4f98-fd6a-de7a4141e702", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " propertyName missing_value pct_missing_value\n", - "0 Climate 22 0.096916\n", - "1 Literacy (%) 18 0.079295\n", - "2 Industry 16 0.070485\n", - "3 Agriculture 15 0.066079\n", - "4 Service 15 0.066079\n", - "5 Phones (per 1000) 4 0.017621\n", - "6 Deathrate 4 0.017621\n", - "7 Net migration 3 0.013216\n", - "8 Infant mortality (per 1000 births) 3 0.013216\n", - "9 Birthrate 3 0.013216" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
propertyNamemissing_valuepct_missing_value
0Climate220.096916
1Literacy (%)180.079295
2Industry160.070485
3Agriculture150.066079
4Service150.066079
5Phones (per 1000)40.017621
6Deathrate40.017621
7Net migration30.013216
8Infant mortality (per 1000 births)30.013216
9Birthrate30.013216
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
propertyNamemissing_valuepct_missing_value
0Climate220.096916
1Literacy (%)180.079295
2Industry160.070485
3Agriculture150.066079
4Service150.066079
5Phones (per 1000)40.017621
6Deathrate40.017621
7Net migration30.013216
8Infant mortality (per 1000 births)30.013216
9Birthrate30.013216
\n", + "
" ], - "source": [ - "identify_missing_values_query = \"\"\"\n", - "\n", - "// Only look at properties of nodes labeled \"Country\"\n", - "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", - "YIELD propertyName, propertyObservations, totalObservations\n", - "RETURN propertyName,\n", - " (totalObservations - propertyObservations) as missing_value,\n", - " (totalObservations - propertyObservations) / toFloat(totalObservations) as pct_missing_value\n", - "ORDER BY pct_missing_value DESC LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(identify_missing_values_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V858FLaB25jG" - }, - "source": [ - "It looks like we don't have many missing values. However, we will disregard features with more than four missing values from our further analysis for the sake of simplicity.\n", - "### High correlation filter\n", - "High correlation filter is a simple data dimensionality reduction technique. Features with high correlation are likely to carry similar information and are more linearly dependant. Using multiple features with related information can bring down the performance of various models and can be avoided by dropping one of the two correlating features." + "text/plain": [ + " propertyName missing_value pct_missing_value\n", + "0 Climate 22 0.096916\n", + "1 Literacy (%) 18 0.079295\n", + "2 Industry 16 0.070485\n", + "3 Agriculture 15 0.066079\n", + "4 Service 15 0.066079\n", + "5 Phones (per 1000) 4 0.017621\n", + "6 Deathrate 4 0.017621\n", + "7 Net migration 3 0.013216\n", + "8 Infant mortality (per 1000 births) 3 0.013216\n", + "9 Birthrate 3 0.013216" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_missing_values_query = \"\"\"\n", + "\n", + "// Only look at properties of nodes labeled \"Country\"\n", + "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", + "YIELD propertyName, propertyObservations, totalObservations\n", + "RETURN propertyName,\n", + " (totalObservations - propertyObservations) as missing_value,\n", + " (totalObservations - propertyObservations) / toFloat(totalObservations) as pct_missing_value\n", + "ORDER BY pct_missing_value DESC LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(identify_missing_values_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V858FLaB25jG" + }, + "source": [ + "It looks like we don't have many missing values. However, we will disregard features with more than four missing values from our further analysis for the sake of simplicity.\n", + "### High correlation filter\n", + "High correlation filter is a simple data dimensionality reduction technique. Features with high correlation are likely to carry similar information and are more linearly dependant. Using multiple features with related information can bring down the performance of various models and can be avoided by dropping one of the two correlating features." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "avNWFhF925jH", + "outputId": "b7c88d8a-9f54-4884-c95a-4b73cb6c21a2" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "avNWFhF925jH", - "outputId": "b7c88d8a-9f54-4884-c95a-4b73cb6c21a2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " feature compare_feature \\\n", - "0 Birthrate Infant mortality (per 1000 births) \n", - "1 GDP ($ per capita) Phones (per 1000) \n", - "2 Deathrate Infant mortality (per 1000 births) \n", - "3 Area (sq. mi.) Population \n", - "4 Birthrate Deathrate \n", - "5 GDP ($ per capita) Net migration \n", - "6 Coastline (coast/area ratio) Crops (%) \n", - "7 Phones (per 1000) Pop. Density (per sq. mi.) \n", - "8 Coastline (coast/area ratio) Pop. Density (per sq. mi.) \n", - "9 Net migration Phones (per 1000) \n", - "\n", - " correlation \n", - "0 0.841210 \n", - "1 0.828151 \n", - "2 0.661350 \n", - "3 0.469985 \n", - "4 0.420948 \n", - "5 0.381256 \n", - "6 0.338594 \n", - "7 0.280954 \n", - "8 0.241690 \n", - "9 0.236930 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featurecompare_featurecorrelation
0BirthrateInfant mortality (per 1000 births)0.841210
1GDP ($ per capita)Phones (per 1000)0.828151
2DeathrateInfant mortality (per 1000 births)0.661350
3Area (sq. mi.)Population0.469985
4BirthrateDeathrate0.420948
5GDP ($ per capita)Net migration0.381256
6Coastline (coast/area ratio)Crops (%)0.338594
7Phones (per 1000)Pop. Density (per sq. mi.)0.280954
8Coastline (coast/area ratio)Pop. Density (per sq. mi.)0.241690
9Net migrationPhones (per 1000)0.236930
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 8 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featurecompare_featurecorrelation
0BirthrateInfant mortality (per 1000 births)0.841210
1GDP ($ per capita)Phones (per 1000)0.828151
2DeathrateInfant mortality (per 1000 births)0.661350
3Area (sq. mi.)Population0.469985
4BirthrateDeathrate0.420948
5GDP ($ per capita)Net migration0.381256
6Coastline (coast/area ratio)Crops (%)0.338594
7Phones (per 1000)Pop. Density (per sq. mi.)0.280954
8Coastline (coast/area ratio)Pop. Density (per sq. mi.)0.241690
9Net migrationPhones (per 1000)0.236930
\n", + "
" ], - "source": [ - "high_correlation_query = \"\"\"\n", - "\n", - "// Only look at properties of nodes labeled \"Country\"\n", - "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", - "YIELD propertyName, propertyObservations, totalObservations\n", - "WITH propertyName,\n", - " (totalObservations - propertyObservations) as missing_value\n", - "// filter our features with more than 5 missing values\n", - "WHERE missing_value < 5 AND propertyName <> 'name'\n", - "WITH collect(propertyName) as features\n", - "MATCH (c:Country)\n", - "UNWIND features as feature\n", - "UNWIND features as compare_feature\n", - "WITH feature,\n", - " compare_feature,\n", - " collect(coalesce(c[feature],0)) as vector_1,\n", - " collect(coalesce(c[compare_feature],0)) as vector_2\n", - "// avoid comparing with a feature with itself\n", - "WHERE feature < compare_feature\n", - "RETURN feature,\n", - " compare_feature,\n", - " gds.similarity.pearson(vector_1, vector_2) AS correlation\n", - "ORDER BY correlation DESC LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(high_correlation_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KqcIxTJP25jI" - }, - "source": [ - "Interesting to see that birth rate and infant mortality are very correlated. The death rate is also quite correlated with infant mortality, so we will drop the birth and death rate but keep the infant mortality. The number of phones and net migration seems to be correlated with the GDP. We will drop them both as well and keep the GDP. We will also cut the population and retain both the area and population density, which carry similar information.\n", - "### Feature statistics\n", - "At this point, we are left with eight features. We will examine their distributions with the apoc.agg.statistics function. It calculates numeric statistics such as minimum, maximum, and percentile ranks for a collection of values." + "text/plain": [ + " feature compare_feature \\\n", + "0 Birthrate Infant mortality (per 1000 births) \n", + "1 GDP ($ per capita) Phones (per 1000) \n", + "2 Deathrate Infant mortality (per 1000 births) \n", + "3 Area (sq. mi.) Population \n", + "4 Birthrate Deathrate \n", + "5 GDP ($ per capita) Net migration \n", + "6 Coastline (coast/area ratio) Crops (%) \n", + "7 Phones (per 1000) Pop. Density (per sq. mi.) \n", + "8 Coastline (coast/area ratio) Pop. Density (per sq. mi.) \n", + "9 Net migration Phones (per 1000) \n", + "\n", + " correlation \n", + "0 0.841210 \n", + "1 0.828151 \n", + "2 0.661350 \n", + "3 0.469985 \n", + "4 0.420948 \n", + "5 0.381256 \n", + "6 0.338594 \n", + "7 0.280954 \n", + "8 0.241690 \n", + "9 0.236930 " ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_correlation_query = \"\"\"\n", + "\n", + "// Only look at properties of nodes labeled \"Country\"\n", + "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", + "YIELD propertyName, propertyObservations, totalObservations\n", + "WITH propertyName,\n", + " (totalObservations - propertyObservations) as missing_value\n", + "// filter our features with more than 5 missing values\n", + "WHERE missing_value < 5 AND propertyName <> 'name'\n", + "WITH collect(propertyName) as features\n", + "MATCH (c:Country)\n", + "UNWIND features as feature\n", + "UNWIND features as compare_feature\n", + "WITH feature,\n", + " compare_feature,\n", + " collect(coalesce(c[feature],0)) as vector_1,\n", + " collect(coalesce(c[compare_feature],0)) as vector_2\n", + "// avoid comparing with a feature with itself\n", + "WHERE feature < compare_feature\n", + "RETURN feature,\n", + " compare_feature,\n", + " gds.similarity.pearson(vector_1, vector_2) AS correlation\n", + "ORDER BY correlation DESC LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(high_correlation_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KqcIxTJP25jI" + }, + "source": [ + "Interesting to see that birth rate and infant mortality are very correlated. The death rate is also quite correlated with infant mortality, so we will drop the birth and death rate but keep the infant mortality. The number of phones and net migration seems to be correlated with the GDP. We will drop them both as well and keep the GDP. We will also cut the population and retain both the area and population density, which carry similar information.\n", + "### Feature statistics\n", + "At this point, we are left with eight features. We will examine their distributions with the apoc.agg.statistics function. It calculates numeric statistics such as minimum, maximum, and percentile ranks for a collection of values." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "SfothkMb25jI", + "outputId": "62b7fd71-ae25-4aab-95a6-ef9336139876" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "SfothkMb25jI", - "outputId": "62b7fd71-ae25-4aab-95a6-ef9336139876", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " potential_feature min max mean \\\n", - "0 Other (%) 33.33 100.00 81.64 \n", - "1 Arable (%) 0.00 62.11 13.80 \n", - "2 Crops (%) 0.00 50.68 4.56 \n", - "3 Coastline (coast/area ratio) 0.00 870.66 21.17 \n", - "4 Infant mortality (per 1000 births) 2.29 191.19 35.51 \n", - "5 Pop. Density (per sq. mi.) 0.00 16271.50 379.05 \n", - "6 GDP ($ per capita) 500.00 55100.00 9689.85 \n", - "7 Area (sq. mi.) 2.00 17075200.00 598227.59 \n", - "\n", - " stdev p50 p75 p95 p99 \n", - "0 16.10 85.70 95.44 99.81 100.00 \n", - "1 13.01 10.42 20.00 40.54 55.30 \n", - "2 8.34 1.03 4.44 20.00 45.71 \n", - "3 72.13 0.73 10.32 92.31 310.69 \n", - "4 35.31 20.97 55.51 103.32 143.64 \n", - "5 1656.53 78.80 188.50 838.60 6482.22 \n", - "6 10026.91 5500.03 15700.06 29600.12 37800.25 \n", - "7 1786336.93 86600.50 437074.00 2345424.00 9631424.00 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
potential_featureminmaxmeanstdevp50p75p95p99
0Other (%)33.33100.0081.6416.1085.7095.4499.81100.00
1Arable (%)0.0062.1113.8013.0110.4220.0040.5455.30
2Crops (%)0.0050.684.568.341.034.4420.0045.71
3Coastline (coast/area ratio)0.00870.6621.1772.130.7310.3292.31310.69
4Infant mortality (per 1000 births)2.29191.1935.5135.3120.9755.51103.32143.64
5Pop. Density (per sq. mi.)0.0016271.50379.051656.5378.80188.50838.606482.22
6GDP ($ per capita)500.0055100.009689.8510026.915500.0315700.0629600.1237800.25
7Area (sq. mi.)2.0017075200.00598227.591786336.9386600.50437074.002345424.009631424.00
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 9 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
potential_featureminmaxmeanstdevp50p75p95p99
0Other (%)33.33100.0081.6416.1085.7095.4499.81100.00
1Crops (%)0.0050.684.568.341.034.4420.0045.71
2Arable (%)0.0062.1113.8013.0110.4220.0040.5455.30
3Coastline (coast/area ratio)0.00870.6621.1772.130.7310.3292.31310.69
4Infant mortality (per 1000 births)2.29191.1935.5135.3120.9755.51103.32143.64
5Pop. Density (per sq. mi.)0.0016271.50379.051656.5378.80188.50838.606482.22
6GDP ($ per capita)500.0055100.009689.8510026.915500.0315700.0629600.1237800.25
7Area (sq. mi.)2.0017075200.00598227.591786336.9386600.50437074.002345424.009631424.00
\n", + "
" ], - "source": [ - "feature_stats_query = \"\"\"\n", - "\n", - "// define excluded features\n", - "WITH ['name', \n", - " 'Deathrate', \n", - " 'Birthrate',\n", - " 'Phones (per 1000)',\n", - " 'Net migration', \n", - " 'Population'] as excluded_features\n", - "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", - "YIELD propertyName, propertyObservations, totalObservations\n", - "WITH propertyName,\n", - " (totalObservations - propertyObservations) as missing_value\n", - "WHERE missing_value < 5 AND \n", - " NOT propertyName in excluded_features\n", - "// Reduce to a single row\n", - "WITH collect(propertyName) as potential_features\n", - "MATCH (c:Country)\n", - "UNWIND potential_features as potential_feature\n", - "WITH potential_feature, \n", - " apoc.agg.statistics(c[potential_feature],\n", - " [0.5,0.75,0.9,0.95,0.99]) as stats\n", - "RETURN potential_feature, \n", - " apoc.math.round(stats.min,2) as min, \n", - " apoc.math.round(stats.max,2) as max, \n", - " apoc.math.round(stats.mean,2) as mean, \n", - " apoc.math.round(stats.stdev,2) as stdev,\n", - " apoc.math.round(stats.`0.5`,2) as p50,\n", - " apoc.math.round(stats.`0.75`,2) as p75,\n", - " apoc.math.round(stats.`0.95`,2) as p95,\n", - " apoc.math.round(stats.`0.99`,2) as p99\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(feature_stats_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IPbg0N1f25jJ" - }, - "source": [ - "The Federated state of Micronesia has the ratio of coast to area at 870, which is pretty impressive. On the other hand, there are a total of 44 countries in the world with zero coastlines. Another fun fact is that Greenland has a population density rounded to 0 per square mile with its 56361 inhabitants and 2166086 square miles. It might be a cool place to perform social distancing.\n", - "We can observe that most of the features appear to be descriptive, except for the Other (%), which is mostly between 80 and 100. Due to the low variance, we will ignore it in our further analysis.\n", - "### Populate the missing values\n", - "We are left with seven features that we are going to use to infer a similarity network between countries. One thing we need to do before that is to populate the missing values. We will use a simple method and fill in the missing values of the features with the average value of the region the country is part of." + "text/plain": [ + " potential_feature min max mean \\\n", + "0 Other (%) 33.33 100.00 81.64 \n", + "1 Crops (%) 0.00 50.68 4.56 \n", + "2 Arable (%) 0.00 62.11 13.80 \n", + "3 Coastline (coast/area ratio) 0.00 870.66 21.17 \n", + "4 Infant mortality (per 1000 births) 2.29 191.19 35.51 \n", + "5 Pop. Density (per sq. mi.) 0.00 16271.50 379.05 \n", + "6 GDP ($ per capita) 500.00 55100.00 9689.85 \n", + "7 Area (sq. mi.) 2.00 17075200.00 598227.59 \n", + "\n", + " stdev p50 p75 p95 p99 \n", + "0 16.10 85.70 95.44 99.81 100.00 \n", + "1 8.34 1.03 4.44 20.00 45.71 \n", + "2 13.01 10.42 20.00 40.54 55.30 \n", + "3 72.13 0.73 10.32 92.31 310.69 \n", + "4 35.31 20.97 55.51 103.32 143.64 \n", + "5 1656.53 78.80 188.50 838.60 6482.22 \n", + "6 10026.91 5500.03 15700.06 29600.12 37800.25 \n", + "7 1786336.93 86600.50 437074.00 2345424.00 9631424.00 " ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_stats_query = \"\"\"\n", + "\n", + "// define excluded features\n", + "WITH ['name', \n", + " 'Deathrate', \n", + " 'Birthrate',\n", + " 'Phones (per 1000)',\n", + " 'Net migration', \n", + " 'Population'] as excluded_features\n", + "CALL apoc.meta.nodeTypeProperties({labels:['Country']})\n", + "YIELD propertyName, propertyObservations, totalObservations\n", + "WITH propertyName,\n", + " (totalObservations - propertyObservations) as missing_value\n", + "WHERE missing_value < 5 AND \n", + " NOT propertyName in excluded_features\n", + "// Reduce to a single row\n", + "WITH collect(propertyName) as potential_features\n", + "MATCH (c:Country)\n", + "UNWIND potential_features as potential_feature\n", + "WITH potential_feature, \n", + " apoc.agg.statistics(c[potential_feature],\n", + " [0.5,0.75,0.9,0.95,0.99]) as stats\n", + "RETURN potential_feature, \n", + " round(stats.min,2) as min, \n", + " round(stats.max,2) as max, \n", + " round(stats.mean,2) as mean, \n", + " round(stats.stdev,2) as stdev,\n", + " round(stats.`0.5`,2) as p50,\n", + " round(stats.`0.75`,2) as p75,\n", + " round(stats.`0.95`,2) as p95,\n", + " round(stats.`0.99`,2) as p99\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(feature_stats_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IPbg0N1f25jJ" + }, + "source": [ + "The Federated state of Micronesia has the ratio of coast to area at 870, which is pretty impressive. On the other hand, there are a total of 44 countries in the world with zero coastlines. Another fun fact is that Greenland has a population density rounded to 0 per square mile with its 56361 inhabitants and 2166086 square miles. It might be a cool place to perform social distancing.\n", + "We can observe that most of the features appear to be descriptive, except for the Other (%), which is mostly between 80 and 100. Due to the low variance, we will ignore it in our further analysis.\n", + "### Populate the missing values\n", + "We are left with seven features that we are going to use to infer a similarity network between countries. One thing we need to do before that is to populate the missing values. We will use a simple method and fill in the missing values of the features with the average value of the region the country is part of." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "b3r7_G9925jJ", + "outputId": "22ba09d6-ff26-40a0-c4c2-5d4b99aade6e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "b3r7_G9925jJ", - "outputId": "22ba09d6-ff26-40a0-c4c2-5d4b99aade6e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " 'missing values populated'\n", - "0 missing values populated" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
'missing values populated'
0missing values populated
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 10 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
'missing values populated'
0missing values populated
\n", + "
" ], - "source": [ - "populate_missing_values = \"\"\"\n", - "\n", - "UNWIND [\"Arable (%)\",\n", - " \"Crops (%)\",\n", - " \"Infant mortality (per 1000 births)\",\n", - " \"GDP ($ per capita)\"] as feature\n", - "MATCH (c:Country)\n", - "WHERE c[feature] IS null\n", - "MATCH (c)-[:PART_OF]->(r:Region)<-[:PART_OF]-(other:Country)\n", - "WHERE other[feature] IS NOT null\n", - "WITH c,feature,avg(other[feature]) as avg_value\n", - "CALL apoc.create.setProperty(c, feature, avg_value) \n", - "YIELD node\n", - "RETURN distinct 'missing values populated'\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(populate_missing_values)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Q9lmKAbQ25jL" - }, - "source": [ - "### Graph data science library\n", - "With Neo4j's Graph Data Science library, we can run more than 50 different graph algorithms directly in Neo4j. Algorithms are exposed as cypher procedures, similar to the APOC procedures we've seen above.\n", - "GDS uses a projection of the stored graph, that is entirely in-memory to achieve faster execution times." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R4GNE06m25jK" - }, - "source": [ - "### Similarity network with cosine similarity\n", - "First, we much project the in-memory graph with GDS 2.0\n" + "text/plain": [ + " 'missing values populated'\n", + "0 missing values populated" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "populate_missing_values = \"\"\"\n", + "\n", + "UNWIND [\"Arable (%)\",\n", + " \"Crops (%)\",\n", + " \"Infant mortality (per 1000 births)\",\n", + " \"GDP ($ per capita)\"] as feature\n", + "MATCH (c:Country)\n", + "WHERE c[feature] IS null\n", + "MATCH (c)-[:PART_OF]->(r:Region)<-[:PART_OF]-(other:Country)\n", + "WHERE other[feature] IS NOT null\n", + "WITH c,feature,avg(other[feature]) as avg_value\n", + "CALL apoc.create.setProperty(c, feature, avg_value) \n", + "YIELD node\n", + "RETURN distinct 'missing values populated'\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(populate_missing_values)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q9lmKAbQ25jL" + }, + "source": [ + "### Graph data science library\n", + "With Neo4j's Graph Data Science library, we can run more than 50 different graph algorithms directly in Neo4j. Algorithms are exposed as cypher procedures, similar to the APOC procedures we've seen above.\n", + "GDS uses a projection of the stored graph, that is entirely in-memory to achieve faster execution times." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R4GNE06m25jK" + }, + "source": [ + "### Similarity network with cosine similarity\n", + "First, we much project the in-memory graph with GDS 2.0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "PvPiKn2y6LrG", + "outputId": "455d5d9f-2720-493e-bcd7-2b08ad7a803f" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "project_graph_query = \"\"\"\n", - "CALL gds.graph.project('countries', 'Country', '*', \n", - " {nodeProperties:['Arable (%)', 'Crops (%)', 'Infant mortality (per 1000 births)', 'GDP ($ per capita)',\n", - " 'Coastline (coast/area ratio)', 'Pop. Density (per sq. mi.)', 'Area (sq. mi.)']})\n", - "\"\"\"\n", - "\n", - "read_query(project_graph_query)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Country': {'label': 'Country', 'properties':...{'__ALL__': {'orientation': 'NATURAL', 'indexI...countries227011592
\n", + "
" ], - "metadata": { - "id": "PvPiKn2y6LrG", - "outputId": "455d5d9f-2720-493e-bcd7-2b08ad7a803f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 13, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'Country': {'label': 'Country', 'properties':... \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'__ALL__': {'orientation': 'NATURAL', 'aggreg... countries 227 \n", - "\n", - " relationshipCount projectMillis \n", - "0 0 323 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Country': {'label': 'Country', 'properties':...{'__ALL__': {'orientation': 'NATURAL', 'aggreg...countries2270323
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 13 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dPQ2byIo25jJ" - }, - "source": [ - "### MinMax normalization\n", - "Last but not least, we have to normalize our features to prevent any single feature dominating over others due to a larger scale. We will use the simple MinMax method of normalization to rescale features between 0 and 1." + "text/plain": [ + " nodeProjection \\\n", + "0 {'Country': {'label': 'Country', 'properties':... \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'__ALL__': {'orientation': 'NATURAL', 'indexI... countries 227 \n", + "\n", + " relationshipCount projectMillis \n", + "0 0 11592 " ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project_graph_query = \"\"\"\n", + "CALL gds.graph.project('countries', 'Country', '*', \n", + " {nodeProperties:['Arable (%)', 'Crops (%)', 'Infant mortality (per 1000 births)', 'GDP ($ per capita)',\n", + " 'Coastline (coast/area ratio)', 'Pop. Density (per sq. mi.)', 'Area (sq. mi.)']})\n", + "\"\"\"\n", + "\n", + "read_query(project_graph_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dPQ2byIo25jJ" + }, + "source": [ + "### MinMax normalization\n", + "Last but not least, we have to normalize our features to prevent any single feature dominating over others due to a larger scale. We will use the simple MinMax method of normalization to rescale features between 0 and 1." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "A3cTmugM6_v1", + "outputId": "36c8ea12-2d03-4e34-c5db-eb21d5f52deb" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "minmax_normalization_query = \"\"\"\n", - "CALL gds.alpha.scaleProperties.mutate('countries', {\n", - " nodeProperties:['Arable (%)', 'Crops (%)', 'Infant mortality (per 1000 births)', 'GDP ($ per capita)',\n", - " 'Coastline (coast/area ratio)', 'Pop. Density (per sq. mi.)', 'Area (sq. mi.)'],\n", - " scaler: 'MINMAX',\n", - " mutateProperty: 'countryFeatures'\n", - "}) YIELD nodePropertiesWritten\n", - "\"\"\"\n", - "\n", - "read_query(minmax_normalization_query)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodePropertiesWritten
0227
\n", + "
" ], - "metadata": { - "id": "A3cTmugM6_v1", - "outputId": "36c8ea12-2d03-4e34-c5db-eb21d5f52deb", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodePropertiesWritten\n", - "0 227" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodePropertiesWritten
0227
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 16 - } + "text/plain": [ + " nodePropertiesWritten\n", + "0 227" ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "minmax_normalization_query = \"\"\"\n", + "CALL gds.alpha.scaleProperties.mutate('countries', {\n", + " nodeProperties:['Arable (%)', 'Crops (%)', 'Infant mortality (per 1000 births)', 'GDP ($ per capita)',\n", + " 'Coastline (coast/area ratio)', 'Pop. Density (per sq. mi.)', 'Area (sq. mi.)'],\n", + " scaler: 'MINMAX',\n", + " mutateProperty: 'countryFeatures'\n", + "}) YIELD nodePropertiesWritten\n", + "\"\"\"\n", + "\n", + "read_query(minmax_normalization_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vqHtYsb06k7d" + }, + "source": [ + "We have finished the data preprocessing and can focus on the data analysis part. The first step of the analysis is to infer a similarity network with the help of the cosine similarity algorithm. We build a vector for each country based on the selected features and compare the cosine similarity between each pair of countries. If the similarity is above the predefined threshold, we store back the results in the form of a relationship between the pair of similar nodes. Defining an optimal threshold is a mix of art and science, and you'll get better with practice. Ideally, you want to infer a sparse graph as community detection algorithms do not perform well on complete or dense graphs. In this example, we will use the similarityCutoff value of 0.8 (range between -1 and 1). Alongside the similarity threshold, we will also use the topK parameter to store only the top 10 similar neighbors. We do this to ensure a sparser graph." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 }, + "id": "fcLCEt5R25jK", + "outputId": "fcf2c331-a673-4370-9d56-00c953113be3" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "We have finished the data preprocessing and can focus on the data analysis part. The first step of the analysis is to infer a similarity network with the help of the cosine similarity algorithm. We build a vector for each country based on the selected features and compare the cosine similarity between each pair of countries. If the similarity is above the predefined threshold, we store back the results in the form of a relationship between the pair of similar nodes. Defining an optimal threshold is a mix of art and science, and you'll get better with practice. Ideally, you want to infer a sparse graph as community detection algorithms do not perform well on complete or dense graphs. In this example, we will use the similarityCutoff value of 0.8 (range between -1 and 1). Alongside the similarity threshold, we will also use the topK parameter to store only the top 10 similar neighbors. We do this to ensure a sparser graph." - ], - "metadata": { - "id": "vqHtYsb06k7d" - } - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "fcLCEt5R25jK", - "outputId": "fcf2c331-a673-4370-9d56-00c953113be3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " ranIterations nodePairsConsidered didConverge preProcessingMillis \\\n", - "0 7 105956 True 0 \n", - "\n", - " computeMillis mutateMillis postProcessingMillis nodesCompared \\\n", - "0 1084 203 -1 227 \n", - "\n", - " relationshipsWritten similarityDistribution \\\n", - "0 2257 {'p1': 0.8574447631835938, 'max': 0.9999618530... \n", - "\n", - " configuration \n", - "0 {'topK': 10, 'maxIterations': 100, 'randomJoin... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ranIterationsnodePairsConsidereddidConvergepreProcessingMilliscomputeMillismutateMillispostProcessingMillisnodesComparedrelationshipsWrittensimilarityDistributionconfiguration
07105956True01084203-12272257{'p1': 0.8574447631835938, 'max': 0.9999618530...{'topK': 10, 'maxIterations': 100, 'randomJoin...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 19 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ranIterationsnodePairsConsidereddidConvergepreProcessingMilliscomputeMillismutateMillispostProcessingMillisnodesComparedrelationshipsWrittensimilarityDistributionconfiguration
0578164True0507216-12272257{'p1': 0.8574447631835938, 'max': 0.9999618530...{'topK': 10, 'maxIterations': 100, 'randomJoin...
\n", + "
" ], - "source": [ - "cosine_similarity_query = \"\"\"\n", - "CALL gds.knn.mutate('countries', \n", - " {similarityCutoff:0.8, topK:10, nodeProperties: {countryFeatures: 'COSINE'},\n", - " mutateRelationshipType: 'SIMILAR', mutateProperty:'score'})\n", - "\"\"\"\n", - "\n", - "read_query(cosine_similarity_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aS3qw8QN25jM" - }, - "source": [ - "### Weakly connected components\n", - "More often than not, we start the graph analysis with the weakly connected components algorithm. It is a community detection algorithm used to find disconnected networks or islands within our graph. As we are only interested in the count of disconnected components, we can run the stats variant of the algorithm." + "text/plain": [ + " ranIterations nodePairsConsidered didConverge preProcessingMillis \\\n", + "0 5 78164 True 0 \n", + "\n", + " computeMillis mutateMillis postProcessingMillis nodesCompared \\\n", + "0 507 216 -1 227 \n", + "\n", + " relationshipsWritten similarityDistribution \\\n", + "0 2257 {'p1': 0.8574447631835938, 'max': 0.9999618530... \n", + "\n", + " configuration \n", + "0 {'topK': 10, 'maxIterations': 100, 'randomJoin... " ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cosine_similarity_query = \"\"\"\n", + "CALL gds.knn.mutate('countries', \n", + " {similarityCutoff:0.8, topK:10, nodeProperties: {countryFeatures: 'COSINE'},\n", + " mutateRelationshipType: 'SIMILAR', mutateProperty:'score'})\n", + "\"\"\"\n", + "\n", + "read_query(cosine_similarity_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aS3qw8QN25jM" + }, + "source": [ + "### Weakly connected components\n", + "More often than not, we start the graph analysis with the weakly connected components algorithm. It is a community detection algorithm used to find disconnected networks or islands within our graph. As we are only interested in the count of disconnected components, we can run the stats variant of the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "fMNY-jFF25jM", + "outputId": "679e7ebf-bf3d-4e3f-a7c4-888fb3b5830c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "fMNY-jFF25jM", - "outputId": "679e7ebf-bf3d-4e3f-a7c4-888fb3b5830c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " componentCount min max mean p50 p75 p90\n", - "0 1 227 227 227.0 227 227 227" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
componentCountminmaxmeanp50p75p90
01227227227.0227227227
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 20 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentCountminmaxmeanp50p75p90
01227227227.0227227227
\n", + "
" ], - "source": [ - "wcc_query = \"\"\"\n", - "\n", - "CALL gds.wcc.stats('countries', {relationshipTypes:['SIMILAR']})\n", - "YIELD componentCount, componentDistribution\n", - "RETURN componentCount, \n", - " componentDistribution.min as min,\n", - " componentDistribution.max as max,\n", - " componentDistribution.mean as mean,\n", - " componentDistribution.p50 as p50,\n", - " componentDistribution.p75 as p75,\n", - " componentDistribution.p90 as p90\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(wcc_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DVAmOn4D25jM" - }, - "source": [ - "The algorithm found only a single component within our graph. This is a favorable outcome as disconnected islands can skew the results of various other graph algorithms.\n", - "### Louvain algorithm\n", - "Another community detection algorithm is the Louvain algorithm. In basic terms, densely connected nodes are more likely to form a community. It relies on the modularity optimization to extract communities. The modularity optimization is performed in two steps. The first step involves optimizing the modularity locally. In the second step, it aggregates nodes belonging to the same community into a single node and builds a new network from those aggregated nodes. These two steps are repeated iteratively until a maximum of modularity is attained. A subtle side effect of these iterations is that we can take a look at the community structure at the end of each iteration, hence the Louvain algorithm is regarded as a hierarchical community detection algorithm. To include hierarchical community results, we must set the includeIntermediateCommunities parameter value to true." + "text/plain": [ + " componentCount min max mean p50 p75 p90\n", + "0 1 227 227 227.0 227 227 227" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wcc_query = \"\"\"\n", + "\n", + "CALL gds.wcc.stats('countries', {relationshipTypes:['SIMILAR']})\n", + "YIELD componentCount, componentDistribution\n", + "RETURN componentCount, \n", + " componentDistribution.min as min,\n", + " componentDistribution.max as max,\n", + " componentDistribution.mean as mean,\n", + " componentDistribution.p50 as p50,\n", + " componentDistribution.p75 as p75,\n", + " componentDistribution.p90 as p90\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(wcc_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DVAmOn4D25jM" + }, + "source": [ + "The algorithm found only a single component within our graph. This is a favorable outcome as disconnected islands can skew the results of various other graph algorithms.\n", + "### Louvain algorithm\n", + "Another community detection algorithm is the Louvain algorithm. In basic terms, densely connected nodes are more likely to form a community. It relies on the modularity optimization to extract communities. The modularity optimization is performed in two steps. The first step involves optimizing the modularity locally. In the second step, it aggregates nodes belonging to the same community into a single node and builds a new network from those aggregated nodes. These two steps are repeated iteratively until a maximum of modularity is attained. A subtle side effect of these iterations is that we can take a look at the community structure at the end of each iteration, hence the Louvain algorithm is regarded as a hierarchical community detection algorithm. To include hierarchical community results, we must set the includeIntermediateCommunities parameter value to true." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "8tRYco1v25jN", + "outputId": "c7c471a4-aea0-4634-bef9-6a09f16d9d59" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "8tRYco1v25jN", - "outputId": "c7c471a4-aea0-4634-bef9-6a09f16d9d59", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " ranLevels communityCount modularity \\\n", - "0 2 8 0.734819 \n", - "\n", - " modularities \n", - "0 [0.696204139379107, 0.7348188052372484] " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ranLevelscommunityCountmodularitymodularities
0280.734819[0.696204139379107, 0.7348188052372484]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 22 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ranLevelscommunityCountmodularitymodularities
0280.734601[0.6954974323961155, 0.734601100224988]
\n", + "
" ], - "source": [ - "louvain_algo_query = \"\"\"\n", - "\n", - "CALL gds.louvain.write('countries', \n", - " {maxIterations:20,\n", - " relationshipTypes:['SIMILAR'],\n", - " includeIntermediateCommunities:true,\n", - " writeProperty:'louvain'})\n", - "YIELD ranLevels, communityCount,modularity,modularities\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(louvain_algo_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dAVvwSvr25jN" - }, - "source": [ - "We can observe by the ranLevels value that the Louvain algorithm found two levels of communities in our network. On the final level, it found eight g. We can now examine the extracted communities of the last level and compare their feature averages." + "text/plain": [ + " ranLevels communityCount modularity \\\n", + "0 2 8 0.734601 \n", + "\n", + " modularities \n", + "0 [0.6954974323961155, 0.734601100224988] " ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "louvain_algo_query = \"\"\"\n", + "\n", + "CALL gds.louvain.write('countries', \n", + " {maxIterations:20,\n", + " relationshipTypes:['SIMILAR'],\n", + " includeIntermediateCommunities:true,\n", + " writeProperty:'louvain'})\n", + "YIELD ranLevels, communityCount,modularity,modularities\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(louvain_algo_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dAVvwSvr25jN" + }, + "source": [ + "We can observe by the ranLevels value that the Louvain algorithm found two levels of communities in our network. On the final level, it found eight g. We can now examine the extracted communities of the last level and compare their feature averages." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "Mb9vtbLa25jN", + "outputId": "7099c934-8b44-4c1c-f0eb-26ea6f919207" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "Mb9vtbLa25jN", - "outputId": "7099c934-8b44-4c1c-f0eb-26ea6f919207", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " community community_size pct_arable pct_crops infant_mortality \\\n", - "0 12 44 5.464773 0.976591 8.682955 \n", - "1 54 25 18.504933 3.923393 8.290000 \n", - "2 43 23 4.273043 0.563913 29.380261 \n", - "3 50 41 31.129268 3.028049 20.186341 \n", - "4 57 26 13.234231 22.198077 23.057591 \n", - "5 30 10 22.116000 10.184000 55.267000 \n", - "6 21 24 14.853575 1.090783 69.105833 \n", - "7 23 34 3.931765 1.443529 91.808529 \n", - "\n", - " gdp coastline population_density area_size \\\n", - "0 22600.000000 42.745682 1283.050000 3.476954e+05 \n", - "1 18768.000000 11.308400 236.540000 2.930858e+05 \n", - "2 8393.913043 0.330000 23.886957 3.101448e+06 \n", - "3 7509.756098 15.717561 194.495122 2.044609e+05 \n", - "4 4465.384615 66.923462 370.726923 3.770538e+04 \n", - "5 2100.000000 2.702000 180.400000 2.983568e+05 \n", - "6 1870.833333 3.340000 100.829167 3.194362e+05 \n", - "7 1435.294118 4.170882 37.926471 6.419174e+05 \n", - "\n", - " example_members \n", - "0 [Andorra, Anguilla, Aruba] \n", - "1 [Argentina, Belgium, British Virgin Is.] \n", - "2 [Algeria, Australia, Belize] \n", - "3 [Albania, Antigua & Barbuda, Armenia] \n", - "4 [American Samoa, Cook Islands, Dominica] \n", - "5 [Burundi, Comoros, Ecuador] \n", - "6 [Azerbaijan, Benin, Burkina Faso] \n", - "7 [Afghanistan, Angola, Bhutan] " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
communitycommunity_sizepct_arablepct_cropsinfant_mortalitygdpcoastlinepopulation_densityarea_sizeexample_members
012445.4647730.9765918.68295522600.00000042.7456821283.0500003.476954e+05[Andorra, Anguilla, Aruba]
1542518.5049333.9233938.29000018768.00000011.308400236.5400002.930858e+05[Argentina, Belgium, British Virgin Is.]
243234.2730430.56391329.3802618393.9130430.33000023.8869573.101448e+06[Algeria, Australia, Belize]
3504131.1292683.02804920.1863417509.75609815.717561194.4951222.044609e+05[Albania, Antigua & Barbuda, Armenia]
4572613.23423122.19807723.0575914465.38461566.923462370.7269233.770538e+04[American Samoa, Cook Islands, Dominica]
5301022.11600010.18400055.2670002100.0000002.702000180.4000002.983568e+05[Burundi, Comoros, Ecuador]
6212414.8535751.09078369.1058331870.8333333.340000100.8291673.194362e+05[Azerbaijan, Benin, Burkina Faso]
723343.9317651.44352991.8085291435.2941184.17088237.9264716.419174e+05[Afghanistan, Angola, Bhutan]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 23 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communitycommunity_sizepct_arablepct_cropsinfant_mortalitygdpcoastlinepopulation_densityarea_sizeexample_members
012465.5206521.4176098.67260922271.73913041.4463041235.8717393.337008e+05[Andorra, Anguilla, Aruba]
1542319.5271013.2976018.27652219091.30434811.173478239.8956523.163263e+05[Argentina, Belgium, British Virgin Is.]
243234.2730430.56391329.3802618393.9130430.33000023.8869573.101448e+06[Algeria, Australia, Belize]
3504131.1292683.02804920.1863417509.75609815.717561194.4951222.044609e+05[Albania, Antigua & Barbuda, Armenia]
4572613.23423122.19807723.0575914465.38461566.923462370.7269233.770538e+04[American Samoa, Cook Islands, Dominica]
5301022.11600010.18400055.2670002100.0000002.702000180.4000002.983568e+05[Burundi, Comoros, Ecuador]
6212414.8535751.09078369.1058331870.8333333.340000100.8291673.194362e+05[Azerbaijan, Benin, Burkina Faso]
723343.9317651.44352991.8085291435.2941184.17088237.9264716.419174e+05[Afghanistan, Angola, Bhutan]
\n", + "
" ], - "source": [ - "final_level_communities =\"\"\"\n", - "\n", - "MATCH (c:Country)\n", - "RETURN c.louvain[-1] as community,\n", - " count(*) as community_size,\n", - " avg(c['Arable (%)']) as pct_arable,\n", - " avg(c['Crops (%)']) as pct_crops, \n", - " avg(c['Infant mortality (per 1000 births)']) as infant_mortality,\n", - " avg(c['GDP ($ per capita)']) as gdp,\n", - " avg(c['Coastline (coast/area ratio)']) as coastline,\n", - " avg(c['Pop. Density (per sq. mi.)']) as population_density,\n", - " avg(c['Area (sq. mi.)']) as area_size,\n", - " collect(c['name'])[..3] as example_members\n", - "ORDER BY gdp DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(final_level_communities)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ioqDNQ2r25jN" - }, - "source": [ - "Louvain algorithm found eight distinct communities within the similarity network. The biggest group has 51 countries as members and has the largest average GDP at almost 22 thousand dollars. They are second in infant mortality and the coastline ratio but lead in population density by a large margin. There are two communities with an average GDP of around 20 thousand dollars, and then we can observe a steep drop to 7000 dollars in third place. With the decline in GDP, we can also find the rise of infant mortality almost linearly. Another fascinating insight is that most of the more impoverished communities have little to no coastline.\n", - "### Find representatives of communities with PageRank\n", - "We can assess the top representatives of the final level communities with the PageRank algorithm. If we assume that each SIMILAR relationship is a vote of similarity between countries, the PageRank algorithm will assign the highest score to the most similar countries within the community. We will execute the PageRank algorithm for each community separately and consider only nodes and relationships within the given community. This can be easily achieved with cypher projection without any additional transformations." + "text/plain": [ + " community community_size pct_arable pct_crops infant_mortality \\\n", + "0 12 46 5.520652 1.417609 8.672609 \n", + "1 54 23 19.527101 3.297601 8.276522 \n", + "2 43 23 4.273043 0.563913 29.380261 \n", + "3 50 41 31.129268 3.028049 20.186341 \n", + "4 57 26 13.234231 22.198077 23.057591 \n", + "5 30 10 22.116000 10.184000 55.267000 \n", + "6 21 24 14.853575 1.090783 69.105833 \n", + "7 23 34 3.931765 1.443529 91.808529 \n", + "\n", + " gdp coastline population_density area_size \\\n", + "0 22271.739130 41.446304 1235.871739 3.337008e+05 \n", + "1 19091.304348 11.173478 239.895652 3.163263e+05 \n", + "2 8393.913043 0.330000 23.886957 3.101448e+06 \n", + "3 7509.756098 15.717561 194.495122 2.044609e+05 \n", + "4 4465.384615 66.923462 370.726923 3.770538e+04 \n", + "5 2100.000000 2.702000 180.400000 2.983568e+05 \n", + "6 1870.833333 3.340000 100.829167 3.194362e+05 \n", + "7 1435.294118 4.170882 37.926471 6.419174e+05 \n", + "\n", + " example_members \n", + "0 [Andorra, Anguilla, Aruba] \n", + "1 [Argentina, Belgium, British Virgin Is.] \n", + "2 [Algeria, Australia, Belize] \n", + "3 [Albania, Antigua & Barbuda, Armenia] \n", + "4 [American Samoa, Cook Islands, Dominica] \n", + "5 [Burundi, Comoros, Ecuador] \n", + "6 [Azerbaijan, Benin, Burkina Faso] \n", + "7 [Afghanistan, Angola, Bhutan] " ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_level_communities =\"\"\"\n", + "\n", + "MATCH (c:Country)\n", + "RETURN c.louvain[-1] as community,\n", + " count(*) as community_size,\n", + " avg(c['Arable (%)']) as pct_arable,\n", + " avg(c['Crops (%)']) as pct_crops, \n", + " avg(c['Infant mortality (per 1000 births)']) as infant_mortality,\n", + " avg(c['GDP ($ per capita)']) as gdp,\n", + " avg(c['Coastline (coast/area ratio)']) as coastline,\n", + " avg(c['Pop. Density (per sq. mi.)']) as population_density,\n", + " avg(c['Area (sq. mi.)']) as area_size,\n", + " collect(c['name'])[..3] as example_members\n", + "ORDER BY gdp DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(final_level_communities)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ioqDNQ2r25jN" + }, + "source": [ + "Louvain algorithm found eight distinct communities within the similarity network. The biggest group has 51 countries as members and has the largest average GDP at almost 22 thousand dollars. They are second in infant mortality and the coastline ratio but lead in population density by a large margin. There are two communities with an average GDP of around 20 thousand dollars, and then we can observe a steep drop to 7000 dollars in third place. With the decline in GDP, we can also find the rise of infant mortality almost linearly. Another fascinating insight is that most of the more impoverished communities have little to no coastline.\n", + "### Find representatives of communities with PageRank\n", + "We can assess the top representatives of the final level communities with the PageRank algorithm. If we assume that each SIMILAR relationship is a vote of similarity between countries, the PageRank algorithm will assign the highest score to the most similar countries within the community. We will execute the PageRank algorithm for each community separately and consider only nodes and relationships within the given community. This can be easily achieved with cypher projection without any additional transformations." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "s3jx3aLJ25jN", + "outputId": "157be139-2c30-4952-f92d-c018cb42617f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "s3jx3aLJ25jN", - "outputId": "157be139-2c30-4952-f92d-c018cb42617f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " community top_5_representatives\n", - "0 23 [Afghanistan, Angola, Bhutan, Bolivia, Central...\n", - "1 50 [Albania, Antigua & Barbuda, Armenia, Banglade...\n", - "2 43 [Algeria, Australia, Belize, Botswana, Brazil]\n", - "3 57 [American Samoa, Cook Islands, Dominica, Domin...\n", - "4 12 [Andorra, Anguilla, Aruba, Austria, Bahamas, The]\n", - "5 54 [Argentina, Belgium, British Virgin Is., Costa...\n", - "6 21 [Azerbaijan, Benin, Burkina Faso, Burma, Cambo...\n", - "7 30 [Burundi, Comoros, Ecuador, Ghana, Guatemala]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
communitytop_5_representatives
023[Afghanistan, Angola, Bhutan, Bolivia, Central...
150[Albania, Antigua & Barbuda, Armenia, Banglade...
243[Algeria, Australia, Belize, Botswana, Brazil]
357[American Samoa, Cook Islands, Dominica, Domin...
412[Andorra, Anguilla, Aruba, Austria, Bahamas, The]
554[Argentina, Belgium, British Virgin Is., Costa...
621[Azerbaijan, Benin, Burkina Faso, Burma, Cambo...
730[Burundi, Comoros, Ecuador, Ghana, Guatemala]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 36 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communitytop_5_representatives
023[Afghanistan, Angola, Bhutan, Bolivia, Central...
150[Albania, Antigua & Barbuda, Armenia, Banglade...
243[Algeria, Australia, Belize, Botswana, Brazil]
357[American Samoa, Cook Islands, Dominica, Domin...
412[Andorra, Anguilla, Aruba, Austria, Bahamas, The]
554[Argentina, Belgium, British Virgin Is., Eston...
621[Azerbaijan, Benin, Burkina Faso, Burma, Cambo...
730[Burundi, Comoros, Ecuador, Ghana, Guatemala]
\n", + "
" ], - "source": [ - "top_representatives_query = \"\"\"\n", - "\n", - "WITH 'MATCH (c:Country) WHERE c.louvain[-1] = $community \n", - " RETURN id(c) as id' as nodeQuery,\n", - " 'MATCH (s:Country)-[:SIMILAR]-(t:Country) \n", - " RETURN id(s) as source, id(t) as target' as relQuery\n", - "MATCH (c:Country)\n", - "WITH distinct c.louvain[-1] as community, nodeQuery, relQuery\n", - "CALL gds.graph.project.cypher(toString(community), nodeQuery, relQuery, {parameters:{community:community}})\n", - "YIELD nodeCount\n", - "CALL gds.pageRank.stream(toString(community))\n", - "YIELD nodeId, score\n", - "WITH community, nodeId,score\n", - "ORDER BY score DESC\n", - "RETURN community, \n", - " collect(gds.util.asNode(nodeId).name)[..5] as top_5_representatives\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(top_representatives_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8ybMwi_b25jO" - }, - "source": [ - "### Hierarchical communities based on the Louvain algorithm\n", - "We mentioned before that the Louvain algorithm can be used to find hierarchical communities with the includeIntermediateCommunities parameter and that in our example, it found two levels of communities. We will now examine the groups of countries on the first level. A rule of thumb is that communities on a lower level will be more granular and smaller." + "text/plain": [ + " community top_5_representatives\n", + "0 23 [Afghanistan, Angola, Bhutan, Bolivia, Central...\n", + "1 50 [Albania, Antigua & Barbuda, Armenia, Banglade...\n", + "2 43 [Algeria, Australia, Belize, Botswana, Brazil]\n", + "3 57 [American Samoa, Cook Islands, Dominica, Domin...\n", + "4 12 [Andorra, Anguilla, Aruba, Austria, Bahamas, The]\n", + "5 54 [Argentina, Belgium, British Virgin Is., Eston...\n", + "6 21 [Azerbaijan, Benin, Burkina Faso, Burma, Cambo...\n", + "7 30 [Burundi, Comoros, Ecuador, Ghana, Guatemala]" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_representatives_query = \"\"\"\n", + "\n", + "WITH 'MATCH (c:Country) WHERE c.louvain[-1] = $community \n", + " RETURN id(c) as id' as nodeQuery,\n", + " 'MATCH (s:Country)-[:SIMILAR]-(t:Country) \n", + " RETURN id(s) as source, id(t) as target' as relQuery\n", + "MATCH (c:Country)\n", + "WITH distinct c.louvain[-1] as community, nodeQuery, relQuery\n", + "CALL gds.graph.project.cypher(toString(community), nodeQuery, relQuery, {parameters:{community:community}})\n", + "YIELD nodeCount\n", + "CALL gds.pageRank.stream(toString(community))\n", + "YIELD nodeId, score\n", + "WITH community, nodeId,score\n", + "ORDER BY score DESC\n", + "RETURN community, \n", + " collect(gds.util.asNode(nodeId).name)[..5] as top_5_representatives\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(top_representatives_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8ybMwi_b25jO" + }, + "source": [ + "### Hierarchical communities based on the Louvain algorithm\n", + "We mentioned before that the Louvain algorithm can be used to find hierarchical communities with the includeIntermediateCommunities parameter and that in our example, it found two levels of communities. We will now examine the groups of countries on the first level. A rule of thumb is that communities on a lower level will be more granular and smaller." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 520 }, + "id": "qJkNuPyq25jO", + "outputId": "6428afda-471f-4741-a4f7-5ce8a56e1da4" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "id": "qJkNuPyq25jO", - "outputId": "6428afda-471f-4741-a4f7-5ce8a56e1da4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 520 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " community community_size pct_arable pct_crops infant_mortality \\\n", - "0 12 18 11.282222 1.233889 5.994444 \n", - "1 54 14 20.745952 1.208201 6.735714 \n", - "2 38 26 1.437308 0.798462 10.544231 \n", - "3 7 11 15.652727 7.379091 10.268182 \n", - "4 50 12 26.865000 2.245000 8.755000 \n", - "5 43 16 5.606250 0.648750 25.319375 \n", - "6 30 12 38.811667 2.170000 13.029167 \n", - "7 39 7 1.225714 0.370000 38.662286 \n", - "8 57 15 17.153333 14.770000 22.656912 \n", - "9 56 11 7.890000 32.327273 23.603971 \n", - "10 9 17 28.716471 4.186471 33.307647 \n", - "11 44 10 22.116000 10.184000 55.267000 \n", - "12 21 16 17.386613 1.349300 66.530000 \n", - "13 0 8 9.787500 0.573750 74.257500 \n", - "14 23 34 3.931765 1.443529 91.808529 \n", - "\n", - " gdp coastline population_density area_size \\\n", - "0 27027.777778 15.930556 204.155556 6.800796e+05 \n", - "1 21335.714286 9.730714 296.542857 2.993842e+05 \n", - "2 19534.615385 61.310000 2029.976923 1.175833e+05 \n", - "3 15500.000000 13.316364 160.172727 2.850697e+05 \n", - "4 13275.000000 47.305000 268.258333 2.471858e+04 \n", - "5 9562.500000 0.282500 32.443750 4.310791e+06 \n", - "6 7425.000000 4.370000 130.316667 1.547768e+05 \n", - "7 5722.857143 0.438571 4.328571 3.372373e+05 \n", - "8 4800.000000 27.435333 537.733333 4.220207e+04 \n", - "9 4009.090909 120.770909 142.990909 3.157355e+04 \n", - "10 3500.000000 1.430588 187.729412 3.664089e+05 \n", - "11 2100.000000 2.702000 180.400000 2.983568e+05 \n", - "12 1881.250000 4.710625 121.756250 2.988618e+05 \n", - "13 1850.000000 0.598750 58.975000 3.605852e+05 \n", - "14 1435.294118 4.170882 37.926471 6.419174e+05 \n", - "\n", - " example_members \n", - "0 [Aruba, Austria, Bermuda] \n", - "1 [Argentina, Belgium, Estonia] \n", - "2 [Andorra, Anguilla, Bahamas, The] \n", - "3 [British Virgin Is., Costa Rica, Greece] \n", - "4 [Antigua & Barbuda, Barbados, Croatia] \n", - "5 [Algeria, Australia, Brazil] \n", - "6 [Belarus, Bulgaria, Cuba] \n", - "7 [Belize, Botswana, Gabon] \n", - "8 [American Samoa, Cook Islands, Dominican Repub... \n", - "9 [Dominica, Grenada, Kiribati] \n", - "10 [Albania, Armenia, Bangladesh] \n", - "11 [Burundi, Comoros, Ecuador] \n", - "12 [Azerbaijan, Benin, Burma] \n", - "13 [Burkina Faso, East Timor, Ethiopia] \n", - "14 [Afghanistan, Angola, Bhutan] " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
communitycommunity_sizepct_arablepct_cropsinfant_mortalitygdpcoastlinepopulation_densityarea_sizeexample_members
0121811.2822221.2338895.99444427027.77777815.930556204.1555566.800796e+05[Aruba, Austria, Bermuda]
1541420.7459521.2082016.73571421335.7142869.730714296.5428572.993842e+05[Argentina, Belgium, Estonia]
238261.4373080.79846210.54423119534.61538561.3100002029.9769231.175833e+05[Andorra, Anguilla, Bahamas, The]
371115.6527277.37909110.26818215500.00000013.316364160.1727272.850697e+05[British Virgin Is., Costa Rica, Greece]
4501226.8650002.2450008.75500013275.00000047.305000268.2583332.471858e+04[Antigua & Barbuda, Barbados, Croatia]
543165.6062500.64875025.3193759562.5000000.28250032.4437504.310791e+06[Algeria, Australia, Brazil]
6301238.8116672.17000013.0291677425.0000004.370000130.3166671.547768e+05[Belarus, Bulgaria, Cuba]
73971.2257140.37000038.6622865722.8571430.4385714.3285713.372373e+05[Belize, Botswana, Gabon]
8571517.15333314.77000022.6569124800.00000027.435333537.7333334.220207e+04[American Samoa, Cook Islands, Dominican Repub...
956117.89000032.32727323.6039714009.090909120.770909142.9909093.157355e+04[Dominica, Grenada, Kiribati]
1091728.7164714.18647133.3076473500.0000001.430588187.7294123.664089e+05[Albania, Armenia, Bangladesh]
11441022.11600010.18400055.2670002100.0000002.702000180.4000002.983568e+05[Burundi, Comoros, Ecuador]
12211617.3866131.34930066.5300001881.2500004.710625121.7562502.988618e+05[Azerbaijan, Benin, Burma]
13089.7875000.57375074.2575001850.0000000.59875058.9750003.605852e+05[Burkina Faso, East Timor, Ethiopia]
1423343.9317651.44352991.8085291435.2941184.17088237.9264716.419174e+05[Afghanistan, Angola, Bhutan]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 37 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communitycommunity_sizepct_arablepct_cropsinfant_mortalitygdpcoastlinepopulation_densityarea_sizeexample_members
0122010.8290002.2225006.23950025830.00000015.623500203.5350006.146537e+05[Aruba, Austria, Bermuda]
1541420.7459521.2082016.73571421335.7142869.730714296.5428572.993842e+05[Argentina, Belgium, Estonia]
238261.4373080.79846210.54423119534.61538561.3100002029.9769231.175833e+05[Andorra, Anguilla, Bahamas, The]
37917.6311116.54777810.67333315600.00000013.417778151.7777783.426807e+05[British Virgin Is., Greece, Guadeloupe]
4501226.8650002.2450008.75500013275.00000047.305000268.2583332.471858e+04[Antigua & Barbuda, Barbados, Croatia]
543165.6062500.64875025.3193759562.5000000.28250032.4437504.310791e+06[Algeria, Australia, Brazil]
6301238.8116672.17000013.0291677425.0000004.370000130.3166671.547768e+05[Belarus, Bulgaria, Cuba]
73971.2257140.37000038.6622865722.8571430.4385714.3285713.372373e+05[Belize, Botswana, Gabon]
8571517.15333314.77000022.6569124800.00000027.435333537.7333334.220207e+04[American Samoa, Cook Islands, Dominican Repub...
956117.89000032.32727323.6039714009.090909120.770909142.9909093.157355e+04[Dominica, Grenada, Kiribati]
1091728.7164714.18647133.3076473500.0000001.430588187.7294123.664089e+05[Albania, Armenia, Bangladesh]
11441022.11600010.18400055.2670002100.0000002.702000180.4000002.983568e+05[Burundi, Comoros, Ecuador]
12211617.3866131.34930066.5300001881.2500004.710625121.7562502.988618e+05[Azerbaijan, Benin, Burma]
13089.7875000.57375074.2575001850.0000000.59875058.9750003.605852e+05[Burkina Faso, East Timor, Ethiopia]
1423343.9317651.44352991.8085291435.2941184.17088237.9264716.419174e+05[Afghanistan, Angola, Bhutan]
\n", + "
" ], - "source": [ - "first_level_communities = \"\"\"\n", - "\n", - "MATCH (c:Country)\n", - "RETURN c.louvain[0] as community,\n", - " count(*) as community_size,\n", - " avg(c['Arable (%)']) as pct_arable,\n", - " avg(c['Crops (%)']) as pct_crops, \n", - " avg(c['Infant mortality (per 1000 births)']) as infant_mortality,\n", - " avg(c['GDP ($ per capita)']) as gdp,\n", - " avg(c['Coastline (coast/area ratio)']) as coastline,\n", - " avg(c['Pop. Density (per sq. mi.)']) as population_density,\n", - " avg(c['Area (sq. mi.)']) as area_size,\n", - " collect(c['name'])[..3] as example_members\n", - "ORDER BY gdp DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(first_level_communities)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S20aUzOJ25jO" - }, - "source": [ - "As expected, there are almost twice as many communities on the first level compared to the second and final level. An exciting community formed in second place by the average GDP. It contains only five countries, which are quite tiny as their average area size is only 364 square miles. On the other hand, they have a very high population density of around 10000 people per square mile. Example members are Macau, Monaco, and Hong Kong." + "text/plain": [ + " community community_size pct_arable pct_crops infant_mortality \\\n", + "0 12 20 10.829000 2.222500 6.239500 \n", + "1 54 14 20.745952 1.208201 6.735714 \n", + "2 38 26 1.437308 0.798462 10.544231 \n", + "3 7 9 17.631111 6.547778 10.673333 \n", + "4 50 12 26.865000 2.245000 8.755000 \n", + "5 43 16 5.606250 0.648750 25.319375 \n", + "6 30 12 38.811667 2.170000 13.029167 \n", + "7 39 7 1.225714 0.370000 38.662286 \n", + "8 57 15 17.153333 14.770000 22.656912 \n", + "9 56 11 7.890000 32.327273 23.603971 \n", + "10 9 17 28.716471 4.186471 33.307647 \n", + "11 44 10 22.116000 10.184000 55.267000 \n", + "12 21 16 17.386613 1.349300 66.530000 \n", + "13 0 8 9.787500 0.573750 74.257500 \n", + "14 23 34 3.931765 1.443529 91.808529 \n", + "\n", + " gdp coastline population_density area_size \\\n", + "0 25830.000000 15.623500 203.535000 6.146537e+05 \n", + "1 21335.714286 9.730714 296.542857 2.993842e+05 \n", + "2 19534.615385 61.310000 2029.976923 1.175833e+05 \n", + "3 15600.000000 13.417778 151.777778 3.426807e+05 \n", + "4 13275.000000 47.305000 268.258333 2.471858e+04 \n", + "5 9562.500000 0.282500 32.443750 4.310791e+06 \n", + "6 7425.000000 4.370000 130.316667 1.547768e+05 \n", + "7 5722.857143 0.438571 4.328571 3.372373e+05 \n", + "8 4800.000000 27.435333 537.733333 4.220207e+04 \n", + "9 4009.090909 120.770909 142.990909 3.157355e+04 \n", + "10 3500.000000 1.430588 187.729412 3.664089e+05 \n", + "11 2100.000000 2.702000 180.400000 2.983568e+05 \n", + "12 1881.250000 4.710625 121.756250 2.988618e+05 \n", + "13 1850.000000 0.598750 58.975000 3.605852e+05 \n", + "14 1435.294118 4.170882 37.926471 6.419174e+05 \n", + "\n", + " example_members \n", + "0 [Aruba, Austria, Bermuda] \n", + "1 [Argentina, Belgium, Estonia] \n", + "2 [Andorra, Anguilla, Bahamas, The] \n", + "3 [British Virgin Is., Greece, Guadeloupe] \n", + "4 [Antigua & Barbuda, Barbados, Croatia] \n", + "5 [Algeria, Australia, Brazil] \n", + "6 [Belarus, Bulgaria, Cuba] \n", + "7 [Belize, Botswana, Gabon] \n", + "8 [American Samoa, Cook Islands, Dominican Repub... \n", + "9 [Dominica, Grenada, Kiribati] \n", + "10 [Albania, Armenia, Bangladesh] \n", + "11 [Burundi, Comoros, Ecuador] \n", + "12 [Azerbaijan, Benin, Burma] \n", + "13 [Burkina Faso, East Timor, Ethiopia] \n", + "14 [Afghanistan, Angola, Bhutan] " ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_level_communities = \"\"\"\n", + "\n", + "MATCH (c:Country)\n", + "RETURN c.louvain[0] as community,\n", + " count(*) as community_size,\n", + " avg(c['Arable (%)']) as pct_arable,\n", + " avg(c['Crops (%)']) as pct_crops, \n", + " avg(c['Infant mortality (per 1000 births)']) as infant_mortality,\n", + " avg(c['GDP ($ per capita)']) as gdp,\n", + " avg(c['Coastline (coast/area ratio)']) as coastline,\n", + " avg(c['Pop. Density (per sq. mi.)']) as population_density,\n", + " avg(c['Area (sq. mi.)']) as area_size,\n", + " collect(c['name'])[..3] as example_members\n", + "ORDER BY gdp DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(first_level_communities)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S20aUzOJ25jO" + }, + "source": [ + "As expected, there are almost twice as many communities on the first level compared to the second and final level. An exciting community formed in second place by the average GDP. It contains only five countries, which are quite tiny as their average area size is only 364 square miles. On the other hand, they have a very high population density of around 10000 people per square mile. Example members are Macau, Monaco, and Hong Kong." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "aI450Ksi25jO", + "outputId": "94299495-50cf-47a4-9ccb-885c3c687061" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "id": "aI450Ksi25jO", - "outputId": "94299495-50cf-47a4-9ccb-885c3c687061", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " 'dropped graph: ' + graphName\n", - "0 dropped graph: 30\n", - "1 dropped graph: 50\n", - "2 dropped graph: 23\n", - "3 dropped graph: 21\n", - "4 dropped graph: 43\n", - "5 dropped graph: 54\n", - "6 dropped graph: 57\n", - "7 dropped graph: 12" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
'dropped graph: ' + graphName
0dropped graph: 30
1dropped graph: 50
2dropped graph: 23
3dropped graph: 21
4dropped graph: 43
5dropped graph: 54
6dropped graph: 57
7dropped graph: 12
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 38 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0dropped graph: 57
1dropped graph: 23
2dropped graph: 12
3dropped graph: 43
4dropped graph: 54
5dropped graph: 21
6dropped graph: 30
7dropped graph: 50
8dropped graph: countries
\n", + "
" ], - "source": [ - "drop_all_graphs = \"\"\"\n", - "CALL gds.graph.list() YIELD graphName\n", - "CALL gds.graph.drop(graphName) YIELD graphName as t\n", - "RETURN 'dropped graph: ' + graphName AS result\n", - "\"\"\"\n", - "\n", - "read_query(drop_all_graphs)" + "text/plain": [ + " result\n", + "0 dropped graph: 57\n", + "1 dropped graph: 23\n", + "2 dropped graph: 12\n", + "3 dropped graph: 43\n", + "4 dropped graph: 54\n", + "5 dropped graph: 21\n", + "6 dropped graph: 30\n", + "7 dropped graph: 50\n", + "8 dropped graph: countries" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rNRoILif25jP" - }, - "outputs": [], - "source": [ - "" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - }, - "colab": { - "name": "Countries of the world analysis.ipynb", - "provenance": [], - "include_colab_link": true + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "drop_all_graphs = \"\"\"\n", + "CALL gds.graph.list() YIELD graphName\n", + "CALL gds.graph.drop(graphName) YIELD graphName as t\n", + "RETURN 'dropped graph: ' + graphName AS result\n", + "\"\"\"\n", + "\n", + "read_query(drop_all_graphs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rNRoILif25jP" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "Countries of the world analysis.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/GDS_Multigraph/GDS multigraph.ipynb b/GDS_Multigraph/GDS multigraph.ipynb index ec33776..9bde7a6 100644 --- a/GDS_Multigraph/GDS multigraph.ipynb +++ b/GDS_Multigraph/GDS multigraph.ipynb @@ -1,3335 +1,3342 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Countries of the world\n", - "* Updated to GDS 2.0 version\n", - "* Link to original blog post: https://towardsdatascience.com/analyzing-multigraphs-in-neo4j-graph-data-science-library-35c9b6d20099" - ], - "metadata": { - "id": "LLgDODQj__no" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install neo4j" - ], - "metadata": { - "id": "IFUfNyE-AVsB", - "outputId": "819a09b9-f95f-4564-c9d2-c15b2180adfe", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting neo4j\n", - " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", - "\u001b[?25l\r\u001b[K |███▋ | 10 kB 19.7 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 11.7 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 9.4 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 8.6 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 4.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 5.3 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 5.4 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 6.0 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 3.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=6c51df58ee1c80464d849e1d3c9a4220ade9ca8512ba25124abaa9900d8f2143\n", - " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j\n", - "Successfully installed neo4j-4.4.2\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "I recommend you setup a [blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" - ], - "metadata": { - "id": "MkzxLkbTAZ2V" - } - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "Alhkhjqf_9sQ" - }, - "outputs": [], - "source": [ - "# Define Neo4j connections\n", - "from neo4j import GraphDatabase\n", - "host = 'bolt://3.235.2.228:7687'\n", - "user = 'neo4j'\n", - "password = 'seats-drunks-carbon'\n", - "driver = GraphDatabase.driver(host,auth=(user, password))\n", - "\n", - "def drop_graph(name):\n", - " with driver.session() as session:\n", - " drop_graph_query = \"\"\"\n", - " CALL gds.graph.drop('{}');\n", - " \"\"\".format(name)\n", - " session.run(drop_graph_query)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LLgDODQj__no" + }, + "source": [ + "# Countries of the world\n", + "* Updated to GDS 2.0 version\n", + "* Link to original blog post: https://towardsdatascience.com/analyzing-multigraphs-in-neo4j-graph-data-science-library-35c9b6d20099" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "IFUfNyE-AVsB", + "outputId": "819a09b9-f95f-4564-c9d2-c15b2180adfe" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "yvbJf7zF_9sU" - }, - "outputs": [], - "source": [ - "# Import libraries\n", - "import pandas as pd\n", - "\n", - "def read_query(query):\n", - " with driver.session() as session:\n", - " result = session.run(query)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting neo4j\n", + " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", + "\u001b[?25l\r", + "\u001b[K |███▋ | 10 kB 19.7 MB/s eta 0:00:01\r", + "\u001b[K |███████▎ | 20 kB 11.7 MB/s eta 0:00:01\r", + "\u001b[K |███████████ | 30 kB 9.4 MB/s eta 0:00:01\r", + "\u001b[K |██████████████▋ | 40 kB 8.6 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████▎ | 51 kB 4.5 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████████ | 61 kB 5.3 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████▋ | 71 kB 5.4 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████████▎ | 81 kB 6.0 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 89 kB 3.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=6c51df58ee1c80464d849e1d3c9a4220ade9ca8512ba25124abaa9900d8f2143\n", + " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j\n", + "Successfully installed neo4j-4.4.2\n" + ] + } + ], + "source": [ + "!pip install neo4j" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MkzxLkbTAZ2V" + }, + "source": [ + "I recommend you setup a [blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Alhkhjqf_9sQ" + }, + "outputs": [], + "source": [ + "# Define Neo4j connections\n", + "from neo4j import GraphDatabase\n", + "host = 'bolt://3.235.2.228:7687'\n", + "user = 'neo4j'\n", + "password = 'seats-drunks-carbon'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))\n", + "\n", + "def drop_graph(name):\n", + " with driver.session() as session:\n", + " drop_graph_query = \"\"\"\n", + " CALL gds.graph.drop('{}');\n", + " \"\"\".format(name)\n", + " session.run(drop_graph_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "yvbJf7zF_9sU" + }, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "\n", + "def read_query(query):\n", + " with driver.session() as session:\n", + " result = session.run(query)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "mhNpPk_D_9sU", + "outputId": "c95e1870-7e50-4fb0-d046-16b7df80761f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "mhNpPk_D_9sU", - "outputId": "c95e1870-7e50-4fb0-d046-16b7df80761f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 4 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "# Import the graph\n", - "\n", - "import_query = \"\"\"\n", - "CREATE (t:Entity{name:'Tomaz'}),\n", - " (n:Entity{name:'Neo4j'})\n", - "CREATE (t)-[:LIKES{weight:1}]->(n),\n", - " (t)-[:LOVES{weight:2}]->(n),\n", - " (t)-[:PRESENTED_FOR{weight:0.5}]->(n),\n", - " (t)-[:PRESENTED_FOR{weight:1.5}]->(n);\n", - "\"\"\"\n", - "read_query(import_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M9KtdcIh_9sV" - }, - "source": [ - "## Relationships without own identity\n", - "\n", - "In the context of the GDS library, relationships without own identity imply that we ignore the type of relationships in the process of projecting the graph.\n", - "\n", - "### Native projection\n", - "\n", - "We will start with native projection examples. If we use the wildcard operator * to define the relationships we want to project, we ignore their type and bundle them all together. This can be understood as losing their own identity (type in the context of Neo4j).\n", - "\n", - "#### Default aggregation strategy\n", - "\n", - "In the first example, we will observe the default behavior of the graph projection process." + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Import the graph\n", + "\n", + "import_query = \"\"\"\n", + "CREATE (t:Entity{name:'Tomaz'}),\n", + " (n:Entity{name:'Neo4j'})\n", + "CREATE (t)-[:LIKES{weight:1}]->(n),\n", + " (t)-[:LOVES{weight:2}]->(n),\n", + " (t)-[:PRESENTED_FOR{weight:0.5}]->(n),\n", + " (t)-[:PRESENTED_FOR{weight:1.5}]->(n);\n", + "\"\"\"\n", + "read_query(import_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M9KtdcIh_9sV" + }, + "source": [ + "## Relationships without own identity\n", + "\n", + "In the context of the GDS library, relationships without own identity imply that we ignore the type of relationships in the process of projecting the graph.\n", + "\n", + "### Native projection\n", + "\n", + "We will start with native projection examples. If we use the wildcard operator * to define the relationships we want to project, we ignore their type and bundle them all together. This can be understood as losing their own identity (type in the context of Neo4j).\n", + "\n", + "#### Default aggregation strategy\n", + "\n", + "In the first example, we will observe the default behavior of the graph projection process." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "lj3scEOR_9sX", + "outputId": "8f3978fe-380c-4181-a475-7fd2d0d4f0ff" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "lj3scEOR_9sX", - "outputId": "8f3978fe-380c-4181-a475-7fd2d0d4f0ff", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'__ALL__': {'orientation': 'NATURAL', 'aggreg... default_agg 2 \n", - "\n", - " relationshipCount projectMillis \n", - "0 4 80 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'__ALL__': {'orientation': 'NATURAL', 'aggreg...default_agg2480
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'__ALL__': {'orientation': 'NATURAL', 'aggreg...default_agg2480
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "default_agg_strategy = \"\"\"\n", - "\n", - "CALL gds.graph.project('default_agg','*','*',\n", - " {relationshipProperties: ['weight']})\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(default_agg_strategy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FwHVohfA_9sY" - }, - "source": [ - "The default aggregation strategy actually doesn't perform any aggregations and projects all the relationships from the stored graph to memory without any transformations. If we check the relationshipCount, we observe that four relationships have been projected. To double-check the projected graph, we can use the degree centrality." + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'__ALL__': {'orientation': 'NATURAL', 'aggreg... default_agg 2 \n", + "\n", + " relationshipCount projectMillis \n", + "0 4 80 " ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_agg_strategy = \"\"\"\n", + "\n", + "CALL gds.graph.project('default_agg','*','*',\n", + " {relationshipProperties: ['weight']})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(default_agg_strategy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FwHVohfA_9sY" + }, + "source": [ + "The default aggregation strategy actually doesn't perform any aggregations and projects all the relationships from the stored graph to memory without any transformations. If we check the relationshipCount, we observe that four relationships have been projected. To double-check the projected graph, we can use the degree centrality." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "Kbj1-VQb_9sZ", + "outputId": "7ed9bc0d-54b5-4a63-d1dd-66ab731e2fee" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Kbj1-VQb_9sZ", - "outputId": "7ed9bc0d-54b5-4a63-d1dd-66ab731e2fee", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name degree\n", - "0 Tomaz 4.0\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedegree
0Tomaz4.0
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedegree
0Tomaz4.0
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "default_agg_strategy_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('default_agg')\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name, \n", - " score AS degree\n", - "ORDER BY degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(default_agg_strategy_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hhEtoaIP_9sZ" - }, - "source": [ - "As we expected, all four relationships have been projected. To have a reference for the future let's also calculate the weighted degree centrality. By adding the relationshipWeightProperty parameter, we indicate we want to use the weighted variant of the algorithm." + "text/plain": [ + " name degree\n", + "0 Tomaz 4.0\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_agg_strategy_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('default_agg')\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name, \n", + " score AS degree\n", + "ORDER BY degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(default_agg_strategy_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hhEtoaIP_9sZ" + }, + "source": [ + "As we expected, all four relationships have been projected. To have a reference for the future let's also calculate the weighted degree centrality. By adding the relationshipWeightProperty parameter, we indicate we want to use the weighted variant of the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "P68OfMz6_9sa", + "outputId": "bc0bce25-4a2e-4405-a271-6b923d1941b4" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "P68OfMz6_9sa", - "outputId": "bc0bce25-4a2e-4405-a271-6b923d1941b4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name weighted_degree\n", - "0 Tomaz 5.0\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameweighted_degree
0Tomaz5.0
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 7 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameweighted_degree
0Tomaz5.0
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "default_agg_strategy_weight_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('default_agg', \n", - " {relationshipWeightProperty:'weight'})\n", - "YIELD nodeId, score \n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS weighted_degree ORDER BY weighted_degree DESC\n", - "\"\"\"\n", - "\n", - "read_query(default_agg_strategy_weight_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ExWNKGBd_9sb" - }, - "source": [ - "The result is the sum of weights of all the considered relationships. We have no use of this projected graph anymore, so remember to release it from memory." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "W4mS6CEy_9sb" - }, - "outputs": [], - "source": [ - "drop_graph('default_agg')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g7hJLJFb_9sb" - }, - "source": [ - "#### Single-graph strategy\n", - "\n", - "Depending on the use case, we might want to reduce our multigraph to a single graph during the projection process. This can be easily achieved with the aggregation parameter. We have to use the configuration map variant for the relationship definition." + "text/plain": [ + " name weighted_degree\n", + "0 Tomaz 5.0\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_agg_strategy_weight_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('default_agg', \n", + " {relationshipWeightProperty:'weight'})\n", + "YIELD nodeId, score \n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS weighted_degree ORDER BY weighted_degree DESC\n", + "\"\"\"\n", + "\n", + "read_query(default_agg_strategy_weight_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ExWNKGBd_9sb" + }, + "source": [ + "The result is the sum of weights of all the considered relationships. We have no use of this projected graph anymore, so remember to release it from memory." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "W4mS6CEy_9sb" + }, + "outputs": [], + "source": [ + "drop_graph('default_agg')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g7hJLJFb_9sb" + }, + "source": [ + "#### Single-graph strategy\n", + "\n", + "Depending on the use case, we might want to reduce our multigraph to a single graph during the projection process. This can be easily achieved with the aggregation parameter. We have to use the configuration map variant for the relationship definition." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "NmUnMJLG_9sc", + "outputId": "535d53c3-fe5f-4765-f5cd-1526119a7c1d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "NmUnMJLG_9sc", - "outputId": "535d53c3-fe5f-4765-f5cd-1526119a7c1d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName \\\n", - "0 {'TYPE': {'orientation': 'NATURAL', 'aggregati... single_rel_strategy \n", - "\n", - " nodeCount relationshipCount projectMillis \n", - "0 2 1 93 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'TYPE': {'orientation': 'NATURAL', 'aggregati...single_rel_strategy2193
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 9 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'TYPE': {'orientation': 'NATURAL', 'aggregati...single_rel_strategy2193
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "single_rel_graph = \"\"\"\n", - "CALL gds.graph.project('single_rel_strategy','*', \n", - " {TYPE:{type:'*', aggregation:'SINGLE'}})\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(single_rel_graph)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zKpEMOIu_9sc" - }, - "source": [ - "We notice by looking at the relationshipCount, that only a single relationship has been projected. If we want to double-check with the degree centrality:" + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName \\\n", + "0 {'TYPE': {'orientation': 'NATURAL', 'aggregati... single_rel_strategy \n", + "\n", + " nodeCount relationshipCount projectMillis \n", + "0 2 1 93 " ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "single_rel_graph = \"\"\"\n", + "CALL gds.graph.project('single_rel_strategy','*', \n", + " {TYPE:{type:'*', aggregation:'SINGLE'}})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(single_rel_graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zKpEMOIu_9sc" + }, + "source": [ + "We notice by looking at the relationshipCount, that only a single relationship has been projected. If we want to double-check with the degree centrality:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "ltHqaNHA_9sd", + "outputId": "6f88b61f-3fad-4107-c1c9-1aa0d3ddb495" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "ltHqaNHA_9sd", - "outputId": "6f88b61f-3fad-4107-c1c9-1aa0d3ddb495", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name degree\n", - "0 Tomaz 1.0\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedegree
0Tomaz1.0
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 10 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedegree
0Tomaz1.0
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "single_rel_graph_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('single_rel_strategy')\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS degree\n", - "ORDER BY degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(single_rel_graph_check)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "vhwpHLbC_9sd" - }, - "outputs": [], - "source": [ - "drop_graph('single_rel_strategy')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "akCjEafL_9sd" - }, - "source": [ - "#### Property aggregation strategies\n", - "\n", - "We have looked at the unweighted multigraph so far. Now it is time to look at what happens when we are dealing with a weighted multigraph and we want to reduce it to a single graph. There are three different strategies we can pick for property aggregations:\n", - "\n", - "* MIN: minimum value of all weights is projected\n", - "* MAX: maximum value of all weights is projected\n", - "* SUM: the sum of all weights is projected\n", - "\n", - "In our next example, we will use the MIN property aggregation strategy to reduce a weighted multigraph to a single graph. By providing the property aggregation parameter, we indicate we want to reduce the stored graph to a single graph in the projection process." + "text/plain": [ + " name degree\n", + "0 Tomaz 1.0\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "single_rel_graph_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('single_rel_strategy')\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS degree\n", + "ORDER BY degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(single_rel_graph_check)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "vhwpHLbC_9sd" + }, + "outputs": [], + "source": [ + "drop_graph('single_rel_strategy')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "akCjEafL_9sd" + }, + "source": [ + "#### Property aggregation strategies\n", + "\n", + "We have looked at the unweighted multigraph so far. Now it is time to look at what happens when we are dealing with a weighted multigraph and we want to reduce it to a single graph. There are three different strategies we can pick for property aggregations:\n", + "\n", + "* MIN: minimum value of all weights is projected\n", + "* MAX: maximum value of all weights is projected\n", + "* SUM: the sum of all weights is projected\n", + "\n", + "In our next example, we will use the MIN property aggregation strategy to reduce a weighted multigraph to a single graph. By providing the property aggregation parameter, we indicate we want to reduce the stored graph to a single graph in the projection process." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "I_u1X2ar_9sd", + "outputId": "962da3f6-68d8-41b5-91f9-52d7d6c2895c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "I_u1X2ar_9sd", - "outputId": "962da3f6-68d8-41b5-91f9-52d7d6c2895c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName \\\n", - "0 {'__ALL__': {'orientation': 'NATURAL', 'aggreg... min_aggregation \n", - "\n", - " nodeCount relationshipCount projectMillis \n", - "0 2 1 16 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'__ALL__': {'orientation': 'NATURAL', 'aggreg...min_aggregation2116
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 12 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'__ALL__': {'orientation': 'NATURAL', 'aggreg...min_aggregation2116
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "min_agg_strategy = \"\"\"\n", - "\n", - "CALL gds.graph.project('min_aggregation','*','*',\n", - " {relationshipProperties: {weight: {property: 'weight', aggregation: 'MIN'}}})\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(min_agg_strategy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oqbgaP9D_9se" - }, - "source": [ - "We can observe that the relationshipCount is 1, which means our multigraph has been successfully reduced to a single graph. To validate the MIN property aggregation, let's also calculate the weighted degree centrality." + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName \\\n", + "0 {'__ALL__': {'orientation': 'NATURAL', 'aggreg... min_aggregation \n", + "\n", + " nodeCount relationshipCount projectMillis \n", + "0 2 1 16 " ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_agg_strategy = \"\"\"\n", + "\n", + "CALL gds.graph.project('min_aggregation','*','*',\n", + " {relationshipProperties: {weight: {property: 'weight', aggregation: 'MIN'}}})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(min_agg_strategy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oqbgaP9D_9se" + }, + "source": [ + "We can observe that the relationshipCount is 1, which means our multigraph has been successfully reduced to a single graph. To validate the MIN property aggregation, let's also calculate the weighted degree centrality." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "WbePfg2Y_9se", + "outputId": "9285a320-9c2d-4dd9-849d-53e097f3d856" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "WbePfg2Y_9se", - "outputId": "9285a320-9c2d-4dd9-849d-53e097f3d856", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name weighted_degree\n", - "0 Tomaz 0.5\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameweighted_degree
0Tomaz0.5
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 13 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameweighted_degree
0Tomaz0.5
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "min_agg_strategy_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('min_aggregation', \n", - " {relationshipWeightProperty:'weight'})\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name, \n", - " score AS weighted_degree\n", - "ORDER BY weighted_degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(min_agg_strategy_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Qk5MNNGf_9se" - }, - "source": [ - "As we expected with the MIN property aggregation strategy, the reduced single weight was the smallest one. Again, as we finished with the example,  don't forget to drop the projected graph." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "_gP6qZnd_9sf" - }, - "outputs": [], - "source": [ - "drop_graph('min_aggregation')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7lFgjXVq_9sf" - }, - "source": [ - "### Cypher projection\n", - "\n", - "Let's recreate the above examples with cypher projection. To lose the identity of the relationships and bundle them all together, we avoid providing the type column in the return of the relationship statement.\n", - "\n", - "#### Default aggregation strategy\n", - "\n", - "Similarly to native projection, the default setting in cypher projection is to project all the relationships without any transformation during the projection process." + "text/plain": [ + " name weighted_degree\n", + "0 Tomaz 0.5\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_agg_strategy_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('min_aggregation', \n", + " {relationshipWeightProperty:'weight'})\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name, \n", + " score AS weighted_degree\n", + "ORDER BY weighted_degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(min_agg_strategy_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qk5MNNGf_9se" + }, + "source": [ + "As we expected with the MIN property aggregation strategy, the reduced single weight was the smallest one. Again, as we finished with the example,  don't forget to drop the projected graph." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "_gP6qZnd_9sf" + }, + "outputs": [], + "source": [ + "drop_graph('min_aggregation')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7lFgjXVq_9sf" + }, + "source": [ + "### Cypher projection\n", + "\n", + "Let's recreate the above examples with cypher projection. To lose the identity of the relationships and bundle them all together, we avoid providing the type column in the return of the relationship statement.\n", + "\n", + "#### Default aggregation strategy\n", + "\n", + "Similarly to native projection, the default setting in cypher projection is to project all the relationships without any transformation during the projection process." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "hz0qkwTb_9sf", + "outputId": "78242376-a9ba-46b3-fd83-61c6e9de5339" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "hz0qkwTb_9sf", - "outputId": "78242376-a9ba-46b3-fd83-61c6e9de5339", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeQuery \\\n", - "0 MATCH (n:Entity) RETURN id(n) AS id \n", - "\n", - " relationshipQuery graphName \\\n", - "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_default_strategy \n", - "\n", - " nodeCount relationshipCount projectMillis \n", - "0 2 4 74 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_default_strategy2474
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 15 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_default_strategy2474
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "cypher_default_agg = \"\"\"\n", - "\n", - "CALL gds.graph.project.cypher('cypher_default_strategy', \n", - " 'MATCH (n:Entity) RETURN id(n) AS id', \n", - " 'MATCH (n:Entity)-[r]->(m:Entity)\n", - " RETURN id(n) AS source, id(m) AS target')\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(cypher_default_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOL1ttJ__9sf" - }, - "source": [ - "By looking at the relationshipCount, we observe that all four relationships have been projected as intended.To verify the projected graph, we run the degree centrality." + "text/plain": [ + " nodeQuery \\\n", + "0 MATCH (n:Entity) RETURN id(n) AS id \n", + "\n", + " relationshipQuery graphName \\\n", + "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_default_strategy \n", + "\n", + " nodeCount relationshipCount projectMillis \n", + "0 2 4 74 " ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cypher_default_agg = \"\"\"\n", + "\n", + "CALL gds.graph.project.cypher('cypher_default_strategy', \n", + " 'MATCH (n:Entity) RETURN id(n) AS id', \n", + " 'MATCH (n:Entity)-[r]->(m:Entity)\n", + " RETURN id(n) AS source, id(m) AS target')\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(cypher_default_agg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOL1ttJ__9sf" + }, + "source": [ + "By looking at the relationshipCount, we observe that all four relationships have been projected as intended.To verify the projected graph, we run the degree centrality." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "Nx5mA6HB_9sg", + "outputId": "50f7e92a-30f7-4606-9045-cff5b00371ce" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "Nx5mA6HB_9sg", - "outputId": "50f7e92a-30f7-4606-9045-cff5b00371ce", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name degree\n", - "0 Tomaz 4.0\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedegree
0Tomaz4.0
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 16 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedegree
0Tomaz4.0
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "cypher_default_agg_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('cypher_default_strategy')\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS degree\n", - "ORDER BY degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(cypher_default_agg_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H9O2QwOh_9sg" - }, - "source": [ - "#### Single relationship strategy\n", - "\n", - "With cypher projection, we don't have access to relationship level aggregation strategies. This is no problem at all as it is very easy to reduce the multigraph to a single graph using only the cypher query language. We simply add the DISTINCT clause in the return of the relationship statement and it should be good to go." + "text/plain": [ + " name degree\n", + "0 Tomaz 4.0\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cypher_default_agg_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('cypher_default_strategy')\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS degree\n", + "ORDER BY degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(cypher_default_agg_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H9O2QwOh_9sg" + }, + "source": [ + "#### Single relationship strategy\n", + "\n", + "With cypher projection, we don't have access to relationship level aggregation strategies. This is no problem at all as it is very easy to reduce the multigraph to a single graph using only the cypher query language. We simply add the DISTINCT clause in the return of the relationship statement and it should be good to go." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "MwojiBmC_9sg", + "outputId": "3e0d5e51-3337-401e-d6ef-4e0c22b1f1ab" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "MwojiBmC_9sg", - "outputId": "3e0d5e51-3337-401e-d6ef-4e0c22b1f1ab", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeQuery \\\n", - "0 MATCH (n:Entity) RETURN id(n) AS id \n", - "\n", - " relationshipQuery graphName \\\n", - "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_single_strategy \n", - "\n", - " nodeCount relationshipCount projectMillis \n", - "0 2 1 11 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_single_strategy2111
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 17 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_single_strategy2111
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "cypher_single_agg = \"\"\"\n", - "\n", - "CALL gds.graph.project.cypher('cypher_single_strategy',\n", - " 'MATCH (n:Entity) RETURN id(n) AS id',\n", - " 'MATCH (n:Entity)-[r]->(m:Entity)\n", - " RETURN DISTINCT id(n) AS source, id(m) AS target' )\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(cypher_single_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ueFYFcaD_9sg" - }, - "source": [ - "The relationship count is one, which means we have successfully reduced the multigraph. Remember to drop the projected graph." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "RFwJ-_wK_9sg" - }, - "outputs": [], - "source": [ - "drop_graph('cypher_single_strategy')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5uc5pkhm_9sh" - }, - "source": [ - "#### Property aggregation strategies\n", - "\n", - "On the other hand, with cypher projection, we do have access to property level aggregation strategies. We don't really \"need\" them as we can accomplish all the transformation using only cypher. To show you what I mean by that, we can apply the minimum property strategy aggregation using plain cypher like:" + "text/plain": [ + " nodeQuery \\\n", + "0 MATCH (n:Entity) RETURN id(n) AS id \n", + "\n", + " relationshipQuery graphName \\\n", + "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_single_strategy \n", + "\n", + " nodeCount relationshipCount projectMillis \n", + "0 2 1 11 " ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cypher_single_agg = \"\"\"\n", + "\n", + "CALL gds.graph.project.cypher('cypher_single_strategy',\n", + " 'MATCH (n:Entity) RETURN id(n) AS id',\n", + " 'MATCH (n:Entity)-[r]->(m:Entity)\n", + " RETURN DISTINCT id(n) AS source, id(m) AS target' )\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(cypher_single_agg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ueFYFcaD_9sg" + }, + "source": [ + "The relationship count is one, which means we have successfully reduced the multigraph. Remember to drop the projected graph." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "RFwJ-_wK_9sg" + }, + "outputs": [], + "source": [ + "drop_graph('cypher_single_strategy')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5uc5pkhm_9sh" + }, + "source": [ + "#### Property aggregation strategies\n", + "\n", + "On the other hand, with cypher projection, we do have access to property level aggregation strategies. We don't really \"need\" them as we can accomplish all the transformation using only cypher. To show you what I mean by that, we can apply the minimum property strategy aggregation using plain cypher like:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "wBSLIs1G_9sh", + "outputId": "a3147c91-3cef-49ea-8285-86f232bd01be" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "wBSLIs1G_9sh", - "outputId": "a3147c91-3cef-49ea-8285-86f232bd01be", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeQuery \\\n", - "0 MATCH (n:Entity) RETURN id(n) AS id \n", - "\n", - " relationshipQuery graphName \\\n", - "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_min_strategy \n", - "\n", - " nodeCount relationshipCount projectMillis \n", - "0 2 1 66 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_min_strategy2166
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 19 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Entity) RETURN id(n) AS idMATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ...cypher_min_strategy2166
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "cypher_min_agg = \"\"\"\n", - "\n", - "CALL gds.graph.project.cypher('cypher_min_strategy', \n", - " 'MATCH (n:Entity) RETURN id(n) AS id', \n", - " 'MATCH (n:Entity)-[r]->(m:Entity)\n", - " RETURN id(n) AS source, id(m) AS target, min(r.weight) as weight' )\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(cypher_min_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MJljp0hp_9si" - }, - "source": [ - "The relationshipCount is 1, which confirms our successful multigraph reduction. Just to make sure, we can run the weighted centrality and validate results." + "text/plain": [ + " nodeQuery \\\n", + "0 MATCH (n:Entity) RETURN id(n) AS id \n", + "\n", + " relationshipQuery graphName \\\n", + "0 MATCH (n:Entity)-[r]->(m:Entity)\\n RETURN ... cypher_min_strategy \n", + "\n", + " nodeCount relationshipCount projectMillis \n", + "0 2 1 66 " ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cypher_min_agg = \"\"\"\n", + "\n", + "CALL gds.graph.project.cypher('cypher_min_strategy', \n", + " 'MATCH (n:Entity) RETURN id(n) AS id', \n", + " 'MATCH (n:Entity)-[r]->(m:Entity)\n", + " RETURN id(n) AS source, id(m) AS target, min(r.weight) as weight' )\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(cypher_min_agg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MJljp0hp_9si" + }, + "source": [ + "The relationshipCount is 1, which confirms our successful multigraph reduction. Just to make sure, we can run the weighted centrality and validate results." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "MJ6ufdjs_9si", + "outputId": "44bfb384-be37-486e-92ef-f59db29c544d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "MJ6ufdjs_9si", - "outputId": "44bfb384-be37-486e-92ef-f59db29c544d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name weighted_degree\n", - "0 Tomaz 0.5\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameweighted_degree
0Tomaz0.5
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 23 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameweighted_degree
0Tomaz0.5
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "cypher_min_agg_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('cypher_min_strategy',\n", - " {relationshipWeightProperty:'weight'})\n", - "YIELD nodeId, score \n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS weighted_degree\n", - "ORDER BY weighted_degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(cypher_min_agg_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yjuGG1yE_9si" - }, - "source": [ - "With everything in order, we can release both projected graphs from memory." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "QYpV1JVN_9si" - }, - "outputs": [], - "source": [ - "drop_graph('cypher_min_strategy')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "quzNqWxS_9si" - }, - "source": [ - "## Relationships with own identity\n", - "\n", - "We also have the option to retain the type of relationships during the projection process. Among other things, this allows us to perform additional filtering when executing graph algorithms. However, we have to be careful, as projecting relationships with a preserved type is a bit different in the context of multigraphs.\n", - "\n", - "### Native projection\n", - "\n", - "It is simple to declare that we want to preserve the type of relationships with the native projection. All we have to do is specify which relationship types we want to consider and the GDS engine will automatically bundle relationships under the specific relationship type. Let's take a look at some examples to gain a better understanding.\n", - "\n", - "#### Default aggregation strategy\n", - "\n", - "From previous examples we already know that the default aggregation strategy does not perform any transformations. By defining the relationship types we indicate to the GDS library we want to retain their type after the projection process." + "text/plain": [ + " name weighted_degree\n", + "0 Tomaz 0.5\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cypher_min_agg_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('cypher_min_strategy',\n", + " {relationshipWeightProperty:'weight'})\n", + "YIELD nodeId, score \n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS weighted_degree\n", + "ORDER BY weighted_degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(cypher_min_agg_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yjuGG1yE_9si" + }, + "source": [ + "With everything in order, we can release both projected graphs from memory." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "QYpV1JVN_9si" + }, + "outputs": [], + "source": [ + "drop_graph('cypher_min_strategy')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "quzNqWxS_9si" + }, + "source": [ + "## Relationships with own identity\n", + "\n", + "We also have the option to retain the type of relationships during the projection process. Among other things, this allows us to perform additional filtering when executing graph algorithms. However, we have to be careful, as projecting relationships with a preserved type is a bit different in the context of multigraphs.\n", + "\n", + "### Native projection\n", + "\n", + "It is simple to declare that we want to preserve the type of relationships with the native projection. All we have to do is specify which relationship types we want to consider and the GDS engine will automatically bundle relationships under the specific relationship type. Let's take a look at some examples to gain a better understanding.\n", + "\n", + "#### Default aggregation strategy\n", + "\n", + "From previous examples we already know that the default aggregation strategy does not perform any transformations. By defining the relationship types we indicate to the GDS library we want to retain their type after the projection process." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "NR5j1_8b_9sj", + "outputId": "928299fd-fa9b-4a82-e1ad-39a81bfe17d8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "id": "NR5j1_8b_9sj", - "outputId": "928299fd-fa9b-4a82-e1ad-39a81bfe17d8", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_default 2 \n", - "\n", - " relationshipCount projectMillis \n", - "0 4 69 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_default2469
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 25 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_default2469
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "default_type = \"\"\"\n", - "\n", - "CALL gds.graph.project('type_default','*',\n", - " ['PRESENTED_FOR','LIKES','LOVES'])\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(default_type)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GpZed9nB_9sj" - }, - "source": [ - "As expected, the relationshipsCount is 4." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "k-xNnrcS_9sj" - }, - "outputs": [], - "source": [ - "drop_graph('type_default')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HtlqhAnt_9sj" - }, - "source": [ - "#### Single relationship strategy\n", - "\n", - "Like before, we can reduce our unweighted multigraph to a single graph with the relationship level aggregation parameter. We have to provide the aggregation parameter for each relationship type separately." + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_default 2 \n", + "\n", + " relationshipCount projectMillis \n", + "0 4 69 " ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "default_type = \"\"\"\n", + "\n", + "CALL gds.graph.project('type_default','*',\n", + " ['PRESENTED_FOR','LIKES','LOVES'])\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(default_type)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GpZed9nB_9sj" + }, + "source": [ + "As expected, the relationshipsCount is 4." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "k-xNnrcS_9sj" + }, + "outputs": [], + "source": [ + "drop_graph('type_default')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HtlqhAnt_9sj" + }, + "source": [ + "#### Single relationship strategy\n", + "\n", + "Like before, we can reduce our unweighted multigraph to a single graph with the relationship level aggregation parameter. We have to provide the aggregation parameter for each relationship type separately." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "EOQJYnXo_9sj", + "outputId": "d0bed529-6f63-403d-9e5d-cbbf565eab3a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "id": "EOQJYnXo_9sj", - "outputId": "d0bed529-6f63-403d-9e5d-cbbf565eab3a", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_single 2 \n", - "\n", - " relationshipCount projectMillis \n", - "0 3 75 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_single2375
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 27 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_single2375
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "type_single_agg = \"\"\"\n", - "\n", - "CALL gds.graph.project('type_single','*',\n", - " {LIKES:{type:'LIKES',aggregation:'SINGLE'},\n", - " LOVES:{type:'LOVES',aggregation:'SINGLE'},\n", - " PRESENTED_FOR:{type:'PRESENTED_FOR',aggregation:'SINGLE'}})\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(type_single_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bM8ZNUR6_9sk" - }, - "source": [ - "Ok, so we reduced to a single graph, but the relationshipCount is 3. Why is it so? The multigraph reduction process works on the relationship type level and because we have three relationship types, a single relationship for each type has been projected. Let's calculate the degree centrality on the whole in-memory graph." + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_single 2 \n", + "\n", + " relationshipCount projectMillis \n", + "0 3 75 " ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_single_agg = \"\"\"\n", + "\n", + "CALL gds.graph.project('type_single','*',\n", + " {LIKES:{type:'LIKES',aggregation:'SINGLE'},\n", + " LOVES:{type:'LOVES',aggregation:'SINGLE'},\n", + " PRESENTED_FOR:{type:'PRESENTED_FOR',aggregation:'SINGLE'}})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(type_single_agg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bM8ZNUR6_9sk" + }, + "source": [ + "Ok, so we reduced to a single graph, but the relationshipCount is 3. Why is it so? The multigraph reduction process works on the relationship type level and because we have three relationship types, a single relationship for each type has been projected. Let's calculate the degree centrality on the whole in-memory graph." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "1bNhlwcX_9sk", + "outputId": "329ca35c-6238-438a-d834-5ee3db070a13" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "id": "1bNhlwcX_9sk", - "outputId": "329ca35c-6238-438a-d834-5ee3db070a13", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name degree\n", - "0 Tomaz 3.0\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedegree
0Tomaz3.0
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 28 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedegree
0Tomaz3.0
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "type_single_agg_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('type_single')\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS degree\n", - "ORDER BY degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(type_single_agg_check)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ue0b1p22_9sk" - }, - "source": [ - "As we explained, even though we have reduced each relationship type separately, we are still dealing with a multigraph on the whole. When running graph algorithms, you have to pay close attention to whether you are dealing with multigraph or not, have you projected multiple relationship types or just a single one and have you performed any transformations, as all of this will affect the algorithm results. We can now drop this graph." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "id": "35u12SFQ_9sk" - }, - "outputs": [], - "source": [ - "drop_graph('type_single')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2fiQSNz2_9sk" - }, - "source": [ - "#### Property aggregation strategies\n", - "\n", - "Property aggregation strategies are very similar to before when we were dealing with relationships without identity. The only change is that now the aggregations are grouped by the relationship type." + "text/plain": [ + " name degree\n", + "0 Tomaz 3.0\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_single_agg_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('type_single')\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS degree\n", + "ORDER BY degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(type_single_agg_check)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ue0b1p22_9sk" + }, + "source": [ + "As we explained, even though we have reduced each relationship type separately, we are still dealing with a multigraph on the whole. When running graph algorithms, you have to pay close attention to whether you are dealing with multigraph or not, have you projected multiple relationship types or just a single one and have you performed any transformations, as all of this will affect the algorithm results. We can now drop this graph." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "35u12SFQ_9sk" + }, + "outputs": [], + "source": [ + "drop_graph('type_single')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2fiQSNz2_9sk" + }, + "source": [ + "#### Property aggregation strategies\n", + "\n", + "Property aggregation strategies are very similar to before when we were dealing with relationships without identity. The only change is that now the aggregations are grouped by the relationship type." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "FeFVkJZU_9sl", + "outputId": "02951c4e-7992-4808-a5fd-90330c9b50eb" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "id": "FeFVkJZU_9sl", - "outputId": "02951c4e-7992-4808-a5fd-90330c9b50eb", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_min 2 \n", - "\n", - " relationshipCount projectMillis \n", - "0 3 113 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_min23113
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 31 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'__ALL__': {'label': '*', 'properties': {}}}{'LOVES': {'orientation': 'NATURAL', 'aggregat...type_min23113
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "type_min_agg = \"\"\"\n", - "\n", - "CALL gds.graph.project('type_min','*',\n", - " ['PRESENTED_FOR','LIKES','LOVES'], \n", - " {relationshipProperties: {weight: {property: 'weight',\n", - " aggregation: 'MIN'}}})\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(type_min_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gDsUSvVF_9sl" - }, - "source": [ - "We get 3 relationships projected as we have learned that the aggregations happen on the relationship type level. We will double-check the results with the weighted degree." + "text/plain": [ + " nodeProjection \\\n", + "0 {'__ALL__': {'label': '*', 'properties': {}}} \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'LOVES': {'orientation': 'NATURAL', 'aggregat... type_min 2 \n", + "\n", + " relationshipCount projectMillis \n", + "0 3 113 " ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_min_agg = \"\"\"\n", + "\n", + "CALL gds.graph.project('type_min','*',\n", + " ['PRESENTED_FOR','LIKES','LOVES'], \n", + " {relationshipProperties: {weight: {property: 'weight',\n", + " aggregation: 'MIN'}}})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(type_min_agg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gDsUSvVF_9sl" + }, + "source": [ + "We get 3 relationships projected as we have learned that the aggregations happen on the relationship type level. We will double-check the results with the weighted degree." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "yL9OMDT7_9sl", + "outputId": "91b71455-e1f8-4499-92d3-520359c442ec" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "id": "yL9OMDT7_9sl", - "outputId": "91b71455-e1f8-4499-92d3-520359c442ec", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " name weighted_degree\n", - "0 Tomaz 3.5\n", - "1 Neo4j 0.0" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameweighted_degree
0Tomaz3.5
1Neo4j0.0
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 32 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameweighted_degree
0Tomaz3.5
1Neo4j0.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "type_min_agg_check = \"\"\"\n", - "\n", - "CALL gds.degree.stream('type_min',\n", - " {relationshipWeightProperty:'weight'})\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name AS name,\n", - " score AS weighted_degree\n", - "ORDER BY weighted_degree DESC\n", - "\n", - "\"\"\"\n", - "\n", - "read_query(type_min_agg_check)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "id": "EWOo5l-3_9sl" - }, - "outputs": [], - "source": [ - "drop_graph('type_min')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7f-BZWXn_9sl" - }, - "outputs": [], - "source": [ - "" + "text/plain": [ + " name weighted_degree\n", + "0 Tomaz 3.5\n", + "1 Neo4j 0.0" ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - }, - "colab": { - "name": "GDS multigraph.ipynb", - "provenance": [], - "include_colab_link": true - } + ], + "source": [ + "type_min_agg_check = \"\"\"\n", + "\n", + "CALL gds.degree.stream('type_min',\n", + " {relationshipWeightProperty:'weight'})\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name AS name,\n", + " score AS weighted_degree\n", + "ORDER BY weighted_degree DESC\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(type_min_agg_check)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "EWOo5l-3_9sl" + }, + "outputs": [], + "source": [ + "drop_graph('type_min')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7f-BZWXn_9sl" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "GDS multigraph.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Game_of_thrones_community_iteration/Game of thrones community iteration.ipynb b/Game_of_thrones_community_iteration/Game of thrones community iteration.ipynb index 0fe119c..68d8bc1 100644 --- a/Game_of_thrones_community_iteration/Game of thrones community iteration.ipynb +++ b/Game_of_thrones_community_iteration/Game of thrones community iteration.ipynb @@ -1,709 +1,655 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "* Updated to GDS 2.0 version\n", - "* Link to original blog post: https://towardsdatascience.com/community-detection-through-time-using-seed-property-in-neo4j-on-the-game-of-thrones-dataset-a2e520a6c79f" - ], - "metadata": { - "id": "_2CaCA2vDGbC" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install neo4j" - ], - "metadata": { - "id": "PG1voNyVDIfn", - "outputId": "ba912efc-e226-4e11-b840-a5959c3f7435", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting neo4j\n", - " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", - "\u001b[?25l\r\u001b[K |███▋ | 10 kB 23.3 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 12.3 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 8.8 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 3.9 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 3.8 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 4.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 4.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 4.9 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 3.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=4b6b3195024550cad7621ab4012c1b3dc9630377b161e72e6b622ee512cff2c6\n", - " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j\n", - "Successfully installed neo4j-4.4.2\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "gOf9wc5_CvHx" - }, - "outputs": [], - "source": [ - "from neo4j import GraphDatabase\n", - "host = 'bolt://3.235.2.228:7687'\n", - "user = 'neo4j'\n", - "password = 'seats-drunks-carbon'\n", - "driver = GraphDatabase.driver(host,auth=(user, password))" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "GZG8ZuroCvH1" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "def run_query(query, params={}):\n", - " with driver.session() as session:\n", - " result = session.run(query, params)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "TfpDkC2aCvH2" - }, - "outputs": [], - "source": [ - "from IPython.display import IFrame, HTML\n", - "import json\n", - "import uuid\n", - "\n", - "\n", - "def generate_vis(host, user, password, cypher, labels_json, relationships_json):\n", - " html = \"\"\"\\\n", - "\n", - "\n", - " Neovis.js Simple Example\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \"\"\"\n", - "\n", - " html = html.format(\n", - " host=host,\n", - " user=user,\n", - " password=password,\n", - " cypher=cypher,\n", - " labels = json.dumps(labels_json),\n", - " relationships=json.dumps(relationships_json)\n", - " )\n", - "\n", - " unique_id = str(uuid.uuid4())\n", - " filename = \"graph-{}.html\".format(unique_id)\n", - "\n", - " with open(filename, \"w\") as f:\n", - " f.write(html)\n", - " return IFrame(src=filename, width=1000, height=800)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "7PBPz29RCvH3" - }, - "outputs": [], - "source": [ - "def visualize_level(level, community):\n", - " # Define cypher query\n", - " if level > 1:\n", - " cypher = \"\"\"MATCH (p1:Person)-[r:INTERACTS_{rel_level}|:INTERACTS_{prev_level}]-(p2:Person) \\\n", - " WHERE p1.community_{level} = {community} RETURN *\"\"\".format(\n", - " rel_level=level if level != 4 else 45,level=level, prev_level=level -1, community=community)\n", - " else:\n", - " cypher = \"\"\"MATCH (p1:Person)-[r:INTERACTS_{level}]-(p2:Person) \\\n", - " WHERE p1.community_{level} = {community} RETURN *\"\"\".format(level=level, community=community)\n", - " print(cypher)\n", - " # Define relationships_json\n", - " relationships_json = dict()\n", - " for l in [level-1,level]:\n", - " relationships_json[\"INTERACTS_{}\".format(l if l != 4 else 45)] = {\n", - " \"caption\": False\n", - " }\n", - " # Define labels_json \n", - " labels_json = {\n", - " \"Person\": {\n", - " \"caption\": \"id\",\n", - " \"community\": \"community_{}\".format(level)\n", - " }\n", - " }\n", - "\n", - " return generate_vis(host, user, password, cypher, labels_json, relationships_json)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_2CaCA2vDGbC" + }, + "source": [ + "* Updated to GDS 2.3 version\n", + "* Link to original blog post: https://towardsdatascience.com/community-detection-through-time-using-seed-property-in-neo4j-on-the-game-of-thrones-dataset-a2e520a6c79f" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "PG1voNyVDIfn", + "outputId": "ba912efc-e226-4e11-b840-a5959c3f7435" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "SjP8vjfWCvH4" - }, - "source": [ - "# Import" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: neo4j in /home/tomaz/.local/lib/python3.8/site-packages (4.4.3)\r\n", + "Requirement already satisfied: pytz in /home/tomaz/anaconda3/lib/python3.8/site-packages (from neo4j) (2021.1)\r\n" + ] + } + ], + "source": [ + "!pip install neo4j" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "gOf9wc5_CvHx" + }, + "outputs": [], + "source": [ + "from neo4j import GraphDatabase\n", + "host = 'bolt://3.231.25.240:7687'\n", + "user = 'neo4j'\n", + "password = 'hatchets-visitor-axes'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "GZG8ZuroCvH1" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "def run_query(query, params={}):\n", + " with driver.session() as session:\n", + " result = session.run(query, params)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "TfpDkC2aCvH2" + }, + "outputs": [], + "source": [ + "from IPython.display import IFrame, HTML\n", + "import json\n", + "import uuid\n", + "\n", + "\n", + "def generate_vis(host, user, password, cypher, labels_json, relationships_json):\n", + " html = \"\"\"\\\n", + " \n", + " \n", + " Neovis.js Simple Example\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \"\"\"\n", + "\n", + " html = html.format(\n", + " host=host,\n", + " user=user,\n", + " password=password,\n", + " cypher=cypher,\n", + " labels = json.dumps(labels_json),\n", + " relationships=json.dumps(relationships_json)\n", + " )\n", + "\n", + " unique_id = str(uuid.uuid4())\n", + " filename = \"graph-{}.html\".format(unique_id)\n", + "\n", + " with open(filename, \"w\") as f:\n", + " f.write(html)\n", + " return IFrame(src=filename, width=1000, height=800)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "7PBPz29RCvH3" + }, + "outputs": [], + "source": [ + "def visualize_level(level, community):\n", + " # Define cypher query\n", + " if level > 1:\n", + " cypher = \"\"\"MATCH (p1:Person)-[r:INTERACTS_{rel_level}|INTERACTS_{prev_level}]-(p2:Person) \\\n", + " WHERE p1.community_{level} = {community} RETURN *\"\"\".format(\n", + " rel_level=level if level != 4 else 45,level=level, prev_level=level -1, community=community)\n", + " else:\n", + " cypher = \"\"\"MATCH (p1:Person)-[r:INTERACTS_{level}]-(p2:Person) \\\n", + " WHERE p1.community_{level} = {community} RETURN *\"\"\".format(level=level, community=community)\n", + " print(cypher)\n", + " # Define relationships_json\n", + " relationships_json = dict()\n", + " for l in [level-1,level]:\n", + " relationships_json[\"INTERACTS_{}\".format(l if l != 4 else 45)] = {\n", + " \"caption\": False\n", + " }\n", + " # Define labels_json \n", + " labels_json = {\n", + " \"Person\": {\n", + " \"label\": \"id\",\n", + " \"group\": \"community_{}\".format(level)\n", + " }\n", + " }\n", + "\n", + " return generate_vis(host, user, password, cypher, labels_json, relationships_json)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SjP8vjfWCvH4" + }, + "source": [ + "# Import" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "udyBibdJCvH5", + "outputId": "0cc93822-b5ac-4cc4-e60b-b1689ccfa7bd" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "id": "udyBibdJCvH5", - "outputId": "0cc93822-b5ac-4cc4-e60b-b1689ccfa7bd", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 31 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "constraint_query = \"\"\"CREATE CONSTRAINT IF NOT EXISTS ON (p:Person) ASSERT p.id IS UNIQUE;\"\"\"\n", - "run_query(constraint_query)" + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "DEcCdTlNCvH6" - }, - "outputs": [], - "source": [ - "# https://networkofthrones.wordpress.com/\n", - "import_networks = \"\"\"\n", - "\n", - "UNWIND ['1','2','3','45'] as book\n", - "LOAD CSV WITH HEADERS FROM \n", - "'https://raw.githubusercontent.com/mathbeveridge/asoiaf/master/data/asoiaf-book' + book + '-edges.csv' as value\n", - "MERGE (source:Person{id:value.Source})\n", - "MERGE (target:Person{id:value.Target})\n", - "WITH source,target,value.weight as weight,book\n", - "CALL apoc.merge.relationship(source,'INTERACTS_' + book, {}, {weight:toFloat(weight)}, target) YIELD rel\n", - "RETURN distinct 'done'\n", - "\n", - "\"\"\"\n", - "run_query(import_networks)" - ] - }, + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "constraint_query = \"\"\"CREATE CONSTRAINT IF NOT EXISTS FOR (p:Person) REQUIRE p.id IS UNIQUE;\"\"\"\n", + "run_query(constraint_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "DEcCdTlNCvH6" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "def write_louvain(book):\n", - " project_graph_query = f\"\"\"\n", - " CALL gds.graph.project.cypher('book',\n", - " 'MATCH (p:Person)\n", - " WHERE (p)-[:INTERACTS_{book}]-()\n", - " RETURN id(p) as id',\n", - " 'MATCH (p:Person)-[:INTERACTS_{book}]-(p1:Person)\n", - " RETURN id(p) as source, id(p1) as target')\n", - "\"\"\"\n", - "\n", - " louvain_book = f\"\"\"\n", - " CALL gds.louvain.write('book'\n", - " ,{{writeProperty:'community_{book}'}})\n", - " \"\"\"\n", - "\n", - " drop_graph = \"\"\"\n", - " CALL gds.graph.drop('book')\n", - " \"\"\"\n", - " run_query(project_graph_query)\n", - " run_query(louvain_book)\n", - " run_query(drop_graph)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
'done'
0done
\n", + "
" ], - "metadata": { - "id": "g4-CauAHEIIo" - }, - "execution_count": 32, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rtmxB-ytCvH7" - }, - "source": [ - "# Book 1" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "id": "ZTtOdPv-CvH8" - }, - "outputs": [], - "source": [ - "write_louvain(\"1\")" + "text/plain": [ + " 'done'\n", + "0 done" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# https://networkofthrones.wordpress.com/\n", + "import_networks = \"\"\"\n", + "\n", + "UNWIND ['1','2','3','45'] as book\n", + "LOAD CSV WITH HEADERS FROM \n", + "'https://raw.githubusercontent.com/mathbeveridge/asoiaf/master/data/asoiaf-book' + book + '-edges.csv' as value\n", + "MERGE (source:Person{id:value.Source})\n", + "MERGE (target:Person{id:value.Target})\n", + "WITH source,target,value.weight as weight,book\n", + "CALL apoc.merge.relationship(source,'INTERACTS_' + book, {}, {weight:toFloat(weight)}, target) YIELD rel\n", + "RETURN distinct 'done'\n", + "\n", + "\"\"\"\n", + "run_query(import_networks)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "g4-CauAHEIIo" + }, + "outputs": [], + "source": [ + "def write_louvain(book):\n", + " project_graph_query = f\"\"\"\n", + " CALL gds.graph.project.cypher('book',\n", + " 'MATCH (p:Person)\n", + " WHERE (p)-[:INTERACTS_{book}]-()\n", + " RETURN id(p) as id',\n", + " 'MATCH (p:Person)-[:INTERACTS_{book}]-(p1:Person)\n", + " RETURN id(p) as source, id(p1) as target')\n", + "\"\"\"\n", + "\n", + " louvain_book = f\"\"\"\n", + " CALL gds.louvain.write('book'\n", + " ,{{writeProperty:'community_{book}'}})\n", + " \"\"\"\n", + "\n", + " drop_graph = \"\"\"\n", + " CALL gds.graph.drop('book')\n", + " \"\"\"\n", + " run_query(project_graph_query)\n", + " run_query(louvain_book)\n", + " run_query(drop_graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rtmxB-ytCvH7" + }, + "source": [ + "# Book 1" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "ZTtOdPv-CvH8" + }, + "outputs": [], + "source": [ + "write_louvain(\"1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Hr2r-5U1CvH8" + }, + "outputs": [], + "source": [ + "# Get Daenerys' community id \n", + "get_daenerys_community_query = \"\"\"\n", + "MATCH (p:Person{id:'Daenerys-Targaryen'})\n", + "RETURN p.community_1 as community\n", + "\"\"\"\n", + "\n", + "daenerys_community = run_query(get_daenerys_community_query)['community'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 839 }, + "id": "jTi8TtCUCvH9", + "outputId": "2d6073df-7ed1-45e5-be44-2369660652cd" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "Hr2r-5U1CvH8" - }, - "outputs": [], - "source": [ - "# Get Daenerys' community id \n", - "get_daenerys_community_query = \"\"\"\n", - "MATCH (p:Person{id:'Daenerys-Targaryen'})\n", - "RETURN p.community_1 as community\n", - "\"\"\"\n", - "\n", - "daenerys_community = run_query(get_daenerys_community_query)['community'][0]" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MATCH (p1:Person)-[r:INTERACTS_1]-(p2:Person) WHERE p1.community_1 = 52 RETURN *\n" + ] }, { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "id": "jTi8TtCUCvH9", - "outputId": "2d6073df-7ed1-45e5-be44-2369660652cd", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 839 - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "MATCH (p1:Person)-[r:INTERACTS_1]-(p2:Person) WHERE p1.community_1 = 52 RETURN *\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {}, - "execution_count": 38 - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "visualize_level(level=1,community=daenerys_community)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ye6-xWwdCvH-" - }, - "source": [ - "# Book 2" + "text/plain": [ + "" ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "visualize_level(level=1,community=daenerys_community)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ye6-xWwdCvH-" + }, + "source": [ + "# Book 2" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "SkCJduILCvH-" + }, + "outputs": [], + "source": [ + "write_louvain(\"2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "nTNS2AK1CvH-", + "outputId": "b155bc72-1b22-4609-b915-c34314c21408" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "id": "SkCJduILCvH-" - }, - "outputs": [], - "source": [ - "write_louvain(\"2\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MATCH (p1:Person)-[r:INTERACTS_2|INTERACTS_1]-(p2:Person) WHERE p1.community_2 = 52 RETURN *\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nTNS2AK1CvH-", - "outputId": "b155bc72-1b22-4609-b915-c34314c21408" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MATCH (p1:Person)-[r:INTERACTS_2|:INTERACTS_1]-(p2:Person) WHERE p1.community_2 = 3 RETURN *\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "visualize_level(level=2,community=daenerys_community)" + "text/plain": [ + "" ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "niEQO5pwCvH_" - }, - "source": [ - "# Book 3" - ] - }, + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "visualize_level(level=2,community=daenerys_community)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "niEQO5pwCvH_" + }, + "source": [ + "# Book 3" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "MIITSfdACvH_" + }, + "outputs": [], + "source": [ + "write_louvain(\"3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "K0GJcsFqCvH_", + "outputId": "36d8e774-d0ec-4a4d-f9f0-898f867c851f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "id": "MIITSfdACvH_" - }, - "outputs": [], - "source": [ - "write_louvain(\"3\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MATCH (p1:Person)-[r:INTERACTS_3|INTERACTS_2]-(p2:Person) WHERE p1.community_3 = 52 RETURN *\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K0GJcsFqCvH_", - "outputId": "36d8e774-d0ec-4a4d-f9f0-898f867c851f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MATCH (p1:Person)-[r:INTERACTS_3|:INTERACTS_2]-(p2:Person) WHERE p1.community_3 = 3 RETURN *\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "visualize_level(level=3,community=daenerys_community)" + "text/plain": [ + "" ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zCadSySZCvIA" - }, - "source": [ - "# Book 4" - ] - }, + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "visualize_level(level=3,community=daenerys_community)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zCadSySZCvIA" + }, + "source": [ + "# Book 4" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "B5wN4bd4CvIA" + }, + "outputs": [], + "source": [ + "write_louvain(\"45\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "Tc4FVKVTCvIA", + "outputId": "afc17880-1ac3-4428-83c5-fbe8eb332517" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "id": "B5wN4bd4CvIA" - }, - "outputs": [], - "source": [ - "write_louvain(\"45\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "MATCH (p1:Person)-[r:INTERACTS_45|INTERACTS_3]-(p2:Person) WHERE p1.community_4 = 52 RETURN *\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Tc4FVKVTCvIA", - "outputId": "afc17880-1ac3-4428-83c5-fbe8eb332517" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MATCH (p1:Person)-[r:INTERACTS_45|:INTERACTS_3]-(p2:Person) WHERE p1.community_4 = 3 RETURN *\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "visualize_level(level=4,community=daenerys_community)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uWD1_qV3CvIA" - }, - "outputs": [], - "source": [ - "" + "text/plain": [ + "" ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - }, - "colab": { - "name": "Game of thrones community iteration.ipynb", - "provenance": [], - "include_colab_link": true - } + ], + "source": [ + "visualize_level(level=4,community=daenerys_community)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uWD1_qV3CvIA" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "Game of thrones community iteration.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Lord_of_the_wikidata/Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb b/Lord_of_the_wikidata/Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb index ebdcf1c..e5e93be 100644 --- a/Lord_of_the_wikidata/Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb +++ b/Lord_of_the_wikidata/Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb @@ -1,4401 +1,2702 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "* Updated to GDS 2.0 version\n", - "* Link to original blog post: https://towardsdatascience.com/lord-of-the-wiki-ring-importing-wikidata-into-neo4j-and-analyzing-family-trees-da27f64d675e" - ], - "metadata": { - "id": "Hwk0pHemHeWt" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install neo4j" - ], - "metadata": { - "id": "8aH7cn62Hn3g", - "outputId": "8625415d-c9c4-4e3c-85d9-9c3ae3055c43", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting neo4j\n", - " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", - "\u001b[?25l\r\u001b[K |███▋ | 10 kB 24.3 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 14.7 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 10.6 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 9.2 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 4.6 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 5.4 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 5.8 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 5.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 3.3 MB/s \n", - "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=1263c50dc3a5bf370b9d96525936014a00881f6b44f5d41da992312297966fac\n", - " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j\n", - "Successfully installed neo4j-4.4.2\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "I recommend you setup a [blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" - ], - "metadata": { - "id": "l9PMy6mJHpvZ" - } - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "IDQFrF1OHa4C" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "# Define Neo4j connections\n", - "from neo4j import GraphDatabase\n", - "host = 'bolt://3.235.2.228:7687'\n", - "user = 'neo4j'\n", - "password = 'seats-drunks-carbon'\n", - "driver = GraphDatabase.driver(host,auth=(user, password))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "OyQ7nwshHa4G" - }, - "outputs": [], - "source": [ - "# Import libraries\n", - "import pandas as pd\n", - "\n", - "def run_query(query, params={}):\n", - " with driver.session() as session:\n", - " result = session.run(query, params)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ] - }, - { - "cell_type": "code", - "source": [ - "# Fix default timeout query setting in Sandbox\n", - "\n", - "run_query(\"\"\"\n", - "CALL dbms.setConfigValue('dbms.transaction.timeout','0')\n", - "\"\"\")" - ], - "metadata": { - "id": "Yox2IsDmNuD3", - "outputId": "88ff6dba-15f2-4293-a5a3-91eef0c8ef06", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "execution_count": 25, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 25 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g_gvwAYQHa4H" - }, - "source": [ - "## Agenda\n", - "\n", - "* Import Wikipedia data to Neo4j\n", - "* Basic graph exploration\n", - "* Populate missing value\n", - "* Some more graph exploration\n", - "* Weakly connected component\n", - "* Betweenness centrality\n", - "\n", - "We have been using simple graph schemas for quite some time now. I am delighted to say that this time we have a bit more complicated schema. The graph schema revolves around the characters in the LOTR world. A character can be either a relative, father, mother, enemy, spouse, or sibling with another character. This represents a social network of characters with multiple types of relationships. We also have additional information about characters such as their race, country, and language. On top of that, we also know if they are part of any group or have participated in any event.\n", - "\n", - "## WikiData import\n", - "\n", - "As mentioned, we will fetch the data from the WikiData API with the help of the apoc.load.json procedure. If you don't know yet, APOC provides great support for importing data into Neo4j. Besides the ability to fetch data from any REST API, it also features integrations with other databases such as MongoDB or relational databases via the JDBC driver.\n", - "\n", - "P.s. You should check out Neosematics library if you work a lot with RDF data, I only noticed it after I have written the post\n", - "\n", - "We will start by importing all the races in the LOTR world. I have to admit I am a total noob when it comes to SPARQL, so I won't be explaining the syntax in depth. If you need a basic introduction on how to query WikiData, I suggest this tutorial on Youtube. Basically, all the races in the LOTR world are an instance of the Middle-earth races entity with id Q989255. To get the instances of a specific entity, we use the following SPARQL clause:\n", - "\n", - "?item wdt:P31 wd:Q989255\n", - "\n", - "This can be translated as \"We would like to fetch an item, which is an instance of (wdt:P31) an entity with an id Q989255\". After we have downloaded the data with APOC, we store the results to Neo4j." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "id": "6t_-UwQ4Ha4J" - }, - "outputs": [], - "source": [ - "import_races_query = \"\"\"\n", - "\n", - "// Prepare a SPARQL query \n", - "WITH 'SELECT ?item ?itemLabel WHERE{ ?item wdt:P31 wd:Q989255 . SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\" }}' AS sparql \n", - "// make a request to Wikidata\n", - "CALL apoc.load.jsonParams('https://query.wikidata.org/sparql?query=' + \n", - " sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null) \n", - "YIELD value \n", - "// Unwind results to row \n", - "UNWIND value['results']['bindings'] as row \n", - "// Prepare data \n", - "WITH row['itemLabel']['value'] as race, \n", - " row['item']['value'] as url, \n", - " split(row['item']['value'],'/')[-1] as id \n", - "// Store to Neo4j \n", - "CREATE (r:Race) SET r.race = race, \n", - " r.url = url, \n", - " r.id = id\n", - "\n", - "\"\"\"\n", - "\n", - "r = run_query(import_races_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SJSd4ON_Ha4L" - }, - "source": [ - "That was easy. The next step is to fetch the characters that are an instance of a given Middle-earth race. The SPARQL syntax is almost identical to the previous query, except this time we iterate over each race and find the characters that are an instance of a given race." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hwk0pHemHeWt" + }, + "source": [ + "* Updated to GDS 2.0 version\n", + "* Link to original blog post: https://towardsdatascience.com/lord-of-the-wiki-ring-importing-wikidata-into-neo4j-and-analyzing-family-trees-da27f64d675e" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "8aH7cn62Hn3g", + "outputId": "8625415d-c9c4-4e3c-85d9-9c3ae3055c43" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "Q6d0zuc7Ha4L" - }, - "outputs": [], - "source": [ - "import_characters_query = \"\"\"\n", - "\n", - "// Iterate over each race in graph\n", - "MATCH (r:Race)\n", - "// Prepare a SparQL query\n", - "WITH 'SELECT ?item ?itemLabel WHERE { ?item wdt:P31 wd:' + r.id + ' . SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\" } }' AS sparql, r \n", - "// make a request to Wikidata \n", - "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", - " sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null)\n", - "YIELD value \n", - "UNWIND value['results']['bindings'] as row \n", - "WITH row['itemLabel']['value'] as name, \n", - " row['item']['value'] as url, \n", - " split(row['item']['value'],'/')[-1] as id, r \n", - "// Store to Neo4j \n", - "CREATE (c:Character) \n", - "SET c.name = name, \n", - " c.url = url, \n", - " c.id = id \n", - "CREATE (c)-[:BELONG_TO]->(r)\n", - "\n", - "\"\"\"\n", - "\n", - "r = run_query(import_characters_query)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting neo4j\n", + " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", + "\u001b[?25l\r", + "\u001b[K |███▋ | 10 kB 24.3 MB/s eta 0:00:01\r", + "\u001b[K |███████▎ | 20 kB 14.7 MB/s eta 0:00:01\r", + "\u001b[K |███████████ | 30 kB 10.6 MB/s eta 0:00:01\r", + "\u001b[K |██████████████▋ | 40 kB 9.2 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████▎ | 51 kB 4.6 MB/s eta 0:00:01\r", + "\u001b[K |██████████████████████ | 61 kB 5.4 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████▋ | 71 kB 5.8 MB/s eta 0:00:01\r", + "\u001b[K |█████████████████████████████▎ | 81 kB 5.7 MB/s eta 0:00:01\r", + "\u001b[K |████████████████████████████████| 89 kB 3.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=1263c50dc3a5bf370b9d96525936014a00881f6b44f5d41da992312297966fac\n", + " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j\n", + "Successfully installed neo4j-4.4.2\n" + ] + } + ], + "source": [ + "!pip install neo4j" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l9PMy6mJHpvZ" + }, + "source": [ + "I recommend you setup a [blank project on Neo4j Sandbox environment](https://sandbox.neo4j.com/?usecase=blank-sandbox), but you can also use other environment versions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "IDQFrF1OHa4C" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "# Define Neo4j connections\n", + "from neo4j import GraphDatabase\n", + "host = 'bolt://44.193.28.203:7687'\n", + "user = 'neo4j'\n", + "password = 'combatants-coordinates-tugs'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "OyQ7nwshHa4G" + }, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "\n", + "def run_query(query, params={}):\n", + " with driver.session() as session:\n", + " result = session.run(query, params)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, - { - "cell_type": "markdown", - "metadata": { - "id": "m3vghrJ-Ha4M" - }, - "source": [ - "Did you know that there are at least 700 characters in the Middle-earth world? I would never guess there would be so many documented characters on WikiData. Our first exploratory cypher query will be to count the characters by race." - ] + "id": "Yox2IsDmNuD3", + "outputId": "88ff6dba-15f2-4293-a5a3-91eef0c8ef06" + }, + "outputs": [], + "source": [ + "# Fix default timeout query setting in Sandbox\n", + "\n", + "run_query(\"\"\"\n", + "CALL dbms.setConfigValue('dbms.transaction.timeout','0')\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g_gvwAYQHa4H" + }, + "source": [ + "## Agenda\n", + "\n", + "* Import Wikipedia data to Neo4j\n", + "* Basic graph exploration\n", + "* Populate missing value\n", + "* Some more graph exploration\n", + "* Weakly connected component\n", + "* Betweenness centrality\n", + "\n", + "We have been using simple graph schemas for quite some time now. I am delighted to say that this time we have a bit more complicated schema. The graph schema revolves around the characters in the LOTR world. A character can be either a relative, father, mother, enemy, spouse, or sibling with another character. This represents a social network of characters with multiple types of relationships. We also have additional information about characters such as their race, country, and language. On top of that, we also know if they are part of any group or have participated in any event.\n", + "\n", + "## WikiData import\n", + "\n", + "As mentioned, we will fetch the data from the WikiData API with the help of the apoc.load.json procedure. If you don't know yet, APOC provides great support for importing data into Neo4j. Besides the ability to fetch data from any REST API, it also features integrations with other databases such as MongoDB or relational databases via the JDBC driver.\n", + "\n", + "P.s. You should check out Neosematics library if you work a lot with RDF data, I only noticed it after I have written the post\n", + "\n", + "We will start by importing all the races in the LOTR world. I have to admit I am a total noob when it comes to SPARQL, so I won't be explaining the syntax in depth. If you need a basic introduction on how to query WikiData, I suggest this tutorial on Youtube. Basically, all the races in the LOTR world are an instance of the Middle-earth races entity with id Q989255. To get the instances of a specific entity, we use the following SPARQL clause:\n", + "\n", + "?item wdt:P31 wd:Q989255\n", + "\n", + "This can be translated as \"We would like to fetch an item, which is an instance of (wdt:P31) an entity with an id Q989255\". After we have downloaded the data with APOC, we store the results to Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "6t_-UwQ4Ha4J" + }, + "outputs": [], + "source": [ + "import_races_query = \"\"\"\n", + "\n", + "// Prepare a SPARQL query \n", + "WITH 'SELECT ?item ?itemLabel WHERE{ ?item wdt:P31 wd:Q989255 . SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\" }}' AS sparql \n", + "// make a request to Wikidata\n", + "CALL apoc.load.jsonParams('https://query.wikidata.org/sparql?query=' + \n", + " apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null) \n", + "YIELD value \n", + "// Unwind results to row \n", + "UNWIND value['results']['bindings'] as row \n", + "// Prepare data \n", + "WITH row['itemLabel']['value'] as race, \n", + " row['item']['value'] as url, \n", + " split(row['item']['value'],'/')[-1] as id \n", + "// Store to Neo4j \n", + "CREATE (r:Race) SET r.race = race, \n", + " r.url = url, \n", + " r.id = id\n", + "\n", + "\"\"\"\n", + "\n", + "r = run_query(import_races_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SJSd4ON_Ha4L" + }, + "source": [ + "That was easy. The next step is to fetch the characters that are an instance of a given Middle-earth race. The SPARQL syntax is almost identical to the previous query, except this time we iterate over each race and find the characters that are an instance of a given race." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "Q6d0zuc7Ha4L" + }, + "outputs": [], + "source": [ + "import_characters_query = \"\"\"\n", + "\n", + "// Iterate over each race in graph\n", + "MATCH (r:Race)\n", + "// Prepare a SparQL query\n", + "WITH 'SELECT ?item ?itemLabel WHERE { ?item wdt:P31 wd:' + r.id + ' . SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\" } }' AS sparql, r \n", + "// make a request to Wikidata \n", + "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", + " apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null)\n", + "YIELD value \n", + "UNWIND value['results']['bindings'] as row \n", + "WITH row['itemLabel']['value'] as name, \n", + " row['item']['value'] as url, \n", + " split(row['item']['value'],'/')[-1] as id, r \n", + "// Store to Neo4j \n", + "CREATE (c:Character) \n", + "SET c.name = name, \n", + " c.url = url, \n", + " c.id = id \n", + "CREATE (c)-[:BELONG_TO]->(r)\n", + "\n", + "\"\"\"\n", + "\n", + "r = run_query(import_characters_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m3vghrJ-Ha4M" + }, + "source": [ + "Did you know that there are at least 700 characters in the Middle-earth world? I would never guess there would be so many documented characters on WikiData. Our first exploratory cypher query will be to count the characters by race." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "QZYwABSvHa4N", + "outputId": "c0598e39-6598-4d63-9fd6-b57902e9b876" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "QZYwABSvHa4N", - "outputId": "c0598e39-6598-4d63-9fd6-b57902e9b876", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " race members\n", - "0 men in Tolkien's legendarium 345\n", - "1 Hobbit 150\n", - "2 Middle-earth elf 83\n", - "3 dwarves in Tolkien's legendarium 52\n", - "4 valar 16\n", - "5 half-elven 12\n", - "6 Maiar 10\n", - "7 Orcs in Tolkien's legendarium 9\n", - "8 Ent 5\n", - "9 dragons of Middle-earth 4" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
racemembers
0men in Tolkien's legendarium345
1Hobbit150
2Middle-earth elf83
3dwarves in Tolkien's legendarium52
4valar16
5half-elven12
6Maiar10
7Orcs in Tolkien's legendarium9
8Ent5
9dragons of Middle-earth4
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 23 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
racemembers
0Middle-earth man354
1Hobbit150
2Middle-earth elf86
3Middle-earth dwarf52
4Valar16
5half-elven11
6Maiar10
7Orcs in Tolkien's legendarium9
8Ent5
9dragons of Middle-earth3
\n", + "
" ], - "source": [ - "race_size_query = \"\"\"\n", - "\n", - "MATCH (r:Race) \n", - "RETURN r.race as race, \n", - " size((r)<-[:BELONG_TO]-()) as members \n", - "ORDER BY members DESC \n", - "LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(race_size_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UazugeKjHa4N" - }, - "source": [ - "The Fellowship of the Ring group is a somewhat representative sample of races in the Middle-earth. Most of the characters are either human or hobbits, with a couple of elves and dwarves strolling by. This is the first time I have heard of Valar and Maiar races though.\n", - "\n", - "Now it is time to enrich the graph with information about characters' gender, country, and manner of death. The SPARQL query will be a bit different than before. This time we will select a WikiData entity directly by its unique id and optionally fetch some of its properties. We can filter a specific entity by its id using the following SPARQL clause:\n", - "\n", - "filter (?item = wd:' + r.id + ')\n", - "\n", - "Similar to the cypher query language, SPARQL also differentiates between a MATCH and an OPTIONAL MATCH. When we want to return multiple properties of an entity, it is best to wrap each property into an OPTIONAL MATCH. This way we will get results if any of the properties exist. Without the OPTIONAL MATCH, we would only get results for entities where all three properties exist. This is an identical behavior to cypher.\n", - "\n", - "OPTIONAL{ ?item wdt:P21 [rdfs:label ?gender] . \n", - " filter (lang(?gender)=\"en\") }\n", - "\n", - "The wdt:P21 indicates we are interested in the gender property.  We also specify that we want to get the English label of an entity instead of its WikiData id. The easiest way to search for the desired property id is to inspect the entity on the WikiData web page and hover over a property name.\n", - "\n", - "Another way is to use the WikiData query editor, which has a great autocomplete function by using the CTRL+T command.\n", - "\n", - "To store the results back to Neo4j we will use the FOREACH trick. Because some of our results will contain null values, we have to wrap the MERGE statement into the FOREACH statement which supports conditional execution. Check the Tips and tricks blog post by Michael Hunger for more information." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "l3lj59OdHa4O" - }, - "outputs": [], - "source": [ - "import_gender_query = \"\"\"\n", - "\n", - "// Iterate over characters \n", - "MATCH (r:Character) \n", - "// Prepare a SparQL query \n", - "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", - " 'OPTIONAL{ ?item wdt:P21 [rdfs:label ?gender] . filter (lang(?gender)=\"en\") } ' + \n", - " 'OPTIONAL{ ?item wdt:P27 [rdfs:label ?country] . filter (lang(?country)=\"en\") } ' +\n", - " 'OPTIONAL{ ?item wdt:P1196 [rdfs:label ?death] . filter (lang(?death)=\"en\") }}' AS sparql, r \n", - "// make a request to Wikidata \n", - "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" \n", - " + sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null)\n", - "YIELD value \n", - "UNWIND value['results']['bindings'] as row \n", - "SET r.gender = row['gender']['value'], \n", - " r.manner_of_death = row['death']['value'] \n", - "// Execute FOREACH statement \n", - "FOREACH(ignoreme in case when row['country'] is not null then [1] else [] end | \n", - " MERGE (c:Country{name:row['country']['value']}) \n", - " MERGE (r)-[:IN_COUNTRY]->(c))\n", - "\n", - "\"\"\"\n", - "\n", - "r = run_query(import_gender_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GIpG7UxcHa4P" - }, - "source": [ - "We are connecting additional information to our graph bit by bit and slowly transforming it into a knowledge graph. Let's first look at the manner of death property." + "text/plain": [ + " race members\n", + "0 Middle-earth man 354\n", + "1 Hobbit 150\n", + "2 Middle-earth elf 86\n", + "3 Middle-earth dwarf 52\n", + "4 Valar 16\n", + "5 half-elven 11\n", + "6 Maiar 10\n", + "7 Orcs in Tolkien's legendarium 9\n", + "8 Ent 5\n", + "9 dragons of Middle-earth 3" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "race_size_query = \"\"\"\n", + "\n", + "MATCH (r:Race) \n", + "RETURN r.race as race, \n", + " count{ (r)<-[:BELONG_TO]-() } as members \n", + "ORDER BY members DESC \n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(race_size_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UazugeKjHa4N" + }, + "source": [ + "The Fellowship of the Ring group is a somewhat representative sample of races in the Middle-earth. Most of the characters are either human or hobbits, with a couple of elves and dwarves strolling by. This is the first time I have heard of Valar and Maiar races though.\n", + "\n", + "Now it is time to enrich the graph with information about characters' gender, country, and manner of death. The SPARQL query will be a bit different than before. This time we will select a WikiData entity directly by its unique id and optionally fetch some of its properties. We can filter a specific entity by its id using the following SPARQL clause:\n", + "\n", + "filter (?item = wd:' + r.id + ')\n", + "\n", + "Similar to the cypher query language, SPARQL also differentiates between a MATCH and an OPTIONAL MATCH. When we want to return multiple properties of an entity, it is best to wrap each property into an OPTIONAL MATCH. This way we will get results if any of the properties exist. Without the OPTIONAL MATCH, we would only get results for entities where all three properties exist. This is an identical behavior to cypher.\n", + "\n", + "OPTIONAL{ ?item wdt:P21 [rdfs:label ?gender] . \n", + " filter (lang(?gender)=\"en\") }\n", + "\n", + "The wdt:P21 indicates we are interested in the gender property.  We also specify that we want to get the English label of an entity instead of its WikiData id. The easiest way to search for the desired property id is to inspect the entity on the WikiData web page and hover over a property name.\n", + "\n", + "Another way is to use the WikiData query editor, which has a great autocomplete function by using the CTRL+T command.\n", + "\n", + "To store the results back to Neo4j we will use the FOREACH trick. Because some of our results will contain null values, we have to wrap the MERGE statement into the FOREACH statement which supports conditional execution. Check the Tips and tricks blog post by Michael Hunger for more information." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "l3lj59OdHa4O" + }, + "outputs": [], + "source": [ + "import_gender_query = \"\"\"\n", + "\n", + "// Iterate over characters \n", + "MATCH (r:Character) \n", + "// Prepare a SparQL query \n", + "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", + " 'OPTIONAL{ ?item wdt:P21 [rdfs:label ?gender] . filter (lang(?gender)=\"en\") } ' + \n", + " 'OPTIONAL{ ?item wdt:P27 [rdfs:label ?country] . filter (lang(?country)=\"en\") } ' +\n", + " 'OPTIONAL{ ?item wdt:P1196 [rdfs:label ?death] . filter (lang(?death)=\"en\") }}' AS sparql, r \n", + "// make a request to Wikidata \n", + "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" \n", + " + apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null)\n", + "YIELD value \n", + "UNWIND value['results']['bindings'] as row \n", + "SET r.gender = row['gender']['value'], \n", + " r.manner_of_death = row['death']['value'] \n", + "// Execute FOREACH statement \n", + "FOREACH(ignoreme in case when row['country'] is not null then [1] else [] end | \n", + " MERGE (c:Country{name:row['country']['value']}) \n", + " MERGE (r)-[:IN_COUNTRY]->(c))\n", + "\n", + "\"\"\"\n", + "\n", + "r = run_query(import_gender_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GIpG7UxcHa4P" + }, + "source": [ + "We are connecting additional information to our graph bit by bit and slowly transforming it into a knowledge graph. Let's first look at the manner of death property." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 }, + "id": "f5s0jWxQHa4P", + "outputId": "4a7b9491-6932-4644-ba34-b0413c21bcf4" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "id": "f5s0jWxQHa4P", - "outputId": "4a7b9491-6932-4644-ba34-b0413c21bcf4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 143 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " manner_of_death count\n", - "0 homicide 3\n", - "1 death in battle 1\n", - "2 accident 1" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
manner_of_deathcount
0homicide3
1death in battle1
2accident1
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 27 - } + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
manner_of_deathcount
0homicide3
1death in battle1
2accident1
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "source": [ - "manner_of_death_query = \"\"\"\n", - "\n", - "MATCH (n:Character) \n", - "WHERE exists (n.manner_of_death) \n", - "RETURN n.manner_of_death as manner_of_death, \n", - " count(*) as count\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(manner_of_death_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vBKeyjrFHa4P" - }, - "source": [ - "Nothing of interest. This is obviously not the Game of Thrones series. Let's also inspect the results of the country property." + "text/plain": [ + " manner_of_death count\n", + "0 homicide 3\n", + "1 death in battle 1\n", + "2 accident 1" ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "manner_of_death_query = \"\"\"\n", + "\n", + "MATCH (n:Character) \n", + "WHERE n.manner_of_death IS NOT NULL \n", + "RETURN n.manner_of_death as manner_of_death, \n", + " count(*) as count\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(manner_of_death_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vBKeyjrFHa4P" + }, + "source": [ + "Nothing of interest. This is obviously not the Game of Thrones series. Let's also inspect the results of the country property." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "H-OtGtHLHa4Q", + "outputId": "ca396601-a14c-4cb6-def0-4ba1769eb837" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "id": "H-OtGtHLHa4Q", - "outputId": "ca396601-a14c-4cb6-def0-4ba1769eb837", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " country members\n", - "0 Gondor 70\n", - "1 Shire 48\n", - "2 Rohan 34\n", - "3 Númenor 34\n", - "4 Arthedain 16\n", - "5 Arnor 8\n", - "6 Doriath 5\n", - "7 Reunited Kingdom 3\n", - "8 Lothlórien 3\n", - "9 Gondolin 3" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countrymembers
0Gondor70
1Shire48
2Rohan34
3Númenor34
4Arthedain16
5Arnor8
6Doriath5
7Reunited Kingdom3
8Lothlórien3
9Gondolin3
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 28 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrymembers
0Gondor70
1Shire48
2Rohan34
3Númenor34
4Arthedain16
5Arnor8
6Doriath5
7Reunited Kingdom3
8Lothlórien3
9Gondolin3
\n", + "
" ], - "source": [ - "country_info_query = \"\"\"\n", - "\n", - "MATCH (c:Country)\n", - "RETURN c.name as country, \n", - " size((c)<-[:IN_COUNTRY]-()) as members\n", - "ORDER BY members DESC \n", - "LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(country_info_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bvplf6CEHa4Q" - }, - "source": [ - "We have the country information for 236 characters. We could make some hypotheses and try to populate missing country values. Let's assume that if two characters are siblings, they belong to the same country. This makes a lot of sense. To be able to achieve this, we have to import the familial ties from WikiData. Specifically, we will fetch the father, mother, relative, sibling, and spouse connections." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "BIqDirK8Ha4Q" - }, - "outputs": [], - "source": [ - "import_social_query = \"\"\"\n", - "\n", - "// Iterate over characters \n", - "MATCH (r:Character) \n", - "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' + \n", - " 'OPTIONAL{ ?item wdt:P22 ?father } OPTIONAL{ ?item wdt:P25 ?mother } OPTIONAL{ ?item wdt:P1038 ?relative } ' +\n", - " 'OPTIONAL{ ?item wdt:P3373 ?sibling } OPTIONAL{ ?item wdt:P26 ?spouse }}' AS sparql, r \n", - "// make a request to wikidata \n", - "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", - " sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null) YIELD value \n", - "UNWIND value['results']['bindings'] as row \n", - "FOREACH(ignoreme in case when row['mother'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['mother']['value']}) \n", - " MERGE (r)-[:HAS_MOTHER]->(c)) \n", - "FOREACH(ignoreme in case when row['father'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['father']['value']}) \n", - " MERGE (r)-[:HAS_FATHER]->(c)) \n", - "FOREACH(ignoreme in case when row['relative'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['relative']['value']}) \n", - " MERGE (r)-[:HAS_RELATIVE]-(c)) \n", - "FOREACH(ignoreme in case when row['sibling'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['sibling']['value']}) \n", - " MERGE (r)-[:SIBLING]-(c))\n", - "FOREACH(ignoreme in case when row['spouse'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['spouse']['value']}) \n", - " MERGE (r)-[:SPOUSE]-(c))\n", - "\n", - "\"\"\"\n", - "\n", - "r = run_query(import_social_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I0qIuaSYHa4R" - }, - "source": [ - "Before we begin filling-in missing values, let's check for promiscuity in the Middle-earth. The first query will search for characters with multiple spouses." + "text/plain": [ + " country members\n", + "0 Gondor 70\n", + "1 Shire 48\n", + "2 Rohan 34\n", + "3 Númenor 34\n", + "4 Arthedain 16\n", + "5 Arnor 8\n", + "6 Doriath 5\n", + "7 Reunited Kingdom 3\n", + "8 Lothlórien 3\n", + "9 Gondolin 3" ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_info_query = \"\"\"\n", + "\n", + "MATCH (c:Country)\n", + "RETURN c.name as country, \n", + " count{ (c)<-[:IN_COUNTRY]-() } as members\n", + "ORDER BY members DESC \n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(country_info_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bvplf6CEHa4Q" + }, + "source": [ + "We have the country information for 236 characters. We could make some hypotheses and try to populate missing country values. Let's assume that if two characters are siblings, they belong to the same country. This makes a lot of sense. To be able to achieve this, we have to import the familial ties from WikiData. Specifically, we will fetch the father, mother, relative, sibling, and spouse connections." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "BIqDirK8Ha4Q" + }, + "outputs": [], + "source": [ + "import_social_query = \"\"\"\n", + "\n", + "// Iterate over characters \n", + "MATCH (r:Character) \n", + "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' + \n", + " 'OPTIONAL{ ?item wdt:P22 ?father } OPTIONAL{ ?item wdt:P25 ?mother } OPTIONAL{ ?item wdt:P1038 ?relative } ' +\n", + " 'OPTIONAL{ ?item wdt:P3373 ?sibling } OPTIONAL{ ?item wdt:P26 ?spouse }}' AS sparql, r \n", + "// make a request to wikidata \n", + "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", + " apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null) YIELD value \n", + "UNWIND value['results']['bindings'] as row \n", + "FOREACH(ignoreme in case when row['mother'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['mother']['value']}) \n", + " MERGE (r)-[:HAS_MOTHER]->(c)) \n", + "FOREACH(ignoreme in case when row['father'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['father']['value']}) \n", + " MERGE (r)-[:HAS_FATHER]->(c)) \n", + "FOREACH(ignoreme in case when row['relative'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['relative']['value']}) \n", + " MERGE (r)-[:HAS_RELATIVE]-(c)) \n", + "FOREACH(ignoreme in case when row['sibling'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['sibling']['value']}) \n", + " MERGE (r)-[:SIBLING]-(c))\n", + "FOREACH(ignoreme in case when row['spouse'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['spouse']['value']}) \n", + " MERGE (r)-[:SPOUSE]-(c))\n", + "\n", + "\"\"\"\n", + "\n", + "r = run_query(import_social_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I0qIuaSYHa4R" + }, + "source": [ + "Before we begin filling-in missing values, let's check for promiscuity in the Middle-earth. The first query will search for characters with multiple spouses." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "imDtdPM7Ha4R", + "outputId": "1d605d05-14e1-483b-b115-8d2040bc7ae6" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "id": "imDtdPM7Ha4R", - "outputId": "1d605d05-14e1-483b-b115-8d2040bc7ae6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 [Indis, Finwë, Míriel]\n", - "1 [Míriel, Finwë, Indis]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0[Indis, Finwë, Míriel]
1[Míriel, Finwë, Indis]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 33 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0[Indis, Finwë, Míriel]
1[Míriel, Finwë, Indis]
\n", + "
" ], - "source": [ - "multiple_spouses_query = \"\"\"\n", - "\n", - "MATCH p=(a)-[:SPOUSE]-(b)-[:SPOUSE]-(c) \n", - "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(multiple_spouses_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NdC3zUP0Ha4R" - }, - "source": [ - "We actually found a single character with two spouses. It is Finwë, the first King of the Noldor. We can also take a look if someone has kids with multiple partners" + "text/plain": [ + " result\n", + "0 [Indis, Finwë, Míriel]\n", + "1 [Míriel, Finwë, Indis]" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multiple_spouses_query = \"\"\"\n", + "\n", + "MATCH p=(a)-[:SPOUSE]-(b)-[:SPOUSE]-(c) \n", + "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(multiple_spouses_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NdC3zUP0Ha4R" + }, + "source": [ + "We actually found a single character with two spouses. It is Finwë, the first King of the Noldor. We can also take a look if someone has kids with multiple partners" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "AFYIBdk8Ha4S", + "outputId": "b20d060a-9dc0-4fad-c808-387ea40f5151" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "id": "AFYIBdk8Ha4S", - "outputId": "b20d060a-9dc0-4fad-c808-387ea40f5151", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 [Finwë, Fingolfin, Indis]\n", - "1 [Finwë, Finarfin, Indis]\n", - "2 [Finwë, Findis, Indis]\n", - "3 [Finwë, Irimë, Indis]\n", - "4 [Finwë, Fëanor, Míriel]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0[Finwë, Fingolfin, Indis]
1[Finwë, Finarfin, Indis]
2[Finwë, Findis, Indis]
3[Finwë, Irimë, Indis]
4[Finwë, Fëanor, Míriel]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 34 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0[Finwë, Fingolfin, Indis]
1[Finwë, Findis, Indis]
2[Finwë, Irimë, Indis]
3[Finwë, Finarfin, Indis]
4[Finwë, Fëanor, Míriel]
\n", + "
" ], - "source": [ - "multiple_kids_query = \"\"\"\n", - "\n", - "MATCH (c:Character)<-[:HAS_FATHER|HAS_MOTHER]-()-[:HAS_FATHER|HAS_MOTHER]->(other) \n", - "WITH c, collect(distinct other) as others \n", - "WHERE size(others) > 1 \n", - "MATCH p=(c)<-[:HAS_FATHER|HAS_MOTHER]-()-[:HAS_FATHER|HAS_MOTHER]->() \n", - "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(multiple_kids_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YcfTme1qHa4S" - }, - "source": [ - "So it seems that Finwë has four children with Indis and a single child with Míriel. On the other hand, it is quite weird that Beren has two fathers. I guess Adanel has some explaining to do. We would probably find more death and promiscuity in the GoT world.\n", - "\n", - "## Populate missing values\n", - "\n", - "Now that we know that the Middle-earth characters abstain from promiscuity, let's populate the missing country values. Remember our hypothesis was:\n", - "\n", - ">If two characters are siblings, they belong to the same country.\n", - "\n", - "Before we populate the missing values for countries, let's populate the missing values for siblings. We will assume that if two characters have the same mother or father, they are siblings. Let's look at some sibling candidates." + "text/plain": [ + " result\n", + "0 [Finwë, Fingolfin, Indis]\n", + "1 [Finwë, Findis, Indis]\n", + "2 [Finwë, Irimë, Indis]\n", + "3 [Finwë, Finarfin, Indis]\n", + "4 [Finwë, Fëanor, Míriel]" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multiple_kids_query = \"\"\"\n", + "\n", + "MATCH (c:Character)<-[:HAS_FATHER|HAS_MOTHER]-()-[:HAS_FATHER|HAS_MOTHER]->(other) \n", + "WITH c, collect(distinct other) as others \n", + "WHERE size(others) > 1 \n", + "MATCH p=(c)<-[:HAS_FATHER|HAS_MOTHER]-()-[:HAS_FATHER|HAS_MOTHER]->() \n", + "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(multiple_kids_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YcfTme1qHa4S" + }, + "source": [ + "So it seems that Finwë has four children with Indis and a single child with Míriel. On the other hand, it is quite weird that Beren has two fathers. I guess Adanel has some explaining to do. We would probably find more death and promiscuity in the GoT world.\n", + "\n", + "## Populate missing values\n", + "\n", + "Now that we know that the Middle-earth characters abstain from promiscuity, let's populate the missing country values. Remember our hypothesis was:\n", + "\n", + ">If two characters are siblings, they belong to the same country.\n", + "\n", + "Before we populate the missing values for countries, let's populate the missing values for siblings. We will assume that if two characters have the same mother or father, they are siblings. Let's look at some sibling candidates." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "XWdkChq-Ha4T", + "outputId": "7e511e96-9487-4c73-b69e-9e85dca43890" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "id": "XWdkChq-Ha4T", - "outputId": "7e511e96-9487-4c73-b69e-9e85dca43890", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 [Ferumbras Took II, Isumbras Took III, Bandobr...\n", - "1 [Bingo Baggins, Laura Grubb, Bungo Baggins]\n", - "2 [Belba Baggins, Laura Grubb, Bungo Baggins]\n", - "3 [Linda Proudfoot, Laura Grubb, Bungo Baggins]\n", - "4 [Bingo Baggins, Mungo Baggins, Bungo Baggins]\n", - "5 [Linda Proudfoot, Mungo Baggins, Bungo Baggins]\n", - "6 [Belba Baggins, Mungo Baggins, Bungo Baggins]\n", - "7 [Hildigard Took, Gerontius Took, Isembard Took]\n", - "8 [Isengar Took, Gerontius Took, Isembard Took]\n", - "9 [Isengrim Took III, Gerontius Took, Isembard T..." - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0[Ferumbras Took II, Isumbras Took III, Bandobr...
1[Bingo Baggins, Laura Grubb, Bungo Baggins]
2[Belba Baggins, Laura Grubb, Bungo Baggins]
3[Linda Proudfoot, Laura Grubb, Bungo Baggins]
4[Bingo Baggins, Mungo Baggins, Bungo Baggins]
5[Linda Proudfoot, Mungo Baggins, Bungo Baggins]
6[Belba Baggins, Mungo Baggins, Bungo Baggins]
7[Hildigard Took, Gerontius Took, Isembard Took]
8[Isengar Took, Gerontius Took, Isembard Took]
9[Isengrim Took III, Gerontius Took, Isembard T...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 35 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0[Ferumbras Took II, Isumbras Took III, Bandobr...
1[Linda Proudfoot, Mungo Baggins, Bungo Baggins]
2[Bingo Baggins, Mungo Baggins, Bungo Baggins]
3[Belba Baggins, Mungo Baggins, Bungo Baggins]
4[Belba Baggins, Laura Grubb, Bungo Baggins]
5[Linda Proudfoot, Laura Grubb, Bungo Baggins]
6[Bingo Baggins, Laura Grubb, Bungo Baggins]
7[Isembold Took, Adamanta Chubb, Isembard Took]
8[Isengar Took, Adamanta Chubb, Isembard Took]
9[Donnamira Took, Adamanta Chubb, Isembard Took]
\n", + "
" ], - "source": [ - "sibling_candidate_query = \"\"\"\n", - "\n", - "MATCH p=(a:Character)-[:HAS_FATHER|:HAS_MOTHER]->()<-[:HAS_FATHER|:HAS_MOTHER]-(b:Character) \n", - "WHERE NOT (a)-[:SIBLING]-(b) \n", - "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(sibling_candidate_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bHKPYakKHa4T" - }, - "source": [ - "Adamanta Chubb has at least six children. Only two of them are marked as siblings. Because all of them are siblings by definition, we will fill in the missing connections." + "text/plain": [ + " result\n", + "0 [Ferumbras Took II, Isumbras Took III, Bandobr...\n", + "1 [Linda Proudfoot, Mungo Baggins, Bungo Baggins]\n", + "2 [Bingo Baggins, Mungo Baggins, Bungo Baggins]\n", + "3 [Belba Baggins, Mungo Baggins, Bungo Baggins]\n", + "4 [Belba Baggins, Laura Grubb, Bungo Baggins]\n", + "5 [Linda Proudfoot, Laura Grubb, Bungo Baggins]\n", + "6 [Bingo Baggins, Laura Grubb, Bungo Baggins]\n", + "7 [Isembold Took, Adamanta Chubb, Isembard Took]\n", + "8 [Isengar Took, Adamanta Chubb, Isembard Took]\n", + "9 [Donnamira Took, Adamanta Chubb, Isembard Took]" ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sibling_candidate_query = \"\"\"\n", + "\n", + "MATCH p=(a:Character)-[:HAS_FATHER|:HAS_MOTHER]->()<-[:HAS_FATHER|:HAS_MOTHER]-(b:Character) \n", + "WHERE NOT exists { (a)-[:SIBLING]-(b) } \n", + "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(sibling_candidate_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bHKPYakKHa4T" + }, + "source": [ + "Adamanta Chubb has at least six children. Only two of them are marked as siblings. Because all of them are siblings by definition, we will fill in the missing connections." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "VHdXvvIHHa4T", + "outputId": "f3095cf0-4496-41d5-eb6f-7a9e52d7ffb4" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "id": "VHdXvvIHHa4T", - "outputId": "f3095cf0-4496-41d5-eb6f-7a9e52d7ffb4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 36 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "sibling_populate_query = \"\"\"\n", - "\n", - "MATCH p=(a:Character)-[:HAS_FATHER|:HAS_MOTHER]->()<-[:HAS_FATHER|:HAS_MOTHER]-(b:Character) \n", - "WHERE NOT (a)-[:SIBLING]-(b) \n", - "MERGE (a)-[:SIBLING]-(b)\n", - "\n", - "\"\"\"\n", - "run_query(sibling_populate_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LsxV7gZJHa4T" - }, - "source": [ - "The query added 118 missing relationships. I need to learn how to update the WikiData knowledge graph and add the missing relationships in bulk. Now we can fill in the missing country values for siblings. We will match all characters with the filled in country information and search for their siblings that don't have the country information. I love how easy it is to express this pattern with cypher query language." + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sibling_populate_query = \"\"\"\n", + "\n", + "MATCH p=(a:Character)-[:HAS_FATHER|:HAS_MOTHER]->()<-[:HAS_FATHER|:HAS_MOTHER]-(b:Character) \n", + "WHERE NOT exists { (a)-[:SIBLING]-(b) } \n", + "MERGE (a)-[:SIBLING]-(b)\n", + "\n", + "\"\"\"\n", + "run_query(sibling_populate_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LsxV7gZJHa4T" + }, + "source": [ + "The query added 118 missing relationships. I need to learn how to update the WikiData knowledge graph and add the missing relationships in bulk. Now we can fill in the missing country values for siblings. We will match all characters with the filled in country information and search for their siblings that don't have the country information. I love how easy it is to express this pattern with cypher query language." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "Ipj0d95THa4T", + "outputId": "2d95cf28-d60a-487d-d51e-a8ce2a327ef6" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "id": "Ipj0d95THa4T", - "outputId": "2d95cf28-d60a-487d-d51e-a8ce2a327ef6", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 37 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "country_populate_query = \"\"\"\n", - "\n", - "MATCH (country)<-[:IN_COUNTRY]-(s:Character)-[:SIBLING]-(t:Character) \n", - "WHERE NOT (t)-[:IN_COUNTRY]->() \n", - "MERGE (t)-[:IN_COUNTRY]->(country)\n", - "\n", - "\"\"\"\n", - "run_query(country_populate_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KDh1CjK3Ha4U" - }, - "source": [ - "There were 49 missing countries added. We could easily come up with more hypotheses to fill in the missing values. You can try and maybe add some other missing values yourself.\n", - "\n", - "We still have to add some information to our graph. In this query, we will add the information about the occupation, language, groups, and events of characters. The SPARQL query is identical to before where we iterate over each character and fetch additional properties." + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_populate_query = \"\"\"\n", + "\n", + "MATCH (country)<-[:IN_COUNTRY]-(s:Character)-[:SIBLING]-(t:Character) \n", + "WHERE NOT exists { (t)-[:IN_COUNTRY]->() }\n", + "MERGE (t)-[:IN_COUNTRY]->(country)\n", + "\n", + "\"\"\"\n", + "run_query(country_populate_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KDh1CjK3Ha4U" + }, + "source": [ + "There were 49 missing countries added. We could easily come up with more hypotheses to fill in the missing values. You can try and maybe add some other missing values yourself.\n", + "\n", + "We still have to add some information to our graph. In this query, we will add the information about the occupation, language, groups, and events of characters. The SPARQL query is identical to before where we iterate over each character and fetch additional properties." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "3AVPww1fHa4U", + "outputId": "faae2f85-abe9-4d2e-a795-9921abe21ce2" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "id": "3AVPww1fHa4U", - "outputId": "faae2f85-abe9-4d2e-a795-9921abe21ce2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 38 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "import_groups_query = \"\"\"\n", - "\n", - "MATCH (r:Character) \n", - "WHERE exists (r.id) \n", - "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", - " 'OPTIONAL { ?item wdt:P106 [rdfs:label ?occupation ] . filter (lang(?occupation) = \"en\" ). } ' +\n", - " 'OPTIONAL { ?item wdt:P103 [rdfs:label ?language ] . filter (lang(?language) = \"en\" ) . } ' +\n", - " 'OPTIONAL { ?item wdt:P463 [rdfs:label ?member_of ] . filter (lang(?member_of) = \"en\" ). } ' +\n", - " 'OPTIONAL { ?item wdt:P1344[rdfs:label ?participant ] . filter (lang(?participant) = \"en\") . } ' +\n", - " 'OPTIONAL { ?item wdt:P39[rdfs:label ?position ] . filter (lang(?position) = \"en\") . }}' AS sparql, r \n", - "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", - " sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null) \n", - "YIELD value \n", - "UNWIND value['results']['bindings'] as row \n", - "FOREACH(ignoreme in case when row['language'] is not null then [1] else [] end | \n", - " MERGE (c:Language{name:row['language']['value']}) \n", - " MERGE (r)-[:HAS_LANGUAGE]->(c)) \n", - "FOREACH(ignoreme in case when row['occupation'] is not null then [1] else [] end | \n", - " MERGE (c:Occupation{name:row['occupation']['value']}) \n", - " MERGE (r)-[:HAS_OCCUPATION]->(c)) \n", - "FOREACH(ignoreme in case when row['member_of'] is not null then [1] else [] end | \n", - " MERGE (c:Group{name:row['member_of']['value']}) \n", - " MERGE (r)-[:MEMBER_OF]->(c)) \n", - "FOREACH(ignoreme in case when row['participant'] is not null then [1] else [] end | \n", - " MERGE (c:Event{name:row['participant']['value']}) \n", - " MERGE (r)-[:PARTICIPATED]->(c)) \n", - "SET r.position = row['position']['value']\n", - "\n", - "\"\"\"\n", - "run_query(import_groups_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OVN0slGqHa4U" - }, - "source": [ - "Let's investigate the results of the groups and the occupation of the characters." + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import_groups_query = \"\"\"\n", + "\n", + "MATCH (r:Character) \n", + "WHERE r.id IS NOT NULL \n", + "WITH 'SELECT * WHERE{ ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", + " 'OPTIONAL { ?item wdt:P106 [rdfs:label ?occupation ] . filter (lang(?occupation) = \"en\" ). } ' +\n", + " 'OPTIONAL { ?item wdt:P103 [rdfs:label ?language ] . filter (lang(?language) = \"en\" ) . } ' +\n", + " 'OPTIONAL { ?item wdt:P463 [rdfs:label ?member_of ] . filter (lang(?member_of) = \"en\" ). } ' +\n", + " 'OPTIONAL { ?item wdt:P1344[rdfs:label ?participant ] . filter (lang(?participant) = \"en\") . } ' +\n", + " 'OPTIONAL { ?item wdt:P39[rdfs:label ?position ] . filter (lang(?position) = \"en\") . }}' AS sparql, r \n", + "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", + " apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null) \n", + "YIELD value \n", + "UNWIND value['results']['bindings'] as row \n", + "FOREACH(ignoreme in case when row['language'] is not null then [1] else [] end | \n", + " MERGE (c:Language{name:row['language']['value']}) \n", + " MERGE (r)-[:HAS_LANGUAGE]->(c)) \n", + "FOREACH(ignoreme in case when row['occupation'] is not null then [1] else [] end | \n", + " MERGE (c:Occupation{name:row['occupation']['value']}) \n", + " MERGE (r)-[:HAS_OCCUPATION]->(c)) \n", + "FOREACH(ignoreme in case when row['member_of'] is not null then [1] else [] end | \n", + " MERGE (c:Group{name:row['member_of']['value']}) \n", + " MERGE (r)-[:MEMBER_OF]->(c)) \n", + "FOREACH(ignoreme in case when row['participant'] is not null then [1] else [] end | \n", + " MERGE (c:Event{name:row['participant']['value']}) \n", + " MERGE (r)-[:PARTICIPATED]->(c)) \n", + "SET r.position = row['position']['value']\n", + "\n", + "\"\"\"\n", + "run_query(import_groups_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OVN0slGqHa4U" + }, + "source": [ + "Let's investigate the results of the groups and the occupation of the characters." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 }, + "id": "NQsHXDmgHa4V", + "outputId": "fefc55bc-65c1-4a95-c4ce-12afc8c53f12" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "id": "NQsHXDmgHa4V", - "outputId": "fefc55bc-65c1-4a95-c4ce-12afc8c53f12", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 300 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " group size members \\\n", - "0 Thorin and Company 14 [Bofur, Óin, Glóin] \n", - "1 Fellowship of the Ring 8 [Samwise Gamgee, Frodo Baggins, Legolas] \n", - "2 White Council 2 [Elrond, Gandalf] \n", - "3 Union of Maedhros 2 [Haldir, Halmir] \n", - "4 Wise 2 [Adanel, Andreth] \n", - "5 Rangers of Ithilien 2 [Damrod, Madril] \n", - "6 Istari 1 [Gandalf] \n", - "7 White Company 1 [Beregond] \n", - "\n", - " occupations \n", - "0 [diarist, swordfighter] \n", - "1 [swordfighter, archer] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - "5 [] \n", - "6 [] \n", - "7 [] " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
groupsizemembersoccupations
0Thorin and Company14[Bofur, Óin, Glóin][diarist, swordfighter]
1Fellowship of the Ring8[Samwise Gamgee, Frodo Baggins, Legolas][swordfighter, archer]
2White Council2[Elrond, Gandalf][]
3Union of Maedhros2[Haldir, Halmir][]
4Wise2[Adanel, Andreth][]
5Rangers of Ithilien2[Damrod, Madril][]
6Istari1[Gandalf][]
7White Company1[Beregond][]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 39 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
groupsizemembersoccupations
0Thorin and Company14[Glóin, Bofur, Thorin II][swordfighter, diarist]
1Fellowship of the Ring12[Gimli, Peregrin Took, Samwise Gamgee][swordfighter, domestic worker, gardener]
2White Council3[Elrond, Gandalf, Gandalf][magician, swordfighter]
3Rangers of Ithilien2[Damrod, Madril][]
4Union of Maedhros2[Haldir, Halmir][]
5Wise2[Adanel, Andreth][]
6Istari2[Gandalf, Gandalf][magician, swordfighter]
7White Company1[Beregond][guard]
\n", + "
" ], - "source": [ - "investigate_groups_query = \"\"\"\n", - "\n", - "MATCH (n:Group)<-[:MEMBER_OF]-(c)\n", - "OPTIONAL MATCH (c)-[:HAS_OCCUPATION]->(o) \n", - "RETURN n.name as group, \n", - " count(*) as size, \n", - " collect(c.name)[..3] as members, \n", - " collect(distinct o.name)[..3] as occupations \n", - "ORDER BY size DESC\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(investigate_groups_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nZGm5a1VHa4V" - }, - "source": [ - "It was at this moment that I realized the whole Hobbit series are included. Balin was the diarist for the Thorin and Company group. For some reason, I was expecting Bilbo Baggins to be the diarist. Obviously, there can be only one archer in the Fellowship of the Ring group, and that is Legolas. Gandalf seems to be involved in a couple of groups.\n", - "\n", - "We will execute one more WikiData API call. This time we will fetch the enemies and the items the characters own." - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "id": "iZ075XIuHa4V" - }, - "outputs": [], - "source": [ - "import_enemy_query = \"\"\"\n", - "\n", - "MATCH (r:Character) \n", - "WHERE exists (r.id) \n", - "WITH 'SELECT * WHERE { ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", - " 'OPTIONAL{ ?item wdt:P1830 [rdfs:label ?owner ] . filter (lang(?owner) = \"en\" ). } ' +\n", - " 'OPTIONAL{ ?item wdt:P7047 ?enemy }}' AS sparql, r \n", - "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", - " sparql, \n", - " { Accept: \"application/sparql-results+json\"}, null) \n", - "YIELD value \n", - "WITH value,r \n", - "WHERE value['results']['bindings'] <> [] \n", - "UNWIND value['results']['bindings'] as row \n", - "FOREACH(ignoreme in case when row['owner'] is not null then [1] else [] end |\n", - " MERGE (c:Item{name:row['owner']['value']}) \n", - " MERGE (r)-[:OWNS_ITEM]->(c)) \n", - "FOREACH(ignoreme in case when row['enemy'] is not null then [1] else [] end | \n", - " MERGE (c:Character{url:row['enemy']['value']}) \n", - " MERGE (r)-[:ENEMY]->(c))\n", - "\n", - "\"\"\"\n", - "\n", - "r = execute_query(import_enemy_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IGRptZDwHa4W" - }, - "source": [ - "Finally, we have finished importing our graph. Let's look at how many enemies are there between direct family members." + "text/plain": [ + " group size members \\\n", + "0 Thorin and Company 14 [Glóin, Bofur, Thorin II] \n", + "1 Fellowship of the Ring 12 [Gimli, Peregrin Took, Samwise Gamgee] \n", + "2 White Council 3 [Elrond, Gandalf, Gandalf] \n", + "3 Rangers of Ithilien 2 [Damrod, Madril] \n", + "4 Union of Maedhros 2 [Haldir, Halmir] \n", + "5 Wise 2 [Adanel, Andreth] \n", + "6 Istari 2 [Gandalf, Gandalf] \n", + "7 White Company 1 [Beregond] \n", + "\n", + " occupations \n", + "0 [swordfighter, diarist] \n", + "1 [swordfighter, domestic worker, gardener] \n", + "2 [magician, swordfighter] \n", + "3 [] \n", + "4 [] \n", + "5 [] \n", + "6 [magician, swordfighter] \n", + "7 [guard] " ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "investigate_groups_query = \"\"\"\n", + "\n", + "MATCH (n:Group)<-[:MEMBER_OF]-(c)\n", + "OPTIONAL MATCH (c)-[:HAS_OCCUPATION]->(o) \n", + "RETURN n.name as group, \n", + " count(*) as size, \n", + " collect(c.name)[..3] as members, \n", + " collect(distinct o.name)[..3] as occupations \n", + "ORDER BY size DESC\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(investigate_groups_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nZGm5a1VHa4V" + }, + "source": [ + "It was at this moment that I realized the whole Hobbit series are included. Balin was the diarist for the Thorin and Company group. For some reason, I was expecting Bilbo Baggins to be the diarist. Obviously, there can be only one archer in the Fellowship of the Ring group, and that is Legolas. Gandalf seems to be involved in a couple of groups.\n", + "\n", + "We will execute one more WikiData API call. This time we will fetch the enemies and the items the characters own." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "iZ075XIuHa4V" + }, + "outputs": [], + "source": [ + "import_enemy_query = \"\"\"\n", + "\n", + "MATCH (r:Character) \n", + "WHERE r.id IS NOT NULL \n", + "WITH 'SELECT * WHERE { ?item rdfs:label ?name . filter (?item = wd:' + r.id + ') filter (lang(?name) = \"en\" ) . ' +\n", + " 'OPTIONAL{ ?item wdt:P1830 [rdfs:label ?owner ] . filter (lang(?owner) = \"en\" ). } ' +\n", + " 'OPTIONAL{ ?item wdt:P7047 ?enemy }}' AS sparql, r \n", + "CALL apoc.load.jsonParams( \"https://query.wikidata.org/sparql?query=\" + \n", + " apoc.text.urlencode(sparql), \n", + " { Accept: \"application/sparql-results+json\"}, null) \n", + "YIELD value \n", + "WITH value,r \n", + "WHERE value['results']['bindings'] <> [] \n", + "UNWIND value['results']['bindings'] as row \n", + "FOREACH(ignoreme in case when row['owner'] is not null then [1] else [] end |\n", + " MERGE (c:Item{name:row['owner']['value']}) \n", + " MERGE (r)-[:OWNS_ITEM]->(c)) \n", + "FOREACH(ignoreme in case when row['enemy'] is not null then [1] else [] end | \n", + " MERGE (c:Character{url:row['enemy']['value']}) \n", + " MERGE (r)-[:ENEMY]->(c))\n", + "\n", + "\"\"\"\n", + "\n", + "r = run_query(import_enemy_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IGRptZDwHa4W" + }, + "source": [ + "Finally, we have finished importing our graph. Let's look at how many enemies are there between direct family members." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 }, + "id": "lMs1wi0fHa4W", + "outputId": "2d76e2b2-6d8f-472e-f92d-671a1539be0a" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "id": "lMs1wi0fHa4W", - "outputId": "2d76e2b2-6d8f-472e-f92d-671a1539be0a", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 [Manwë, Morgoth]\n", - "1 [Morgoth, Manwë]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0[Manwë, Morgoth]
1[Morgoth, Manwë]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 42 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0[Manwë, Morgoth]
1[Morgoth, Manwë]
\n", + "
" ], - "source": [ - "family_enemy_query = \"\"\"\n", - "\n", - "MATCH p=(a)-[:SPOUSE|SIBLING|HAS_FATHER|HAS_MOTHER]-(b) \n", - "WHERE (a)-[:ENEMY]-(b) \n", - "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", - "\n", - "\"\"\"\n", - "run_query(family_enemy_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m-s2T9ZIHa4W" - }, - "source": [ - "It looks like Morgoth and Manwë are brothers and enemies. This is the first time I have heard of the two, but LOTR fandom site claims Morgoth was the first Dark Lord. Let's look at how many enemies are within the second-degree relatives." + "text/plain": [ + " result\n", + "0 [Manwë, Morgoth]\n", + "1 [Morgoth, Manwë]" ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "family_enemy_query = \"\"\"\n", + "\n", + "MATCH p=(a)-[:SPOUSE|SIBLING|HAS_FATHER|HAS_MOTHER]-(b) \n", + "WHERE exists { (a)-[:ENEMY]-(b) } \n", + "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", + "\n", + "\"\"\"\n", + "run_query(family_enemy_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m-s2T9ZIHa4W" + }, + "source": [ + "It looks like Morgoth and Manwë are brothers and enemies. This is the first time I have heard of the two, but LOTR fandom site claims Morgoth was the first Dark Lord. Let's look at how many enemies are within the second-degree relatives." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 }, + "id": "MiRNuFtyHa4W", + "outputId": "5e5f5a11-2caf-4d39-e758-1e1bf4fed74c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "id": "MiRNuFtyHa4W", - "outputId": "5e5f5a11-2caf-4d39-e758-1e1bf4fed74c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 175 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 [Manwë, Morgoth]\n", - "1 [Morgoth, Manwë]\n", - "2 [Morgoth, Manwë, Varda]\n", - "3 [Varda, Manwë, Morgoth]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0[Manwë, Morgoth]
1[Morgoth, Manwë]
2[Morgoth, Manwë, Varda]
3[Varda, Manwë, Morgoth]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 43 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0[Manwë, Morgoth]
1[Morgoth, Manwë]
2[Morgoth, Manwë, Varda]
3[Varda, Manwë, Morgoth]
\n", + "
" ], - "source": [ - "family_enemy_2hops_query = \"\"\"\n", - "\n", - "MATCH p=(a)-[:SPOUSE|SIBLING|HAS_FATHER|HAS_MOTHER*..2]-(b) \n", - "WHERE (a)-[:ENEMY]-(b) \n", - "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", - "\n", - "\"\"\"\n", - "run_query(family_enemy_2hops_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vNIv7qiMHa4X" - }, - "source": [ - "Not a lot of enemies within the second-degree relatives. We can observe that Varda has taken her husband's stance and is also an enemy with Morgoth. This is an example of a stable triangle or triad. The triangle consists of one positive relationship (SPOUSE) and two negatives (ENEMY). In social network analysis, triangles are used to measure the cohesiveness and structural stability of a network.\n", - "\n", - "## Graph data science\n", - "\n", - "If you have read any of my previous blog posts, you know that I just have to include some example use cases of graph algorithms from the Graph Data Science library. If you need a quick refresher on how the GDS library works and what is happening behind the scenes, I suggest you read my previous blog post.\n", - "\n", - "We will start by projecting the family network. We load all the characters and the familial relationships like SPOUSE, SIBLING, HAS_FATHER, and HAS_MOTHER between them." + "text/plain": [ + " result\n", + "0 [Manwë, Morgoth]\n", + "1 [Morgoth, Manwë]\n", + "2 [Morgoth, Manwë, Varda]\n", + "3 [Varda, Manwë, Morgoth]" ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "family_enemy_2hops_query = \"\"\"\n", + "\n", + "MATCH p=(a)-[:SPOUSE|SIBLING|HAS_FATHER|HAS_MOTHER*..2]-(b) \n", + "WHERE exists { (a)-[:ENEMY]-(b) } \n", + "RETURN [n IN nodes(p) | n.name] AS result LIMIT 10\n", + "\n", + "\"\"\"\n", + "run_query(family_enemy_2hops_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vNIv7qiMHa4X" + }, + "source": [ + "Not a lot of enemies within the second-degree relatives. We can observe that Varda has taken her husband's stance and is also an enemy with Morgoth. This is an example of a stable triangle or triad. The triangle consists of one positive relationship (SPOUSE) and two negatives (ENEMY). In social network analysis, triangles are used to measure the cohesiveness and structural stability of a network.\n", + "\n", + "## Graph data science\n", + "\n", + "If you have read any of my previous blog posts, you know that I just have to include some example use cases of graph algorithms from the Graph Data Science library. If you need a quick refresher on how the GDS library works and what is happening behind the scenes, I suggest you read my previous blog post.\n", + "\n", + "We will start by projecting the family network. We load all the characters and the familial relationships like SPOUSE, SIBLING, HAS_FATHER, and HAS_MOTHER between them." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "ND_zVoYOHa4X", + "outputId": "88bccf60-ff12-43a1-b663-570dfedfa92e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "id": "ND_zVoYOHa4X", - "outputId": "88bccf60-ff12-43a1-b663-570dfedfa92e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'Character': {'label': 'Character', 'properti... \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'HAS_MOTHER': {'orientation': 'NATURAL', 'agg... family 699 \n", - "\n", - " relationshipCount projectMillis \n", - "0 1054 102 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'HAS_MOTHER': {'orientation': 'NATURAL', 'agg...family6991054102
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 44 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'HAS_MOTHER': {'orientation': 'NATURAL', 'ind...family710106026
\n", + "
" ], - "source": [ - "project_graph = \"\"\"\n", - "CALL gds.graph.project('family','Character', \n", - " ['SPOUSE','SIBLING','HAS_FATHER','HAS_MOTHER'])\n", - "\"\"\"\n", - "run_query(project_graph)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUBD0xphHa4X" - }, - "source": [ - "### Weakly connected component\n", - "\n", - "The weakly connected component algorithm is used to find islands or disconnected components within our network. The following visualizations contain two connected components. The first component is composed of Michael, Mark, and Doug while the second one consists of Alice, Charles, and Bridget.\n", - "\n", - "In our case, we will use the weakly connected component algorithm to find islands within the family network. All members within the same family component are related to each other somehow. Could be a cousin of the sister-in-law's grandmother or something more direct like a sibling. To get a rough feeling of the results, we will run the stats mode of the algorithm." + "text/plain": [ + " nodeProjection \\\n", + "0 {'Character': {'label': 'Character', 'properti... \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'HAS_MOTHER': {'orientation': 'NATURAL', 'ind... family 710 \n", + "\n", + " relationshipCount projectMillis \n", + "0 1060 26 " ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project_graph = \"\"\"\n", + "CALL gds.graph.project('family','Character', \n", + " ['SPOUSE','SIBLING','HAS_FATHER','HAS_MOTHER'])\n", + "\"\"\"\n", + "run_query(project_graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUBD0xphHa4X" + }, + "source": [ + "### Weakly connected component\n", + "\n", + "The weakly connected component algorithm is used to find islands or disconnected components within our network. The following visualizations contain two connected components. The first component is composed of Michael, Mark, and Doug while the second one consists of Alice, Charles, and Bridget.\n", + "\n", + "In our case, we will use the weakly connected component algorithm to find islands within the family network. All members within the same family component are related to each other somehow. Could be a cousin of the sister-in-law's grandmother or something more direct like a sibling. To get a rough feeling of the results, we will run the stats mode of the algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "3g8ufqCcHa4X", + "outputId": "f70b07d5-ad38-49ff-8014-72a4461b3932" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "id": "3g8ufqCcHa4X", - "outputId": "f70b07d5-ad38-49ff-8014-72a4461b3932", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " components p75 p90 mean max\n", - "0 147 1 3 4.76 324" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
componentsp75p90meanmax
0147134.76324
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 46 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentsp75p90meanmax
0156134.55321
\n", + "
" ], - "source": [ - "wcc_stats_query = \"\"\"\n", - "\n", - "CALL gds.wcc.stats('family') \n", - "YIELD componentCount, \n", - " componentDistribution \n", - "RETURN componentCount as components, \n", - " componentDistribution.p75 as p75, \n", - " componentDistribution.p90 as p90, \n", - " apoc.math.round(componentDistribution.mean,2) as mean, \n", - " componentDistribution.max as max\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(wcc_stats_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nCgfAjCmHa4X" - }, - "source": [ - "There are 145 connected components in our graph. More than 75% of the components contain only a single character. This means that around 110 (75% * 145) characters don't have a single familial link to any other character. If they had a single link, the size of the component would be at least two.  The biggest component has 328 members, so that must be one happy family. Let's write back the results and further analyze the family components." + "text/plain": [ + " components p75 p90 mean max\n", + "0 156 1 3 4.55 321" ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wcc_stats_query = \"\"\"\n", + "\n", + "CALL gds.wcc.stats('family') \n", + "YIELD componentCount, \n", + " componentDistribution \n", + "RETURN componentCount as components, \n", + " componentDistribution.p75 as p75, \n", + " componentDistribution.p90 as p90, \n", + " round(componentDistribution.mean,2) as mean, \n", + " componentDistribution.max as max\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(wcc_stats_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nCgfAjCmHa4X" + }, + "source": [ + "There are 145 connected components in our graph. More than 75% of the components contain only a single character. This means that around 110 (75% * 145) characters don't have a single familial link to any other character. If they had a single link, the size of the component would be at least two.  The biggest component has 328 members, so that must be one happy family. Let's write back the results and further analyze the family components." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "YDdRNi4EHa4Y", + "outputId": "264f1bb5-1078-45ef-b89b-56d660269a50" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "id": "YDdRNi4EHa4Y", - "outputId": "264f1bb5-1078-45ef-b89b-56d660269a50", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " writeMillis nodePropertiesWritten componentCount \\\n", - "0 181 699 147 \n", - "\n", - " componentDistribution postProcessingMillis \\\n", - "0 {'p99': 139, 'min': 1, 'max': 324, 'mean': 4.7... 5 \n", - "\n", - " preProcessingMillis computeMillis \\\n", - "0 0 19 \n", - "\n", - " configuration \n", - "0 {'writeConcurrency': 4, 'seedProperty': None, ... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
writeMillisnodePropertiesWrittencomponentCountcomponentDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
0181699147{'p99': 139, 'min': 1, 'max': 324, 'mean': 4.7...5019{'writeConcurrency': 4, 'seedProperty': None, ...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 47 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
writeMillisnodePropertiesWrittencomponentCountcomponentDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
0123710156{'p99': 139, 'min': 1, 'max': 321, 'mean': 4.5...904{'jobId': '6c933782-d1a8-4f5e-9748-fad56cebab3...
\n", + "
" ], - "source": [ - "wcc_write_query = \"\"\"\n", - "\n", - "CALL gds.wcc.write('family', {writeProperty:'familyComponent'})\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(wcc_write_query)" + "text/plain": [ + " writeMillis nodePropertiesWritten componentCount \\\n", + "0 123 710 156 \n", + "\n", + " componentDistribution postProcessingMillis \\\n", + "0 {'p99': 139, 'min': 1, 'max': 321, 'mean': 4.5... 9 \n", + "\n", + " preProcessingMillis computeMillis \\\n", + "0 0 4 \n", + "\n", + " configuration \n", + "0 {'jobId': '6c933782-d1a8-4f5e-9748-fad56cebab3... " ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wcc_write_query = \"\"\"\n", + "\n", + "CALL gds.wcc.write('family', {writeProperty:'familyComponent'})\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(wcc_write_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "x1EWzmFORYVy", + "outputId": "5d433f56-7847-4b9e-e645-2d05b974c2fb" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "# Also need to mutate in order to be able to use subgraph later on\n", - "\n", - "wcc_mutate_query = \"\"\"\n", - "\n", - "CALL gds.wcc.mutate('family', {mutateProperty:'familyComponent'})\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(wcc_mutate_query)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mutateMillisnodePropertiesWrittencomponentCountcomponentDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
00710156{'p99': 139, 'min': 1, 'max': 321, 'mean': 4.5...505{'jobId': '7273e197-dde4-4aed-bf2f-7aebb547cca...
\n", + "
" ], - "metadata": { - "id": "x1EWzmFORYVy", - "outputId": "5d433f56-7847-4b9e-e645-2d05b974c2fb", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 52, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " mutateMillis nodePropertiesWritten componentCount \\\n", - "0 0 699 147 \n", - "\n", - " componentDistribution postProcessingMillis \\\n", - "0 {'p99': 139, 'min': 1, 'max': 324, 'mean': 4.7... 4 \n", - "\n", - " preProcessingMillis computeMillis \\\n", - "0 0 16 \n", - "\n", - " configuration \n", - "0 {'seedProperty': None, 'consecutiveIds': False... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mutateMillisnodePropertiesWrittencomponentCountcomponentDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
00699147{'p99': 139, 'min': 1, 'max': 324, 'mean': 4.7...4016{'seedProperty': None, 'consecutiveIds': False...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 52 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EmTUYR0AHa4Y" - }, - "source": [ - "We will start by looking at the top five largest family components. The first thing we are interested in is which races are present in the family trees. We'll also add some random members in the results to get a better feeling of the data." + "text/plain": [ + " mutateMillis nodePropertiesWritten componentCount \\\n", + "0 0 710 156 \n", + "\n", + " componentDistribution postProcessingMillis \\\n", + "0 {'p99': 139, 'min': 1, 'max': 321, 'mean': 4.5... 5 \n", + "\n", + " preProcessingMillis computeMillis \\\n", + "0 0 5 \n", + "\n", + " configuration \n", + "0 {'jobId': '7273e197-dde4-4aed-bf2f-7aebb547cca... " ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Also need to mutate in order to be able to use subgraph later on\n", + "\n", + "wcc_mutate_query = \"\"\"\n", + "\n", + "CALL gds.wcc.mutate('family', {mutateProperty:'familyComponent'})\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(wcc_mutate_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EmTUYR0AHa4Y" + }, + "source": [ + "We will start by looking at the top five largest family components. The first thing we are interested in is which races are present in the family trees. We'll also add some random members in the results to get a better feeling of the data." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "awKFPFy7Ha4Y", + "outputId": "df34a048-a6ac-4bfa-f059-ea2d030feb14" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "id": "awKFPFy7Ha4Y", - "outputId": "df34a048-a6ac-4bfa-f059-ea2d030feb14", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " familyComponent size random_members \\\n", - "0 115 324 [Galadriel, Fingolfin, Amras] \n", - "1 0 139 [Frodo Baggins, Bilbo Baggins, Samwise Gamgee] \n", - "2 198 29 [Thorin II, Gimli, Balin] \n", - "3 273 21 [Túrin I, Dior of Gondor, Hador of Gondor] \n", - "4 99 6 [Aulë, Oromë, Tulkas] \n", - "\n", - " family_race \n", - "0 [Middle-earth elf, Maiar, men in Tolkien's leg... \n", - "1 [Hobbit] \n", - "2 [dwarves in Tolkien's legendarium] \n", - "3 [men in Tolkien's legendarium] \n", - "4 [valar] " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
familyComponentsizerandom_membersfamily_race
0115324[Galadriel, Fingolfin, Amras][Middle-earth elf, Maiar, men in Tolkien's leg...
10139[Frodo Baggins, Bilbo Baggins, Samwise Gamgee][Hobbit]
219829[Thorin II, Gimli, Balin][dwarves in Tolkien's legendarium]
327321[Túrin I, Dior of Gondor, Hador of Gondor][men in Tolkien's legendarium]
4996[Aulë, Oromë, Tulkas][valar]
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 48 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
familyComponentsizerandom_membersfamily_race
00321[Galadriel, Fingolfin, Amras][Middle-earth elf, Maiar, Middle-earth man, ha...
18139[Frodo Baggins, Bilbo Baggins, Samwise Gamgee][Hobbit]
225929[Thorin II, Gimli, Balin][Middle-earth dwarf]
337821[Cirion, Eradan, Belegorn][Middle-earth man]
41576[Aulë, Oromë, Tulkas][Valar]
\n", + "
" ], - "source": [ - "top5_families_query = \"\"\"\n", - "\n", - "MATCH (c:Character) \n", - "OPTIONAL MATCH (c)-[:BELONG_TO]->(race) \n", - "WITH c.familyComponent as familyComponent, \n", - " count(*) as size, \n", - " collect(c.name) as members, \n", - " collect(distinct race.race) as family_race \n", - "ORDER BY size DESC LIMIT 5 \n", - "RETURN familyComponent, \n", - " size, \n", - " members[..3] as random_members, \n", - " family_race\n", - "\"\"\"\n", - "\n", - "run_query(top5_families_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "68C5-8MQHa4Y" - }, - "source": [ - "As mentioned, the largest family has 328 members of various races ranging from elves to humans and even Maiar. It appears that elven and human lifes are quite intertwined in the Middle-earth. Also their legs. There is a reason why the half-elven race even exists. Other races like hobbits and dwarves stick more to their own kind.\n", - "\n", - "Let's examine the interracial marriages in the largest community." + "text/plain": [ + " familyComponent size random_members \\\n", + "0 0 321 [Galadriel, Fingolfin, Amras] \n", + "1 8 139 [Frodo Baggins, Bilbo Baggins, Samwise Gamgee] \n", + "2 259 29 [Thorin II, Gimli, Balin] \n", + "3 378 21 [Cirion, Eradan, Belegorn] \n", + "4 157 6 [Aulë, Oromë, Tulkas] \n", + "\n", + " family_race \n", + "0 [Middle-earth elf, Maiar, Middle-earth man, ha... \n", + "1 [Hobbit] \n", + "2 [Middle-earth dwarf] \n", + "3 [Middle-earth man] \n", + "4 [Valar] " ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top5_families_query = \"\"\"\n", + "\n", + "MATCH (c:Character) \n", + "OPTIONAL MATCH (c)-[:BELONG_TO]->(race) \n", + "WITH c.familyComponent as familyComponent, \n", + " count(*) as size, \n", + " collect(c.name) as members, \n", + " collect(distinct race.race) as family_race \n", + "ORDER BY size DESC LIMIT 5 \n", + "RETURN familyComponent, \n", + " size, \n", + " members[..3] as random_members, \n", + " family_race\n", + "\"\"\"\n", + "\n", + "run_query(top5_families_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "68C5-8MQHa4Y" + }, + "source": [ + "As mentioned, the largest family has 328 members of various races ranging from elves to humans and even Maiar. It appears that elven and human lifes are quite intertwined in the Middle-earth. Also their legs. There is a reason why the half-elven race even exists. Other races like hobbits and dwarves stick more to their own kind.\n", + "\n", + "Let's examine the interracial marriages in the largest community." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 }, + "id": "sCbdf4g0Ha4Y", + "outputId": "f7ea21ae-757d-41ac-c951-516678b7504f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "id": "sCbdf4g0Ha4Y", - "outputId": "f7ea21ae-757d-41ac-c951-516678b7504f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " spouse_1 race_1 spouse_2 \\\n", - "0 Beren Erchamion men in Tolkien's legendarium Lúthien \n", - "1 Melian Maiar Thingol \n", - "2 Elrond half-elven Celebrían \n", - "3 Tuor men in Tolkien's legendarium Idril \n", - "4 Dior Eluchíl half-elven Nimloth \n", - "5 Arwen half-elven Aragorn \n", - "\n", - " race_2 \n", - "0 Middle-earth elf \n", - "1 Middle-earth elf \n", - "2 Middle-earth elf \n", - "3 Middle-earth elf \n", - "4 Middle-earth elf \n", - "5 men in Tolkien's legendarium " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
spouse_1race_1spouse_2race_2
0Beren Erchamionmen in Tolkien's legendariumLúthienMiddle-earth elf
1MelianMaiarThingolMiddle-earth elf
2Elrondhalf-elvenCelebríanMiddle-earth elf
3Tuormen in Tolkien's legendariumIdrilMiddle-earth elf
4Dior Eluchílhalf-elvenNimlothMiddle-earth elf
5Arwenhalf-elvenAragornmen in Tolkien's legendarium
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 50 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spouse_1race_1spouse_2race_2
0MelianMaiarThingolMiddle-earth elf
1Dior Eluchílhalf-elvenNimlothMiddle-earth elf
2BerenMiddle-earth manLúthienMiddle-earth elf
3Elrondhalf-elvenCelebríanMiddle-earth elf
4TuorMiddle-earth manIdrilMiddle-earth elf
5Arwenhalf-elvenAragornMiddle-earth man
\n", + "
" ], - "source": [ - "ir_query = \"\"\"\n", - "\n", - "MATCH (c:Character) \n", - "WHERE c.familyComponent = 115 // fix the family component \n", - "MATCH p=(race)<-[:BELONG_TO]-(c)-[:SPOUSE]-(other)-[:BELONG_TO]->(other_race) \n", - "WHERE race <> other_race AND id(c) > id(other) \n", - "RETURN c.name as spouse_1, \n", - " race.race as race_1, \n", - " other.name as spouse_2, \n", - " other_race.race as race_2\n", - "\"\"\"\n", - "\n", - "run_query(ir_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ax5hKMTLHa4Z" - }, - "source": [ - "First of all, I didn't know that Elrond was a half-elf. It seems like the human and elven \"alliance\" is as old as time itself. I was mainly expecting to see Arwen and Aragorn as I remember that from the movies. It would be interesting to learn how far back do half-elves go. Let's look who are the half-elves with the most descendants." + "text/plain": [ + " spouse_1 race_1 spouse_2 race_2\n", + "0 Melian Maiar Thingol Middle-earth elf\n", + "1 Dior Eluchíl half-elven Nimloth Middle-earth elf\n", + "2 Beren Middle-earth man Lúthien Middle-earth elf\n", + "3 Elrond half-elven Celebrían Middle-earth elf\n", + "4 Tuor Middle-earth man Idril Middle-earth elf\n", + "5 Arwen half-elven Aragorn Middle-earth man" ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ir_query = \"\"\"\n", + "\n", + "MATCH (c:Character) \n", + "WHERE c.familyComponent = 0 // fix the family component \n", + "MATCH p=(race)<-[:BELONG_TO]-(c)-[:SPOUSE]-(other)-[:BELONG_TO]->(other_race) \n", + "WHERE race <> other_race AND id(c) > id(other) \n", + "RETURN c.name as spouse_1, \n", + " race.race as race_1, \n", + " other.name as spouse_2, \n", + " other_race.race as race_2\n", + "\"\"\"\n", + "\n", + "run_query(ir_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ax5hKMTLHa4Z" + }, + "source": [ + "First of all, I didn't know that Elrond was a half-elf. It seems like the human and elven \"alliance\" is as old as time itself. I was mainly expecting to see Arwen and Aragorn as I remember that from the movies. It would be interesting to learn how far back do half-elves go. Let's look who are the half-elves with the most descendants." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "bXhIScloHa4Z", + "outputId": "eddf3509-03a5-4907-b151-721e0040a67d" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "id": "bXhIScloHa4Z", - "outputId": "eddf3509-03a5-4907-b151-721e0040a67d", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " character descendants\n", - "0 Dior Eluchíl 11\n", - "1 Elwing 10\n", - "2 Eärendil 10\n", - "3 Elros 9\n", - "4 Elrond 2" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
characterdescendants
0Dior Eluchíl11
1Elwing10
2Eärendil10
3Elros9
4Elrond2
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 51 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
characterdescendants
0Dior Eluchíl11
1Eärendil10
2Elwing10
3Elros9
4Elrond2
\n", + "
" ], - "source": [ - "oldest_halfelf_query = \"\"\"\n", - "\n", - "MATCH (c:Character)\n", - "WHERE (c)-[:BELONG_TO]->(:Race{race:'half-elven'})\n", - "MATCH p=(c)<-[:HAS_FATHER|HAS_MOTHER*..20]-(end)\n", - "WHERE NOT (end)<-[:HAS_FATHER|:HAS_MOTHER]-()\n", - "WITH c, max(length(p)) as descendants\n", - "ORDER BY descendants DESC\n", - "LIMIT 5\n", - "RETURN c.name as character,\n", - " descendants\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(oldest_halfelf_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wOIj6SAGHa4Z" - }, - "source": [ - "It seems like Dior Eluchíl is the oldest recorded half-elf. I inspected results on LOTR fandom site, and it seems we are correct. Dior Eluchil was born in the First Age in the year 470. There are a couple of other half-elves who were born within 50 years of Dior.\n", - "\n", - "### Betweenness centrality\n", - "\n", - "We will also take a look at the betweenness centrality algorithm. It is used to find bridge nodes between different communities. If we take a look at the following visualization, we can observe that Captain America has the highest betweenness centrality score. That is because he is the main bridge in the network and connects the left-hand side of the network to the right-hand side. The second bridge in the network is the Beast. We can easily see that all the information exchanged between the main and right-hand side of the network has to go through him to reach the right-hand side.\n", - "\n", - "We will look for the bridge characters in the largest family network. My guess would be that spouses in an interracial marriage will come out on top. This is because all the communication between the races flows through them. We've seen that there are only six interracial marriages, so probably some of them will come out on top." + "text/plain": [ + " character descendants\n", + "0 Dior Eluchíl 11\n", + "1 Eärendil 10\n", + "2 Elwing 10\n", + "3 Elros 9\n", + "4 Elrond 2" ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oldest_halfelf_query = \"\"\"\n", + "\n", + "MATCH (c:Character)\n", + "WHERE exists{ (c)-[:BELONG_TO]->(:Race{race:'half-elven'}) }\n", + "MATCH p=(c)<-[:HAS_FATHER|HAS_MOTHER*..20]-(end)\n", + "WHERE NOT (end)<-[:HAS_FATHER|:HAS_MOTHER]-()\n", + "WITH c, max(length(p)) as descendants\n", + "ORDER BY descendants DESC\n", + "LIMIT 5\n", + "RETURN c.name as character,\n", + " descendants\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(oldest_halfelf_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wOIj6SAGHa4Z" + }, + "source": [ + "It seems like Dior Eluchíl is the oldest recorded half-elf. I inspected results on LOTR fandom site, and it seems we are correct. Dior Eluchil was born in the First Age in the year 470. There are a couple of other half-elves who were born within 50 years of Dior.\n", + "\n", + "### Betweenness centrality\n", + "\n", + "We will also take a look at the betweenness centrality algorithm. It is used to find bridge nodes between different communities. If we take a look at the following visualization, we can observe that Captain America has the highest betweenness centrality score. That is because he is the main bridge in the network and connects the left-hand side of the network to the right-hand side. The second bridge in the network is the Beast. We can easily see that all the information exchanged between the main and right-hand side of the network has to go through him to reach the right-hand side.\n", + "\n", + "We will look for the bridge characters in the largest family network. My guess would be that spouses in an interracial marriage will come out on top. This is because all the communication between the races flows through them. We've seen that there are only six interracial marriages, so probably some of them will come out on top." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "VOKMREFFRFMg", + "outputId": "e5381a6d-ad4a-4b30-c1bb-2139eb4e1c48" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "create_largest_wcc_query = \"\"\"\n", - "CALL gds.graph.project.cypher('largest-wcc', \n", - " 'MATCH (n:Character) WHERE n.familyComponent = 115 \n", - " RETURN id(n) as id',\n", - " 'MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SPOUSE|SIBLING]-(t:Character) \n", - " RETURN id(s) as source, id(t) as target',\n", - " {validateRelationships: false})\n", - "\"\"\"\n", - "\n", - "run_query(create_largest_wcc_query)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Character) WHERE n.familyComponent = ...MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SP...largest-wcc321110834
\n", + "
" ], - "metadata": { - "id": "VOKMREFFRFMg", - "outputId": "e5381a6d-ad4a-4b30-c1bb-2139eb4e1c48", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 58, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeQuery \\\n", - "0 MATCH (n:Character) WHERE n.familyComponent = ... \n", - "\n", - " relationshipQuery graphName nodeCount \\\n", - "0 MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SP... largest-wcc 324 \n", - "\n", - " relationshipCount projectMillis \n", - "0 1114 93 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountprojectMillis
0MATCH (n:Character) WHERE n.familyComponent = ...MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SP...largest-wcc324111493
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 58 - } + "text/plain": [ + " nodeQuery \\\n", + "0 MATCH (n:Character) WHERE n.familyComponent = ... \n", + "\n", + " relationshipQuery graphName nodeCount \\\n", + "0 MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SP... largest-wcc 321 \n", + "\n", + " relationshipCount projectMillis \n", + "0 1108 34 " ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "create_largest_wcc_query = \"\"\"\n", + "CALL gds.graph.project.cypher('largest-wcc', \n", + " 'MATCH (n:Character) WHERE n.familyComponent = 0 \n", + " RETURN id(n) as id',\n", + " 'MATCH (s:Character)-[:HAS_FATHER|HAS_MOTHER|SPOUSE|SIBLING]-(t:Character) \n", + " RETURN id(s) as source, id(t) as target',\n", + " {validateRelationships: false})\n", + "\"\"\"\n", + "\n", + "run_query(create_largest_wcc_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "Jw7ebYM0Ha4Z", + "outputId": "45ead7c6-98f3-4392-a2f9-4b7180fedda8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "id": "Jw7ebYM0Ha4Z", - "outputId": "45ead7c6-98f3-4392-a2f9-4b7180fedda8", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " character score\n", - "0 Arwen 44100.000000\n", - "1 Aragorn 43584.000000\n", - "2 Arathorn II 42224.000000\n", - "3 Arador 41940.000000\n", - "4 Argonui 41652.000000\n", - "5 Arathorn I 41360.000000\n", - "6 Arassuil 41064.000000\n", - "7 Arahad II 40764.000000\n", - "8 Elrond 40483.107143\n", - "9 Aravorn 40460.000000" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
characterscore
0Arwen44100.000000
1Aragorn43584.000000
2Arathorn II42224.000000
3Arador41940.000000
4Argonui41652.000000
5Arathorn I41360.000000
6Arassuil41064.000000
7Arahad II40764.000000
8Elrond40483.107143
9Aravorn40460.000000
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 59 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
characterscore
0Arwen42750.000000
1Aragorn42222.000000
2Arathorn II40832.000000
3Arador40542.000000
4Argonui40248.000000
5Arathorn I39950.000000
6Arassuil39648.000000
7Elrond39371.107143
8Arahad II39342.000000
9Aravorn39032.000000
\n", + "
" ], - "source": [ - "betwenness_centrality_query = \"\"\"\n", - "\n", - "CALL gds.betweenness.stream('largest-wcc')\n", - "YIELD nodeId, score\n", - "RETURN gds.util.asNode(nodeId).name as character,\n", - " score\n", - "ORDER BY score DESC \n", - "LIMIT 10\n", - "\n", - "\"\"\"\n", - "\n", - "run_query(betwenness_centrality_query)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DbZGAkRUHa4a" - }, - "source": [ - "Interesting to see that Arwen and Aragorn come out on top. Not exactly sure why, but I keep on thinking that they are the modern Romeo and Juliet that have formed an alliance between men and half-elves with their marriage. I have no idea how the JRR Tolkien system for generating names worked, but it seems a bit biased towards names starting with an A." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zHt9z2yyHa4a" - }, - "outputs": [], - "source": [ - "" + "text/plain": [ + " character score\n", + "0 Arwen 42750.000000\n", + "1 Aragorn 42222.000000\n", + "2 Arathorn II 40832.000000\n", + "3 Arador 40542.000000\n", + "4 Argonui 40248.000000\n", + "5 Arathorn I 39950.000000\n", + "6 Arassuil 39648.000000\n", + "7 Elrond 39371.107143\n", + "8 Arahad II 39342.000000\n", + "9 Aravorn 39032.000000" ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "scispacy", - "language": "python", - "name": "scispacy" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "colab": { - "name": "Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb", - "provenance": [], - "include_colab_link": true - } + ], + "source": [ + "betwenness_centrality_query = \"\"\"\n", + "\n", + "CALL gds.betweenness.stream('largest-wcc')\n", + "YIELD nodeId, score\n", + "RETURN gds.util.asNode(nodeId).name as character,\n", + " score\n", + "ORDER BY score DESC \n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "run_query(betwenness_centrality_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DbZGAkRUHa4a" + }, + "source": [ + "Interesting to see that Arwen and Aragorn come out on top. Not exactly sure why, but I keep on thinking that they are the modern Romeo and Juliet that have formed an alliance between men and half-elves with their marriage. I have no idea how the JRR Tolkien system for generating names worked, but it seems a bit biased towards names starting with an A." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zHt9z2yyHa4a" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "name": "Part1 Importing Wikidata into Neo4j and analyzing family trees.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/debug/node2vec.ipynb b/debug/node2vec.ipynb deleted file mode 100644 index de6f768..0000000 --- a/debug/node2vec.ipynb +++ /dev/null @@ -1,3714 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "source": [ - "!pip install neo4j node2vec networkx" - ], - "metadata": { - "id": "zyRBeIgMB5nI", - "outputId": "a7faee33-b8c6-4667-e18f-7a875a6d86e8", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "zyRBeIgMB5nI", - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: neo4j in /usr/local/lib/python3.7/dist-packages (4.4.5)\n", - "Requirement already satisfied: node2vec in /usr/local/lib/python3.7/dist-packages (0.4.3)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (2.6.3)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2022.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from node2vec) (1.21.6)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from node2vec) (4.64.0)\n", - "Requirement already satisfied: gensim in /usr/local/lib/python3.7/dist-packages (from node2vec) (3.6.0)\n", - "Requirement already satisfied: joblib>=0.13.2 in /usr/local/lib/python3.7/dist-packages (from node2vec) (1.1.0)\n", - "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.7/dist-packages (from gensim->node2vec) (5.2.1)\n", - "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim->node2vec) (1.15.0)\n", - "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim->node2vec) (1.7.3)\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "799c8425", - "metadata": { - "id": "799c8425" - }, - "outputs": [], - "source": [ - "from neo4j import GraphDatabase\n", - "\n", - "url = 'bolt://44.193.1.247:7687'\n", - "username = 'neo4j'\n", - "password = 'fund-circulation-morale'\n", - "\n", - "# Connect to Neo4j\n", - "driver = GraphDatabase.driver(url, auth=(username, password))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e09e693c", - "metadata": { - "id": "e09e693c" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "def run_query(query):\n", - " with driver.session() as session:\n", - " result = session.run(query)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "69c12633", - "metadata": { - "id": "69c12633", - "outputId": "e69f419e-dbda-417b-8b7c-e7cd28050dac", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " gds.version()\n", - "0 2.0.1" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gds.version()
02.0.1
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 4 - } - ], - "source": [ - "run_query(\"\"\"\n", - "RETURN gds.version() \n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "source": [ - "run_query(\"\"\"\n", - "call dbms.setConfigValue('dbms.transaction.timeout','0')\n", - "\"\"\")" - ], - "metadata": { - "id": "76Hy1YgMDBHA", - "outputId": "c17aeed2-33d2-4b26-8ed4-0f32ed880b0b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "id": "76Hy1YgMDBHA", - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "50b01ed4", - "metadata": { - "id": "50b01ed4", - "outputId": "e503afb3-982e-4608-dd5a-4333ab2e0e62", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } - ], - "source": [ - "run_query(\"\"\"\n", - "CREATE CONSTRAINT IF NOT EXISTS ON (s:Stream) ASSERT s.id IS UNIQUE;\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3993ac5a", - "metadata": { - "id": "3993ac5a", - "outputId": "8599c2c4-b313-4a00-cf2a-8e026edc4920", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 7 - } - ], - "source": [ - "run_query(\"\"\"\n", - "LOAD CSV WITH HEADERS FROM \"https://bit.ly/3JjgKgZ\" AS row\n", - "MERGE (s:Stream {id: row.id})\n", - "SET s.language = row.language\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d6666d5c", - "metadata": { - "id": "d6666d5c", - "outputId": "b2aa37b6-ee4e-4e49-9f37-83439a37fc7e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 8 - } - ], - "source": [ - "run_query(\"\"\"\n", - "USING PERIODIC COMMIT 10000\n", - "LOAD CSV WITH HEADERS FROM \"https://bit.ly/3S9Uyd8\" AS row\n", - "MATCH (s:Stream {id:row.source})\n", - "MATCH (t:Stream {id:row.target})\n", - "MERGE (s)-[r:SHARED_AUDIENCE]->(t)\n", - "SET r.weight = toInteger(row.weight)\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "source": [ - "run_query(\"\"\"\n", - "MATCH (s:Stream)\n", - "WHERE NOT (s)-[:SHARED_AUDIENCE]-()\n", - "DETACH DELETE s\n", - "\"\"\")" - ], - "metadata": { - "id": "wvznbYI0JQlM", - "outputId": "4201f7cc-ae42-4fa2-ac9f-4ba4e943a79c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "id": "wvznbYI0JQlM", - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "code", - "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.ensemble import RandomForestClassifier" - ], - "metadata": { - "id": "_AwPDGlRCovZ" - }, - "id": "_AwPDGlRCovZ", - "execution_count": 10, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "98c2b5de", - "metadata": { - "id": "98c2b5de", - "outputId": "327ebb64-0280-4a85-ac93-992b06d0f3d2", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'Stream': {'label': 'Stream', 'properties': {}}} \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'SHARED_AUDIENCE': {'orientation': 'UNDIRECTE... twitch 3721 \n", - "\n", - " relationshipCount projectMillis \n", - "0 262854 679 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Stream': {'label': 'Stream', 'properties': {}}}{'SHARED_AUDIENCE': {'orientation': 'UNDIRECTE...twitch3721262854679
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "run_query(\"\"\"\n", - "CALL gds.graph.project(\"twitch\", \"Stream\", \n", - " {SHARED_AUDIENCE: {orientation: \"UNDIRECTED\", properties:[\"weight\"]}})\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "3b0c05dc", - "metadata": { - "id": "3b0c05dc", - "outputId": "cc1e43a9-8169-4ee1-d5cf-9cf817909601", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "embedding dimension 8 has {'precision': 0.9018994255561046, 'recall': 0.9006711409395973, 'f1-score': 0.9005426180923415, 'support': 745} weighted avg\n", - "embedding dimension 16 has {'precision': 0.8888086789098827, 'recall': 0.8859060402684564, 'f1-score': 0.8853744481384653, 'support': 745} weighted avg\n", - "embedding dimension 32 has {'precision': 0.856411034595741, 'recall': 0.8469798657718121, 'f1-score': 0.845814552890966, 'support': 745} weighted avg\n", - "embedding dimension 64 has {'precision': 0.8281362773812437, 'recall': 0.7932885906040269, 'f1-score': 0.7852580524128767, 'support': 745} weighted avg\n", - "embedding dimension 128 has {'precision': 0.7924177877059633, 'recall': 0.7100671140939597, 'f1-score': 0.684787735858872, 'support': 745} weighted avg\n", - "embedding dimension 256 has {'precision': 0.7442489869651336, 'recall': 0.6067114093959731, 'f1-score': 0.529712561742786, 'support': 745} weighted avg\n" - ] - } - ], - "source": [ - "from sklearn.metrics import classification_report\n", - "\n", - "for embeddingDimension in [8,16,32,64,128,256]:\n", - " data = run_query(f\"\"\"\n", - " CALL gds.beta.node2vec.stream('twitch', \n", - " {{embeddingDimension:{embeddingDimension}, relationshipWeightProperty:'weight',\n", - " inOutFactor:2, returnFactor:1}})\n", - " YIELD nodeId, embedding\n", - " WITH gds.util.asNode(nodeId) AS node, embedding\n", - " RETURN node.id AS streamId, node.language AS language, embedding\n", - " \"\"\")\n", - " data['output'] = pd.factorize(data['language'])[0]\n", - " X = data['embedding'].to_list()\n", - " y = data['output'].to_list()\n", - "\n", - " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)\n", - "\n", - " rfc = RandomForestClassifier()\n", - " rfc.fit(X_train, y_train)\n", - "\n", - " y_pred = rfc.predict(X_test)\n", - " r = classification_report(y_test,y_pred, output_dict=True)['weighted avg']\n", - " print(f\"embedding dimension {embeddingDimension} has {r} weighted avg\")" - ] - }, - { - "cell_type": "code", - "source": [ - "run_query(\"\"\"\n", - "CALL gds.graph.drop('twitch')\n", - "\"\"\")" - ], - "metadata": { - "id": "Tee7V3iIJYBK", - "outputId": "c13402d2-b2e9-4480-f91b-842f695803a0", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 142 - } - }, - "id": "Tee7V3iIJYBK", - "execution_count": 13, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " graphName database memoryUsage sizeInBytes nodeCount relationshipCount \\\n", - "0 twitch neo4j -1 3721 262854 \n", - "\n", - " configuration density \\\n", - "0 {'relationshipProjection': {'SHARED_AUDIENCE':... 0.018989 \n", - "\n", - " creationTime modificationTime \\\n", - "0 2022-08-08T16:25:31.018810000+00:00 2022-08-08T16:25:31.695509000+00:00 \n", - "\n", - " schema \n", - "0 {'relationships': {'SHARED_AUDIENCE': {'weight... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
graphNamedatabasememoryUsagesizeInBytesnodeCountrelationshipCountconfigurationdensitycreationTimemodificationTimeschema
0twitchneo4j-13721262854{'relationshipProjection': {'SHARED_AUDIENCE':...0.0189892022-08-08T16:25:31.018810000+00:002022-08-08T16:25:31.695509000+00:00{'relationships': {'SHARED_AUDIENCE': {'weight...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 13 - } - ] - }, - { - "cell_type": "code", - "source": [ - "from node2vec import Node2Vec\n", - "import networkx as nx" - ], - "metadata": { - "id": "wbTd9nR2EMD1" - }, - "id": "wbTd9nR2EMD1", - "execution_count": 14, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# networx Graph" - ], - "metadata": { - "id": "TaOUWr_NHuq7" - }, - "id": "TaOUWr_NHuq7", - "execution_count": 15, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Construct a networkX graph\n", - "edge_list = run_query(\"\"\"\n", - "MATCH (s:Stream)-[r:SHARED_AUDIENCE]->(t:Stream)\n", - "WITH toString(s.id) + \" \" + toString(t.id) + \" {'weight':\" + toString(r.weight) + \"}\" as edge\n", - "WITH collect(edge) as result\n", - "RETURN result\n", - "\"\"\")\n", - "\n", - "edge_list = edge_list['result'].to_list()[0]\n", - "# Undirected graph as well\n", - "G = nx.parse_edgelist(edge_list, create_using=nx.Graph(), nodetype=int)" - ], - "metadata": { - "id": "Ns3aWDwSIIJC" - }, - "id": "Ns3aWDwSIIJC", - "execution_count": 16, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "G.number_of_nodes()" - ], - "metadata": { - "id": "a0IBZGBUIS4V", - "outputId": "a4ebf627-2816-480f-a81a-90c971a52ef5", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "id": "a0IBZGBUIS4V", - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "3721" - ] - }, - "metadata": {}, - "execution_count": 17 - } - ] - }, - { - "cell_type": "code", - "source": [ - "labels = run_query(\"\"\"\n", - "MATCH (s:Stream)\n", - "RETURN s.id AS id, s.language AS language\n", - "\"\"\")" - ], - "metadata": { - "id": "cb5xotXzPblA" - }, - "id": "cb5xotXzPblA", - "execution_count": 18, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "for embeddingDimension in [8,16,32,64,128,256]:\n", - " node2vec = Node2Vec(G, dimensions=embeddingDimension, walk_length=80, num_walks=10, workers=4, p=2, q= 1, weight_key= 'weight', seed=1)\n", - " model = node2vec.fit(window=10, min_count=1, batch_words=1000, sg=1, negative=5, ns_exponent=0.75, alpha=0.01, min_alpha=0.0001) # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)\n", - " d = []\n", - " for i in model.wv.vocab:\n", - " d.append({'id': i, 'embedding': list(model.wv[i])})\n", - " df = pd.DataFrame.from_dict(d).merge(labels, on='id')\n", - " df['output'] = pd.factorize(df['language'])[0]\n", - " X = df['embedding'].to_list()\n", - " y = df['output'].to_list()\n", - "\n", - " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)\n", - "\n", - " rfc = RandomForestClassifier()\n", - " rfc.fit(X_train, y_train)\n", - "\n", - " y_pred = rfc.predict(X_test)\n", - " r = classification_report(y_test,y_pred, output_dict=True)['weighted avg']\n", - " print(f\"embedding dimension {embeddingDimension} has {r} weighted avg\")" - ], - "metadata": { - "id": "93A-B844PEXt", - "outputId": "a368fd33-a4bc-4bc2-fda7-056c9d8307e4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 313, - "referenced_widgets": [ - "7d48ec744e224173870600fc760f2998", - "1f36d43087694874af3fb7838ab75718", - "fa831425ec364fecb40967b7a7026144", - "713ec0557c9d469ab88d3d79d41c72c2", - "62f7eb2507064130851367060903c095", - "9d77f98d702c46c49dca0fd0adb04833", - "86f0a9f3e153460aba378992bca35461", - "6b9967aa7c5c405ba886b2a051e7c827", - "77e4fc97d5fa4bc693b5cf664f8ddb18", - "e7b50c5b37844b8ea6bb8ed0688796f9", - "ad424d63e66f4bf594faf1e3b9be6c48", - "5da7501823b14e579d8944d4e09b102f", - "c21ecdef751e48b2ac92c58333df5dda", - "263ee46a75aa4695b63a5153755c1594", - "c587a840144a4698bf66b50d81f1eead", - "6f492e7bafca4e459ed7b9978fa1a6d3", - "9cbc3e392bd442e2975bf672e2b142b1", - "e196bf399cef438dae1b6bb077f5c1cb", - "93765946bce14dc888aa7bf206bfa5f0", - "22240b45ad004bbe9e52c35f82040666", - "66b1a73744844a878bbc816d11336e19", - "a2b9675c524144caadbd81d5f39baa54", - "73b3b66988754dc18c22d6294e198609", - "08643dfc8bc54acd84d464d7487df50e", - "2af5e6c1efba4cc88139fc05ea0c0006", - "ad5e4261b3064e45b777e3a56d116e38", - "d8012cd7fbe24711b4ec44cbc416f8da", - "ac6ae0fd76e341f4b3523f3ef09473d9", - "02aca957089449df921dac6b1f5829de", - "47e5afbab7614af399a73c3c8c6adcef", - "5e1620568118434abef3c8716f71fd0c", - "325a8dbac4ad43d9817e75617504da99", - "dc839ef080e04c678bb99bf29672d6f4", - "b3161e41568947c7a2fdb3ce50078b23", - "36be16b5566e4a89b1cde78ba29c7e6f", - "620963486e7748afb292bb42d7d2ffa8", - "0816a64432e049da9acb986911594a53", - "c5876824b2d24d42bc7cce7e2cf35b27", - "792a1f635d88464a880cc1045f04b432", - "7ede0a7b4725455a80eedcff98918e6b", - "19d3560ce3da46f5a3c62f846e7c9aa1", - "320d9b85a4f14e5cad3550e59f7e9bb8", - "c8c33153b9e4489c92beb9603f10c555", - "642c00821930488d88d9a0d561ecb8c4", - "4116b586ad15456292b979cbe07fe7b1", - "c1bcb300bd6e4ba895165f64e419e80b", - "3fbaaa6279cd430aac86b5f400662c35", - "e0761bd0fdc94cea8c9cde8f6cd703b8", - "d3aa40f4a4474ab392421c9b366581dc", - "7e3bf1b662b349d2bc0536c9851d15ca", - "80307d5925344ca68ad75fe27c020594", - "84529df93bcb4afcbb3c3ee127224cee", - "00b7c3a83bd84f5e8eee99513a72d866", - "32039480493c4a1fbb6ddf8398857d37", - "b1ccf9a81e7340acbbf404df9d7dd85d", - "a4a30e68cd7746039010701fc162126f", - "fef877ff5b294d1982fdbc3860e5d164", - "3d8ae4ea77eb450983cb0d778acc9b68", - "6b9d461efb5a438e86cf6a1866d4a72b", - "aca99435049044a5a27e0099b4fb559c", - "6487a8dbe76b433b9e438ffec5569d43", - "1d0e2799125142e4851447250e6ea20b", - "a49d2f591e3a438f86748726ef428358", - "01608bf7baea4a6e96e1e2b06958d644", - "6defc8202f744529bb55aa742ca4d3cd", - "c61852112cbc4b1db95c15da7f5f1570" - ] - } - }, - "id": "93A-B844PEXt", - "execution_count": 19, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Computing transition probabilities: 0%| | 0/3721 [00:00()) AS numberOfDevices,\n", - " size((u)-[:HAS_CC]->()) AS numberOfCCs,\n", - " size((u)-[:HAS_IP]->()) AS numberOfIps,\n", + " count{ (u)-[:USED]->() } AS numberOfDevices,\n", + " count{ (u)-[:HAS_CC]->() } AS numberOfCCs,\n", + " count{ (u)-[:HAS_IP]->() } AS numberOfIps,\n", " coalesce(totalOutgoingAmount, 0) AS totalOutgoingAmount, \n", " coalesce(avgOutgoingAmount, 0) AS avgOutgoingAmount,\n", " coalesce(maxOutgoingAmount, 0) AS maxOutgoingAmount,\n", @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "3ca0b417", "metadata": {}, "outputs": [ @@ -314,7 +314,7 @@ "4 200.00 2 " ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -333,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "66632cba", "metadata": {}, "outputs": [ @@ -383,7 +383,7 @@ "1 1 211" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "74048cdd", "metadata": {}, "outputs": [ @@ -608,7 +608,7 @@ "max 6750.000000 564.000000 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "cb367423", "metadata": {}, "outputs": [ @@ -639,7 +639,7 @@ "" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, @@ -681,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "d3abee73", "metadata": {}, "outputs": [], @@ -698,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "2b049b2f", "metadata": {}, "outputs": [], @@ -746,7 +746,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "19e86468", "metadata": {}, "outputs": [ @@ -824,10 +824,25 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "dc970138", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef751e6905144d54a17f4adebc534a68", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading: 0%| | 0/100 [00:00\n", " \n", " 0\n", - " 0\n", + " 765184\n", " 0\n", " \n", " \n", " 1\n", - " 1\n", + " 765185\n", " 1\n", " \n", " \n", " 2\n", - " 2\n", + " 765186\n", " 2\n", " \n", " \n", " 3\n", - " 3\n", + " 765187\n", " 3\n", " \n", " \n", " 4\n", - " 4\n", + " 765188\n", " 4\n", " \n", " \n", @@ -953,14 +968,14 @@ ], "text/plain": [ " nodeId componentId\n", - "0 0 0\n", - "1 1 1\n", - "2 2 2\n", - "3 3 3\n", - "4 4 4" + "0 765184 0\n", + "1 765185 1\n", + "2 765186 2\n", + "3 765187 3\n", + "4 765188 4" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -979,7 +994,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "e97c1869", "metadata": {}, "outputs": [], @@ -1000,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "340f4bf1", "metadata": {}, "outputs": [], @@ -1025,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "9983c200", "metadata": {}, "outputs": [], @@ -1051,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "9d41c3aa", "metadata": {}, "outputs": [], @@ -1073,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "3a86c2e3", "metadata": {}, "outputs": [], @@ -1096,7 +1111,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "0b914683", "metadata": {}, "outputs": [ @@ -1167,28 +1182,29 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "ccd028fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "graphName fraud\n", - "database neo4j\n", - "memoryUsage \n", - "sizeInBytes -1\n", - "nodeCount 152550\n", - "relationshipCount 171201\n", - "configuration {'relationshipProjection': {'HAS_CC': {'orient...\n", - "density 0.000007\n", - "creationTime 2022-06-29T11:32:01.574618000+02:00\n", - "modificationTime 2022-06-29T11:32:01.917006000+02:00\n", - "schema {'graphProperties': {}, 'relationships': {'HAS...\n", + "graphName fraud\n", + "database neo4j\n", + "memoryUsage \n", + "sizeInBytes -1\n", + "nodeCount 152550\n", + "relationshipCount 171201\n", + "configuration {'relationshipProjection': {'HAS_CC': {'orient...\n", + "density 0.000007\n", + "creationTime 2023-02-01T13:10:50.719251667+01:00\n", + "modificationTime 2023-02-01T13:10:51.453140320+01:00\n", + "schema {'graphProperties': {}, 'relationships': {'HAS...\n", + "schemaWithOrientation {'graphProperties': {}, 'relationships': {'HAS...\n", "Name: 0, dtype: object" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } diff --git a/gds_python/gds_python_intro.ipynb b/gds_python/gds_python_intro.ipynb index 1811a52..d7e9341 100644 --- a/gds_python/gds_python_intro.ipynb +++ b/gds_python/gds_python_intro.ipynb @@ -1,1707 +1,1307 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kHob0RF8YDrx" + }, + "source": [ + "# How to get started with the Neo4j Graph Data Science Python client\n", + "## Learn the basic syntax of the newly released Python client for Neo4j Graph Data Science library\n", + "\n", + "Data scientists like me love Python. It features a wide variety of machine learning and data science libraries that can help you get started on a data science project in minutes. It is not uncommon to use a variety of libraries in a data science workflow. With the release of version 2 of the Neo4j Graph Data Science (GDS) library, a supporting Python client has been introduced. The Python client for the GDS library is designed to help you seamlessly integrate the Neo4j Graph Data Science library into your data science workflow. Instead of having to write Cypher statements to execute graph algorithms, the Python client provides a simple surface that allows you to project and run graph algorithms using pure Python code.\n", + "\n", + "Since the Python client for GDS is relatively new, there are not many examples out there yet. Therefore, I've decided to write this blog post to help you get started with the GDS Python client syntax and show some common usage patterns through a simple network analysis.\n", + "\n", + "The Neo4j Graph Data Science Python client can be installed using the pip package installer." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "gds-python.ipynb", - "provenance": [], - "authorship_tag": "ABX9TyMLODtjsTX2gWhXe5ADDUdP", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "Q1KlTgmR8PAL", + "outputId": "db6489bd-cf34-4f97-8b44-77f08b3e9b73" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: graphdatascience in /usr/local/lib/python3.7/dist-packages (1.0.0)\n", + "Requirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", + "Requirement already satisfied: neo4j<5.0,>=4.4.2 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.4.3)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<5.0,>=4.4.2->graphdatascience) (2022.1)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n" + ] } + ], + "source": [ + "!pip install graphdatascience" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] + { + "cell_type": "markdown", + "metadata": { + "id": "tv2vtJOQYLQW" + }, + "source": [ + "An important thing to note is that the Python client is only guaranteed to work with GDS versions 2.0 and later. Therefore, if you have a previous version, I suggest you first upgrade the GDS library to the latest version.\n", + "# Neo4j environment setup\n", + "If you want to follow along with the code examples, you need to set up a Neo4j database. I suggest you use a [blank project on Neo4j Sandbox](https://sandbox.neo4j.com/?usecase=blank-sandbox) for this simple demonstration, but you can also download a [Neo4j Desktop application](https://neo4j.com/download/) and set up a local database.\n", + "\n", + "Neo4j Sandbox has the GDS library already installed. However, if you use Neo4j Desktop, you have to install the GDS library manually.\n", + "\n", + "# Setting up the GDS Python client connection\n", + "We start by defining the client connection to the Neo4j database. If you have seen any of my previous blog posts that use the official Neo4j Python driver, you can see that the syntax is almost identical." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "GKTUobMl8S31" + }, + "outputs": [], + "source": [ + "from graphdatascience import GraphDataScience\n", + "\n", + "host = \"bolt://44.193.28.203:7687\"\n", + "user = \"neo4j\"\n", + "password= \"combatants-coordinates-tugs\"\n", + "\n", + "gds = GraphDataScience(host, auth=(user, password))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FWOhP1TdYehq" + }, + "source": [ + "We have instantiated the connection to the Neo4j instance. If you are using Neo4j Enterprise, you might have multiple databases available in Neo4j. If we want to use any database other than the default one, we can select the required database using the set_database method." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "lk93jlvJ-u6L" + }, + "outputs": [], + "source": [ + "# Optionally set different database\n", + "#gds.set_database(\"databaseName\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rZbDBeZnYhEJ" + }, + "source": [ + "Lastly, we can verify that the connection is valid and the target Neo4j instance has the GDS library installed by using the `gds.version()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "-Ah7nqZD6Zcd", + "outputId": "42ae6efd-561d-45e5-d601-27d5e562c580" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# How to get started with the Neo4j Graph Data Science Python client\n", - "## Learn the basic syntax of the newly released Python client for Neo4j Graph Data Science library\n", - "\n", - "Data scientists like me love Python. It features a wide variety of machine learning and data science libraries that can help you get started on a data science project in minutes. It is not uncommon to use a variety of libraries in a data science workflow. With the release of version 2 of the Neo4j Graph Data Science (GDS) library, a supporting Python client has been introduced. The Python client for the GDS library is designed to help you seamlessly integrate the Neo4j Graph Data Science library into your data science workflow. Instead of having to write Cypher statements to execute graph algorithms, the Python client provides a simple surface that allows you to project and run graph algorithms using pure Python code.\n", - "\n", - "Since the Python client for GDS is relatively new, there are not many examples out there yet. Therefore, I've decided to write this blog post to help you get started with the GDS Python client syntax and show some common usage patterns through a simple network analysis.\n", - "\n", - "The Neo4j Graph Data Science Python client can be installed using the pip package installer." - ], - "metadata": { - "id": "kHob0RF8YDrx" - } + "name": "stdout", + "output_type": "stream", + "text": [ + "2.3.0\n" + ] + } + ], + "source": [ + "# Check if connection is valid and the target database has GDS installed\n", + "print(gds.version())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ad85KnNeYlwa" + }, + "source": [ + "The version() method should return the version of the installed GDS library. If it returns anything else, make sure that you entered the correct credentials and the GDS library is installed.\n", + "# Executing Cypher statements\n", + "The Python client allows you to execute arbitrary Cypher statements using the `run_cypher` method. The method takes two parameters are input. The first and mandatory parameter is the Cypher query you want to execute. The second method parameter is optional and can be used to provide any query parameters.\n", + "\n", + "The `run_cypher` method can be used to import, transform, or fetch any data from the database. We will begin by populating the database with the [Harry Potter network](https://medium.com/neo4j/turn-a-harry-potter-book-into-a-knowledge-graph-ffc1c45afcc8) I created in one of my previous blog posts.\n", + "\n", + "The network contains characters in the first book, and their interactions, which are represented as relationships. The CSV with the relationship is available on my GitHub, so we can use the `LOAD CSV` clause to retrieve the data from GitHub and store it into Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "GaXfepPu_YZk", + "outputId": "390e4c6d-d5b3-4191-e1ba-0437f8752363" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Q1KlTgmR8PAL", - "outputId": "db6489bd-cf34-4f97-8b44-77f08b3e9b73" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: graphdatascience in /usr/local/lib/python3.7/dist-packages (1.0.0)\n", - "Requirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", - "Requirement already satisfied: neo4j<5.0,>=4.4.2 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.4.3)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<5.0,>=4.4.2->graphdatascience) (2022.1)\n", - "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "source": [ - "!pip install graphdatascience" + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "LOAD CSV WITH HEADERS FROM $url AS row\n", + "MERGE (s:Character {name:row.source})\n", + "MERGE (t:Character {name:row.target})\n", + "MERGE (s)-[i:INTERACTS]->(t)\n", + "SET i.weight = toInteger(row.weight)\n", + "\"\"\"\n", + "params = {'url': 'https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/HP/hp_1.csv'}\n", + "gds.run_cypher(query, params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "egAEqSvfY6lz" + }, + "source": [ + "The import script uses the `run_cypher` method to execute the Cypher statement used to import the Harry Potter network. To demonstrate how Cypher parameters work with the `run_cypher` method, I've attached the URL of the file as a Cypher parameter. While the Cypher query is represented as a string, the Cypher parameters are defined as a dictionary.\n", + "If you have done any data analysis in Python, you have probably used the Pandas library in your workflow. Therefore, when fetching data from a database using the run_cyphermethod, the method conveniently returns a populated Pandas DataFrame. Having the data available as a Pandas DataFrame makes it much easier to integrate the data from Neo4j into your analytical workflow and use it in combination with other libraries.\n", + "\n", + "In this example, we will retrieve the degree (count of relationships) for each character in the network using the `run_cypher` method." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "a50_Aq0GAiHR", + "outputId": "fb6dabb6-edda-4ae4-d924-65618ae9c5f3" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "An important thing to note is that the Python client is only guaranteed to work with GDS versions 2.0 and later. Therefore, if you have a previous version, I suggest you first upgrade the GDS library to the latest version.\n", - "# Neo4j environment setup\n", - "If you want to follow along with the code examples, you need to set up a Neo4j database. I suggest you use a [blank project on Neo4j Sandbox](https://sandbox.neo4j.com/?usecase=blank-sandbox) for this simple demonstration, but you can also download a [Neo4j Desktop application](https://neo4j.com/download/) and set up a local database.\n", - "\n", - "Neo4j Sandbox has the GDS library already installed. However, if you use Neo4j Desktop, you have to install the GDS library manually.\n", - "\n", - "# Setting up the GDS Python client connection\n", - "We start by defining the client connection to the Neo4j database. If you have seen any of my previous blog posts that use the official Neo4j Python driver, you can see that the syntax is almost identical." - ], - "metadata": { - "id": "tv2vtJOQYLQW" - } - }, - { - "cell_type": "code", - "source": [ - "from graphdatascience import GraphDataScience\n", - "\n", - "host = \"bolt://54.172.168.40:7687\"\n", - "user = \"neo4j\"\n", - "password= \"shares-masses-turnarounds\"\n", - "\n", - "gds = GraphDataScience(host, auth=(user, password))" - ], - "metadata": { - "id": "GKTUobMl8S31" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We have instantiated the connection to the Neo4j instance. If you are using Neo4j Enterprise, you might have multiple databases available in Neo4j. If we want to use any database other than the default one, we can select the required database using the set_database method." - ], - "metadata": { - "id": "FWOhP1TdYehq" - } - }, - { - "cell_type": "code", - "source": [ - "# Optionally set different database\n", - "#gds.set_database(\"databaseName\")" - ], - "metadata": { - "id": "lk93jlvJ-u6L" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Lastly, we can verify that the connection is valid and the target Neo4j instance has the GDS library installed by using the `gds.version()` method." - ], - "metadata": { - "id": "rZbDBeZnYhEJ" - } - }, - { - "cell_type": "code", - "source": [ - "# Check if connection is valid and the target database has GDS installed\n", - "print(gds.version())" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
characterdegree
0Petunia Dursley8
1Dudley Dursley14
2Lily J. Potter5
3James Potter I5
4Harry Potter83
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-Ah7nqZD6Zcd", - "outputId": "42ae6efd-561d-45e5-d601-27d5e562c580" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2.0.1\n" - ] - } + "text/plain": [ + " character degree\n", + "0 Petunia Dursley 8\n", + "1 Dudley Dursley 14\n", + "2 Lily J. Potter 5\n", + "3 James Potter I 5\n", + "4 Harry Potter 83" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "degree_df = gds.run_cypher(\"\"\"\n", + "MATCH (c:Character)\n", + "RETURN c.name AS character,\n", + " count{ (c)--() } AS degree\n", + "\"\"\")\n", + "\n", + "degree_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mEsbwfP3ZGdj" + }, + "source": [ + "Since the data is available as a Pandas DataFrame, we can easily integrate it into our analytical workflow. For example, we can use the Seaborn library to visualize the node degree distribution." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 530 }, + "id": "DhCt2ueTA4SA", + "outputId": "27ba275a-76c5-4a07-b798-46e0f0e261c5" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "The version() method should return the version of the installed GDS library. If it returns anything else, make sure that you entered the correct credentials and the GDS library is installed.\n", - "# Executing Cypher statements\n", - "The Python client allows you to execute arbitrary Cypher statements using the `run_cypher` method. The method takes two parameters are input. The first and mandatory parameter is the Cypher query you want to execute. The second method parameter is optional and can be used to provide any query parameters.\n", - "\n", - "The `run_cypher` method can be used to import, transform, or fetch any data from the database. We will begin by populating the database with the [Harry Potter network](https://medium.com/neo4j/turn-a-harry-potter-book-into-a-knowledge-graph-ffc1c45afcc8) I created in one of my previous blog posts.\n", - "\n", - "The network contains characters in the first book, and their interactions, which are represented as relationships. The CSV with the relationship is available on my GitHub, so we can use the `LOAD CSV` clause to retrieve the data from GitHub and store it into Neo4j." - ], - "metadata": { - "id": "Ad85KnNeYlwa" - } - }, - { - "cell_type": "code", - "source": [ - "query = \"\"\"\n", - "LOAD CSV WITH HEADERS FROM $url AS row\n", - "MERGE (s:Character {name:row.source})\n", - "MERGE (t:Character {name:row.target})\n", - "MERGE (s)-[i:INTERACTS]->(t)\n", - "SET i.weight = toInteger(row.weight)\n", - "\"\"\"\n", - "params = {'url': 'https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/HP/hp_1.csv'}\n", - "gds.run_cypher(query, params)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - }, - "id": "GaXfepPu_YZk", - "outputId": "390e4c6d-d5b3-4191-e1ba-0437f8752363" - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } + "data": { + "text/plain": [ + "" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" }, { - "cell_type": "markdown", - "source": [ - "The import script uses the `run_cypher` method to execute the Cypher statement used to import the Harry Potter network. To demonstrate how Cypher parameters work with the `run_cypher` method, I've attached the URL of the file as a Cypher parameter. While the Cypher query is represented as a string, the Cypher parameters are defined as a dictionary.\n", - "If you have done any data analysis in Python, you have probably used the Pandas library in your workflow. Therefore, when fetching data from a database using the run_cyphermethod, the method conveniently returns a populated Pandas DataFrame. Having the data available as a Pandas DataFrame makes it much easier to integrate the data from Neo4j into your analytical workflow and use it in combination with other libraries.\n", - "\n", - "In this example, we will retrieve the degree (count of relationships) for each character in the network using the `run_cypher` method." - ], - "metadata": { - "id": "egAEqSvfY6lz" - } - }, - { - "cell_type": "code", - "source": [ - "degree_df = gds.run_cypher(\"\"\"\n", - "MATCH (c:Character)\n", - "RETURN c.name AS character,\n", - " size((c)--()) AS degree\n", - "\"\"\")\n", - "\n", - "degree_df.head()" - ], - "metadata": { - "id": "a50_Aq0GAiHR", - "outputId": "fb6dabb6-edda-4ae4-d924-65618ae9c5f3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " character degree\n", - "0 Petunia Dursley 8\n", - "1 Dudley Dursley 14\n", - "2 Lily J. Potter 5\n", - "3 James Potter I 5\n", - "4 Harry Potter 83" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
characterdegree
0Petunia Dursley8
1Dudley Dursley14
2Lily J. Potter5
3James Potter I5
4Harry Potter83
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] - }, - { - "cell_type": "markdown", - "source": [ - "Since the data is available as a Pandas DataFrame, we can easily integrate it into our analytical workflow. For example, we can use the Seaborn library to visualize the node degree distribution." - ], - "metadata": { - "id": "mEsbwfP3ZGdj" - } - }, + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "sns.displot(data=degree_df, x=\"degree\", height=7, aspect=1.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fkER78KrZOxT" + }, + "source": [ + "We can easily observe that most nodes have less than 15 relationships. However, there is one outlier in the dataset with 83 connections, and that is, of course, Harry Potter himself.\n", + "# Projected graph object\n", + "The central concept of the GDS Python client is to allow projecting and executing graph algorithms in Neo4j with pure Python code. Furthermore, the Python client is designed to mimic the GDS Cypher procedures so that we don't have to learn a new syntax to use the Python client.\n", + "\n", + "As you might know, before we can execute any graph algorithms, we first have to project an in-memory graph. For example, let's say we want to project a simple directed network of characters and their interactions with the Python client.\n", + "\n", + "![mapping.png]()\n", + "\n", + "If you are familiar with Cypher procedures of the Graph Data Science library, you will be able to pick up the Python client syntax easily. For the most part, we remove the CALLclause before the GDS procedures, and we get the Python client syntax to project graphs or execute algorithms.\n", + "\n", + "In our case, we want to project a network of characters where the interaction relationships are treated as undirected. Therefore, we must use the extended map syntax to define undirected relationships.\n", + "\n", + "![Copy of mapping_graph.drawio (1).png]()\n", + "\n", + "When dealing with map objects, or dictionaries as they are called in Python, we have to add quotes around map keys. Otherwise, the keys would be treated as variables in Python, and you would get a NameError as the key variables are not defined. So, apart from adding quotes and removing the CALLclause, the syntax to project an in-memory graph is identical. \n", + "\n", + "When projecting a graph with the Python client, a client-side reference to the projected graph is returned. We call these references Graph objects. Along with the Graph object, the metadata from the procedure call is returned as Pandas Series.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "c78q70AkB1ZQ" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "sns.displot(data=degree_df, x=\"degree\", height=7, aspect=1.5)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 530 - }, - "id": "DhCt2ueTA4SA", - "outputId": "27ba275a-76c5-4a07-b798-46e0f0e261c5" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "36cbe152ce7d42dea0d178c4d05d76ad", + "version_major": 2, + "version_minor": 0 }, - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 7 - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": { - "needs_background": "light" - } - } + "text/plain": [ + "Loading: 0%| | 0/100 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeIdscore
001.851142
113.241780
220.375610
330.375610
4424.197442
\n", + "" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xll1y1m9DsH5", - "outputId": "1351780f-5673-4195-f10c-fd79a2b5338e" - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "hp-graph\n", - "2341 KiB\n", - "0.05782652043868395\n" - ] - } + "text/plain": [ + " nodeId score\n", + "0 0 1.851142\n", + "1 1 3.241780\n", + "2 2 0.375610\n", + "3 3 0.375610\n", + "4 4 24.197442" ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# PageRank stream\n", + "pagerank_df = gds.pageRank.stream(G, relationshipWeightProperty=\"weight\")\n", + "pagerank_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EuKnJ7Yzacb-" + }, + "source": [ + "The `stream` mode of any algorithm in the GDS library returns a stream of records. Python client then automatically converts the output into a Pandas DataFrame.\n", + "\n", + "If you have ever executed the `stream` mode of the graph algorithms in Neo4j GDS library, you might be aware that the result contains internal node ids as a reference to nodes instead of actual node objects. The `pagerank_df` DataFrame contains two columns:\n", + "* nodeId: Internal node ids used to reference nodes\n", + "* score: PageRank score\n", + "\n", + "We can retrieve the referenced node objects using the `nodeId` column without constructing a Cypher statement by using the `gds.util.asNodes()` method. The `gds.util.asNodes()` method takes a list of internal node ids as input and outputs a list of node objects." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "xTl4i5cPWbrh" + }, + "outputs": [], + "source": [ + "# If you need to fetch information about node objects based on their internal node ids, you can use gds.util.asNodes\n", + "pagerank_df['node_object'] = gds.util.asNodes(pagerank_df['nodeId'].to_list())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fa-iw0etaubY" + }, + "source": [ + "The `node_object` column now contains the referenced node objects. Node objects are defined in the underlying Neo4j Python driver. You can reference the [official documentation if you want to examine all the possible methods of the node object](https://neo4j.com/docs/api/python-driver/current/api.html#node).\n", + "\n", + "In this example, we will extract the `name` property from node objects and then visualize a bar chart of the top ten characters with the highest PageRank score." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 652 }, + "id": "6drUBiMBWqpJ", + "outputId": "9e3c906f-7d87-4404-d231-36f6fc0639cf" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Running graph algorithms\n", - "Now that we have the projected graph ready and available as the reference variable `G` , we can go ahead and execute a couple of graph algorithms using the Python client.\n", - "\n", - "We will begin by executing the weighted variant of the PageRank algorithm. The `stream` mode of the algorithm returns the result of the algorithm as a stream of records.\n", - "\n", - "![gds_pagerank.drawio (1).png]()\n", - "\n", - "Similar to before, when we were projecting an in-memory graph, we need to remove the `CALL` clause in the Python client for all algorithm executions. We reference the projected graph by its name with the Cypher procedure statement. However, using the Python client, we pass the Graph object as the reference to the projected in-memory graph instead of its name. Lastly, any algorithm configuration parameters can be specified as keyword arguments in the Python client.\n", - "We can use the following Python script to execute the `stream` mode of the weighted PageRank algorithm." - ], - "metadata": { - "id": "so0n--2RaJzn" - } - }, - { - "cell_type": "code", - "source": [ - "# PageRank stream\n", - "pagerank_df = gds.pageRank.stream(G, relationshipWeightProperty=\"weight\")\n", - "pagerank_df.head()" - ], - "metadata": { - "id": "a0pLRI1XV5gi", - "outputId": "942c4d98-a44a-4875-f488-04877744d15a", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeId score\n", - "0 0 1.851142\n", - "1 1 3.241780\n", - "2 2 0.375610\n", - "3 3 0.375610\n", - "4 4 24.197442" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeIdscore
001.851142
113.241780
220.375610
330.375610
4424.197442
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 11 - } + "data": { + "text/plain": [ + "(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),\n", + " [Text(0, 0, 'Harry Potter'),\n", + " Text(1, 0, 'Ronald Weasley'),\n", + " Text(2, 0, 'Hermione Granger'),\n", + " Text(3, 0, 'Rubeus Hagrid'),\n", + " Text(4, 0, 'Severus Snape'),\n", + " Text(5, 0, 'Dudley Dursley'),\n", + " Text(6, 0, 'Draco Malfoy'),\n", + " Text(7, 0, 'Vernon Dursley'),\n", + " Text(8, 0, 'Albus Dumbledore'),\n", + " Text(9, 0, 'Neville Longbottom')])" ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" }, { - "cell_type": "markdown", - "source": [ - "The `stream` mode of any algorithm in the GDS library returns a stream of records. Python client then automatically converts the output into a Pandas DataFrame.\n", - "\n", - "If you have ever executed the `stream` mode of the graph algorithms in Neo4j GDS library, you might be aware that the result contains internal node ids as a reference to nodes instead of actual node objects. The `pagerank_df` DataFrame contains two columns:\n", - "* nodeId: Internal node ids used to reference nodes\n", - "* score: PageRank score\n", - "\n", - "We can retrieve the referenced node objects using the `nodeId` column without constructing a Cypher statement by using the `gds.util.asNodes()` method. The `gds.util.asNodes()` method takes a list of internal node ids as input and outputs a list of node objects." - ], - "metadata": { - "id": "EuKnJ7Yzacb-" - } - }, - { - "cell_type": "code", - "source": [ - "# If you need to fetch information about node objects based on their internal node ids, you can use gds.util.asNodes\n", - "pagerank_df['node_object'] = gds.util.asNodes(pagerank_df['nodeId'].to_list())" - ], - "metadata": { - "id": "xTl4i5cPWbrh" - }, - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The `node_object` column now contains the referenced node objects. Node objects are defined in the underlying Neo4j Python driver. You can reference the [official documentation if you want to examine all the possible methods of the node object](https://neo4j.com/docs/api/python-driver/current/api.html#node).\n", - "\n", - "In this example, we will extract the `name` property from node objects and then visualize a bar chart of the top ten characters with the highest PageRank score." - ], - "metadata": { - "id": "fa-iw0etaubY" - } - }, - { - "cell_type": "code", - "source": [ - "# Extract name properties from the node object\n", - "pagerank_df['name'] = [n['name'] for n in pagerank_df['node_object']]\n", - "# Draw a bar chart\n", - "plt.figure(figsize=(16,9))\n", - "sns.barplot(x='name', y='score', data=pagerank_df.sort_values(by='score', ascending=False).head(10))\n", - "plt.xticks(rotation=45)\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 652 - }, - "id": "6drUBiMBWqpJ", - "outputId": "9e3c906f-7d87-4404-d231-36f6fc0639cf" - }, - "execution_count": 13, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),\n", - " )" - ] - }, - "metadata": {}, - "execution_count": 13 - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": { - "needs_background": "light" - } - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Extract name properties from the node object\n", + "pagerank_df['name'] = [n['name'] for n in pagerank_df['node_object']]\n", + "# Draw a bar chart\n", + "plt.figure(figsize=(16,9))\n", + "sns.barplot(x='name', y='score', data=pagerank_df.sort_values(by='score', ascending=False).head(10))\n", + "plt.xticks(rotation=45)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7e7Jauk_a66I" + }, + "source": [ + "An additional benefit of having the graph algorithm output available in the Pandas Dataframe is that if you are not experienced with Cypher aggregations, you can simply skip them and do your aggregations in Pandas.\n", + "\n", + "As opposed to the `stream` mode of algorithms, the `stats`, `mutate`, and `write` modes do not produce a stream of results. Therefore, the results of Python client methods are not Pandas DataFrame. Instead, those methods output the algorithm metadata in Pandas Series format.\n", + "\n", + "For example, let's say we want to execute the mutate mode of the Louvain algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "mz5ARGwjEOPV", + "outputId": "a01ee5f2-da08-40b8-cdaa-e0f3ffdf8ed8" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "An additional benefit of having the graph algorithm output available in the Pandas Dataframe is that if you are not experienced with Cypher aggregations, you can simply skip them and do your aggregations in Pandas.\n", - "\n", - "As opposed to the `stream` mode of algorithms, the `stats`, `mutate`, and `write` modes do not produce a stream of results. Therefore, the results of Python client methods are not Pandas DataFrame. Instead, those methods output the algorithm metadata in Pandas Series format.\n", - "\n", - "For example, let's say we want to execute the mutate mode of the Louvain algorithm." - ], - "metadata": { - "id": "7e7Jauk_a66I" - } - }, - { - "cell_type": "code", - "source": [ - "# Louvain mutate\n", - "louvain_metadata = gds.louvain.mutate(G, mutateProperty='communityId', relationshipWeightProperty='weight')\n", - "print(louvain_metadata)" - ], - "metadata": { - "id": "mz5ARGwjEOPV", - "outputId": "a01ee5f2-da08-40b8-cdaa-e0f3ffdf8ed8", - "colab": { - "base_uri": "https://localhost:8080/" - } + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c91387e23a5e4b8caa670dd9c10373dd", + "version_major": 2, + "version_minor": 0 }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "mutateMillis 0\n", - "nodePropertiesWritten 119\n", - "modularity 0.176974\n", - "modularities [0.15405649883268652, 0.17697414179849225]\n", - "ranLevels 2\n", - "communityCount 10\n", - "communityDistribution {'p99': 42, 'min': 2, 'max': 42, 'mean': 11.9,...\n", - "postProcessingMillis 4\n", - "preProcessingMillis 0\n", - "computeMillis 1257\n", - "configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n", - "Name: 0, dtype: object\n" - ] - } + "text/plain": [ + "Louvain: 0%| | 0/100 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeIdpropertyValue
00103
11103
223
333
4414
\n", + "" ], - "metadata": { - "id": "LMgy4UhzFKeD", - "outputId": "db0ebcfc-8348-48a0-d9d3-190f58e3b8f3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeId propertyValue\n", - "0 0 103\n", - "1 1 103\n", - "2 2 3\n", - "3 3 3\n", - "4 4 14" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeIdpropertyValue
00103
11103
223
333
4414
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 16 - } + "text/plain": [ + " nodeId propertyValue\n", + "0 0 103\n", + "1 1 103\n", + "2 2 3\n", + "3 3 3\n", + "4 4 14" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to retrieve node properties from the in-memory graph in the form of a Pandas DataFrame\n", + "louvain_df = gds.graph.streamNodeProperty(G, 'communityId')\n", + "louvain_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HlABauhObZAd" + }, + "source": [ + "The first parameter of the `gds.graph.streamNodeProperty()` method is the referenced Graph object. As the second parameter, we define which property we want to retrieve from the in-memory graph.\n", + "\n", + "Again, we get the internal node ids in the `nodeId` column. We could use the `gds.util.asNodes()` method to fetch the node objects that the internal node ids reference. Unfortunately, the column with the retrieved node properties has a generic name `propertyValue` . In our case, it would make sense to name the column with the results `communityId` . However, we can do that manually if we need to." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "ETnKF9pX-aED", + "outputId": "2a23dc7c-417a-4852-ce67-ee90028fa888" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "The first parameter of the `gds.graph.streamNodeProperty()` method is the referenced Graph object. As the second parameter, we define which property we want to retrieve from the in-memory graph.\n", - "\n", - "Again, we get the internal node ids in the `nodeId` column. We could use the `gds.util.asNodes()` method to fetch the node objects that the internal node ids reference. Unfortunately, the column with the retrieved node properties has a generic name `propertyValue` . In our case, it would make sense to name the column with the results `communityId` . However, we can do that manually if we need to." - ], - "metadata": { - "id": "HlABauhObZAd" - } - }, - { - "cell_type": "code", - "source": [ - "# Rename columns\n", - "louvain_df.columns = ['nodeId', 'communityId']\n", - "# You can do all sorts of pandas operations like aggregations\n", - "louvain_df.groupby(\"communityId\").size().to_frame(\n", - " \"communitySize\"\n", - ").reset_index().sort_values(by=[\"communitySize\"], ascending=False)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityIdcommunitySize
21442
810318
79916
56911
3318
147
4486
6815
91174
032
\n", + "
" ], - "metadata": { - "id": "ETnKF9pX-aED", - "outputId": "2a23dc7c-417a-4852-ce67-ee90028fa888", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - } - }, - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " communityId communitySize\n", - "2 14 42\n", - "8 103 18\n", - "7 99 16\n", - "5 69 11\n", - "3 31 8\n", - "1 4 7\n", - "4 48 6\n", - "6 81 5\n", - "9 117 4\n", - "0 3 2" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
communityIdcommunitySize
21442
810318
79916
56911
3318
147
4486
6815
91174
032
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 17 - } + "text/plain": [ + " communityId communitySize\n", + "2 14 42\n", + "8 103 18\n", + "7 99 16\n", + "5 69 11\n", + "3 31 8\n", + "1 4 7\n", + "4 48 6\n", + "6 81 5\n", + "9 117 4\n", + "0 3 2" ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Rename columns\n", + "louvain_df.columns = ['nodeId', 'communityId']\n", + "# You can do all sorts of pandas operations like aggregations\n", + "louvain_df.groupby(\"communityId\").size().to_frame(\n", + " \"communitySize\"\n", + ").reset_index().sort_values(by=[\"communitySize\"], ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbYFdS47bolR" + }, + "source": [ + "Like mentioned before, the added benefit of dealing with Pandas DataFrames as algorithm output is that you can apply all your Python skills to transform or manipulate the results. In this example, we simply grouped the DataFrame by the `communityId` column and count the members of each community.\n", + "\n", + "# Helpful methods\n", + "In the last part of this post, we will go over some of the helpful methods. The first one that comes to mind is listing all of the already projected in-memory graph with the `gds.graph.list()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 }, + "id": "GO4zQ6DKLCWG", + "outputId": "b94c92fc-7d02-46c0-84bc-f8263499827e" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "Like mentioned before, the added benefit of dealing with Pandas DataFrames as algorithm output is that you can apply all your Python skills to transform or manipulate the results. In this example, we simply grouped the DataFrame by the `communityId` column and count the members of each community.\n", - "\n", - "# Helpful methods\n", - "In the last part of this post, we will go over some of the helpful methods. The first one that comes to mind is listing all of the already projected in-memory graph with the `gds.graph.list()` method." - ], - "metadata": { - "id": "SbYFdS47bolR" - } - }, - { - "cell_type": "code", - "source": [ - "gds.graph.list()" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
degreeDistributiongraphNamedatabasememoryUsagesizeInBytesnodeCountrelationshipCountconfigurationdensitycreationTimemodificationTimeschemaschemaWithOrientation
0{'p99': 41, 'min': 1, 'max': 83, 'mean': 6.823...hp-graphneo4j2350 KiB2406760119812{'relationshipProjection': {'INTERACTS': {'ori...0.0578272023-02-01T12:15:46.030702248+00:002023-02-01T12:16:01.522004066+00:00{'graphProperties': {}, 'relationships': {'INT...{'graphProperties': {}, 'relationships': {'INT...
\n", + "
" ], - "metadata": { - "id": "GO4zQ6DKLCWG", - "outputId": "b94c92fc-7d02-46c0-84bc-f8263499827e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - } - }, - "execution_count": 18, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " degreeDistribution graphName database \\\n", - "0 {'p99': 41, 'min': 1, 'max': 83, 'mean': 6.823... hp-graph neo4j \n", - "\n", - " memoryUsage sizeInBytes nodeCount relationshipCount \\\n", - "0 2353 KiB 2409552 119 812 \n", - "\n", - " configuration density \\\n", - "0 {'relationshipProjection': {'INTERACTS': {'ori... 0.057827 \n", - "\n", - " creationTime modificationTime \\\n", - "0 2022-05-31T20:26:22.662999000+00:00 2022-05-31T20:26:24.869696000+00:00 \n", - "\n", - " schema \n", - "0 {'relationships': {'INTERACTS': {'weight': 'Fl... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
degreeDistributiongraphNamedatabasememoryUsagesizeInBytesnodeCountrelationshipCountconfigurationdensitycreationTimemodificationTimeschema
0{'p99': 41, 'min': 1, 'max': 83, 'mean': 6.823...hp-graphneo4j2353 KiB2409552119812{'relationshipProjection': {'INTERACTS': {'ori...0.0578272022-05-31T20:26:22.662999000+00:002022-05-31T20:26:24.869696000+00:00{'relationships': {'INTERACTS': {'weight': 'Fl...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 18 - } + "text/plain": [ + " degreeDistribution graphName database \\\n", + "0 {'p99': 41, 'min': 1, 'max': 83, 'mean': 6.823... hp-graph neo4j \n", + "\n", + " memoryUsage sizeInBytes nodeCount relationshipCount \\\n", + "0 2350 KiB 2406760 119 812 \n", + "\n", + " configuration density \\\n", + "0 {'relationshipProjection': {'INTERACTS': {'ori... 0.057827 \n", + "\n", + " creationTime modificationTime \\\n", + "0 2023-02-01T12:15:46.030702248+00:00 2023-02-01T12:16:01.522004066+00:00 \n", + "\n", + " schema \\\n", + "0 {'graphProperties': {}, 'relationships': {'INT... \n", + "\n", + " schemaWithOrientation \n", + "0 {'graphProperties': {}, 'relationships': {'INT... " ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.graph.list()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d92J_oYCbxxx" + }, + "source": [ + "Sometimes there are already projected in-memory graphs present in the database. If you don't have a reference to the projected graphs in the form of a Graph object, you cannot execute any graph algorithm. To avoid having to drop and recreate projected graphs, you can use the `gds.graph.get()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "UML8KJzWTpjN" + }, + "outputs": [], + "source": [ + "# G = gds.graph.get(\"graph name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2hIU-lQ2b5xz" + }, + "source": [ + "When using the shortest path algorithms, you need to provide source and target nodes ids. You could use Cypher statements or you could use the `gds.find_node_id()` method.\n", + "\n", + "![find_node.drawio (1).png]()\n", + "\n", + "The `gds.find_node_id()` takes in two arguments. The first argument defines the node label we are searching for. In our example, we are searching for the `Character` node label. The second parameter specifies the node properties used to identify the particular node. The node properties are defined as a dictionary or map of key-value pairs, similar to the inline `MATCH` clause. The only difference is that we must add quotes around the key values of properties since otherwise, we would get a NameError in Python." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "2nTv7PcrTl7B", + "outputId": "a3dcfc07-b511-4701-fdfe-566f1b41ac31" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "Sometimes there are already projected in-memory graphs present in the database. If you don't have a reference to the projected graphs in the form of a Graph object, you cannot execute any graph algorithm. To avoid having to drop and recreate projected graphs, you can use the `gds.graph.get()` method." - ], - "metadata": { - "id": "d92J_oYCbxxx" - } - }, - { - "cell_type": "code", - "source": [ - "# G = gds.graph.get(\"graph name\")" - ], - "metadata": { - "id": "UML8KJzWTpjN" - }, - "execution_count": 19, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "When using the shortest path algorithms, you need to provide source and target nodes ids. You could use Cypher statements or you could use the `gds.find_node_id()` method.\n", - "\n", - "![find_node.drawio (1).png]()\n", - "\n", - "The `gds.find_node_id()` takes in two arguments. The first argument defines the node label we are searching for. In our example, we are searching for the `Character` node label. The second parameter specifies the node properties used to identify the particular node. The node properties are defined as a dictionary or map of key-value pairs, similar to the inline `MATCH` clause. The only difference is that we must add quotes around the key values of properties since otherwise, we would get a NameError in Python." - ], - "metadata": { - "id": "2hIU-lQ2b5xz" - } - }, - { - "cell_type": "code", - "source": [ - "gds.find_node_id([\"Character\"], {\"name\":\"Harry Potter\"})" - ], - "metadata": { - "id": "2nTv7PcrTl7B", - "outputId": "a3dcfc07-b511-4701-fdfe-566f1b41ac31", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 20, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "4" - ] - }, - "metadata": {}, - "execution_count": 20 - } + "data": { + "text/plain": [ + "4" ] - }, - { - "cell_type": "markdown", - "source": [ - "The last useful method I will present here is the `drop()` method of a Graph object. It is used to release the projected graph from memory." - ], - "metadata": { - "id": "zj3XAI8McJdK" - } - }, - { - "cell_type": "code", - "source": [ - "# Drop a projected in-memory graph\n", - "G.drop()" - ], - "metadata": { - "id": "V23wjoyfKtag" - }, - "execution_count": 21, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Conclusion\n", - "The Neo4j Graph Data Science Python client is designed to help you integrate Neo4j and its graph algorithms into your Python analytical workflows. The syntax of the Python client mimics the GDS Cypher procedures. Since not all graph algorithms are documented to be used as Python client method, you need to take into account the following guidelines when translating a Cypher procedure to a Python client method:\n", - "* When specifying a map or a dictionary as a parameter to any method, make sure to add quotes around the keys\n", - "* Instead of referencing the projected graph by its name, you need to input the * Graph object as the first parameter of graph algorithms\n", - "* Algorithm specific configuration parameter can be specified using keyword arguments\n", - "* The stream mode of graph algorithms outputs a Pandas DataFrame\n", - "* Other algorithm modes like stats , write , and mutate output the metadata of the algorithm call as a Pandas Series\n", - "\n", - "I am very excited about the new Python client and will be definitely using it in my workflows. Try it out and if you have any feedback please report it to the [official GitHub repository of the Python client](https://github.com/neo4j/graph-data-science-client)." - ], - "metadata": { - "id": "n0u3CxSccPgi" - } - }, + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.find_node_id([\"Character\"], {\"name\":\"Harry Potter\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zj3XAI8McJdK" + }, + "source": [ + "The last useful method I will present here is the `drop()` method of a Graph object. It is used to release the projected graph from memory." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "V23wjoyfKtag" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "" - ], - "metadata": { - "id": "8-VFIcGAcYQg" - }, - "execution_count": 21, - "outputs": [] + "data": { + "text/plain": [ + "graphName hp-graph\n", + "database neo4j\n", + "memoryUsage \n", + "sizeInBytes -1\n", + "nodeCount 119\n", + "relationshipCount 812\n", + "configuration {'relationshipProjection': {'INTERACTS': {'ori...\n", + "density 0.057827\n", + "creationTime 2023-02-01T12:15:46.030702248+00:00\n", + "modificationTime 2023-02-01T12:16:01.522004066+00:00\n", + "schema {'graphProperties': {}, 'relationships': {'INT...\n", + "schemaWithOrientation {'graphProperties': {}, 'relationships': {'INT...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "# Drop a projected in-memory graph\n", + "G.drop()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n0u3CxSccPgi" + }, + "source": [ + "# Conclusion\n", + "The Neo4j Graph Data Science Python client is designed to help you integrate Neo4j and its graph algorithms into your Python analytical workflows. The syntax of the Python client mimics the GDS Cypher procedures. Since not all graph algorithms are documented to be used as Python client method, you need to take into account the following guidelines when translating a Cypher procedure to a Python client method:\n", + "* When specifying a map or a dictionary as a parameter to any method, make sure to add quotes around the keys\n", + "* Instead of referencing the projected graph by its name, you need to input the * Graph object as the first parameter of graph algorithms\n", + "* Algorithm specific configuration parameter can be specified using keyword arguments\n", + "* The stream mode of graph algorithms outputs a Pandas DataFrame\n", + "* Other algorithm modes like stats , write , and mutate output the metadata of the algorithm call as a Pandas Series\n", + "\n", + "I am very excited about the new Python client and will be definitely using it in my workflows. Try it out and if you have any feedback please report it to the [official GitHub repository of the Python client](https://github.com/neo4j/graph-data-science-client)." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "8-VFIcGAcYQg" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyMLODtjsTX2gWhXe5ADDUdP", + "include_colab_link": true, + "name": "gds-python.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/gds_python/p2p-network-analysis.ipynb b/gds_python/p2p-network-analysis.ipynb index 7f3fd73..25c7f72 100644 --- a/gds_python/p2p-network-analysis.ipynb +++ b/gds_python/p2p-network-analysis.ipynb @@ -39,9 +39,9 @@ "source": [ "## Environment setup\n", "We will be using Neo4j as the database to store the peer-to-peer network. Therefore, I suggest you download and install the Neo4j Desktop application if you want to follow along with the code examples.\n", - "The dataset is available as a [database dump](https://drive.google.com/file/d/1_N_QLtCRI-eeLzjEIFZAbj8YQrWfTolI/view). It is a variation of the database dump available on Neo4j's product example GitHub to showcase fraud detection.\n", + "The dataset is available as a [database dump](https://drive.google.com/file/d/1apR3xwWEOdi_WKmIAGk1bPqhHQSgxwT-/view?usp=share_link). It is a variation of the database dump available on Neo4j's product example GitHub to showcase fraud detection.\n", "\n", - "I've written a post about restoring a database dump in Neo4j Desktop sometime ago if you need some help. After you have restored the database dump, you will also need to install the Graph Data Science and APOC libraries. Make sure you are using version 2.1.0 of the GDS library or later.\n", + "I've written a post about restoring a database dump in Neo4j Desktop sometime ago if you need some help. After you have restored the database dump, you will also need to install the Graph Data Science and APOC libraries. Make sure you are using version 2.3.0 of the GDS library or later.\n", "\n", "You will need to have the following three Python libraries installed:\n", "* graphdatascience: Neo4j Graph Data Science Python client\n", @@ -85,14 +85,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.1.2\n" + "2.3.0\n" ] } ], "source": [ "host = \"bolt://localhost:7687\"\n", "user = \"neo4j\"\n", - "password = \"letmein\"\n", + "password = \"pleaseletmein\"\n", "\n", "gds = GraphDataScience(host, auth=(user, password))\n", "\n", @@ -119,8 +119,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[{'User': 33732, 'Device': 51451, 'IP': 585855, 'Node': 3, 'Card': 118818}]\n", - "[{'HAS_IP': 1488949, 'REFERRED': 1870, 'REL': 1, 'USED': 55026, 'HAS_CC': 128066, 'P2P': 102832}]\n" + "[{'User': 33732, 'Device': 51451, 'IP': 585855, 'Card': 118818}]\n", + "[{'HAS_IP': 1488949, 'REFERRED': 1870, 'USED': 55026, 'HAS_CC': 128066, 'P2P': 102832}]\n" ] } ], @@ -224,30 +224,16 @@ "id": "3d6fcef5", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c5cf93aa3c5649cdac6470bcfc8c9a9c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading: 0%| | 0/100 [00:00" + "" ] }, "execution_count": 11, @@ -572,7 +558,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -631,7 +617,7 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -645,7 +631,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -655,7 +641,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -665,7 +651,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -685,7 +671,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -697,19 +683,19 @@ "" ], "text/plain": [ - "nodeProperty nodeId accountYears avgTransactionAmount betweenness \\\n", - "0 0 4.0 98.782609 24985.335938 \n", - "1 1 4.0 100.000000 0.000000 \n", - "2 3 4.0 122.000000 0.000000 \n", - "3 4 0.0 0.000000 0.000000 \n", - "4 14 4.0 368.000000 0.000000 \n", + " nodeId accountYears avgTransactionAmount betweenness closeness \\\n", + "0 0 5.0 98.782609 24985.335938 0.163444 \n", + "1 1 5.0 100.000000 0.000000 0.075315 \n", + "2 3 5.0 122.000000 0.000000 0.137684 \n", + "3 4 0.0 0.000000 0.000000 0.171180 \n", + "4 14 5.0 368.000000 0.000000 0.172998 \n", "\n", - "nodeProperty closeness weightedIndegree weightedOutdegree \n", - "0 0.163444 1000.00 2272.0 \n", - "1 0.075315 100.00 100.0 \n", - "2 0.137684 167.22 122.0 \n", - "3 0.171180 210.00 0.0 \n", - "4 0.172998 500.00 2576.0 " + " weightedIndegree weightedOutdegree \n", + "0 1000.00 2272.0 \n", + "1 100.00 100.0 \n", + "2 167.22 122.0 \n", + "3 210.00 0.0 \n", + "4 500.00 2576.0 " ] }, "execution_count": 12, @@ -767,7 +753,7 @@ "
nodePropertynodeIdaccountYearsavgTransactionAmount
004.05.098.78260924985.3359380.163444
114.05.0100.0000000.0000000.075315
234.05.0122.0000000.0000000.137684
4144.05.0368.0000000.0000000.172998
\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -791,7 +777,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -801,7 +787,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -841,7 +827,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -851,7 +837,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -863,25 +849,25 @@ "" ], "text/plain": [ - "nodeProperty nodeId accountYears avgTransactionAmount betweenness \\\n", - "count 11311.000000 11311.000000 11311.000000 1.131100e+04 \n", - "mean 16789.334542 2.216426 99.198514 1.967391e+04 \n", - "std 9749.632411 2.349186 331.473144 2.176740e+05 \n", - "min 0.000000 0.000000 0.000000 0.000000e+00 \n", - "25% 8308.000000 0.000000 0.000000 0.000000e+00 \n", - "50% 16756.000000 0.000000 0.000000 0.000000e+00 \n", - "75% 25289.000000 5.000000 54.687500 0.000000e+00 \n", - "max 33724.000000 5.000000 4000.000000 1.129122e+07 \n", + " nodeId accountYears avgTransactionAmount betweenness \\\n", + "count 11311.000000 11311.000000 11311.000000 1.131100e+04 \n", + "mean 16789.334542 2.693307 99.198514 1.967391e+04 \n", + "std 9749.632411 2.843814 331.473144 2.176740e+05 \n", + "min 0.000000 0.000000 0.000000 0.000000e+00 \n", + "25% 8308.000000 0.000000 0.000000 0.000000e+00 \n", + "50% 16756.000000 0.000000 0.000000 0.000000e+00 \n", + "75% 25289.000000 6.000000 54.687500 0.000000e+00 \n", + "max 33724.000000 6.000000 4000.000000 1.129122e+07 \n", "\n", - "nodeProperty closeness weightedIndegree weightedOutdegree \n", - "count 11311.000000 11311.000000 1.131100e+04 \n", - "mean 0.180162 1527.190995 1.527191e+03 \n", - "std 0.175071 13206.700185 1.637541e+04 \n", - "min 0.000000 0.000000 0.000000e+00 \n", - "25% 0.129951 5.000000 0.000000e+00 \n", - "50% 0.155281 30.000000 0.000000e+00 \n", - "75% 0.170334 300.000000 1.390000e+02 \n", - "max 1.000000 498911.090000 1.065990e+06 " + " closeness weightedIndegree weightedOutdegree \n", + "count 11311.000000 11311.000000 1.131100e+04 \n", + "mean 0.180162 1527.190995 1.527191e+03 \n", + "std 0.175071 13206.700185 1.637541e+04 \n", + "min 0.000000 0.000000 0.000000e+00 \n", + "25% 0.129951 5.000000 0.000000e+00 \n", + "50% 0.155281 30.000000 0.000000e+00 \n", + "75% 0.170334 300.000000 1.390000e+02 \n", + "max 1.000000 498911.090000 1.065990e+06 " ] }, "execution_count": 13, @@ -910,7 +896,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 14, @@ -919,7 +905,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -972,8 +958,8 @@ "mutateMillis 0\n", "postProcessingMillis 0\n", "preProcessingMillis 0\n", - "computeMillis 23\n", - "configuration {'jobId': '21d64bc4-f207-4331-886d-a8276ec15d0...\n", + "computeMillis 20\n", + "configuration {'jobId': '2cb069c4-39e3-444b-b674-a75a2b96ad5...\n", "Name: 0, dtype: object" ] }, @@ -1037,45 +1023,57 @@ "
\n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", "
nodePropertynodeIdaccountYearsavgTransactionAmount
mean16789.3345422.2164262.69330799.1985141.967391e+040.180162
std9749.6324112.3491862.843814331.4731442.176740e+050.175071
75%25289.0000005.0000006.00000054.6875000.000000e+000.170334
max33724.0000005.0000006.0000004000.0000001.129122e+071.000000nodeIdcommunityIddistanceFromCentroidsilhouette
020000020.005067-1.0
120011201.148309-1.0
220014020.026275-1.0
320017350.087512-1.0
42002610.225664-1.0
\n", "" ], "text/plain": [ - " nodeId communityId\n", - "0 20000 0\n", - "1 20011 2\n", - "2 20014 0\n", - "3 20017 3\n", - "4 20026 1" + " nodeId communityId distanceFromCentroid silhouette\n", + "0 20000 2 0.005067 -1.0\n", + "1 20011 0 1.148309 -1.0\n", + "2 20014 2 0.026275 -1.0\n", + "3 20017 5 0.087512 -1.0\n", + "4 20026 1 0.225664 -1.0" ] }, "execution_count": 16, @@ -1084,7 +1082,7 @@ } ], "source": [ - "kmeans_df = gds.alpha.kmeans.stream(\n", + "kmeans_df = gds.beta.kmeans.stream(\n", " largestComponentGraph, nodeProperty=\"features\", k=6, randomSeed=42\n", ")\n", "kmeans_df.head()" @@ -1134,32 +1132,32 @@ " \n", " 0\n", " 0\n", - " 3225\n", + " 375\n", " \n", " \n", " 1\n", " 1\n", - " 4870\n", + " 4792\n", " \n", " \n", " 2\n", " 2\n", - " 570\n", + " 3637\n", " \n", " \n", " 3\n", " 3\n", - " 1961\n", + " 348\n", " \n", " \n", " 4\n", " 4\n", - " 217\n", + " 236\n", " \n", " \n", " 5\n", " 5\n", - " 468\n", + " 1923\n", " \n", " \n", "\n", @@ -1167,12 +1165,12 @@ ], "text/plain": [ " communityId communitySize\n", - "0 0 3225\n", - "1 1 4870\n", - "2 2 570\n", - "3 3 1961\n", - "4 4 217\n", - "5 5 468" + "0 0 375\n", + "1 1 4792\n", + "2 2 3637\n", + "3 3 348\n", + "4 4 236\n", + "5 5 1923" ] }, "execution_count": 17, @@ -1205,7 +1203,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 18, @@ -1214,7 +1212,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1255,7 +1253,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 19, @@ -1264,9 +1262,9 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ - "
" + "
" ] }, "metadata": { @@ -1314,7 +1312,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1371,17 +1369,18 @@ { "data": { "text/plain": [ - "graphName largestComponent\n", - "database neo4j\n", - "memoryUsage \n", - "sizeInBytes -1\n", - "nodeCount 11311\n", - "relationshipCount 52272\n", - "configuration {'jobId': '3ea5e089-5118-4f2c-9222-f8710cb8770...\n", - "density 0.000409\n", - "creationTime 2022-06-28T11:39:18.691439000+02:00\n", - "modificationTime 2022-06-28T11:40:06.085760000+02:00\n", - "schema {'graphProperties': {}, 'relationships': {'P2P...\n", + "graphName largestComponent\n", + "database neo4j\n", + "memoryUsage \n", + "sizeInBytes -1\n", + "nodeCount 11311\n", + "relationshipCount 52272\n", + "configuration {'jobId': '36a5f9cd-005a-41d5-838f-6cb665d09f8...\n", + "density 0.000409\n", + "creationTime 2023-02-01T13:18:11.499292052+01:00\n", + "modificationTime 2023-02-01T13:19:23.300208879+01:00\n", + "schema {'graphProperties': {}, 'relationships': {'P2P...\n", + "schemaWithOrientation {'graphProperties': {}, 'relationships': {'P2P...\n", "Name: 0, dtype: object" ] }, diff --git a/harry_potter/Harry_Potter_Karate_Club_integration.ipynb b/harry_potter/Harry_Potter_Karate_Club_integration.ipynb index 7f3f704..fd4c0f0 100644 --- a/harry_potter/Harry_Potter_Karate_Club_integration.ipynb +++ b/harry_potter/Harry_Potter_Karate_Club_integration.ipynb @@ -1,2323 +1,1469 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-aWuO6deuj4U" + }, + "source": [ + "* Updated to GDS 2.0 version\n", + "* Link to original blog post: https://towardsdatascience.com/integrate-neo4j-with-karateclub-node-embedding-package-99715d73250a" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "Harry Potter - Karate Club integration.ipynb", - "provenance": [], - "authorship_tag": "ABX9TyOelByQliYNdEZtLAG5ICeE", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "Ln3_AG0uiTiW", + "outputId": "f78e1075-d615-4570-ddfb-395337274495" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.3.5)\n", + "Collecting neo4j\n", + " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", + "\u001b[K |████████████████████████████████| 89 kB 3.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (1.0.2)\n", + "Collecting karateclub\n", + " Downloading karateclub-1.2.3.tar.gz (62 kB)\n", + "\u001b[K |████████████████████████████████| 62 kB 548 kB/s \n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.21.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", + "Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.4.1)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.1.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (3.1.0)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (from karateclub) (2.6.3)\n", + "Requirement already satisfied: decorator==4.4.2 in /usr/local/lib/python3.7/dist-packages (from karateclub) (4.4.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from karateclub) (4.64.0)\n", + "Requirement already satisfied: python-louvain in /usr/local/lib/python3.7/dist-packages (from karateclub) (0.16)\n", + "Collecting pygsp\n", + " Downloading PyGSP-0.5.1-py2.py3-none-any.whl (1.8 MB)\n", + "\u001b[K |████████████████████████████████| 1.8 MB 7.8 MB/s \n", + "\u001b[?25hCollecting gensim>=4.0.0\n", + " Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n", + "\u001b[K |████████████████████████████████| 24.1 MB 2.2 MB/s \n", + "\u001b[?25hCollecting python-Levenshtein\n", + " Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)\n", + "\u001b[K |████████████████████████████████| 50 kB 2.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim>=4.0.0->karateclub) (5.2.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from python-Levenshtein->karateclub) (57.4.0)\n", + "Building wheels for collected packages: neo4j, karateclub, python-Levenshtein\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=4941018e1ed8fd95da841b0f878d0b0b5fe7830d6ac387f5d42f17d2106a054f\n", + " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", + " Building wheel for karateclub (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for karateclub: filename=karateclub-1.2.3-py3-none-any.whl size=97754 sha256=54c83ec56a4864ca4fc21b2856d5f7a4bcd86c3e0515fb8abe411b86176f8c1d\n", + " Stored in directory: /root/.cache/pip/wheels/7a/09/80/0d50455fd4e297e88f8f38a711c6f4849e6bd1a330000dde3d\n", + " Building wheel for python-Levenshtein (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149868 sha256=2ecbfbdcde0c31ee08f23a76147cc73a61fe17841cfe478691b3ab8124c62e24\n", + " Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d\n", + "Successfully built neo4j karateclub python-Levenshtein\n", + "Installing collected packages: python-Levenshtein, pygsp, gensim, neo4j, karateclub\n", + " Attempting uninstall: gensim\n", + " Found existing installation: gensim 3.6.0\n", + " Uninstalling gensim-3.6.0:\n", + " Successfully uninstalled gensim-3.6.0\n", + "Successfully installed gensim-4.1.2 karateclub-1.2.3 neo4j-4.4.2 pygsp-0.5.1 python-Levenshtein-0.12.2\n" + ] } + ], + "source": [ + "!pip install pandas neo4j scikit-learn karateclub" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "* Updated to GDS 2.0 version\n", - "* Link to original blog post: https://towardsdatascience.com/integrate-neo4j-with-karateclub-node-embedding-package-99715d73250a" - ], - "metadata": { - "id": "-aWuO6deuj4U" - } - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ln3_AG0uiTiW", - "outputId": "f78e1075-d615-4570-ddfb-395337274495" - }, - "source": [ - "!pip install pandas neo4j scikit-learn karateclub" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.3.5)\n", - "Collecting neo4j\n", - " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", - "\u001b[K |████████████████████████████████| 89 kB 3.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (1.0.2)\n", - "Collecting karateclub\n", - " Downloading karateclub-1.2.3.tar.gz (62 kB)\n", - "\u001b[K |████████████████████████████████| 62 kB 548 kB/s \n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", - "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.21.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", - "Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.4.1)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (1.1.0)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn) (3.1.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (from karateclub) (2.6.3)\n", - "Requirement already satisfied: decorator==4.4.2 in /usr/local/lib/python3.7/dist-packages (from karateclub) (4.4.2)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from karateclub) (4.64.0)\n", - "Requirement already satisfied: python-louvain in /usr/local/lib/python3.7/dist-packages (from karateclub) (0.16)\n", - "Collecting pygsp\n", - " Downloading PyGSP-0.5.1-py2.py3-none-any.whl (1.8 MB)\n", - "\u001b[K |████████████████████████████████| 1.8 MB 7.8 MB/s \n", - "\u001b[?25hCollecting gensim>=4.0.0\n", - " Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)\n", - "\u001b[K |████████████████████████████████| 24.1 MB 2.2 MB/s \n", - "\u001b[?25hCollecting python-Levenshtein\n", - " Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)\n", - "\u001b[K |████████████████████████████████| 50 kB 2.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim>=4.0.0->karateclub) (5.2.1)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from python-Levenshtein->karateclub) (57.4.0)\n", - "Building wheels for collected packages: neo4j, karateclub, python-Levenshtein\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=4941018e1ed8fd95da841b0f878d0b0b5fe7830d6ac387f5d42f17d2106a054f\n", - " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", - " Building wheel for karateclub (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for karateclub: filename=karateclub-1.2.3-py3-none-any.whl size=97754 sha256=54c83ec56a4864ca4fc21b2856d5f7a4bcd86c3e0515fb8abe411b86176f8c1d\n", - " Stored in directory: /root/.cache/pip/wheels/7a/09/80/0d50455fd4e297e88f8f38a711c6f4849e6bd1a330000dde3d\n", - " Building wheel for python-Levenshtein (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149868 sha256=2ecbfbdcde0c31ee08f23a76147cc73a61fe17841cfe478691b3ab8124c62e24\n", - " Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d\n", - "Successfully built neo4j karateclub python-Levenshtein\n", - "Installing collected packages: python-Levenshtein, pygsp, gensim, neo4j, karateclub\n", - " Attempting uninstall: gensim\n", - " Found existing installation: gensim 3.6.0\n", - " Uninstalling gensim-3.6.0:\n", - " Successfully uninstalled gensim-3.6.0\n", - "Successfully installed gensim-4.1.2 karateclub-1.2.3 neo4j-4.4.2 pygsp-0.5.1 python-Levenshtein-0.12.2\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pD8HtmZxidNG" - }, - "source": [ - "import networkx as nx\n", - "import pandas as pd\n", - "from neo4j import GraphDatabase\n", - "# Change the host and user/password combination to your neo4j\n", - "host = 'bolt://3.235.2.228:7687'\n", - "user = 'neo4j'\n", - "password = 'seats-drunks-carbon'\n", - "driver = GraphDatabase.driver(host,auth=(user, password))\n", - "\n", - "def read_query(query, params=None):\n", - " with driver.session() as session:\n", - " result = session.run(query, params)\n", - " return pd.DataFrame([r.values() for r in result], columns=result.keys())" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KZr8Abs_b1hl" - }, - "source": [ - "Lately, I have been on a quest to learn as much as possible about node embedding techniques. The goal of node embedding is to encode nodes so that the similarity in the embedding space approximates similarity in the original network. In layman’s terms, we encode each node to a fixed size vector that preserves the similarity of the original network.\n", - "\n", - "I have come across the [Karate Club](https://github.com/benedekrozemberczki/karateclub) package in my search for the implementation of various node embedding models. I will let the author Benedek Rozemberczki explain what its purpose is:\n", - ">Karate Club consists of state-of-the-art methods to do unsupervised learning on graph structured data. To put it simply it is a Swiss Army knife for small-scale graph mining research. First, it provides network embedding techniques at the node and graph level. Second, it includes a variety of overlapping and non-overlapping community detection methods. Implemented methods cover a wide range of network science (NetSci, Complenet), data mining (ICDM, CIKM, KDD), artificial intelligence (AAAI, IJCAI) and machine learning (NeurIPS, ICML, ICLR) conferences, workshops, and pieces from prominent journals.\n", - "\n", - "The Karate Club project features:\n", - "10+ community detection models\n", - "25+ node embedding models\n", - "10+ graph embedding models\n", - "As you might know, I like to store my network information in Neo4j. In this blog post, I will demonstrate how to extract network information from Neo4j and use it as an input to the Karate Club API. It is a straightforward transformation. We have to transform a Neo4j graph to a NetworkX graph model, as Karate Club uses NetworkX structure, and we are good to go. \n", - "\n", - "#Data model\n", - "We will use a simple toy graph of the Harry Potter universe that I have created in my previous blog post. I have prepared a CSV file with the network structure, so you don’t have to complete the NLP process yourself.\n", - "\n", - "The network is based on the Harry Potter and the Sorcerer’s Stone book. Nodes represent the character in the book, and the INTERACTS relationships represent co-occurrences in the text between characters. To import this network, execute the following Cypher query:\n", - "P.s. If you are following along with the Colab notebook, I suggest you open a blank [Neo4j Sandbox project](https://neo4j.com/sandbox/)." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "EDYtEnBPikiu", - "outputId": "7d658614-b8ed-42a7-a709-585a497461d0" - }, - "source": [ - "# import data\n", - "read_query(\"\"\"\n", - "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/HP/hp_1.csv\" as row\n", - "MERGE (s:Character{name:row.source})\n", - "MERGE (t:Character{name:row.target})\n", - "MERGE (s)-[r:INTERACTS]-(t)\n", - "SET r.weight = row.weight\n", - "RETURN distinct 'import successful' as result\n", - "\"\"\")" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 import successful" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0import successful
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 3 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "" - ], - "metadata": { - "id": "ngyYPEGku79t" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "t71ygjxRcuOK" - }, - "source": [ - "Now that we have our network imported, we can examine the community structure and calculate the node embedding with the help of the Karate Club package.\n", - "# Community detection\n", - "For those of you that are completely new to Neo4j, I must let you know that Neo4j Graph Data Science plugin provides a couple of community detection algorithms out of the box. I will quickly demonstrate how to use the Louvain algorithm in the GDS library. First you have to project an in-memory graph" - ] - }, - { - "cell_type": "code", - "source": [ - "read_query(\"\"\"\n", - "CALL gds.graph.project('got', 'Character', {INTERACTS:{orientation:'UNDIRECTED'}})\n", - "\"\"\")" - ], - "metadata": { - "id": "mex15wrau6zV", - "outputId": "22175cae-6ff5-4c5a-8a77-eac28ff2d838", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'Character': {'label': 'Character', 'properti... \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'INTERACTS': {'orientation': 'UNDIRECTED', 'a... got 119 \n", - "\n", - " relationshipCount projectMillis \n", - "0 812 52 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'INTERACTS': {'orientation': 'UNDIRECTED', 'a...got11981252
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 4 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Then you can execute any algorithm on the projected graph" - ], - "metadata": { - "id": "Cp6fcVZRu8rY" - } + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "pD8HtmZxidNG" + }, + "outputs": [], + "source": [ + "import networkx as nx\n", + "import pandas as pd\n", + "from neo4j import GraphDatabase\n", + "# Change the host and user/password combination to your neo4j\n", + "host = 'bolt://3.231.25.240:7687'\n", + "user = 'neo4j'\n", + "password = 'hatchets-visitor-axes'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))\n", + "\n", + "def read_query(query, params=None):\n", + " with driver.session() as session:\n", + " result = session.run(query, params)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KZr8Abs_b1hl" + }, + "source": [ + "Lately, I have been on a quest to learn as much as possible about node embedding techniques. The goal of node embedding is to encode nodes so that the similarity in the embedding space approximates similarity in the original network. In layman’s terms, we encode each node to a fixed size vector that preserves the similarity of the original network.\n", + "\n", + "I have come across the [Karate Club](https://github.com/benedekrozemberczki/karateclub) package in my search for the implementation of various node embedding models. I will let the author Benedek Rozemberczki explain what its purpose is:\n", + ">Karate Club consists of state-of-the-art methods to do unsupervised learning on graph structured data. To put it simply it is a Swiss Army knife for small-scale graph mining research. First, it provides network embedding techniques at the node and graph level. Second, it includes a variety of overlapping and non-overlapping community detection methods. Implemented methods cover a wide range of network science (NetSci, Complenet), data mining (ICDM, CIKM, KDD), artificial intelligence (AAAI, IJCAI) and machine learning (NeurIPS, ICML, ICLR) conferences, workshops, and pieces from prominent journals.\n", + "\n", + "The Karate Club project features:\n", + "10+ community detection models\n", + "25+ node embedding models\n", + "10+ graph embedding models\n", + "As you might know, I like to store my network information in Neo4j. In this blog post, I will demonstrate how to extract network information from Neo4j and use it as an input to the Karate Club API. It is a straightforward transformation. We have to transform a Neo4j graph to a NetworkX graph model, as Karate Club uses NetworkX structure, and we are good to go. \n", + "\n", + "#Data model\n", + "We will use a simple toy graph of the Harry Potter universe that I have created in my previous blog post. I have prepared a CSV file with the network structure, so you don’t have to complete the NLP process yourself.\n", + "\n", + "The network is based on the Harry Potter and the Sorcerer’s Stone book. Nodes represent the character in the book, and the INTERACTS relationships represent co-occurrences in the text between characters. To import this network, execute the following Cypher query:\n", + "P.s. If you are following along with the Colab notebook, I suggest you open a blank [Neo4j Sandbox project](https://neo4j.com/sandbox/)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "EDYtEnBPikiu", + "outputId": "7d658614-b8ed-42a7-a709-585a497461d0" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 142 - }, - "id": "BWPibvfActjL", - "outputId": "a3b165b1-6b2f-49ec-8f84-f88c251dd378" - }, - "source": [ - "\n", - "\n", - "read_query(\"\"\"\n", - "CALL gds.louvain.write('got',{\n", - " writeProperty:'louvain'\n", - "})\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0import successful
\n", + "
" ], - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " writeMillis nodePropertiesWritten modularity \\\n", - "0 185 119 0.345871 \n", - "\n", - " modularities ranLevels communityCount \\\n", - "0 [0.28369409594991385, 0.34587104758669224] 2 7 \n", - "\n", - " communityDistribution postProcessingMillis \\\n", - "0 {'p99': 29, 'min': 8, 'max': 29, 'mean': 17.0,... 3 \n", - "\n", - " preProcessingMillis computeMillis \\\n", - "0 0 1769 \n", - "\n", - " configuration \n", - "0 {'maxIterations': 10, 'writeConcurrency': 4, '... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
writeMillisnodePropertiesWrittenmodularitymodularitiesranLevelscommunityCountcommunityDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
01851190.345871[0.28369409594991385, 0.34587104758669224]27{'p99': 29, 'min': 8, 'max': 29, 'mean': 17.0,...301769{'maxIterations': 10, 'writeConcurrency': 4, '...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i0iCbJxSdC3I" - }, - "source": [ - "A critical detail of the network analysis is that the interaction network between the characters is undirected. I won’t go into the algorithms’ theory or their pros and cons. The goal of this blog post is purely to help you get started with integrating Karate Club and Neo4j. The ideas and differentiation of the algorithms may come in another blog post. Now let’s run some algorithms in KC. KC only works when nodes in the graph have consecutive ids. I don’t know the reason behind this choice; that’s just how it is. We can easily create a mapping to consecutive ids and store it in Neo4j." + "text/plain": [ + " result\n", + "0 import successful" ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import data\n", + "read_query(\"\"\"\n", + "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/HP/hp_1.csv\" as row\n", + "MERGE (s:Character{name:row.source})\n", + "MERGE (t:Character{name:row.target})\n", + "MERGE (s)-[r:INTERACTS]-(t)\n", + "SET r.weight = row.weight\n", + "RETURN distinct 'import successful' as result\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t71ygjxRcuOK" + }, + "source": [ + "Now that we have our network imported, we can examine the community structure and calculate the node embedding with the help of the Karate Club package.\n", + "# Community detection\n", + "For those of you that are completely new to Neo4j, I must let you know that Neo4j Graph Data Science plugin provides a couple of community detection algorithms out of the box. I will quickly demonstrate how to use the Louvain algorithm in the GDS library. First you have to project an in-memory graph" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "mex15wrau6zV", + "outputId": "22175cae-6ff5-4c5a-8a77-eac28ff2d838" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "N4mijxU8i7zw", - "outputId": "f7621460-00a1-48ff-ed1c-96db8c64a6da" - }, - "source": [ - "# KarateClub only works on nodes with consecutive ids\n", - "read_query(\"\"\"\n", - "MATCH (c:Character)\n", - "WITH count(*) as number, collect(c) as nodes\n", - "UNWIND range(0, number - 1) as index\n", - "WITH nodes[index] as node, index\n", - "SET node.index = index\n", - "RETURN distinct 'done' as result\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'INTERACTS': {'orientation': 'UNDIRECTED', 'i...got119812284
\n", + "
" ], - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 done" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0done
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } + "text/plain": [ + " nodeProjection \\\n", + "0 {'Character': {'label': 'Character', 'properti... \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'INTERACTS': {'orientation': 'UNDIRECTED', 'i... got 119 \n", + "\n", + " relationshipCount projectMillis \n", + "0 812 284 " ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.graph.project('got', 'Character', {INTERACTS:{orientation:'UNDIRECTED'}})\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cp6fcVZRu8rY" + }, + "source": [ + "Then you can execute any algorithm on the projected graph" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 }, + "id": "BWPibvfActjL", + "outputId": "a3b165b1-6b2f-49ec-8f84-f88c251dd378" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "uXfnlMoJja-S" - }, - "source": [ - "# Define character mapping\n", - "character_mapping = read_query(\"\"\"\n", - "MATCH (c:Character)\n", - "WHERE ((c)-[:INTERACTS]-())\n", - "RETURN c.name as character, c.index as index\n", - "ORDER BY index\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
writeMillisnodePropertiesWrittenmodularitymodularitiesranLevelscommunityCountcommunityDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
0311190.348028[0.28802264068528716, 0.3480277366594676]27{'p99': 29, 'min': 8, 'max': 29, 'mean': 17.0,...303485{'maxIterations': 10, 'writeConcurrency': 4, '...
\n", + "
" ], - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RExm6VXYdOa9" - }, - "source": [ - "Now, we must export the relevant network data and construct a NetworkX graph model out of it. NetworkX graph can be constructed by only providing the edge list. The syntax of the edge list is:\n", - "```\n", - "[\"1 2 {'weight': 3}\", \"2 3 {'weight': 27}\", \"3 4 {'weight': 3.0}\"]\n", - "```\n", - "Now, we can go ahead and construct a NetworkX graph model of the Harry Potter universe." + "text/plain": [ + " writeMillis nodePropertiesWritten modularity \\\n", + "0 31 119 0.348028 \n", + "\n", + " modularities ranLevels communityCount \\\n", + "0 [0.28802264068528716, 0.3480277366594676] 2 7 \n", + "\n", + " communityDistribution postProcessingMillis \\\n", + "0 {'p99': 29, 'min': 8, 'max': 29, 'mean': 17.0,... 3 \n", + "\n", + " preProcessingMillis computeMillis \\\n", + "0 0 3485 \n", + "\n", + " configuration \n", + "0 {'maxIterations': 10, 'writeConcurrency': 4, '... " ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.louvain.write('got',{\n", + " writeProperty:'louvain'\n", + "})\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i0iCbJxSdC3I" + }, + "source": [ + "A critical detail of the network analysis is that the interaction network between the characters is undirected. I won’t go into the algorithms’ theory or their pros and cons. The goal of this blog post is purely to help you get started with integrating Karate Club and Neo4j. The ideas and differentiation of the algorithms may come in another blog post. Now let’s run some algorithms in KC. KC only works when nodes in the graph have consecutive ids. I don’t know the reason behind this choice; that’s just how it is. We can easily create a mapping to consecutive ids and store it in Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "N4mijxU8i7zw", + "outputId": "f7621460-00a1-48ff-ed1c-96db8c64a6da" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 319 - }, - "id": "kDXnmI6cjNrh", - "outputId": "6fd9e026-50ca-414b-b3d4-311c64df4484" - }, - "source": [ - "# Construct a networkX graph\n", - "edge_list = read_query(\"\"\"\n", - "MATCH (s:Character)-[r:INTERACTS]->(t:Character)\n", - "WITH toString(s.index) + \" \" + toString(t.index) + \" {'weight':\" + toString(r.weight) + \"}\" as edge\n", - "WITH collect(edge) as result\n", - "RETURN result\n", - "\"\"\")\n", - "\n", - "edge_list = edge_list['result'].to_list()[0]\n", - "G = nx.parse_edgelist(edge_list, create_using=nx.Graph(), nodetype=int)\n", - "nx.draw(G)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0done
\n", + "
" ], - "execution_count": 8, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deWAU9fk/8Pfskd1ALgyBhIQ7hIQjnHJUhaACCthv+cqlomhF/AltlaqllW8tHlDP0taCtIhVBCGASj0AESGAQuQKoEKI4SiJEEgCISRkN3vM74+4IYQcuzOzOzO779dfGrKTT7LJPPM5nucRRFEUQUREFCIMag+AiIgokBj4iIgopDDwERFRSGHgIyKikMLAR0REIYWBj4iIQgoDHxERhRQGPiIiCikMfEREFFIY+IiIKKQw8BERUUhh4CMiopDCwEdERCGFgY+IiEIKAx8REYUUBj4iIgopDHxERBRSGPiIiCikMPAREVFIYeAjIqKQwsBHREQhhYGPiIhCikntARAR6UFJhR3r9hcit6gc5TYnoqwmpMZHYeKAJMRGWNQeHvlAEEVRVHsQRERadaigDIuy8rE9rxgAYHe6a//NajJABJDRPQ4zhyejT/sYlUZJvmDgIyJqxIrsU5i/IRc2pwtN3SkFAbCajJg7JhVTh3QK2PhIGi51EhE1oCboHUWVw93s54oiUOVwYf6GowDA4KdxPNxCRFTPoYIyzN+Q61XQq6vK4cb8Dbk4XFjmp5GREhj4iIjqWZSVD5vTJem1NqcLi7PyFR4RKYmBj4iojpIKO7bnFTe5p9cUUQS2HStGaYVd2YGRYhj4iIjqWLe/UPY1BADrDsi/DvkHAx8RUR25ReXXpCxIYXO6kXv2skIjIqUx8BER1VFucyp0HYci1yHlMfAREdURZVUmyyvKalbkOqQ8Bj4iojpS46NgMcm7NVpNBqQmRCo0IlIaAx8RUR0TBiTJvoYIYEJ/+dch/2DgIyKqo3WEBcNT4iAI0l4vCMCI7nEsXK1hDHxERPXMykiG1WSU9FqryYiZGckKj4iUxMBHRFRPn/YxmDsmFeFmH2+RTjt+dXMi0pPYpUHLGPiIiBowdUgnzB2ThnCzsdllT0EAws1GDLUW4d/PPITKysrADJIkYVsiIqImHC4sw+KsfGw7VgwBNcnpHlaTATa7HX3amPHClJvQOzEav/zlL1FaWoqPPvoIRmPNcimb2GoLAx8RkRdKK+xYd6AQuWcvo9zmQJTVjNSESFz5bit2bduMtWvXAgCqq6sxZswYpKWl4eHfPY/FWcfZxFZjGPiIiGQoKytDp06dsPdwLr48WYnconJcuFyFnfsOQmjVASIENHWTZRPbwGPgIyKS4VBBGR56dRUutUyCyWSSXOcz3GzA3DFpDH4BwMBHRCRRTZf2XNgcriZndd4KNxuROWMIT4X6GU91EhFJUBP0jqJKoaAHsIltoDDwERH56FBBGeZvyEWVQ177ovrYxDYwGPiIiHy0KCsfNqfLL9dmE1v/Y+AjIvJBSYUd2/OK4a/TEWxi638MfEREPli33/+zMTax9S8GPiIiH+QWlUtOWfAWm9j6FwMfEZEPym1Ov16fTWz9j4GPiMgHUVaTX6/PJrb+x8BHROSD1PgoWEz+uXWyiW1gMPAREflgwgD/zcbYxDYwGPiIiHzQOsKC4Slxzfbo81VNrc5UlisLAAY+IiIfzcpIhtVkVORania2LFAdOCxSTUQkwdVanRJTG5zVsFitGNE9DjMzkjnTCyAGPiIiiWq7MzhdzVZyMRkE9GoXhdgICyKtJqx68zVkvf0S0rq0D8xgqZZ/z+USEQWxqUM6IT0pBouz8rHtWDEE1JQc87CaDHC6XBDOfo+1z81A3w6tav/t+IpyfH/gGwY+FTDwERHJkJ4UgyVTB6K0wo51BwqRe/Yyym0ORFnNSE2IxN39EjH29hdw9OsO6NvhntrX3XLLLdixYwcmTJig4uhDE5c6iYj8bNu2bXjkkUdw5MgRhIWFAQCys7Px2GOPIScnR+XRhR6e6iQi8rMRI0aga9euWLZsWe3H+vfvj/z8fJSVlak4stDEwEdEFAALFizACy+8gMrKSgBAWFgYBg0ahF27dqk8stDDwEdEFAADBgzALbfcgr/97W+1Hxs2bBh27Nih4qhCEwMfEVGAvPDCC/jLX/6C0tJSAFcPuFBg8XALEVEAPfroo4iOjsYrr7yCK1euoE2bNjh//jxatGih9tBCBmd8REQB9Kc//QnLli1DYWEhWrRogd69e+Obb75Re1ghhYGPiCiA2rVrh0ceeQTPP/88gJp9vp07d6o8qtDCpU4iogC7ePEiUlJS8NVXXyE/Px8LFy7Eli1b1B5WyGDgIyJSwUsvvYQDBw7gpYX/wOB7Hsc9jz2NimoXoqwmpMZHYeKAJDak9RMGPiIiFXyTX4SJzy6FpXN/OBzVgDGs9t+sJgNEABnd4zBzeDL6tGfnBiUx8BERBZinq0OVwwmg8Y62glDTlX3umFT26lMQAx8RUQBJ6eNX052djWqVwlOdREQBcqig7KeZnm/Na6scbszfkIvDhazrqQQGPiKiAFmUlQ+b0yXptTanC4uz8hUeUWhi4CMiCoCSCju25xU326m9MaIIbDtWjNIKu7IDC0EMfEREAbBuf6HsawgA1h2Qf51Qx8BHRBQAuUXlsDt929urz+Z0I/fsZYVGFLoY+IiIAqDc5lToOg5FrhPKGPiIiAIgympS6DpmRa4Tyhj4iIgCIDU+ChaTvFuu1WRAakKkQiMKXQx8REQBMGFAkuxriAAm9Jd/nVDHwEdEFACtIywYnhIHofEKZU0SBGBE9zgWrlYAAx8RUYDMykiG1WSU9FqryYiZGckKjyg0MfAREQVIn/YxmDsmFeFm3269NbU6U5GexC4NSlDmmBEREXnFU2h6/oZc2Jyupiu5iG6Eh5nZnUFh7M5ARKSCw4VlWJyVj23HiiGgJjndw9OPz35yP+bfl4H77rxZtXEGIwY+IiIVlVbYse5AIXLPXka5zYEoqxmpCZGY0D8JmcuXYdOmTfj444/VHmZQYeAjItIom82GLl26YNOmTUhPT1d7OEGDh1uIiDTKarVi9uzZ+POf/6z2UIIKZ3xERBp2+fJldOnSBbt27UK3bt3UHk5Q4IyPiEjDIiMjMWvWLLz88stqDyVocMZHRKRxpaWl6NatGw4dOoT27durPRzd44yPiEjjYmNj8fDDD+O1115TeyhBgTM+IiIdOHv2LHr27Inc3Fy0adNG7eHoGmd8REQ6kJCQgClTpuCvf/2r2kPRPc74iIh04uTJkxg4cCCOHz+OmBjW7ZSKMz4iIp3o3Lkzxo0bh8WLF6s9FF3jjI+ISEeOHj2KjIwMnDhxAi1btlR7OLrEwEe6VFJhx7r9hcgtKke5zYkoqwmp8VGYOCCJjTop6E2YMAG33HILHn/8cbWHoksMfKQrhwrKsCgrH9vzigEA9gYq2md0j8PM4cno0557IBSc9u/fj1/84hfIz8+HxcIHPV8x8JFurMg+5VUPM0Go6VbNHmYUzO68807cfffdmD59utpD0R0GPtKFmqB3FFUOd/Of/JOartVpDH4UlHbu3ImHHnoIubm5MJnYU9wXPNVJmneooAzzN+T6FPQAoMrhxvwNuThcWOankRGp55ZbbkFCQgLWrl2r9lB0h4GPNG9RVj5sTpek19qcLizOyld4RETaMHfuXCxYsABut28PhaGOgY80raTCju15xU3u6TVFFIFtx4pRWmFXdmBEGjB69GiEhYXh008/VXsousLAR5q2bn+h7GsIANYdkH8dIq0RBAHPPPMM5s+fDx7X8B4DH2lablH5NSkLUticbuSevazQiIi0Zfz48SgvL8fWrVvVHopuMPCRppXbnApdx6HIdYi0xmAw4A9/+APmz5+v9lB0g4GPNC3Kqswx7SirWZHrEGnRPffcg5MnT2L37t1qD0UXGPhI01Ljo2Axyfs1tZoMSE2IVGhERNpjNpvxu9/9DgsWLFB7KLrABHbStJIKO256eausfT6LyYBdc25lDU8KajabDV26dMHGjRuR2DWVtWybwMBHmjfjvX344ug5SSkNggCM7tEWS6YOVH5gRBrz9Ev/wJYzRlyJ7gSAtWwbw6VO0rxZGcmwmoySXms1GTEzI1nhERFpz4rsU/iksisuhCfB7nRft0pi++ljm4+cw5Sl2ViRfUqdgWoAAx9pXp/2MZg7JhXhZt9+XWtqdaYiPSl0n2wpNHhq2dqcbgiGpv9ORBGocrgwf8PRkA1+DHykC1OHdMLcMWkINxsBNL3mKQhAuNnIAtUUEljL1ncMfKQbU4d0wvJp/eA6dQBmQ82eRV1WkwEWkwGje7RF5owhDHoUEljL1nfsZUG6sm/zh+h/ZT/e/sMcrDtQiNyzl/HBJxvRMSEOU+8cgQn9eWqNQoeStWxD6e+GpzpJN5xOJ1JSUrBy5UoMHTq09uPJyckYNmwY3n77bRVHRxR4S7Yfx8ItebLSfawmA2aPTMGjw7oqODJt44yPdGPt2rVo3779NUEPAMLDw1FRUaHSqPSvpMLOnC+dYi1baRj4JOCNIvBEUcRLL72El156qfZjnvehesA9ONIiCk9k5vB98MGhgjIsysrH9rxiAPVzvoqwcEsec740jrVspWHg8wFvFOrZuHEjAOCOO+64/n1I6AM7gPUHz/B98FLN8fdc2JyuBveHbD/9bm8+cg478kowd0wqDwtpEGvZSmOcN2/ePLUHoQcrsk/h8cyDyDt/GU63CJf72ruF52MnSiqx/uAZxISbmD+moIcffhizZ8/GocpIvg8yeXK+vD3+7nSL2H2iFDHhZv4sNeZU6RXsPXXhur8DX1hNBtzVpx0GdrxBwZFpG9MZvHD1RtHw03FdTA5V3tdff43CwkJUdxjM90Em5nwFlwkDkmRfQwQwob/86+gJlzqbIfdGkZ4UExJPyUrsezZ2jY//+jdM/c1cvPR5Ht8HmZTI+WLdU+1oHWHB8JQ4WbVsR3SPC7k9cQa+ZvBG0TQl9j2bukaY8SzsyfeguDICNke1pDGGwvvgDeZ8BadZGcnY+UMJqhy+36dCtZYtA18TeKNomhIHJJq7RrVLhGAKQ3GFtKAHXH0ffjh3GV/mng/Z07jr9hfKvoYAYN2BwpDK+dI6Ty1bX/ZtgdCuZcvA1wTeKBrnywGJuvttAGqDn6+HLORwuNy44+87YTIIIXsalzlfwcvzN9XUQ6SHINTM9EL5pC4DXxN4o2iYEvueoghJ15DKLQIQrz8FGkrH9pnzFdymDumE9KQYLM7Kx7ZjxRBw9fcbAOCshsVqxYjucZiZkRySMz0PBr4m8EbRMCX2Pd0iJF/DHxqblQYT5nwFv/SkGCyZOhClFfbaWrblNgcirSasW/pXrP3L73Fjepraw1QdA18TeKO4nhL7nltziwGIkq/hT8F8CjQ1PgoWU5Hsuo6pCZEKjor8ITbCct32SsUXZnz15SYGPjCPr0k1Nwp5P6Jgu1Eose/pbmDJUUuCtVULc75C21133YVPPvlE7WFoAmd8TZgwIAkLt+TJuoYebhS+5OApse/p1HDQA4L3NG5syzB0slQit9rSbJfuhoRqzlewuP3223H//fejrKwMMTHBtZrhKwa+JgR7cqiUHDyl9j21LthO4+bn52PGjBm4KETBOnQG7C7ff6FDNecrWLRs2RK33HILNm3ahClTpqg9HFVxqbMZszKSYTUZJb1WcDvx0OBEhUekjBXZpzBlaTa+OHoOdqf7ulmc7aePbT5yDlOWZteW/VJq31PrguU0rtPpxGuvvYYhQ4Zg3Lhx2Lf5A/xxXA+Em3370w/lnK9gwuXOGgx8zfAkh/p6o7CaDOhSdhDT7hqB/fv3+2l00sipParEvqfJIMAoyLpEQOj9NO6hQ4cwZMgQbNy4EXv27MFvf/tbGI1GTB3SCXPHpCHcbITQzPsgCEC42Yi5Y9KC8qRrqBk7diw2bdoEpzM0Vm4aw8DnBSk3iv8bm4YtS/6EefPm4c4778TLL78Ml0v94/tyc/B6JETJHoNBEGA0aD/y6fU0rs1mw9y5czFy5EjMnDkTW7ZsQZcuXa75nKlDOiFzxhCM7tEWFpMB1noPM1aTARaTAaN7tEXmjCEMekGiffv26NChA3bt2qX2UFQliKIWD5Vr0+HCskaTQ60mA0SgweTQ06dP4/7774fBYMDy5cvRvn37wA/+JzPe2ydrz3J0j7Zwi1D1GoFgNRkwe2SK7vb4vvrqK0yfPh29evXCG2+8gYSEhGZfUz/nK8pqRmpCJCb0D41SbqHm2WefRVVVFV599VW1h6IaBj4JpNwoXC4XXn31VSxcuBBvvPEGJk2aFOBR15zevOnlrbJOZVpMBiy9fyAeXbFfUlHccLMRmTOGQBSBKUuzJV0jECwmA3bNuVU3N/7y8nL84Q9/wPr16/HGG2/gf//3f9UeEmnU3r17cf/99yM3N1ftoaiGgS/A9u3bh3vvvRc/+9nP8Pe//x1RUfKXDr21ZPtxLNySJzuBefbIFLQMM0osipumSq1OX3hmpXrp5vDZZ5/hsccew6hRo/Dqq6+iVatWag+JNMztdiMxMRE7duxAt27d1B6OKrjHF2ADBw5ETk4OwsLC0K9fP+zevTtgX1vJ2qNKHJDw5RqBpJdj+8XFxbjvvvvwm9/8Bv/+97/x1ltvMehRswwGA8aOHYtPP/1U7aGohoFPBS1btsS//vUvvPbaaxg/fjyee+65gJyyUrr2aHMHJIxwwSC6mjwg4e0hixHd42AOwFFQPRzbF0URK1euRO/evZGQkIBvv/0Wt912m9rDIh0J9bQGLnWq7MyZM3jwwQdRUVGBFStWXHf6TklPZOZg/cEzsq8zvm8iFk7ue83HGtr3jDXasOip+1GQfxQmU/P5f55rrPx0GwzWCPTrmXrN3qmcgzk1NXQaD5x6adVy+vRpPPbYYygoKMCyZctw4403qj0k0qHKykokJCTg9OnTIVnFhTM+lbVr1w6bNm3CxIkTMXjwYCxfvhz+ehbxZ+1RT1HchZP7Ytm0G7Fwcl/834Qh6Bgfi82bN3t1bc81BjqP4LbIInSPj8TRs+V4+oPDeCIzBwnRVliM0sZvEASYjYJuj+273W4sWrQIAwYMwNChQ7Fv3z4GPZKsbhWXUMQZn4YcOnQI9913H3r16oU333xT8f0apU51+nLaccmSJdi6dSvWrFnj1ecfKijD/3vjI5w3tobJZKpXRs0Ah8sNET/11/OSCW7M+5/euLNXgqrH9n2piVpXbm4upk+fDgBYunQp0tJYXZ/kW7JkCXbu3ImVK1eqPZSAY+DTmKqqKsyZMwf/+c9/8O677yIjI0PR6yuRx+fLaceLFy+ic+fOOHnyZLOBvOaUZy6qqh2A0PTMThRFCM2ciBEAuJ12PHtXLzw8LMXrMSut6ZqoNfmfDXWAdzgceOWVV7Bw4UI899xzeOyxx2CQUFyaqCEFBQXo27cvzp0759VWRDAxzps3b57ag6CrzGYz7rzzTnTv3h3Tpk1DSUkJhg0bBqNRWr3Q+jrc0ALrD56R1CEh3GzEK3eno22U1fvXhIfjwIEDuHLlCgYObDxgXpPa4MURT0EQIECEIIowG4VrZoBWkwFGg4DwCz9gcicnnpwyyuvxKm1F9ik8nnkQeecvw+m+vh2T52MnSiqx/uAZxISbkJ4Ug3379mHcuHEoKyvDJ598gpEjRzYb6Il8ER0djdWrV6NXr17o2LGj2sMJKM74NOz8+fN4+OGHcfbsWaxcuRLdu3dX5LpS8ufq5+D54rPPPsOLL77YaOrGoYIyycnsZgG48QY78k4Vos+NQ2uXLxPtBfj1jIdw7NgxhIWFAZC+1CiVlJ+z1WxAWtVR7H7vFbz++uu49957GfDIb0K1igsDn8aJooglS5bg2WefxYIFCzB9+vRGb4S+3Ng9y4o2Z9OFqgUAbocd0we0wh/vGSHpe3A6nUhKSsL27dsbDN5yl1+7Wa+g849b8M9//hNAzc9s2LBheOSRR/DAAw9IXmqUQ04wN7id+PfUdAzv3VmRsRA1JlSruDDw6cTRo0dx7733omPHjnjrrbfQunXr2n+TemP31B7dmlsMtyhes/xpMggwCAJuTY1DD/yIhX98Ajk5OZKPPj/11FMICwvDggULrvm4EgdujHBjrCsbf39lPgBgw4YNeOqpp/Dtt99i1d4C7wK8wukMgd5LJZIiVKu4cKdcJ9LS0pCdnY1u3bqhT58+tSkCUvvqATUth2pinXhdCkXN/4sQAQwfPhzjxo3D9OnTJadaTJs2De+99x7OXbqCJduP44nMHPzy3b144O09cLrkliwTURTeCUDNH/LcuXPx4osv/hT0pLVfkqOkwo7tecWSC3DX7QBP5E+hWsWFh1t0xGQyYdSoUejbty9++ctfYvsZNz46JcDm5R6S0y1i94lSxISbcbiwrPbQhctdk95dlwjAJaL20MWE/xmDTSv/CVEUJeWPFVVb8OFpM5YdrsQ3Jy/g+zPlOFlSieIK+3Vf21ciDIi2CLhveC+sWbMG2dnZmPbbZ/F45iGf64DW/IwuYFi31j4d4qlr+e7/IvtE6XUHWXxhNgiIbmHGwI43SL4GkTdEUcTy5csxbdo0tYcSMFzq1Kkd3/0XD753EG6D78eQzUYBBgB2l/dvfbjZgOkDY/HSw2Px5ZdfIj093evX1qYpOJxoqnqKHN0jHPhszjj06NEDixYtwtqiVqotNfqzQg6R0kKxiguXOgOgpMJ+zfLeE5k5WLL9uKylrBU5xRCN0nJvHC7Rp6AH1DSifWvfBTw5/2+YPHkyKisrvXpd3W7v/gp6ABAdbsa7776LxMRE9B1yi6pLjUrXRCXyp1Cs4hJaWYsB1vShkyIs3JIn6TSh3D0kqWxOF061SMHgwYPx61//Gm+//XaTny+127vPXA50jAnHc/OeQ2ZmJj448KPsSwoA1h0olNSINsqqzJ+VXjvAk/54ilZPmTJF7aEEBGd8fiLn0Elz1u0vVHi03vHMhJ5/+S/YtWtXs6WOFmXlw+YMTKNZR95X6Nu3L4YOHapo+yUpUuOjECazk0RjNVGJ/GHs2LHYtGlTQLrEaAFnfH7gS+Jy3dOEALw6Sq/EjV0qAcDGYxexevVqjBw5EoMGDWrwGHSgZqWCADgLDuP9L/+JLVu2ANDCUqOIah+Xkq+/AjChf5KsaxB5q3379khKTsPcFdtgD28dkAIPamLgU5jU5b0qhxvzN+QiPSmm2V5wSt3YpfDMhB4d1hfz5s3DlClTsGvXLlgs1/5hBGpWKgAoP/oV7hgxovbAjZpLjct3ncRrnx+FnMUUQQBGdI8LupsNaZNnS+byrXOwNrcKbuHqwSw5WzJaxqVOhclZ3rM5XVicld/s5yl1Y5fKMxOaOXMmOnbsiDlz5lz3OYGalbpFIPr2RzH4vqdqP+bP9ktNWfvlHjy7/iCcotyvrY8O8KR/dbdk3DDALVxbE1jOloyWMfApKFCJy0rc2OXwzIQEQcCyZcuwfv16fPzxx9d8TiBnpQazFUv3ldT+UU4YIH+J0JelxoqKCjz55JOY/dYmwCjvQIoeOsBTcKh74jqQBR60gIFPQUos73lOEzZFiRu7VPVnQq1atcL777+PRx55BAUFBbUfD3Rg9iwVHy4sQ+sIC4anxHnT5KFBviw1/uc//0HPnj3xY2k5WnQdCLkpG4/f1k2zzXApeMjdkjlcWOankQUGA5+CAnWaUO6NXY6GZkI/+9nPMHv2bNx77704cKoUM97bh83fFwV8bHWXimdlJMNqktbKyZulxoKCAowfPx5PP/00/v3vfyPjod9DlFGpBQDCjAI7MVBABGJLRst4uEVBhRerFLmON6cJZ2UkY+cPJZKq/3vTxLUhTc2Efve73+HDw+cx8Z9fwy0YA55jCFy7VNynfQzmjknFn9YfhkvwPgDWXWpsqNtFSpsIlOVswl8WPIdf/epXWLVqFaxWK/760odwiPIOo1S7RMkpFETeUnJLRq8HsBj4FLIi+xRyTl9U5FrenCb03Nh97fcGQPKsIswoNDgTKqmw45mPvkVJp9tqil6rXATPk3geUXQQrn3rYB16H+wut9fdGXonxmDGe/saLDwAlwOC0AGjXliH8eP6wmKx4OWXX8bh70UgqbfssbNaC/mbklsyUgo8aAEDnwI8m8QyU7cA1PRiM185D7fbDYOh6ZVoz16QN2135BIddlTuXgvjlI7ATwcvPMegtx07D4cS37wC7E433v76JI7+twjrPt6LQeMeQGR0NM5esuFUyRUYhJrlZA9P26YR3eMwMyMZhwtr+ug1+vM0miEC2F1QiclLdyOl4juc2rISo2YvweY8+Q8+rNZC/qZ2gQctYJFqmeQ0HG2IEW5YN8/HlQvnMG3aNEybNg2dOzfdkNTTV2/bsWIIuPbGHmYUZCdTh5uNGBFzEZnzfwUA+PDDD1Fg6RiQgCuZKKLuJqjVZIBbFNGpdUu0iw6H0SDUdmuf0L8mQVdKx3TB5cAf7uwOF0x4/YtjcIrS9+isJgNmj0zR7VM06cMv392LrbnnZV/nttQ2WDbN904tWsAZn0xKluUSBGBkjwS8uWA3cnJy8M4772DQoEHo1asXHnzwQdx9992IiIi47nXpSTFYMnUgSivsWHegELlnL6Pc5kCU1YzvzlzCD+crJI+pbaQFSx8YiPSkGPSPqsQzzzyDyXPfQOSwB+GQcZP3u3rLuZ6HgR/OV6DgQtV1DWelnnITjWa88kU+3NuXwH3zDEBCtwyPapcbafFRkl9P5A3WkmU/PllKKux49uPvr+lcLke42YhX7k5HfHQ4EhIScOedd+Lxxx9HTEwM3n//fTzxxBM4duwYWrVqhQ4dOly3V9cizISBHW/AHb3i8T99EzGwUyss3JInqy+c0y1i9u0paBFmQv/+/VFpbY1vo26U1A5JK+r2JfTky8358DDyi6U9ILhFEQMG/wy9OrXFiRLvulY0RASw6fsixISbmMdHfnOq9Ar2nrog675gNRlwV592uu0XyXQGGZQsy9VY4rLFYu+yPzgAAB14SURBVMHdd9+NTz75BLm5uejVqxdmzZqFbt264fnnn8epU6cavF5JhR1PrT0Eh8zu5na7HW98trf2/8/E9IZgCpN1TS3w5CN9sL8AD76zp/YgiySCAd+WujF1cAfJKRRXxxU8ScKkTYEu8KBFDHwyKFWWyygAc8ekNZu4HB8fjyeffBLffvst1qxZg+LiYgwcOBC33norli9fjsrKShwqKMOM9/bhppe3YscPxZA9GTWasXTtBsybNw9nL1T8FCA0vMTpgyqHC09/cBhZx2QEvZ8IAI4WXcbcMakIN8v7swqWJGHSpkAWeNAqBj4ZlCrL1b9DK5+qdQiCgAEDBuCNN97Ajz/+iFmzZmHt2rXoPOoBjF+0HV8cqWmFpNAKLIbdNhoHDhxAxi/nwO1WpyuEvyj1M/Kccps6pBPmjklDuFnezC8YkoRJu/xd4EHr9LtRowFKbRJfPvdffP21iL59+6Jly5a1H28ogbp+mxDPUmhV4gD88NkR2JzKH7HM+eZrfP7GG3h89QEc5CSkUZ4cvKlDOqHDDS3x4Dt7JAfWYEgSJu2SmgccLLVkGfhkqCkWXSRrudMkuIGLhfjtb/+K7777Dp07d0bK0JEoTxqC/1ZHwGAwNNu53XMi0R9Bz2oyoF+Xthg8eDAGPfU2VM9O17C6p9yOnC2H2WiQ9buh9yRh0jZf8oDrFngIhlqyzOOToaTCjpte3irr5mYxGbBrzq2IjbCguroar/9nD5blXITTDUBofCW67i/ijh9K8MXRc37JpwszCtj9+9tweO8uPPDmlzB2Har8FwkCJgGYOSIZLcJMyC0qx+4TpThX3nSXDW+M75uIhZP7KjBCooY1lwcsCEJtgQe9z/Q8GPhkmvHePslBRxCA0T3aYsnUgQB869zuYYQbEAyKVI25jtuNKz9k49aw43jnnXfwt81H8OZXBYCRCwXXEyEAMECES8Gtcz0nCZO+1M8D/uarLIwa1BNz77kt6JbbebhFJqU2iaUmULtgkJWP05Rwixlz7uqLzz//HDExMThwYD+DXqMEiBAUDXqAvpOESV9iIyx4dFhXLJzcF8um3YgR1tNoX3Es6IIewMAnm2eT2Ncj7PU3iWVVgPFDKxvP+J58aCIuXLiAsU+8hN0V+kxW1SspXeCJlNKjRw8cPXpU7WH4BQOfAuoeYW8uBglCTYWWunl7ctuEKEsEnNWwf7Ma//1yJU6fPo3DhZdwJCwVBrNV7cGFFL0nCZO+paWl4ciRI2oPwy+4bqWQqUM6IT0pptFN4vpdAOpuEitZAUYytxsGQURKpBMzbuqA+PGPYsWKFejXrx/iJz4LW6uuCJbEdT0IhiRh0re0tDQcPXpUcv9OLePhFj9oqFh03S4A9T2RmYP1B8+oMNKfuF2wnTqIyO8/QrjBhYsXL+LixYvo06cP0voNwtaIEXD70MyV5As3G5E5Y0jQnKIj/RFFEbGxscjNzUWbNm3UHo6iOOPzA88msbeUqgAjlSXMjC9ffxRHcvph586d2LlzJ0pLS3Hx4kXsKzXB3cINGBn4AiVYkoRJ3wRBqJ31MfCR4pSqACOFgJolta5JbdE16S7cddddAIArV65gz549eGHLaZS5eLIwEIItSZj0z3PAZfjw4WoPRVE83KIBNRVgVHorRDfiSg5i06ZNyMvLQ3V1NQCgRYsWyMjIQMduaeqMy89EUYQoaqPuqNVkgMVkwOgebZE5YwiDHmlGsB5w4YxPAyYMSMLCLXmqfO04YxXOH92L1z/LxIkTJ1BYWIj4+Hh06dIFXbp0wemEDADBl8YgCELNKdp6ndoDKT7KiqFdYpvc/yVSU1paGjZu3Kj2MBTHwKcBnjYh/io71pTC47m4uH8r4uPj0a9fP4waNQpWa03agt1uh+tCARAV02T5NL1S86Sa1WTAQzd1Yh1O0jTPHl+wCb67mU7JqQAjR5jgQvfu3dG7d2/07NkTXbp0QcuWLVFRUYHCwkJUHP6Cdan9gDl6pAcdOnTAxYsXUV5ervZQFMXApxFSK8DIYTUZ8NAvRmHixIkwGo344osvMG/ePKxduxaVlZXIyMjAa/PnIS6SS3BKYo4e6YXBYEBqamrQzfq41KkhvrQJUYII4FdjByI24iZMmzYNbrcbVVVV+O6777B//37s378fK1euxJme98DSLlW1vbBgEwyNPCl0eJY7Bw8erPZQFMPApzHNVYAxCMp0DRdFEZePZaND2wlwOBxwOp0QRRFmsxlmsxkmkwkmkwlmsxlhKZUMegphjh7pTTDu8zHwaVB6UgyWTB3YYAWYwz+W4XhxpfwvIrpRtisTkRERiI2NRVxcHFq3bo3Y2FjExsZe899v54ch57y6SfbBwCAA04Z2YroC6UpaWhreeecdtYehKAY+DWuoAswTmTmKBL7eiTH44PghlJaWorS0FCUlJbX/XVpaih9//BGHDx9GaWkpcjveBUTxIIZcogi8u/u/SGoVzuBHuhGMMz7W6tSZJduPY+GWPFld3wHAaBBwe1obzByejD7tG192O1RQhv9982v/NLoNUTXLnWkMfqQLDocDkZGRKCsrq0110jue6tSZCQOUmXm53CI2HzmHKUuzsSL71HX/brPZsHr1akxa8D6cfmp0G6qqHG7M35CLw4Vlag+FqFlmsxldunRBXp46RTb8gYFPZzzJ7koQRaDK4cL8DUexIvsURFHEvn37MGvWLLRt2xZPvrURtugOQdeSRBplg7/N6cLirHxFr0nkL8G23Mk9Pp05VFCGS1UORa9Z5XBj3n++xZ+ffgwleQdgsVgQ1X8szDdOhJs9+GqIAEQX3C4nDGb5+XeiCGw7VozSCjvz+Ujzgi3wccanIyuyT2HK0mzsOXVB8Ws73UB50lB07NgRo++dAdONkxj06hIEWMNMGN07CRmdWqC12QnILHItAFh3QANNiImaEWzFqhn4dGJF9inM33AUVQ4/JbYbDDAk9caB7/OwPq+KjWcbYHOK+Or4Bfx2bD/c3LOD7PqlNqcbuWcvKzQ6Iv/xtCcKFlzq1IFDBWWYvyEXVQ7/ttERRRGRA8ahRdeBEAx8JmqIZ2+uWqFjruU2ZZetifyhdVJnnInuicdXHcDlaheirCakxkdh4gB9dhVhOoMOzHhvX8A6N7Q2VqEcLRS7sQcji8mAW1PbYON3RbKvNb5vIhZO7qvAqIiUd6igDIuy8rE9rxh2mw0whdX+m9VkgAggo3tcs2lRWsPAp3ElFXbc9PJW2Xl73mobZcG5cntAvpZehRkF/Cy5NXYfL5X1vliMAoYmt0ZMuBnlNqfun6IpuNRsrzRfN1gQaurPzh2TqpvcVAY+jVMqYd1b8VFWFJXbAvK19Mx6/ggcbdLgknkAyGIyXPPe6vkpmoLH1TMF3t939FSYgRs5GpdbVB6woGcxGVBdXR2Qr6V3yd3T0KLqHOQ+N9Z/b21ON+xOd5PFBYj8SeqZAj0VZmDg07hyW+CKQ9sdLpRUMvB5w9gyBtUR7fyW3F+/uABRoCzKyofN6ZL0Wr0UZmDg07goa2AO3oqiCAgCDEYe9G2OSQC+P1MOu8v/M3E9PUWT/pVU2LE9r1jyQbq6hRm0jIFP41Ljo2Ax+f9tYlky7zlFBLR+qV6eokn/1u2XX1BBD4UZGPg0yuFwYOPGjdj29p9hs/GwiWaIIgKSV1LvS+rhKZr0T4kzBXoozMDApyFutxs7duzAY489hnbt2uGFF17AzQP7IKN7G780QNflgV5RrBm3WmMXBFW60evhKZr0T6kzBVovzMANHZWJooicnBysWrUKq1evRqtWrXDPPfdgz5496Ny5M4CaU1Z7lmajyiFtw7kxml7e/GnPsfZ/XQ4IRjMgCKpVEDUaAAGCKm2a9PAUTfqn1JmCKKtZkev4CwNfA0oq7Fi3vxC5ReV+SyzOy8vDqlWrsGrVKlRXV+Pee+/Fpk2b0LNnz+s+t0/7GMwdk+pzXo1uud1o6b6MztEmtG3fGSUVdhwuuKhwYyDftYm04Owl9ZYbtf4UTfpXc6agSNZyp9VkQGpCpIKjUh4DXx11y/MAqJdYXISFW/JkJRYXFhZi9erVWLVqFc6cOYPJkyfj3XffxaBBg5qdfXmSQr2ppKAZ9WZt3hJEFwbjB6RZWmDUraMwZWk2RJkFoZUgiurOkLX+FE36N2FAEhZukddwVgQwob8yDbP9hYHvJ82V57H9FAQ3HzmHHXklXpfnKSkpwbp167Bq1Sp89913GD9+PF555RVkZGTAaPStA8LUIZ2QnhSDxVn52HasGA6XG5puji4IEEXRpyXVMAMQeSILnXregEuXLsnKKVKaqOKcUw9P0aR/nkbXUmsDCwIwonuc5kvuqf8YrQG+tPzxJrH48uXLWLFiBcaOHYvk5GRkZWXhySefxJkzZ/DWW2/htttu8znoeaQnxWDJ1IHYNedWpCVESbpG4Lmb7V0nut0wwoW7uwqIu5SL6OhoFJfbZOUUKa11S0tAUksaooenaAoOszKSYTVJuz9ZTUbMzEhWeETKC/nAp1R5HrvdjvXr12Py5MlISkpCZmYm7rvvvtrlzZ///OewWJR7CoqNsOCu9HYIM2r4gApqDtCIotBo7zqDAJiNAqp+yMas7g4Mbe2E2WxGVFQUfnDdEODRNk502pHcokqVr62Xp2gKDp4zBeFm38JDTa3OVKQnab++bMgvdcotzzNvzW60+n4dPvroI/Tp0wf33HMPFi9ejNjYWJRU2LFifyFyi36QdEimsUM2Y1JbYevGj/Hee6th7/8ohDqtQrSoqaVOtwiYBQFVp3IwcsAU5OXlISwsDNHR0bgoXgpYndLmmMMsyHxhJno++CJOCJEBnYXq5SmagocvZwr02J0hpAOfEuV5DhTZMT0tHd8+/zwSExMB1Mwi/7Bhn+RDMk0dsjG4T+PPn7oRfaUC0x96DN+6ErDlmHaWA6WwO91oNeJhfHMhDC2rq2tnfHa3NhYkBAG4Pa0tfr99C/7noV9DGDQdosH3Px2TQYDRIPgUzPX0FE3Bpf6ZAgFXzzoAVzuJjOgeh5kZybr6HQ3pwKdEeR6rxYK4QeNqg57cQzLNvd5tMEEwAJejO+PNPCOmDY3EV8cvKJ7jF2iGMCte//IkHkh01874XFUVQCu1R3Z1xtUpKQbZG9bg50++hh9a9rqmKWdzDALQo10U4qOs2JFXDLvLHXRP0RR8PGcKSivsWHegELlnL6PwfClyvvkas2fchwn99dk7MqQDn9LleXzpYVX3kAxQ83Ql5fXv7j6F29Pa4vPvzqJaG6uCktmcLmQVW5D004zPUfJfWDr0U3W5s/6MKzw8HJsX/R8efW0FPj/XEgZzGEQvUurdInC48BLyTJfhEkXERVhw8Uo1jIIQNE/RFLxiIyx4dFhXAEBVVRVu+N04PPzOMzCZ9BlC9DlqhShZnkfuIZlwsxEvfHoEdpdva5ZVDjc+zTkNMW8bzD1GwglBt8ueogjkV1qRaIlAdHQ0Kr/7EjH9f6HKWJqacQmCgH89fT9WbNyJZ1ZkwdyhL8wm0zUBrDGezymusMNiNGB4ShxahJlQbnMgympGakKkbp+iKTSEh4cjPj4eJ0+eRLdu3dQejiTa2ERRiZLleeQckqlyOPH0iq9gk7pcaTRh1N1T8cHMm5GREiftGo0I/KlREUXhHREVFYXy8z9iuMLfT3OsJgMsJgNG92iLzBlDmlxmnHrnLdjz+iNI2LMI0ae/Qq/4CBi8/HGJYk0Q3PFDMQZ0jMGyaTdi4eS+eHRYVwY90rzU1FQcO3ZM7WFIFtKBT4mWP1aTAe1vCJeZbybAbY2CYJA2FhECtueVIDEmHIM7x8r+ngwCkBzXEuP7JuLJUd3x+zu6I1DxzwUDKoxRCA8Ph9PpxIybO3odTKQwGQRkpMThttQ2GN83EbNHpmDXnFuxZOpAr5YZ4+PjsW3TJ0hrbcJ3hRd8LijAfnukR6mpqcjNzVV7GJKF9FKnUuV5tMBTvV+JfUu3CPROjMHCyX1rP3bg9EVsPnJe5ii94zJaIAgCoqKi0ClSQK/EaBwuvOSXr/XILZ0x5440Wdcwm80I63sXhCNFkn4fPP32lkwdKGscRIGSmpqKvXv3qj0MyUJ6xucpzyO1SYEnsfj0hSuq55t5Dtn4q63IgvHpMAdo2mcx1Cz5RkdH49KlSxjTK8FvM86TJZWyr1GbFiOxbwT77ZHe6H3GF9KBD1CmPI9SwUaumgMS/mkr0jrCguHdWgegD56Ii2IEDhWU1ezzlZdjwoAkmIz++VVVIuCEStdqIg8GPp1TojyPUsFGriirWbF9y4YKIlcf/AT+X9wVcFG0YsrSbBhShuHSpUuyZ+ZNfzX5ASdUulYTebRp0wYulwslJSVqD0WSkA98QE0O3dwxaQg3G5u9uQoCEG42Yu6YNEzq3w6fffYZDm3fCNGp7jKVJ1hNGCC/kHFDBZE//vhjbP9wOUaktJZ9/eYJqHK4cLHzrfgsryYYyJmZN0WJgBMqXauJPARBQPfu3XU762Pg+8nUIZ2QOWMIRvdoC4vJAGu9WZPnmPvI1DZ4oreIrf96HgkJCfjzn/+MO1KiYbFYVRp5DU+wUmrfsu6R+sLCQjzyyCN4//338eqk/gE74SkazFh3UsC0f3+D3SdK8cRtyT7PzL0hN+CEStdqorr0vNypjTU6jWioPE+5zYFIixGmyvMo/uYTfPhmJr7t1g2TJk3C888/j6SkmpnR8ff2yephFRdhQXGFXZEeWLMykrHzhxJJZczqF0R2uVy477778Jvf/AY33XQTAOC2tLbYfOSc7wOVwJOq8c2JCxABdIlriRPnK1Htbrrkly/kBpxQ6VpNVJeec/k442tAbIQF02/qhJ+3uYiw7Lfx/m/uwPYlf0TvlM7Yv38/du/ejdmzZ9cGPUD+IZk5o7sr1gNLybYiL774IoxGI37/+9/XfmxWRjLCzcovOzbF5nTD7nQjt+gyIABp8ZGKzDyVCDj+Wl4m0jLO+AKgsRY93rb48Ybb7cbXX3+NzMxMfPDBB0hISMCkSZOQnZ2NLl26NPlaT7DxttamhyfY3D2gfW3tTimvr59srURbkR07dmDJkiXYv3//NY1zpX6vShDFmm4OJ0sqMWtEMhZlHYdLRht6JQJOqHStJqorvmMyvncn4InMHL/dk/1FEEVtV3ZsqkWPp6BvUy1+muN2u/HNN98gMzMTa9euRWxsLCZPnoyJEyciJSXF5+s1113Bo7FgI/f19R0uLJPUVqS0tBT9+vXDkiVLMGbMGFnfq7+Em41IT4rGnpOlknLoBAEY3aOtIonjhwrKMGVptqTl5XCzEZkzhrAgNemC556clVcMW1UVDOarQU6Je3IgaDrwKR0EPERRxN69e7FmzRqsWbMGERERmDx5MiZNmoS0NHlVPADpwUap1zek7r5lSYUdF65UA6KIVi0taB0Rds2TmiiK+MUvfoHk5GS8/vrrsr7X5trvyCEIwODONyDnvxdgl1DmVOmA40t3jatjMGDumDS2HiJd8Nc9OdA0G/iUvomIooicnBxkZmZizZo1CAsLqw12vXr1UnDkV9U/JONr9X25r6/P29lz6/MH8Pn7/8KuXbsQFuZdz7nGxnqwoAwbvyvyeazespgM+H83tcdft+RBMHv/M/FXwAmWGwNRfcH0YKfJwKfUspEoijh8+HDtzM7tdmPy5MmYPHky0tPTIfgjI1qjvL4hA3A77HhieHvMvutG2V93yfbjWLglz28l3awmA2aPTMEff/873HDbdE00ePXHjJ1ITcG2lK/Jwy1yWvzYnC689HEOkou2Yc2aNbDb7Zg0aRJWr16N/v37h1Sw8/CpwS0AwWzBv/aWIC7ulOzgoEQh8KZ4EtATq05gzrAYZJ0zqx5wGkuLYb890iu592StFWHXXOCrLfgrcR4qisDXpy7hBlc1li9fjhtvvDEkg52H3Aa56UkxsoKE3BOP3ii3OdC5c2cYywqxZOpEzQScul2rifRKiXuypyauVh74NBf4lCj4G26xoPfIRzBoEG86WnhSk5NQ740oqxkxnTvj5MmTABhwiJSkZBF2rfxdai6BnQV/laPkk5ocUhPqveFJQO/SpQtOnDih+PWJQl0w3pM1F/hY8Fc5WmqX40shcF94EtA715nxEZFygvGerLnAx4K/ytHak5qnEPjgzjcocr26FU8Y+Ij8IxjvyZoLfP7sJxdqtPikVtO/UJk/gLo1Sjt16oTTp0/D7Q5sCTWiYBeM92TNBT4W/FWOFp/UPPuOclnr1SgNDw9Hq1atcObMGdnXJqKrgvGerLnA549+cqFKi09qSuw7AsDwbnHX5RjygAuR8oLxnqy5wAfIb/FTt0VPKNPik5oS+44A0CLs+tks9/mI/CPY7smaDHxK9pMLZVp8UvPnviMDH5F/BNs9WZOBD/Dt+Lsg1NSD02IxVLVp7UnNn/uODHxE/hNM92TNBj7g6vH30T3awmIywFpvv8pqMsBiMmB0j7bInDFEkz9gtWntSc2f+44MfET+FSz3ZE12Z2iIVuov6pVW2uWUVNhx08tbZe3zWUwG7Jpz63Xv+6lTp3DzzTejsFCZAzRE1Dg935N1E/hIPq20y5nx3j7JRaub6prudDrRsmVLlJeXw2LR9h8eEamHgS8Eqf2k5s/eXl27dsXGjRuRkpIid5hEFKQ0152B/E/t7gWefUdp3Zyb3nf07PMx8BFRYxj4SBWe/UOl9x15wIWImsPAR6qZOqQT0pNiGt13FNwOGE1m3J7W1ut9RwY+ImoO9/hIE67uO5Zj9YcfY8LPx2LHJ6sx/+FxGDcyw6trlFTY8ce3P8WeYz+iz6ChiLKakBofhYkDtH/KjIgCh4GPNCc5ORkbNmzA1KlT8Y9//AODBg1q8vMPFZRhUVY+tucVw+12o+62oee0akb3OMwcnow+7bVVQYKIAo9LnaQ5iYmJOHPmDGw2G6xWa5Of21x+omfpdPORc9iRV+K3/EQi0g8GPtKcdu3a4ccff4TNZmsyH68m6Hl3MlQUgSqHC/M3HAUABj+iEKbpkmUUmhITE2sDX2MzvkMFZZi/IdendAgAqHK4MX9DLg4XlikxVCLSIQY+0px27do1u9S5KCsfNqfvCfAAYHO6sDgrX84QiUjHGPhIczwzPrvd3mDg83Rxl3osSxSBbceKUVphlzlSItIjBj7SnOaWOpXo4i4AWHeAxayJQhEPt5CmlFTYsaM4DD92uB3Ribdjzvoj1+XiKdHF3eZ0I/fsZSWGTEQ6w8BHmlA3Fw8AhM6D0QLA+oNnYDUVYeGWvNpcPH92cSei4MfAR6rzNRcvpW2EIl+3oS7uRBT8uMdHqrqai9d0oWrgai7e92cuwWQQZH3dxrq4E1Hw44yPVCM1F8/pBiC6a9o2SCQCmNA/SfLriUi/OOMj1cjJxYOMCZ8g1HSaZ+FqotDEwEeqkJuLJyfyWU1GzMxIlvx6ItI3Bj5ShRK5eCYBPu/1edPFnYiCGwMfqUKJXDynCPRsF4Vws7HZ7T5BAMLNRswdk8YC1UQhjoGPVKFULl7rCAsyZwzB6B5tYTEZYDVd+yttNRlgMRkwukdbZM4YwqBHRDzVSeqIsirzqxdlNSM9KQZLpg6s08X9MsptDkRZzUhNiMSE/uzATkRXMfCRKlLjo2AxFcla7qyfixcbYcGjw7oqMTwiCmJc6iRVTBggP4eOuXhEJAUDH6midYQFw1PiJOegMxePiKRi4CPVzMpIhtVklPRa5uIRkVQMfKSaPu1jMHdMKsLNvv0aMhePiOTg4RZSlSe9oKnuDB6CUDPTmzsmlWkJRCSZIIrSi0YRKeVwYRkWZ+Vj27FiCLjaigioOb0pomZPb2ZGMmd6RCQLAx9pCnPxiMjfGPiIiCik8HALERGFFAY+IiIKKQx8REQUUhj4iIgopDDwERFRSGHgIyKikMLAR0REIYWBj4iIQgoDHxERhRQGPiIiCikMfEREFFIY+IiIKKQw8BERUUhh4CMiopDCwEdERCGFgY+IiEIKAx8REYUUBj4iIgopDHxERBRSGPiIiCikMPAREVFI+f905e8cd9Xj+QAAAABJRU5ErkJggg==\n" - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DkOH7iG1deLW" - }, - "source": [ - "Take special care to the create_using parameter. In this case, I wanted to define an undirected graph, so I have used the nx.Graph option. If you are dealing with a directed graph or even a multigraph, choose the according create_using parameter.\n", - "\n", - "Now that we have constructed the NetworkX graph, we can go ahead and test KC algorithms. We will begin with a community detection algorithm BigClam. We will calculate the community structure and write the results back to Neo4j." + "text/plain": [ + " result\n", + "0 done" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# KarateClub only works on nodes with consecutive ids\n", + "read_query(\"\"\"\n", + "MATCH (c:Character)\n", + "WITH count(*) as number, collect(c) as nodes\n", + "UNWIND range(0, number - 1) as index\n", + "WITH nodes[index] as node, index\n", + "SET node.index = index\n", + "RETURN distinct 'done' as result\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "uXfnlMoJja-S" + }, + "outputs": [], + "source": [ + "# Define character mapping\n", + "character_mapping = read_query(\"\"\"\n", + "MATCH (c:Character)\n", + "WHERE exists { (c)-[:INTERACTS]-() }\n", + "RETURN c.name as character, c.index as index\n", + "ORDER BY index\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RExm6VXYdOa9" + }, + "source": [ + "Now, we must export the relevant network data and construct a NetworkX graph model out of it. NetworkX graph can be constructed by only providing the edge list. The syntax of the edge list is:\n", + "```\n", + "[\"1 2 {'weight': 3}\", \"2 3 {'weight': 27}\", \"3 4 {'weight': 3.0}\"]\n", + "```\n", + "Now, we can go ahead and construct a NetworkX graph model of the Harry Potter universe." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 319 }, + "id": "kDXnmI6cjNrh", + "outputId": "6fd9e026-50ca-414b-b3d4-311c64df4484" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "K-q8oWR6jgPC" - }, - "source": [ - "from karateclub.community_detection.overlapping import BigClam\n", - "\n", - "model = BigClam()\n", - "model.fit(G)\n", - "results = model.get_memberships()" - ], - "execution_count": 9, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "N5adb__ijyqB", - "outputId": "32fa2a9b-68e9-474c-e598-45d321fe5826" - }, - "source": [ - "data = [{'index': int(el), 'value': int(results[el])} for el in results]\n", - "read_query(\"\"\"\n", - "UNWIND $data as row\n", - "MATCH (c:Character{index:row.index})\n", - "SET c.bigClam = row.value\n", - "RETURN distinct 'done' as result\n", - "\"\"\", {'data':data})" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 done" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0done
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 10 - } + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] - }, + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Construct a networkX graph\n", + "edge_list = read_query(\"\"\"\n", + "MATCH (s:Character)-[r:INTERACTS]->(t:Character)\n", + "WITH toString(s.index) + \" \" + toString(t.index) + \" {'weight':\" + toString(r.weight) + \"}\" as edge\n", + "WITH collect(edge) as result\n", + "RETURN result\n", + "\"\"\")\n", + "\n", + "edge_list = edge_list['result'].to_list()[0]\n", + "G = nx.parse_edgelist(edge_list, create_using=nx.Graph(), nodetype=int)\n", + "nx.draw(G)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DkOH7iG1deLW" + }, + "source": [ + "Take special care to the create_using parameter. In this case, I wanted to define an undirected graph, so I have used the nx.Graph option. If you are dealing with a directed graph or even a multigraph, choose the according create_using parameter.\n", + "\n", + "Now that we have constructed the NetworkX graph, we can go ahead and test KC algorithms. We will begin with a community detection algorithm BigClam. We will calculate the community structure and write the results back to Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "K-q8oWR6jgPC" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "2OyJcLrydoer" - }, - "source": [ - "KC API is very simple to use. You just define the desired graph algorithm and input the NetworkX graph model in the fit method and that’s it. Couldn’t be simpler than that.\n", - "\n", - "# Node embeddings\n", - "Again, the Neo4j GDS library provides node embedding algorithms like FastRP, node2vec, and GraphSAGE. I will show the syntax for FastRP algorithm, but again, won’t delve much into hyper-parameter optimization." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomaz/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n" + ] + } + ], + "source": [ + "from karateclub.community_detection.overlapping import BigClam\n", + "\n", + "model = BigClam()\n", + "model.fit(G)\n", + "results = model.get_memberships()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "N5adb__ijyqB", + "outputId": "32fa2a9b-68e9-474c-e598-45d321fe5826" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "G6WJX3-kduSv", - "outputId": "08e9873f-5d07-467e-e96d-3973d779c745" - }, - "source": [ - "read_query(\"\"\"\n", - "CALL gds.fastRP.write('got',{\n", - " embeddingDimension: 64,\n", - " writeProperty: 'fastrp'})\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0done
\n", + "
" ], - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeCount nodePropertiesWritten preProcessingMillis computeMillis \\\n", - "0 119 119 2 90 \n", - "\n", - " writeMillis configuration \n", - "0 97 {'writeConcurrency': 4, 'nodeSelfInfluence': 0... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeCountnodePropertiesWrittenpreProcessingMilliscomputeMilliswriteMillisconfiguration
011911929097{'writeConcurrency': 4, 'nodeSelfInfluence': 0...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 11 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MfyRm-_3dyCi" - }, - "source": [ - "The embeddingDimension parameter is mandatory and defines the size of the embedding vector for each node. Other than that, we have again defined the interaction network to be treated as undirected.\n", - "\n", - "Now let’s try out some of the node embedding algorithms in the KC package. First, we will define a function that will draw a t-SNE scatter plot of embedding results." + "text/plain": [ + " result\n", + "0 done" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [{'index': int(el), 'value': int(results[el])} for el in results]\n", + "read_query(\"\"\"\n", + "UNWIND $data as row\n", + "MATCH (c:Character{index:row.index})\n", + "SET c.bigClam = row.value\n", + "RETURN distinct 'done' as result\n", + "\"\"\", {'data':data})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2OyJcLrydoer" + }, + "source": [ + "KC API is very simple to use. You just define the desired graph algorithm and input the NetworkX graph model in the fit method and that’s it. Couldn’t be simpler than that.\n", + "\n", + "# Node embeddings\n", + "Again, the Neo4j GDS library provides node embedding algorithms like FastRP, node2vec, and GraphSAGE. I will show the syntax for FastRP algorithm, but again, won’t delve much into hyper-parameter optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "G6WJX3-kduSv", + "outputId": "08e9873f-5d07-467e-e96d-3973d779c745" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "mw9VEBndmVE8" - }, - "source": [ - "from sklearn.manifold import TSNE\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "\n", - "def tsne(embeddings, hue=None):\n", - " tsne = TSNE(n_components=2, n_iter=300)\n", - " tsne_results = tsne.fit_transform(embeddings['embedding'].to_list())\n", - "\n", - " embeddings['tsne_x'] = [x[0] for x in list(tsne_results)]\n", - " embeddings['tsne_y'] = [x[1] for x in list(tsne_results)]\n", - "\n", - " plt.figure(figsize=(18,10))\n", - " sns.scatterplot(\n", - " x=\"tsne_x\", y=\"tsne_y\",\n", - " hue=hue,\n", - " palette=sns.color_palette(\"hls\", 10),\n", - " data=embeddings,\n", - " legend=\"full\",\n", - " alpha=0.9\n", - " )\n", - " \n", - " for i in range(df.shape[0]):\n", - " plt.text(x=df['tsne_x'][i]+0.3,y=df['tsne_y'][i]+0.3,s=df.character[i], \n", - " fontdict=dict(color='black',size=10),)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeCountnodePropertiesWrittenpreProcessingMilliscomputeMilliswriteMillisconfiguration
01191190100106{'writeConcurrency': 4, 'nodeSelfInfluence': 0...
\n", + "
" ], - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CVj3tIdwd4cd" - }, - "source": [ - "We will begin with the NetMF algorithm. NetMF algorithm fall into the community-based node embedding category. If you want to learn more about the technical details, read the original paper or examine the code." + "text/plain": [ + " nodeCount nodePropertiesWritten preProcessingMillis computeMillis \\\n", + "0 119 119 0 100 \n", + "\n", + " writeMillis configuration \n", + "0 106 {'writeConcurrency': 4, 'nodeSelfInfluence': 0... " ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.fastRP.write('got',{\n", + " embeddingDimension: 64,\n", + " writeProperty: 'fastrp'})\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MfyRm-_3dyCi" + }, + "source": [ + "The embeddingDimension parameter is mandatory and defines the size of the embedding vector for each node. Other than that, we have again defined the interaction network to be treated as undirected.\n", + "\n", + "Now let’s try out some of the node embedding algorithms in the KC package. First, we will define a function that will draw a t-SNE scatter plot of embedding results." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "mw9VEBndmVE8" + }, + "outputs": [], + "source": [ + "from sklearn.manifold import TSNE\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "\n", + "def tsne(embeddings, hue=None):\n", + " tsne = TSNE(n_components=2, n_iter=300)\n", + " tsne_results = tsne.fit_transform(embeddings['embedding'].to_list())\n", + "\n", + " embeddings['tsne_x'] = [x[0] for x in list(tsne_results)]\n", + " embeddings['tsne_y'] = [x[1] for x in list(tsne_results)]\n", + "\n", + " plt.figure(figsize=(18,10))\n", + " sns.scatterplot(\n", + " x=\"tsne_x\", y=\"tsne_y\",\n", + " hue=hue,\n", + " palette=sns.color_palette(\"hls\", 10),\n", + " data=embeddings,\n", + " legend=\"full\",\n", + " alpha=0.9\n", + " )\n", + " \n", + " for i in range(df.shape[0]):\n", + " plt.text(x=df['tsne_x'][i]+0.3,y=df['tsne_y'][i]+0.3,s=df.character[i], \n", + " fontdict=dict(color='black',size=10),)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CVj3tIdwd4cd" + }, + "source": [ + "We will begin with the NetMF algorithm. NetMF algorithm fall into the community-based node embedding category. If you want to learn more about the technical details, read the original paper or examine the code." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 675 }, + "id": "-igmAtnql7OZ", + "outputId": "30c30679-c69a-4a25-bfd6-882aa9de1afa" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 675 - }, - "id": "-igmAtnql7OZ", - "outputId": "30c30679-c69a-4a25-bfd6-882aa9de1afa" - }, - "source": [ - "from karateclub.node_embedding.neighbourhood import NetMF\n", - "\n", - "\"\"\"\n", - "dimensions (int): Number of embedding dimension. Default is 32.\n", - "iteration (int): Number of SVD iterations. Default is 10.\n", - "order (int): Number of PMI matrix powers. Default is 2.\n", - "negative_samples (in): Number of negative samples. Default is 1.\n", - "seed (int): SVD random seed. Default is 42.\n", - "\"\"\"\n", - "\n", - "model = NetMF(dimensions=64)\n", - "model.fit(G)\n", - "embedding = model.get_embedding()\n", - "\n", - "results = []\n", - "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", - " results.append({'character': name, 'embedding': embedding}) \n", - "df = pd.DataFrame.from_dict(results)\n", - "tsne(df)" - ], - "execution_count": 13, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/manifold/_t_sne.py:783: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\n", - " FutureWarning,\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/manifold/_t_sne.py:793: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n", - " FutureWarning,\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": { - "needs_background": "light" - } - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\n", + " warnings.warn(\n", + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "AvPxkrdGd8hG" - }, - "source": [ - "The KC library also features the NEU algorithm. The procedure uses an arbitrary embedding and augments it by higher order proximities with a recursive meta learning algorithm." + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from karateclub.node_embedding.neighbourhood import NetMF\n", + "\n", + "\"\"\"\n", + "dimensions (int): Number of embedding dimension. Default is 32.\n", + "iteration (int): Number of SVD iterations. Default is 10.\n", + "order (int): Number of PMI matrix powers. Default is 2.\n", + "negative_samples (in): Number of negative samples. Default is 1.\n", + "seed (int): SVD random seed. Default is 42.\n", + "\"\"\"\n", + "\n", + "model = NetMF(dimensions=64)\n", + "model.fit(G)\n", + "embedding = model.get_embedding()\n", + "\n", + "results = []\n", + "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", + " results.append({'character': name, 'embedding': embedding}) \n", + "df = pd.DataFrame.from_dict(results)\n", + "tsne(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AvPxkrdGd8hG" + }, + "source": [ + "The KC library also features the NEU algorithm. The procedure uses an arbitrary embedding and augments it by higher order proximities with a recursive meta learning algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 606 }, + "id": "m-bOrtHMmhVd", + "outputId": "56128b10-1d2b-447e-d395-c968c7257776" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 606 - }, - "id": "m-bOrtHMmhVd", - "outputId": "56128b10-1d2b-447e-d395-c968c7257776" - }, - "source": [ - "from karateclub.node_embedding.meta import NEU\n", - "\n", - "\"\"\"\n", - "L1 (float): Weight of lower order proximities. Defauls is 0.5\n", - "L2 (float): Weight of higer order proximities. Default is 0.25.\n", - "T (int): Number of iterations. Default is 1.\n", - "seed (int): Random seed value. Default is 42.\n", - "\"\"\"\n", - "\n", - "model = NetMF()\n", - "meta_model = NEU(T=3)\n", - "meta_model.fit(G, model)\n", - "\n", - "embedding = meta_model.get_embedding()\n", - "results = []\n", - "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", - " results.append({'character': name, 'embedding': embedding}) \n", - "df = pd.DataFrame.from_dict(results)\n", - "tsne(df)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [], - "needs_background": "light" - } - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\n", + " warnings.warn(\n", + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "sdzt_FjjeBDB" - }, - "source": [ - "Another node embedding category of algorithms is the structural role embedding category. Instead of capturing the similarity between nodes close in the network (neighbors), we want to capture the similarity between nodes with similar structural roles. One such algorithm is the Role2Vec algorithm.\n", - "\n", - "The default walk_length is 80. Given that our example graph has only 100+ nodes, I have decided to use a smaller walk_length value. Other than that, there is room for more hyper-parameter tweaking." + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from karateclub.node_embedding.meta import NEU\n", + "\n", + "\"\"\"\n", + "L1 (float): Weight of lower order proximities. Defauls is 0.5\n", + "L2 (float): Weight of higer order proximities. Default is 0.25.\n", + "T (int): Number of iterations. Default is 1.\n", + "seed (int): Random seed value. Default is 42.\n", + "\"\"\"\n", + "\n", + "model = NetMF()\n", + "meta_model = NEU(T=3)\n", + "meta_model.fit(G, model)\n", + "\n", + "embedding = meta_model.get_embedding()\n", + "results = []\n", + "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", + " results.append({'character': name, 'embedding': embedding}) \n", + "df = pd.DataFrame.from_dict(results)\n", + "tsne(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sdzt_FjjeBDB" + }, + "source": [ + "Another node embedding category of algorithms is the structural role embedding category. Instead of capturing the similarity between nodes close in the network (neighbors), we want to capture the similarity between nodes with similar structural roles. One such algorithm is the Role2Vec algorithm.\n", + "\n", + "The default walk_length is 80. Given that our example graph has only 100+ nodes, I have decided to use a smaller walk_length value. Other than that, there is room for more hyper-parameter tweaking." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 675 }, + "id": "RKFabIOmnL6O", + "outputId": "8f94df1c-7ed4-47c3-aeb3-df8fc3cc0323" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 675 - }, - "id": "RKFabIOmnL6O", - "outputId": "8f94df1c-7ed4-47c3-aeb3-df8fc3cc0323" - }, - "source": [ - "from karateclub.node_embedding.structural import Role2Vec\n", - "\n", - "\"\"\"\n", - "walk_number (int): Number of random walks. Default is 10.\n", - "walk_length (int): Length of random walks. Default is 80.\n", - "dimensions (int): Dimensionality of embedding. Default is 128.\n", - "workers (int): Number of cores. Default is 4.\n", - "window_size (int): Matrix power order. Default is 2.\n", - "epochs (int): Number of epochs. Default is 1.\n", - "learning_rate (float): HogWild! learning rate. Default is 0.05.\n", - "down_sampling (float): Down sampling frequency. Default is 0.0001.\n", - "min_count (int): Minimal count of feature occurrences. Default is 10.\n", - "wl_iterations (int): Number of Weisfeiler-Lehman hashing iterations. Default is 2.\n", - "seed (int): Random seed value. Default is 42.\n", - "erase_base_features (bool): Removing the base features. Default is False.\n", - "\"\"\"\n", - "\n", - "model = Role2Vec(walk_length=20)\n", - "model.fit(G)\n", - "embedding = model.get_embedding()\n", - "\n", - "results = []\n", - "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", - " results.append({'character': name, 'embedding': embedding}) \n", - "df = pd.DataFrame.from_dict(results)\n", - "tsne(df)" - ], - "execution_count": 15, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/sklearn/manifold/_t_sne.py:783: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\n", - " FutureWarning,\n", - "/usr/local/lib/python3.7/dist-packages/sklearn/manifold/_t_sne.py:793: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n", - " FutureWarning,\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": { - "needs_background": "light" - } - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\n", + " warnings.warn(\n", + "/home/tomaz/.local/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "H_ij2S-ze_NJ" - }, - "source": [ - "Calculating node embedding based on node role similarity is an exciting field. Instead of comparing the closeness of nodes in the network, we want to capture the structural role similarity between nodes. Then, we can use the structural role embedding to infer a kNN network and run a community detection algorithm to try and segment the nodes based on their network roles. First, we have to store the Role2vec results back to Neo4j." + "data": { + "image/png": "\n", + "text/plain": [ + "
" ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from karateclub.node_embedding.structural import Role2Vec\n", + "\n", + "\"\"\"\n", + "walk_number (int): Number of random walks. Default is 10.\n", + "walk_length (int): Length of random walks. Default is 80.\n", + "dimensions (int): Dimensionality of embedding. Default is 128.\n", + "workers (int): Number of cores. Default is 4.\n", + "window_size (int): Matrix power order. Default is 2.\n", + "epochs (int): Number of epochs. Default is 1.\n", + "learning_rate (float): HogWild! learning rate. Default is 0.05.\n", + "down_sampling (float): Down sampling frequency. Default is 0.0001.\n", + "min_count (int): Minimal count of feature occurrences. Default is 10.\n", + "wl_iterations (int): Number of Weisfeiler-Lehman hashing iterations. Default is 2.\n", + "seed (int): Random seed value. Default is 42.\n", + "erase_base_features (bool): Removing the base features. Default is False.\n", + "\"\"\"\n", + "\n", + "model = Role2Vec(walk_length=20)\n", + "model.fit(G)\n", + "embedding = model.get_embedding()\n", + "\n", + "results = []\n", + "for name,embedding in zip(character_mapping['character'].to_list(), embedding):\n", + " results.append({'character': name, 'embedding': embedding}) \n", + "df = pd.DataFrame.from_dict(results)\n", + "tsne(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H_ij2S-ze_NJ" + }, + "source": [ + "Calculating node embedding based on node role similarity is an exciting field. Instead of comparing the closeness of nodes in the network, we want to capture the structural role similarity between nodes. Then, we can use the structural role embedding to infer a kNN network and run a community detection algorithm to try and segment the nodes based on their network roles. First, we have to store the Role2vec results back to Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "F6C3WsDuokMr", + "outputId": "279c4b5f-2bb1-47d0-e019-1c26a925561d" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "F6C3WsDuokMr", - "outputId": "279c4b5f-2bb1-47d0-e019-1c26a925561d" - }, - "source": [ - "df['embedding'] = [el.tolist() for el in df['embedding']]\n", - "data = list(df[['character','embedding']].T.to_dict().values())\n", - "\n", - "read_query(\"\"\"\n", - "UNWIND $data as row\n", - "MATCH (c:Character{name:row.character})\n", - "SET c.role2vec = row.embedding\n", - "RETURN distinct 'done'\n", - "\"\"\", {'data':data})" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
'done'
0done
\n", + "
" ], - "execution_count": 16, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " 'done'\n", - "0 done" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
'done'
0done
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 16 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yo6ivJkPfB6Q" - }, - "source": [ - "The kNN algorithm is featured in the GDS library. The K-Nearest Neighbors algorithm computes a distance value for all node pairs in the graph and creates new relationships between each node and its k nearest neighbors. The distance is calculated based on node properties.\n", - "We will take advantage of the Graph Catalog feature as we will run two graph algorithms in sequence. First, we store a projection of a network as a named graph using the following syntax:" + "text/plain": [ + " 'done'\n", + "0 done" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['embedding'] = [el.tolist() for el in df['embedding']]\n", + "data = list(df[['character','embedding']].T.to_dict().values())\n", + "\n", + "read_query(\"\"\"\n", + "UNWIND $data as row\n", + "MATCH (c:Character{name:row.character})\n", + "SET c.role2vec = row.embedding\n", + "RETURN distinct 'done'\n", + "\"\"\", {'data':data})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yo6ivJkPfB6Q" + }, + "source": [ + "The kNN algorithm is featured in the GDS library. The K-Nearest Neighbors algorithm computes a distance value for all node pairs in the graph and creates new relationships between each node and its k nearest neighbors. The distance is calculated based on node properties.\n", + "We will take advantage of the Graph Catalog feature as we will run two graph algorithms in sequence. First, we store a projection of a network as a named graph using the following syntax:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "3wv1OW_yoqRf", + "outputId": "9e9297bb-9686-4956-a804-8e206bcad3a9" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "3wv1OW_yoqRf", - "outputId": "9e9297bb-9686-4956-a804-8e206bcad3a9" - }, - "source": [ - "#KNN\n", - "\n", - "read_query(\"\"\"\n", - "CALL gds.graph.project('role2vec', 'Character', 'INTERACTS', {nodeProperties:['role2vec']})\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'INTERACTS': {'orientation': 'NATURAL', 'inde...role2vec11940696
\n", + "
" ], - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeProjection \\\n", - "0 {'Character': {'label': 'Character', 'properti... \n", - "\n", - " relationshipProjection graphName nodeCount \\\n", - "0 {'INTERACTS': {'orientation': 'NATURAL', 'aggr... role2vec 119 \n", - "\n", - " relationshipCount projectMillis \n", - "0 406 23 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeProjectionrelationshipProjectiongraphNamenodeCountrelationshipCountprojectMillis
0{'Character': {'label': 'Character', 'properti...{'INTERACTS': {'orientation': 'NATURAL', 'aggr...role2vec11940623
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 17 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TvOobvm5fDzh" - }, - "source": [ - "We don’t care about undirected INTERACTS relationships as we will not be using them. The important thing is that we have included the role2vec node embedding in our projection. Now, we can go ahead and mutate the kNN algorithm. Using the mutate method, we store the algorithm results back to the projected named graph instead of the Neo4j stored graph. This way, we can use the results of the kNN algorithm as an input to a community detection algorithm." + "text/plain": [ + " nodeProjection \\\n", + "0 {'Character': {'label': 'Character', 'properti... \n", + "\n", + " relationshipProjection graphName nodeCount \\\n", + "0 {'INTERACTS': {'orientation': 'NATURAL', 'inde... role2vec 119 \n", + "\n", + " relationshipCount projectMillis \n", + "0 406 96 " ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#KNN\n", + "\n", + "read_query(\"\"\"\n", + "CALL gds.graph.project('role2vec', 'Character', 'INTERACTS', {nodeProperties:['role2vec']})\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TvOobvm5fDzh" + }, + "source": [ + "We don’t care about undirected INTERACTS relationships as we will not be using them. The important thing is that we have included the role2vec node embedding in our projection. Now, we can go ahead and mutate the kNN algorithm. Using the mutate method, we store the algorithm results back to the projected named graph instead of the Neo4j stored graph. This way, we can use the results of the kNN algorithm as an input to a community detection algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 159 }, + "id": "H9OH_uTXqevH", + "outputId": "a3ab7a29-7d59-441a-b44c-89ade2e53ab0" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "H9OH_uTXqevH", - "outputId": "a3ab7a29-7d59-441a-b44c-89ade2e53ab0" - }, - "source": [ - "read_query(\"\"\"\n", - "CALL gds.knn.mutate('role2vec', {topK: 5, nodeProperties:'role2vec', mutateProperty:'weight', mutateRelationshipType:'SIMILAR_ROLE'})\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ranIterationsnodePairsConsidereddidConvergepreProcessingMilliscomputeMillismutateMillispostProcessingMillisnodesComparedrelationshipsWrittensimilarityDistributionconfiguration
0618279True01127-1119595{'p1': 0.8695907592773438, 'max': 0.9993972778...{'topK': 5, 'maxIterations': 100, 'randomJoins...
\n", + "
" ], - "execution_count": 18, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " ranIterations nodePairsConsidered didConverge preProcessingMillis \\\n", - "0 7 23576 True 0 \n", - "\n", - " computeMillis mutateMillis postProcessingMillis nodesCompared \\\n", - "0 228 30 -1 119 \n", - "\n", - " relationshipsWritten similarityDistribution \\\n", - "0 595 {'p1': 0.9004364013671875, 'max': 0.9998779296... \n", - "\n", - " configuration \n", - "0 {'topK': 5, 'maxIterations': 100, 'randomJoins... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ranIterationsnodePairsConsidereddidConvergepreProcessingMilliscomputeMillismutateMillispostProcessingMillisnodesComparedrelationshipsWrittensimilarityDistributionconfiguration
0723576True022830-1119595{'p1': 0.9004364013671875, 'max': 0.9998779296...{'topK': 5, 'maxIterations': 100, 'randomJoins...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 18 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m-Ugp35xfHdA" - }, - "source": [ - "We will store the resulting relationships under the SIMILAR_ROLE type. One of the parameters is topK, which defines the number of neighbors to find for each node. The K-nearest neighbors are returned. Finally, we can examine the community structure of the resulted similarity network by using the Louvain algorithm." + "text/plain": [ + " ranIterations nodePairsConsidered didConverge preProcessingMillis \\\n", + "0 6 18279 True 0 \n", + "\n", + " computeMillis mutateMillis postProcessingMillis nodesCompared \\\n", + "0 112 7 -1 119 \n", + "\n", + " relationshipsWritten similarityDistribution \\\n", + "0 595 {'p1': 0.8695907592773438, 'max': 0.9993972778... \n", + "\n", + " configuration \n", + "0 {'topK': 5, 'maxIterations': 100, 'randomJoins... " ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.knn.mutate('role2vec', {topK: 5, nodeProperties:'role2vec', mutateProperty:'weight', mutateRelationshipType:'SIMILAR_ROLE'})\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m-Ugp35xfHdA" + }, + "source": [ + "We will store the resulting relationships under the SIMILAR_ROLE type. One of the parameters is topK, which defines the number of neighbors to find for each node. The K-nearest neighbors are returned. Finally, we can examine the community structure of the resulted similarity network by using the Louvain algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 }, + "id": "MByYnQH2rKEr", + "outputId": "dc458e7d-37bc-4468-8d89-500bc64848c8" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 142 - }, - "id": "MByYnQH2rKEr", - "outputId": "dc458e7d-37bc-4468-8d89-500bc64848c8" - }, - "source": [ - "read_query(\"\"\"\n", - "CALL gds.louvain.write('role2vec', {relationshipTypes:['SIMILAR_ROLE'], writeProperty:'louvain_role'})\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
writeMillisnodePropertiesWrittenmodularitymodularitiesranLevelscommunityCountcommunityDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
0131190.612429[0.6124285008120895]18{'p99': 33, 'min': 3, 'max': 33, 'mean': 14.87...211202{'maxIterations': 10, 'writeConcurrency': 4, '...
\n", + "
" ], - "execution_count": 19, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " writeMillis nodePropertiesWritten modularity modularities \\\n", - "0 192 119 0.712916 [0.712915754537109] \n", - "\n", - " ranLevels communityCount \\\n", - "0 1 10 \n", - "\n", - " communityDistribution postProcessingMillis \\\n", - "0 {'p99': 29, 'min': 3, 'max': 29, 'mean': 11.9,... 4 \n", - "\n", - " preProcessingMillis computeMillis \\\n", - "0 0 701 \n", - "\n", - " configuration \n", - "0 {'maxIterations': 10, 'writeConcurrency': 4, '... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
writeMillisnodePropertiesWrittenmodularitymodularitiesranLevelscommunityCountcommunityDistributionpostProcessingMillispreProcessingMilliscomputeMillisconfiguration
01921190.712916[0.712915754537109]110{'p99': 29, 'min': 3, 'max': 29, 'mean': 11.9,...40701{'maxIterations': 10, 'writeConcurrency': 4, '...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 19 - } + "text/plain": [ + " writeMillis nodePropertiesWritten modularity modularities \\\n", + "0 13 119 0.612429 [0.6124285008120895] \n", + "\n", + " ranLevels communityCount \\\n", + "0 1 8 \n", + "\n", + " communityDistribution postProcessingMillis \\\n", + "0 {'p99': 33, 'min': 3, 'max': 33, 'mean': 14.87... 2 \n", + "\n", + " preProcessingMillis computeMillis \\\n", + "0 1 1202 \n", + "\n", + " configuration \n", + "0 {'maxIterations': 10, 'writeConcurrency': 4, '... " ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.louvain.write('role2vec', {relationshipTypes:['SIMILAR_ROLE'], writeProperty:'louvain_role'})\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "IbSMP_fcrZ7r", + "outputId": "17d299f7-e7e0-4e90-b493-a493d16f57b7" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "IbSMP_fcrZ7r", - "outputId": "17d299f7-e7e0-4e90-b493-a493d16f57b7" - }, - "source": [ - "read_query(\"\"\"\n", - "CALL gds.graph.writeRelationship('role2vec', 'SIMILAR_ROLE', 'weight')\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
writeMillisgraphNamerelationshipTyperelationshipPropertyrelationshipsWrittenpropertiesWritten
0183role2vecSIMILAR_ROLEweight595595
\n", + "
" ], - "execution_count": 20, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " writeMillis graphName relationshipType relationshipProperty \\\n", - "0 21 role2vec SIMILAR_ROLE weight \n", - "\n", - " relationshipsWritten propertiesWritten \n", - "0 595 595 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
writeMillisgraphNamerelationshipTyperelationshipPropertyrelationshipsWrittenpropertiesWritten
021role2vecSIMILAR_ROLEweight595595
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 20 - } + "text/plain": [ + " writeMillis graphName relationshipType relationshipProperty \\\n", + "0 183 role2vec SIMILAR_ROLE weight \n", + "\n", + " relationshipsWritten propertiesWritten \n", + "0 595 595 " ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gwtNKPoDfPOI" - }, - "source": [ - "# Conclusion\n", - "The Karate Club package includes node embedding models that take into consideration also node properties. Unfortunately, we don’t have any node properties in our simple Harry Potter network, so I skipped them. Nevertheless, the node embedding research field is fascinating, and there are many approaches to what type of information you want to extract from the network. Hopefully, this simple integration of the Neo4j and Karate Club project will help you use the node embedding models that will work best for you." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qIE2xSAgfQai" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "read_query(\"\"\"\n", + "CALL gds.graph.writeRelationship('role2vec', 'SIMILAR_ROLE', 'weight')\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gwtNKPoDfPOI" + }, + "source": [ + "# Conclusion\n", + "The Karate Club package includes node embedding models that take into consideration also node properties. Unfortunately, we don’t have any node properties in our simple Harry Potter network, so I skipped them. Nevertheless, the node embedding research field is fascinating, and there are many approaches to what type of information you want to extract from the network. Hopefully, this simple integration of the Neo4j and Karate Club project will help you use the node embedding models that will work best for you." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qIE2xSAgfQai" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyOelByQliYNdEZtLAG5ICeE", + "include_colab_link": true, + "name": "Harry Potter - Karate Club integration.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/ice&fire/Ice&Fire_analysis.ipynb b/ice&fire/Ice&Fire_analysis.ipynb index 1944a79..39dc641 100644 --- a/ice&fire/Ice&Fire_analysis.ipynb +++ b/ice&fire/Ice&Fire_analysis.ipynb @@ -1,930 +1,625 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "pK9aJPSTWznM", + "outputId": "cf054629-bb16-4b56-95b0-2ea596de7eab" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting graphdatascience\n", + " Downloading graphdatascience-1.5-py3-none-any.whl (183 kB)\n", + "\u001b[K |████████████████████████████████| 183 kB 5.1 MB/s \n", + "\u001b[?25hCollecting multimethod<2.0,>=1.0\n", + " Downloading multimethod-1.9-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", + "Requirement already satisfied: tqdm<5.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.64.1)\n", + "Collecting neo4j<6.0,>=4.4.2\n", + " Downloading neo4j-5.2.0.tar.gz (173 kB)\n", + "\u001b[K |████████████████████████████████| 173 kB 38.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow<11.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (6.0.1)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<6.0,>=4.4.2->graphdatascience) (2022.6)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-5.2.0-py3-none-any.whl size=248021 sha256=7eca51544ff2688bb0223e2c4bad8b9d4f6ad129b0fa70670353498914a04c33\n", + " Stored in directory: /root/.cache/pip/wheels/5a/07/16/4d845d69ef310660c14b7148848c95da3ef3950c7b58daec42\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j, multimethod, graphdatascience\n", + "Successfully installed graphdatascience-1.5 multimethod-1.9 neo4j-5.2.0\n" + ] } + ], + "source": [ + "!pip install graphdatascience" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pK9aJPSTWznM", - "outputId": "cf054629-bb16-4b56-95b0-2ea596de7eab" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting graphdatascience\n", - " Downloading graphdatascience-1.5-py3-none-any.whl (183 kB)\n", - "\u001b[K |████████████████████████████████| 183 kB 5.1 MB/s \n", - "\u001b[?25hCollecting multimethod<2.0,>=1.0\n", - " Downloading multimethod-1.9-py3-none-any.whl (10 kB)\n", - "Requirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", - "Requirement already satisfied: tqdm<5.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.64.1)\n", - "Collecting neo4j<6.0,>=4.4.2\n", - " Downloading neo4j-5.2.0.tar.gz (173 kB)\n", - "\u001b[K |████████████████████████████████| 173 kB 38.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: pyarrow<11.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (6.0.1)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<6.0,>=4.4.2->graphdatascience) (2022.6)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", - "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-5.2.0-py3-none-any.whl size=248021 sha256=7eca51544ff2688bb0223e2c4bad8b9d4f6ad129b0fa70670353498914a04c33\n", - " Stored in directory: /root/.cache/pip/wheels/5a/07/16/4d845d69ef310660c14b7148848c95da3ef3950c7b58daec42\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j, multimethod, graphdatascience\n", - "Successfully installed graphdatascience-1.5 multimethod-1.9 neo4j-5.2.0\n" - ] - } - ], - "source": [ - "!pip install graphdatascience" - ] - }, - { - "cell_type": "code", - "source": [ - "from graphdatascience import GraphDataScience\n", - "\n", - "host = \"bolt://44.202.221.209:7687\"\n", - "user = \"neo4j\"\n", - "password = \"map-striker-injuries\"\n", - "\n", - "gds = GraphDataScience(host, auth=(user, password))" - ], - "metadata": { - "id": "KdsdF8zSW1yN" - }, - "execution_count": 2, - "outputs": [] + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "KdsdF8zSW1yN" + }, + "outputs": [], + "source": [ + "from graphdatascience import GraphDataScience\n", + "\n", + "host = \"bolt://3.231.25.240:7687\"\n", + "user = \"neo4j\"\n", + "password = \"hatchets-visitor-axes\"\n", + "\n", + "gds = GraphDataScience(host, auth=(user, password))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "5Un_5pO3XE_S", + "outputId": "15388f90-713a-409c-a03d-5c819caa2fc0" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "# Results of the query changed for notebook environment\n", - "gds.run_cypher(\n", - " \"\"\"MATCH (s:Character {name:$person1}), (t:Character {name:$person2})\n", - "MATCH p=shortestPath((s)-[:FATHER|MOTHER|SPOUSE*]-(t))\n", - "RETURN [n in nodes(p) | n.name] AS result\"\"\",\n", - " {\"person1\": \"Tyrion Lannister\", \"person2\": \"Viserys I\"},\n", - ")[\"result\"][0]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Un_5pO3XE_S", - "outputId": "15388f90-713a-409c-a03d-5c819caa2fc0" - }, - "execution_count": 3, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['Tyrion Lannister',\n", - " 'LadyJoanna Lannister',\n", - " 'Cersei Lannister',\n", - " 'Robert I',\n", - " 'Steffon Baratheon',\n", - " 'Rhaelle Targaryen',\n", - " 'Aegon V',\n", - " 'Maekar I',\n", - " 'Daeron II',\n", - " 'Aegon IV',\n", - " 'Viserys II',\n", - " 'Rhaenyra Targaryen',\n", - " 'Viserys I']" - ] - }, - "metadata": {}, - "execution_count": 3 - } + "data": { + "text/plain": [ + "['Tyrion Lannister',\n", + " 'Sansa Stark',\n", + " 'Eddard Stark',\n", + " 'Lyarra Stark',\n", + " 'Rodrik Stark',\n", + " 'Beron Stark',\n", + " 'Brandon Stark',\n", + " 'Cregan Stark',\n", + " 'Rickon Stark',\n", + " 'Sara Snow',\n", + " 'Jacaerys Velaryon',\n", + " 'Rhaenyra Targaryen',\n", + " 'Viserys I']" ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Results of the query changed for notebook environment\n", + "gds.run_cypher(\n", + " \"\"\"MATCH (s:Character {name:$person1}), (t:Character {name:$person2})\n", + "MATCH p=shortestPath((s)-[:FATHER|MOTHER|SPOUSE*]-(t))\n", + "RETURN [n in nodes(p) | n.name] AS result\"\"\",\n", + " {\"person1\": \"Tyrion Lannister\", \"person2\": \"Viserys I\"},\n", + ")[\"result\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "2YqgteubXto8", + "outputId": "c55f2dfa-95e9-4b03-ea9c-8ca17c6f8d78" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "# Results of the query changed for notebook environment\n", - "gds.run_cypher(\n", - " \"\"\"MATCH p=(c:Character {name:$person})-[:FATHER|MOTHER*]->()\n", - "RETURN [r in relationships(p) | endNode(r).name + \" \" + type(r) + \" to \" + startNode(r).name] AS result\"\"\",\n", - " {\"person\": \"Margaery Tyrell\"},\n", - ")[\"result\"]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2YqgteubXto8", - "outputId": "c55f2dfa-95e9-4b03-ea9c-8ca17c6f8d78" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0 [Mace Tyrell FATHER to Margaery Tyrell]\n", - "1 [Mace Tyrell FATHER to Margaery Tyrell, Olenna...\n", - "2 [Mace Tyrell FATHER to Margaery Tyrell, Olenna...\n", - "3 [Mace Tyrell FATHER to Margaery Tyrell, Luthor...\n", - "4 [Alerie Hightower MOTHER to Margaery Tyrell]\n", - "5 [Alerie Hightower MOTHER to Margaery Tyrell, L...\n", - "Name: result, dtype: object" - ] - }, - "metadata": {}, - "execution_count": 4 - } + "data": { + "text/plain": [ + "0 [Alerie Hightower MOTHER to Margaery Tyrell]\n", + "1 [Alerie Hightower MOTHER to Margaery Tyrell, L...\n", + "2 [Mace Tyrell FATHER to Margaery Tyrell]\n", + "3 [Mace Tyrell FATHER to Margaery Tyrell, Olenna...\n", + "4 [Mace Tyrell FATHER to Margaery Tyrell, Olenna...\n", + "5 [Mace Tyrell FATHER to Margaery Tyrell, Luthor...\n", + "Name: result, dtype: object" ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Results of the query changed for notebook environment\n", + "gds.run_cypher(\n", + " \"\"\"MATCH p=(c:Character {name:$person})-[:FATHER|MOTHER*]->()\n", + "RETURN [r in relationships(p) | endNode(r).name + \" \" + type(r) + \" to \" + startNode(r).name] AS result\"\"\",\n", + " {\"person\": \"Margaery Tyrell\"},\n", + ")[\"result\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "t9JdS7ibW4Q0" + }, + "outputs": [], + "source": [ + "G, res = gds.graph.project(\"family\", \"Character\", [\"MOTHER\", \"FATHER\", \"SPOUSE\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "MzJFNjK-W_tm" + }, + "outputs": [], + "source": [ + "wcc_df = gds.wcc.stream(G)\n", + "wcc_df[\"name\"] = [el[\"name\"] for el in gds.util.asNodes(wcc_df[\"nodeId\"].to_list())]\n", + "wcc_df[\"last_name\"] = [\n", + " el.split(\" \")[-1] if len(el.split(\" \")) > 1 and len(el.split(\" \")[-1]) > 3 else None\n", + " for el in wcc_df[\"name\"]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "7IciN9YNZMmQ", + "outputId": "6b0bb5a4-c962-417c-c7d1-4fc050f92873" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "G, res = gds.graph.project(\"family\", \"Character\", [\"MOTHER\", \"FATHER\", \"SPOUSE\"])" - ], - "metadata": { - "id": "t9JdS7ibW4Q0" - }, - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "wcc_df = gds.wcc.stream(G)\n", - "wcc_df[\"name\"] = [el[\"name\"] for el in gds.util.asNodes(wcc_df[\"nodeId\"].to_list())]\n", - "wcc_df[\"last_name\"] = [\n", - " el.split(\" \")[-1] if len(el.split(\" \")) > 1 and len(el.split(\" \")[-1]) > 3 else None\n", - " for el in wcc_df[\"name\"]\n", - "]" - ], - "metadata": { - "id": "MzJFNjK-W_tm" - }, - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "wcc_df.head()" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeIdcomponentIdnamelast_name
01190A certain manNone
11201Abelar HightowerHightower
21212AbelonNone
31223AddamNone
41234Addam FreyFrey
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "7IciN9YNZMmQ", - "outputId": "6b0bb5a4-c962-417c-c7d1-4fc050f92873" - }, - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " nodeId componentId name last_name\n", - "0 4512 0 Arianne Martell Martell\n", - "1 4513 1 Arianne Tarth Tarth\n", - "2 4514 2 Arlan of Pennytree Pennytree\n", - "3 4516 3 Arlan III Durrandon Durrandon\n", - "4 4517 4 Arlan V Durrandon Durrandon" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeIdcomponentIdnamelast_name
045120Arianne MartellMartell
145131Arianne TarthTarth
245142Arlan of PennytreePennytree
345163Arlan III DurrandonDurrandon
445174Arlan V DurrandonDurrandon
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 7 - } + "text/plain": [ + " nodeId componentId name last_name\n", + "0 119 0 A certain man None\n", + "1 120 1 Abelar Hightower Hightower\n", + "2 121 2 Abelon None\n", + "3 122 3 Addam None\n", + "4 123 4 Addam Frey Frey" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wcc_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "R9fThQtGZPOT", + "outputId": "c426bf7c-a396-4009-99b7-283e6b433a4d" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "wcc_df.groupby(\"componentId\").size().sort_values(ascending=False).to_frame(\n", - " \"componentSize\"\n", - ").reset_index().head()" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentIdcomponentSize
05785
145719
211112
393811
419310
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "R9fThQtGZPOT", - "outputId": "c426bf7c-a396-4009-99b7-283e6b433a4d" - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " componentId componentSize\n", - "0 4 785\n", - "1 235 19\n", - "2 295 12\n", - "3 726 11\n", - "4 448 10" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
componentIdcomponentSize
04785
123519
229512
372611
444810
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 8 - } + "text/plain": [ + " componentId componentSize\n", + "0 5 785\n", + "1 457 19\n", + "2 111 12\n", + "3 938 11\n", + "4 193 10" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wcc_df.groupby(\"componentId\").size().sort_values(ascending=False).to_frame(\n", + " \"componentSize\"\n", + ").reset_index().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 }, + "id": "HMJgmlRHZpOQ", + "outputId": "403ca797-78e5-4af8-8f0a-cd7f04d65f36" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "largest_component = wcc_df.groupby('componentId').size().sort_values(\n", - " ascending=False\n", - ").reset_index()['componentId'][0]\n", - "wcc_df[wcc_df[\"componentId\"] == largest_component].groupby(\"last_name\").size().sort_values(\n", - " ascending=False\n", - ").to_frame(\"count\").reset_index().head(10)" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namecount
0Frey91
1Targaryen66
2Stark50
3Lannister30
4Hightower28
5Velaryon21
6Baratheon21
7Greyjoy19
8Rivers15
9Arryn15
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 363 - }, - "id": "HMJgmlRHZpOQ", - "outputId": "403ca797-78e5-4af8-8f0a-cd7f04d65f36" - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " last_name count\n", - "0 Frey 91\n", - "1 Targaryen 66\n", - "2 Stark 50\n", - "3 Lannister 30\n", - "4 Hightower 28\n", - "5 Velaryon 21\n", - "6 Baratheon 21\n", - "7 Greyjoy 19\n", - "8 Rivers 15\n", - "9 Arryn 15" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
last_namecount
0Frey91
1Targaryen66
2Stark50
3Lannister30
4Hightower28
5Velaryon21
6Baratheon21
7Greyjoy19
8Rivers15
9Arryn15
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 11 - } + "text/plain": [ + " last_name count\n", + "0 Frey 91\n", + "1 Targaryen 66\n", + "2 Stark 50\n", + "3 Lannister 30\n", + "4 Hightower 28\n", + "5 Velaryon 21\n", + "6 Baratheon 21\n", + "7 Greyjoy 19\n", + "8 Rivers 15\n", + "9 Arryn 15" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "largest_component = wcc_df.groupby('componentId').size().sort_values(\n", + " ascending=False\n", + ").reset_index()['componentId'][0]\n", + "wcc_df[wcc_df[\"componentId\"] == largest_component].groupby(\"last_name\").size().sort_values(\n", + " ascending=False\n", + ").to_frame(\"count\").reset_index().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "Mll03EJPj6ew", + "outputId": "cb78ae3a-ea0c-42da-941a-3a9bc4ef90c5" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\"\"\"\n", - "MATCH (c1:Character)<-[:FATHER|MOTHER]-(s1)-[:SPOUSE]-(s2)-[:FATHER|MOTHER]->(c2:Character)\n", - "WHERE c1.name CONTAINS \"Targaryen\" AND c2.name CONTAINS \"Stark\"\n", - "RETURN s1.name AS spouse1, s2.name AS spouse2\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spouse1spouse2
0Jacaerys VelaryonSara Snow
\n", + "
" ], - "metadata": { - "id": "Mll03EJPj6ew", - "outputId": "cb78ae3a-ea0c-42da-941a-3a9bc4ef90c5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 12, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " spouse1 spouse2\n", - "0 Jacaerys Velaryon Sara Snow" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
spouse1spouse2
0Jacaerys VelaryonSara Snow
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 12 - } + "text/plain": [ + " spouse1 spouse2\n", + "0 Jacaerys Velaryon Sara Snow" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\"\"\"\n", + "MATCH (c1:Character)<-[:FATHER|MOTHER]-(s1)-[:SPOUSE]-(s2)-[:FATHER|MOTHER]->(c2:Character)\n", + "WHERE c1.name CONTAINS \"Targaryen\" AND c2.name CONTAINS \"Stark\"\n", + "RETURN s1.name AS spouse1, s2.name AS spouse2\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "PT-pSDn8an6s", + "outputId": "bc5a9e48-bb95-47ea-a3ab-cebf40ddba5f" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "G.drop()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PT-pSDn8an6s", - "outputId": "bc5a9e48-bb95-47ea-a3ab-cebf40ddba5f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "graphName family\n", - "database neo4j\n", - "memoryUsage \n", - "sizeInBytes -1\n", - "nodeCount 3654\n", - "relationshipCount 1794\n", - "configuration {'relationshipProjection': {'FATHER': {'orient...\n", - "density 0.000134\n", - "creationTime 2022-11-08T17:57:36.072746000+00:00\n", - "modificationTime 2022-11-08T17:57:36.181971000+00:00\n", - "schema {'graphProperties': {}, 'relationships': {'FAT...\n", - "Name: 0, dtype: object" - ] - }, - "metadata": {}, - "execution_count": 35 - } + "data": { + "text/plain": [ + "graphName family\n", + "database neo4j\n", + "memoryUsage \n", + "sizeInBytes -1\n", + "nodeCount 3653\n", + "relationshipCount 1794\n", + "configuration {'relationshipProjection': {'FATHER': {'orient...\n", + "density 0.000134\n", + "creationTime 2023-02-01T12:28:33.121376800+00:00\n", + "modificationTime 2023-02-01T12:28:33.233263361+00:00\n", + "schema {'graphProperties': {}, 'relationships': {'FAT...\n", + "schemaWithOrientation {'graphProperties': {}, 'relationships': {'FAT...\n", + "Name: 0, dtype: object" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "XmmwDND9a7qL" - }, - "execution_count": null, - "outputs": [] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "G.drop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XmmwDND9a7qL" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/ice&fire/Ice&Fire_import.ipynb b/ice&fire/Ice&Fire_import.ipynb index eeaa56d..d42293a 100644 --- a/ice&fire/Ice&Fire_import.ipynb +++ b/ice&fire/Ice&Fire_import.ipynb @@ -1,1763 +1,983 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "R_hqa9eZDO2M", + "outputId": "2a1256f4-e99b-40bd-c770-1c8a8806bc1b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting graphdatascience\n", + " Downloading graphdatascience-1.5-py3-none-any.whl (183 kB)\n", + "\u001b[K |████████████████████████████████| 183 kB 13.2 MB/s \n", + "\u001b[?25hCollecting multimethod<2.0,>=1.0\n", + " Downloading multimethod-1.9-py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: tqdm<5.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.64.1)\n", + "Requirement already satisfied: pyarrow<11.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (6.0.1)\n", + "Collecting neo4j<6.0,>=4.4.2\n", + " Downloading neo4j-5.2.0.tar.gz (173 kB)\n", + "\u001b[K |████████████████████████████████| 173 kB 62.5 MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<6.0,>=4.4.2->graphdatascience) (2022.6)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-5.2.0-py3-none-any.whl size=248021 sha256=b1213438cc3b276397b7ec7db728e7913e3d1e99d4b729b71520a854edd87042\n", + " Stored in directory: /root/.cache/pip/wheels/5a/07/16/4d845d69ef310660c14b7148848c95da3ef3950c7b58daec42\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j, multimethod, graphdatascience\n", + "Successfully installed graphdatascience-1.5 multimethod-1.9 neo4j-5.2.0\n" + ] } + ], + "source": [ + "!pip install graphdatascience" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "R_hqa9eZDO2M", - "outputId": "2a1256f4-e99b-40bd-c770-1c8a8806bc1b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Collecting graphdatascience\n", - " Downloading graphdatascience-1.5-py3-none-any.whl (183 kB)\n", - "\u001b[K |████████████████████████████████| 183 kB 13.2 MB/s \n", - "\u001b[?25hCollecting multimethod<2.0,>=1.0\n", - " Downloading multimethod-1.9-py3-none-any.whl (10 kB)\n", - "Requirement already satisfied: tqdm<5.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (4.64.1)\n", - "Requirement already satisfied: pyarrow<11.0,>=4.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (6.0.1)\n", - "Collecting neo4j<6.0,>=4.4.2\n", - " Downloading neo4j-5.2.0.tar.gz (173 kB)\n", - "\u001b[K |████████████████████████████████| 173 kB 62.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: pandas<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from graphdatascience) (1.3.5)\n", - "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j<6.0,>=4.4.2->graphdatascience) (2022.6)\n", - "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (1.21.6)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas<2.0,>=1.0->graphdatascience) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas<2.0,>=1.0->graphdatascience) (1.15.0)\n", - "Building wheels for collected packages: neo4j\n", - " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for neo4j: filename=neo4j-5.2.0-py3-none-any.whl size=248021 sha256=b1213438cc3b276397b7ec7db728e7913e3d1e99d4b729b71520a854edd87042\n", - " Stored in directory: /root/.cache/pip/wheels/5a/07/16/4d845d69ef310660c14b7148848c95da3ef3950c7b58daec42\n", - "Successfully built neo4j\n", - "Installing collected packages: neo4j, multimethod, graphdatascience\n", - "Successfully installed graphdatascience-1.5 multimethod-1.9 neo4j-5.2.0\n" - ] - } - ], - "source": [ - "!pip install graphdatascience" - ] - }, - { - "cell_type": "code", - "source": [ - "from graphdatascience import GraphDataScience\n", - "\n", - "host = \"bolt://44.202.221.209:7687\"\n", - "user = \"neo4j\"\n", - "password = \"map-striker-injuries\"\n", - "\n", - "gds = GraphDataScience(host, auth=(user, password))" - ], - "metadata": { - "id": "Eu_aAgmYiaGj" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Define constraints" - ], - "metadata": { - "id": "lZsg0OoBM_8L" - } + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "Eu_aAgmYiaGj" + }, + "outputs": [], + "source": [ + "from graphdatascience import GraphDataScience\n", + "\n", + "host = \"bolt://3.231.25.240:7687\"\n", + "user = \"neo4j\"\n", + "password = \"hatchets-visitor-axes\"\n", + "\n", + "gds = GraphDataScience(host, auth=(user, password))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lZsg0OoBM_8L" + }, + "source": [ + "# Define constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "rN310oX6lH_4", + "outputId": "8e51e2be-d2eb-4948-cb4c-569163abc533" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "CREATE CONSTRAINT IF NOT EXISTS ON (h:Faction) ASSERT (h.url) IS UNIQUE; \n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - }, - "id": "rN310oX6lH_4", - "outputId": "8e51e2be-d2eb-4948-cb4c-569163abc533" - }, - "execution_count": 3, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 3 - } + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "CREATE CONSTRAINT IF NOT EXISTS FOR (h:Faction) REQUIRE (h.url) IS UNIQUE; \n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "9tgp3ReklRX4", + "outputId": "18fb0af1-7e87-4759-d30e-e8414b43289d" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "CREATE CONSTRAINT IF NOT EXISTS ON (c:Character) ASSERT (c.url) IS UNIQUE; \n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - }, - "id": "9tgp3ReklRX4", - "outputId": "18fb0af1-7e87-4759-d30e-e8414b43289d" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 4 - } + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Character) REQUIRE (c.url) IS UNIQUE; \n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TtEaQGp2NB9Q" + }, + "source": [ + "# Import data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "JxexrXQsNYUv", + "outputId": "b13efbd6-8817-4750-9f57-35684249b3ec" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Import data" - ], - "metadata": { - "id": "TtEaQGp2NB9Q" - } - }, - { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "call dbms.setConfigValue('dbms.transaction.timeout','0')\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ], - "metadata": { - "id": "JxexrXQsNYUv", - "outputId": "b13efbd6-8817-4750-9f57-35684249b3ec", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - } - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 5 - } + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "call dbms.setConfigValue('dbms.transaction.timeout','0')\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "RYXOh0A4lU9R", + "outputId": "65c98346-9253-443f-82df-70f933039182" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/ice_fire/characters.tsv\" AS row FIELDTERMINATOR \"\\t\"\n", - "MERGE (c:Character {url: toLower(row.url)})\n", - "SET c.name = row.name,\n", - " c.born = replace(replace(replace(row.born, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\"),\n", - " c.died = replace(replace(replace(row.died, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\"),\n", - " c.title = replace(replace(replace(row.title, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\")\n", - "FOREACH (a IN apoc.convert.fromJsonList(row.allegiance) | MERGE (f:Faction {url: toLower(split(a, \"//\")[1])}) MERGE (c)-[:ALLEGIANCE]->(f))\n", - "FOREACH (cu IN apoc.convert.fromJsonList(row.culture) | MERGE (culture:Culture {name: split(toLower(cu), \"[\")[0]}) MERGE (c)-[:CULTURE]->(culture))\n", - "FOREACH (s IN apoc.convert.fromJsonList(row.spouse) | MERGE (c1:Character {url: toLower(s)}) MERGE (c)-[:SPOUSE]-(c1))\n", - "FOREACH (s IN apoc.convert.fromJsonList(row.father) | MERGE (c1:Character {url: toLower(s)}) MERGE (c)-[:FATHER]->(c1))\n", - "FOREACH (m IN apoc.convert.fromJsonList(row.mother) | MERGE (c1:Character {url: toLower(m)}) MERGE (c)-[:MOTHER]->(c1))\n", - "FOREACH (b IN apoc.convert.fromJsonList(row.books) | MERGE (b1:Book {url: toLower(split(b, \"//\")[1])}) MERGE (c)-[:APPEARED_IN_BOOK]->(b1))\n", - "FOREACH (s IN apoc.convert.fromJsonList(row.show) | MERGE (s1:Show {url: toLower(split(s, \"//\")[1])}) MERGE (c)-[:APPEARED_IN_BOOK]->(s1))\n", - "FOREACH (pr IN apoc.convert.fromJsonList(row.predecessor) | MERGE (c1:Character {url: toLower(pr)}) MERGE (c)-[:PREDECESSOR]->(c1))\n", - "FOREACH (pr IN apoc.convert.fromJsonList(row.successor) | MERGE (c1:Character {url: toLower(pr)}) MERGE (c)<-[:PREDECESSOR]-(c1))\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - }, - "id": "RYXOh0A4lU9R", - "outputId": "65c98346-9253-443f-82df-70f933039182" - }, - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 6 - } + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/ice_fire/characters.tsv\" AS row FIELDTERMINATOR \"\\t\"\n", + "MERGE (c:Character {url: toLower(row.url)})\n", + "SET c.name = row.name,\n", + " c.born = replace(replace(replace(row.born, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\"),\n", + " c.died = replace(replace(replace(row.died, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\"),\n", + " c.title = replace(replace(replace(row.title, \"[\", \"\"), \"]\", \"\"),\"b'\",\"\")\n", + "FOREACH (a IN apoc.convert.fromJsonList(row.allegiance) | MERGE (f:Faction {url: toLower(split(a, \"//\")[1])}) MERGE (c)-[:ALLEGIANCE]->(f))\n", + "FOREACH (cu IN apoc.convert.fromJsonList(row.culture) | MERGE (culture:Culture {name: split(toLower(cu), \"[\")[0]}) MERGE (c)-[:CULTURE]->(culture))\n", + "FOREACH (s IN apoc.convert.fromJsonList(row.spouse) | MERGE (c1:Character {url: toLower(s)}) MERGE (c)-[:SPOUSE]-(c1))\n", + "FOREACH (s IN apoc.convert.fromJsonList(row.father) | MERGE (c1:Character {url: toLower(s)}) MERGE (c)-[:FATHER]->(c1))\n", + "FOREACH (m IN apoc.convert.fromJsonList(row.mother) | MERGE (c1:Character {url: toLower(m)}) MERGE (c)-[:MOTHER]->(c1))\n", + "FOREACH (b IN apoc.convert.fromJsonList(row.books) | MERGE (b1:Book {url: toLower(split(b, \"//\")[1])}) MERGE (c)-[:APPEARED_IN_BOOK]->(b1))\n", + "FOREACH (s IN apoc.convert.fromJsonList(row.show) | MERGE (s1:Show {url: toLower(split(s, \"//\")[1])}) MERGE (c)-[:APPEARED_IN_BOOK]->(s1))\n", + "FOREACH (pr IN apoc.convert.fromJsonList(row.predecessor) | MERGE (c1:Character {url: toLower(pr)}) MERGE (c)-[:PREDECESSOR]->(c1))\n", + "FOREACH (pr IN apoc.convert.fromJsonList(row.successor) | MERGE (c1:Character {url: toLower(pr)}) MERGE (c)<-[:PREDECESSOR]-(c1))\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6UZeKal7NDm8" + }, + "source": [ + "# Data cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 }, + "id": "TSttWqMYFlP0", + "outputId": "8f2b5cfe-eb3c-4dc0-e04c-cc0385b14944" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Data cleanup" - ], - "metadata": { - "id": "6UZeKal7NDm8" - } - }, - { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "MATCH (n)\n", - "WHERE NOT EXISTS (n.name)\n", - "WITH n, replace(split(apoc.text.urldecode(n.url), \"/\")[-1], \"_\", \" \") AS clean_name\n", - "SET n.name = clean_name\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 49 - }, - "id": "TSttWqMYFlP0", - "outputId": "8f2b5cfe-eb3c-4dc0-e04c-cc0385b14944" - }, - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 7 - } + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "MATCH (n)\n", + "WHERE NOT n.name IS NOT NULL\n", + "WITH n, replace(split(apoc.text.urldecode(n.url), \"/\")[-1], \"_\", \" \") AS clean_name\n", + "SET n.name = clean_name\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "cCONUTgssmqY", + "outputId": "7a08235b-0d32-4b0a-d95a-3d7aa7d64d6f" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "MATCH (t)-[r]->(m)\n", - "WHERE toLower(t.name) = toLower(m.name)\n", - "DELETE r\n", - "RETURN count(*) AS selfloops\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
selfloops
0329
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "cCONUTgssmqY", - "outputId": "7a08235b-0d32-4b0a-d95a-3d7aa7d64d6f" - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " selfloops\n", - "0 329" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
selfloops
0329
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 8 - } + "text/plain": [ + " selfloops\n", + "0 329" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "MATCH (t)-[r]->(m)\n", + "WHERE toLower(t.name) = toLower(m.name)\n", + "DELETE r\n", + "RETURN count(*) AS selfloops\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "f0VCfRMNJYw-", + "outputId": "ae6513b1-8357-41fe-ac30-738dee59e9ed" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "MATCH (n) \n", - "WHERE NOT (n)--()\n", - "DELETE n\n", - "RETURN count(*) AS isolated\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isolated
0221
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "f0VCfRMNJYw-", - "outputId": "ae6513b1-8357-41fe-ac30-738dee59e9ed" - }, - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " isolated\n", - "0 221" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
isolated
0221
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 9 - } + "text/plain": [ + " isolated\n", + "0 221" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "MATCH (n) \n", + "WHERE NOT EXISTS { (n)--() }\n", + "DELETE n\n", + "RETURN count(*) AS isolated\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "o59beDs6MXhg", + "outputId": "c6b050d6-319c-4abc-e896-48c60104da69" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "MATCH (s:Show)\n", - "WHERE NOT s.url CONTAINS \"house\"\n", - "WITH s, split(s.url, \"_\")[-1] AS seasons\n", - "WITH seasons, collect(s) AS duplicates\n", - "WHERE size(duplicates) > 1\n", - "CALL apoc.refactor.mergeNodes(duplicates) YIELD node\n", - "RETURN distinct 'done' AS result\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
result
0done
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - }, - "id": "o59beDs6MXhg", - "outputId": "c6b050d6-319c-4abc-e896-48c60104da69" - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " result\n", - "0 done" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
result
0done
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 10 - } + "text/plain": [ + " result\n", + "0 done" ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "MATCH (s:Show)\n", + "WHERE NOT s.url CONTAINS \"house\"\n", + "WITH s, split(s.url, \"_\")[-1] AS seasons\n", + "WITH seasons, collect(s) AS duplicates\n", + "WHERE size(duplicates) > 1\n", + "CALL apoc.refactor.mergeNodes(duplicates) YIELD node\n", + "RETURN distinct 'done' AS result\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 }, + "id": "-8o0rpISh_YN", + "outputId": "76737527-a69a-4b19-b656-ab879c604bf1" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\"\"\"\n", - "MATCH (c1:Character), (c2:Character)\n", - "WHERE c1.name CONTAINS \"catelyn tully\" AND c2.name CONTAINS \"Catelyn Stark\"\n", - "CALL apoc.refactor.mergeNodes([c2,c1]) YIELD node\n", - "RETURN distinct 'done'\n", - "\"\"\")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
'done'
0done
\n", + "
" ], - "metadata": { - "id": "-8o0rpISh_YN", - "outputId": "76737527-a69a-4b19-b656-ab879c604bf1", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81 - } - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " 'done'\n", - "0 done" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
'done'
0done
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 11 - } + "text/plain": [ + " 'done'\n", + "0 done" ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\"\"\"\n", + "MATCH (c1:Character), (c2:Character)\n", + "WHERE c1.name CONTAINS \"catelyn tully\" AND c2.name CONTAINS \"Catelyn Stark\"\n", + "CALL apoc.refactor.mergeNodes([c2,c1]) YIELD node\n", + "RETURN distinct 'done'\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5P1_Yq17NF9s" + }, + "source": [ + "# Verify the data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 142 }, + "id": "VBziu9xioGFJ", + "outputId": "1d1200a7-39d1-47b5-eb25-87d93d3af5c6" + }, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "# Verify the data" - ], - "metadata": { - "id": "5P1_Yq17NF9s" - } - }, - { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "CALL apoc.meta.stats()\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelCountrelTypeCountpropertyKeyCountnodeCountrelCountlabelsrelTypesrelTypesCountstats
07786431916941{'Character': 3653, 'Book': 20, 'Show': 11, 'C...{'(:Character)-[:PREDECESSOR]->()': 307, '()-[...{'APPEARED_IN_BOOK': 7936, 'FATHER': 960, 'MOT...{'relTypeCount': 7, 'propertyKeyCount': 86, 'l...
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 142 - }, - "id": "VBziu9xioGFJ", - "outputId": "1d1200a7-39d1-47b5-eb25-87d93d3af5c6" - }, - "execution_count": 12, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " labelCount relTypeCount propertyKeyCount nodeCount relCount \\\n", - "0 6 7 21 4319 16941 \n", - "\n", - " labels \\\n", - "0 {'Character': 3653, 'Book': 20, 'Show': 11, 'C... \n", - "\n", - " relTypes \\\n", - "0 {'(:Character)-[:PREDECESSOR]->()': 307, '()-[... \n", - "\n", - " relTypesCount \\\n", - "0 {'APPEARED_IN_BOOK': 7936, 'FATHER': 960, 'MOT... \n", - "\n", - " stats \n", - "0 {'relTypeCount': 7, 'propertyKeyCount': 21, 'l... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labelCountrelTypeCountpropertyKeyCountnodeCountrelCountlabelsrelTypesrelTypesCountstats
06721431916941{'Character': 3653, 'Book': 20, 'Show': 11, 'C...{'(:Character)-[:PREDECESSOR]->()': 307, '()-[...{'APPEARED_IN_BOOK': 7936, 'FATHER': 960, 'MOT...{'relTypeCount': 7, 'propertyKeyCount': 21, 'l...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 12 - } + "text/plain": [ + " labelCount relTypeCount propertyKeyCount nodeCount relCount \\\n", + "0 7 7 86 4319 16941 \n", + "\n", + " labels \\\n", + "0 {'Character': 3653, 'Book': 20, 'Show': 11, 'C... \n", + "\n", + " relTypes \\\n", + "0 {'(:Character)-[:PREDECESSOR]->()': 307, '()-[... \n", + "\n", + " relTypesCount \\\n", + "0 {'APPEARED_IN_BOOK': 7936, 'FATHER': 960, 'MOT... \n", + "\n", + " stats \n", + "0 {'relTypeCount': 7, 'propertyKeyCount': 86, 'l... " ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "CALL apoc.meta.stats()\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "YFaql70MHzTh", + "outputId": "528e46d4-cfc3-4ef0-bf0c-0c33a397640d" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "gds.run_cypher(\n", - " \"\"\"\n", - "MATCH (n)\n", - "RETURN labels(n)[0] AS label, count(*) AS count\n", - "\"\"\"\n", - ")" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelcount
0Character3653
1Faction563
2Culture72
3Book20
4Show11
\n", + "
" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "YFaql70MHzTh", - "outputId": "528e46d4-cfc3-4ef0-bf0c-0c33a397640d" - }, - "execution_count": 13, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " label count\n", - "0 Character 3653\n", - "1 Faction 563\n", - "2 Culture 72\n", - "3 Book 20\n", - "4 Show 11" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labelcount
0Character3653
1Faction563
2Culture72
3Book20
4Show11
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 13 - } + "text/plain": [ + " label count\n", + "0 Character 3653\n", + "1 Faction 563\n", + "2 Culture 72\n", + "3 Book 20\n", + "4 Show 11" ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "QLkzlxtLH7b_" - }, - "execution_count": null, - "outputs": [] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "gds.run_cypher(\n", + " \"\"\"\n", + "MATCH (n)\n", + "RETURN labels(n)[0] AS label, count(*) AS count\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QLkzlxtLH7b_" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}