diff --git a/notebooks/communityDetectionStructures.ipynb b/notebooks/communityDetectionStructures.ipynb new file mode 100644 index 00000000..08673ae9 --- /dev/null +++ b/notebooks/communityDetectionStructures.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import requests\n", + "\n", + "import mercury\n", + "from dotenv import load_dotenv\n", + "\n", + "import networkx as nx\n", + "from netgraph import Graph, InteractiveGraph\n", + "from ipysigma import Sigma\n", + "from pyvis.network import Network\n", + "\n", + "import pandas as pd\n", + "from pandas import json_normalize\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import itertools\n", + "from itertools import combinations" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/mercury+json": { + "allow_download": true, + "code_uid": "App.0.40.25.2-rande2e6d846", + "continuous_update": true, + "description": "", + "full_screen": true, + "model_id": "mercury-app", + "notify": "{}", + "output": "app", + "schedule": "", + "show_code": false, + "show_prompt": false, + "show_sidebar": true, + "static_notebook": true, + "stop_on_error": false, + "title": "Display JSON", + "widget": "App" + }, + "text/html": [ + "

Mercury Application

This output won't appear in the web app." + ], + "text/plain": [ + "mercury.App" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load app for json display\n", + "mercury.App(title=\"Display JSON\", static_notebook=True)\n", + "\n", + "# Load server environment\n", + "load_dotenv(os.path.dirname(sys.path[1]) + '/server/.env')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Elastic search info\n", + "SCANR_API_URL = os.environ.get('SCANR_API_URL')\n", + "SCANR_API_TOKEN = os.environ.get('SCANR_API_TOKEN')\n", + "header = {'Authorization': SCANR_API_TOKEN}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Query json\n", + "json_query = {\n", + " \"size\": 5000,\n", + " \"_source\": [\n", + " \"id\",\n", + " \"authors\",\n", + " \"domains\",\n", + " \"title\",\n", + " \"year\",\n", + " \"isOa\",\n", + " \"type\",\n", + " \"affiliations\",\n", + " \"keywords\",\n", + " \"summary\",\n", + " \"alternativeSummary\"\n", + " ],\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"filter\": [\n", + " {\"terms\": {\"authors.role.keyword\": [\"author\", \"directeurthese\"]}},\n", + " {\"range\": {\"year\": {\"gte\": \"2018\", \"lte\": \"2023\"}}},\n", + " {\"terms\": {\"affiliations.id.keyword\": [\"196012231\"]}},\n", + " ],\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Request answer\n", + "json_answer = requests.post(SCANR_API_URL, json=json_query, headers=header).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display json\n", + "mercury.JSON(json_answer)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of publications : 3510\n" + ] + } + ], + "source": [ + "# Get publications data\n", + "works = list(map(lambda x: x.get(\"_source\"), json_answer.get(\"hits\").get(\"hits\")))\n", + "print(f\"Number of publications : {len(works)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of publications filtered : 3405/3510\n" + ] + } + ], + "source": [ + "# Filter publications\n", + "max_affiliations = 20\n", + "works_filter = list(filter(lambda x: len(x.get(\"affiliations\")) < max_affiliations, works))\n", + "print(f\"Number of publications filtered : {len(works_filter)}/{len(works)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes (structures) found : 978\n" + ] + } + ], + "source": [ + "# Compute nodes (structures)\n", + "\n", + "nodes_dict = {}\n", + "for work in works_filter:\n", + " work_id = work.get(\"id\")\n", + " for affiliation in work.get(\"affiliations\") or {}:\n", + " affiliation_id = affiliation.get(\"id\")\n", + " country = affiliation.get(\"address\")[0].get(\"country\") if (\"address\" in affiliation) else None\n", + " gps = affiliation.get(\"address\")[0].get(\"gps\") if (\"address\" in affiliation) else None\n", + " if affiliation_id and gps and country == \"France\":\n", + " if affiliation_id in nodes_dict:\n", + " nodes_dict[affiliation_id][\"publications\"].append(work_id)\n", + " else:\n", + " nodes_dict[affiliation_id] = {\"id\": affiliation_id, \n", + " \"name\": affiliation.get(\"label\").get(\"en\") or affiliation.get(\"label\").get(\"default\"),\n", + " \"publications\": [work_id],\n", + " \"x\": gps.get(\"lon\"),\n", + " \"y\": gps.get(\"lat\")}\n", + "\n", + "nodes = list(nodes_dict.values())\n", + "print(f\"Number of nodes (structures) found : {len(nodes)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute edges (publications)\n", + "edges = []\n", + "for source, target in combinations(nodes, 2):\n", + " similar_publications = set(source.get(\"publications\")) & set(target.get(\"publications\"))\n", + " if similar_publications:\n", + " edges.append({\"source\":source.get(\"id\"),\n", + " \"target\":target.get(\"id\"),\n", + " \"weight\": len(similar_publications)})" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "# Create graph\n", + "G = nx.Graph()\n", + "\n", + "# Add nodes\n", + "for node in nodes:\n", + " G.add_node(node.get(\"id\"), label=node.get(\"name\"), weight=len(node.get(\"publications\")))\n", + "\n", + "# Add edges\n", + "for edge in edges:\n", + " G.add_edge(edge.get(\"source\"), edge.get(\"target\"), weight=edge.get(\"weight\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph filtered : 96 \n", + "Minimum number of works required: 24\n" + ] + } + ], + "source": [ + "# Filter graph\n", + "max_order = 100\n", + "min_weight = 1\n", + "\n", + "while G.order() > max_order:\n", + " min_weight += 1\n", + " G = G.subgraph([node for node, attrdict in G.nodes.items() if attrdict.get(\"weight\") >= min_weight]) \n", + " # print(f\"Minimum number of works auto computed : {min_weight} (order={G.order()})\")\n", + "\n", + "print(f\"Graph filtered : {len(G.nodes) or 0} \\nMinimum number of works required: {min_weight}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9f6f216486254a0c8e307d1efdbb7143", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sigma(nx.Graph with 96 nodes and 2,019 edges)" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use sigma widget\n", + "Sigma(G, node_size=G.degree, \n", + " node_metrics={\"community\": \"louvain\"}, \n", + " node_color=\"community\",\n", + " node_border_color_from=\"node\",\n", + " layout=nodes_dict,\n", + " default_edge_type=\"curve\",\n", + " hide_edges_on_move=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "import folium\n", + "\n", + "m = folium.Map()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}