-
Notifications
You must be signed in to change notification settings - Fork 9
/
odsc_west_2021_network_analysis.Rmd
1831 lines (1394 loc) · 70.3 KB
/
odsc_west_2021_network_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "ODSC West 2021: Network Analysis"
author: "Clinton Brownley"
date: "11/17/2021"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Install packages
Install [igraph](https://igraph.org/r/) and some additional packages to process and plot the network data.
```{r install_packages, message=FALSE}
# ,echo=FALSE, include=FALSE
# install.packages("statnet")
if (!require("pacman")) install.packages("pacman")
pacman::p_load("ape",
"d3r",
"ergm",
"ggplot2",
"jsonlite",
"tidyverse",
"igraph",
"lubridate",
"purrr",
"RColorBrewer",
"sand",
"sqldf",
"wrapr")
```
# Load packages
Load [igraph](https://igraph.org/r/) and some additional packages to process and plot the network data.
```{r load_packages, message=FALSE}
# ,echo=FALSE, include=FALSE
library("ape")
library("d3r")
library("ergm")
library("ggplot2")
library("jsonlite")
library("dplyr")
library("igraph")
library("lubridate")
library("purrr")
library("RColorBrewer")
library("readr")
library("sand")
library("sqldf")
library("wrapr")
```
# Network Analysis
![Lord of the Rings](images/lord_of_the_rings.jpeg)
## Read Data
### Ontology
Read CSV data from the `morethanbooks` [Lord of the Rings Networks](https://github.com/morethanbooks/projects/tree/master/LotR) repository into a [`tibble`](https://tibble.tidyverse.org/).
`ontology.csv` contains the basic metadata about each entity (i.e. proper names used to reference characters, places, or groups) together with its identifier (e.g. the identifier for Aragorn is "arag").
```{r ontology}
ontology = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/ontologies/ontology.csv"), sep = "\t"))
names(ontology) <- c("id", "type", "label", "freqsum", "subtype", "gender")
head(ontology)
```
### Books 1, 2, 3 Combined
Read CSV data from the `morethanbooks` [Lord of the Rings Networks](https://github.com/morethanbooks/projects/tree/master/LotR) repository into a [`tibble`](https://tibble.tidyverse.org/).
`networks-id-3books.csv` contains an edges table with the number of times two entities are mentioned in the same paragraph across all three books of the series.
In this project, the nodes represent entities (i.e. proper names used to reference characters, places, or groups), and two of them are connected by an edge if in any paragraph there are references to these two entities.
Across the three books, Frodo and Sam are referenced in the same paragraph most frequently (533 paragraphs), and Frodo and Gandalf are referenced in the second most number of paragraphs(181 paragraphs).
```{r books123}
books123 = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/tables/networks-id-3books.csv"), sep = ","))
books123 <- books123 %>%
dplyr::select("IdSource", "IdTarget", "Weight", "Type") %>%
dplyr::mutate("Type" = "Books 123",
"Weight" = as.double(Weight))
names(books123) <- c("source", "target", "weight", "volume")
head(books123)
```
### Book 1
Read CSV data from the `morethanbooks` [Lord of the Rings Networks](https://github.com/morethanbooks/projects/tree/master/LotR) repository into a [`tibble`](https://tibble.tidyverse.org/).
`networks-id-volume1.csv` contains an edges table with the number of times two entities are mentioned in the same paragraph in [**The Fellowship of the Ring**](https://en.wikipedia.org/wiki/The_Lord_of_the_Rings), published on July 29, 1954.
```{r book1}
book1 = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/tables/networks-id-volume1.csv"), sep = ","))
book1 <- book1 %>%
dplyr::select("IdSource", "IdTarget", "Weight", "Type") %>%
dplyr::mutate("Type" = "Book 1",
"Weight" = as.double(Weight))
names(book1) <- c("source", "target", "weight", "volume")
book1 <- book1 %>% mutate("title" = "The Fellowship of the Ring",
"publication_date" = lubridate::ymd("1954-07-29"))
head(book1)
```
### Book 2
Read CSV data from the `morethanbooks` [Lord of the Rings Networks](https://github.com/morethanbooks/projects/tree/master/LotR) repository into a [`tibble`](https://tibble.tidyverse.org/).
`networks-id-volume2.csv` contains an edges table with the number of times two entities are mentioned in the same paragraph in [**The Two Towers**](https://en.wikipedia.org/wiki/The_Lord_of_the_Rings), published on November 11, 1954.
```{r book2}
book2 = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/tables/networks-id-volume2.csv"), sep = ","))
book2 <- book2 %>%
dplyr::select("IdSource", "IdTarget", "Weight", "Type") %>%
dplyr::mutate("Type" = "Book 2",
"Weight" = as.double(Weight))
names(book2) <- c("source", "target", "weight", "volume")
book2 <- book2 %>% mutate("title" = "The Two Towers",
"publication_date" = lubridate::ymd("1954-11-11"))
head(book2)
```
### Book 3
Read CSV data from the `morethanbooks` [Lord of the Rings Networks](https://github.com/morethanbooks/projects/tree/master/LotR) repository into a [`tibble`](https://tibble.tidyverse.org/).
`networks-id-volume3.csv` contains an edges table with the number of times two entities are mentioned in the same paragraph in [**The Return of the King**](https://en.wikipedia.org/wiki/The_Lord_of_the_Rings), published on October 20, 1955.
```{r book3}
book3 = tibble(read.csv(url("https://raw.githubusercontent.com/morethanbooks/projects/master/LotR/tables/networks-id-volume3.csv"), sep = ","))
book3 <- book3 %>%
dplyr::select("Source", "Target", "Weight", "Type") %>%
dplyr::mutate("Type" = "Book 3",
"Weight" = as.double(Weight))
names(book3) <- c("source", "target", "weight", "volume")
book3 <- book3 %>% mutate("title" = "The Return of the King",
"publication_date" = lubridate::ymd("1955-10-20"))
head(book3)
```
## Create a DataFrame from the `books123` edgelist for an undirected graph
We can use `sqldf` to create a `R` data frame that combines the edges data from `books123` and the metadata about the entities from `ontology`. The result is a data frame with all of the information we have about the paragraph references to pairs of entities across all three books.
We have undirected data, i.e. a particular person can appear as the source or the target and we're not interested in the distinction (direction), so we can make it easier to get the ego network (the network that is "visible" to a certain person) by duplicating each row with its inverse. This way, we can group by a single column to get the ego network.
For simplicity in this early cell, and because this processing isn't actually needed for igraph, we don't perform this processing in this cell; however, we do perform it in the next cell to illustrate the difference between these methods.
Note that in the data frame row filtering shown in this cell we have to search for `Frodo` both as a `source` or as a `target` to select all of the rows (edges) associated with Frodo's ego network. In the next cell, we will see that conducting the processing in the SQL query will make it easier for us to identify character ego networks.
```{r g_df}
g_df <- sqldf::sqldf("
SELECT
sour.id AS source_id, sour.label as source_name, sour.type AS source_type, sour.subtype AS source_subtype, sour.gender AS source_gender,
dest.id AS target_id, dest.label AS target_name, dest.type AS target_type, dest.subtype AS target_subtype, dest.gender AS target_gender,
conn.weight, conn.volume
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id"
)
g_df %>%
dplyr::filter(source_name == "Frodo" | target_name == "Frodo", source_type == "per", target_type == "per") %>%
dplyr::arrange(source_id, desc(weight)) %>%
head(10)
```
```{r g_df_shape}
# Number of references (edges) between Frodo and other people across the three books: 38
g_df %>%
dplyr::filter(source_name == "Frodo" | target_name == "Frodo", source_type == "per", target_type == "per") %>%
dim()
```
## Create a DataFrame with bidirectional edgelists for an undirected graph
We have undirected data, so to make it easier to work with we're going to duplicate each row with its inverse. This way, we can group by a single column to get the ego network (the network that is "visible" to a certain person).
In contrast to the previous cell, we perform this processing step in the SQL query here to illustrate the difference between these methods.
Note that in the data frame row filtering shown in this cell we have to search for `Frodo` only in the `source` column to select all of the rows (edges) associated with Frodo's ego network.
Again, this processing isn't actually needed for igraph, we're doing it for manual manipulation of the graph.
```{r network_df}
network_df <- sqldf::sqldf("
SELECT
sour.id AS source_id, sour.label as source_name, sour.type AS source_type, sour.subtype AS source_subtype, sour.gender AS source_gender,
dest.id AS target_id, dest.label AS target_name, dest.type AS target_type, dest.subtype AS target_subtype, dest.gender AS target_gender,
conn.weight, conn.volume
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id
UNION
SELECT
dest.id AS source_id, dest.label as source_name, dest.type AS source_type, dest.subtype AS source_subtype, dest.gender AS source_gender,
sour.id AS target_id, sour.label AS target_name, sour.type AS target_type, sour.subtype AS target_subtype, sour.gender AS target_gender,
conn.weight, conn.volume
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id"
)
network_df %>%
dplyr::filter(source_name == "Frodo", source_type == "per", target_type == "per") %>%
dplyr::arrange(source_id, desc(weight)) %>%
head(10)
```
```{r network_df_shape}
# Number of references (edges) between Frodo and other people across the three books: 38
network_df %>%
dplyr::filter(source_name == "Frodo", source_type == "per", target_type == "per") %>%
dim()
```
Filter for pairs of entities that were mentioned in the same paragraph at least twice and then sort by weight and source_id
``` {r weight_gte2}
# Filter for edges with weights >= 2
network_df %>%
dplyr::filter(weight >= 2) %>%
dplyr::arrange(weight, source_id) %>%
head(10)
```
## Check that the same number of source and target nodes were created
``` {r sources_targets_count}
c( length(unique(network_df$source_id)), length(unique(network_df$target_id)) )
```
## Count the number of nodes that are: group, person, place, thing
Here, we are interested in the people, so when we create our network graph we will only include the `per` nodes and edges, i.e. the ones associated with people.
``` {r source_type_count}
network_df %>% group_by(source_type) %>% count()
```
# Create a network graph of people with edge weights greater than 20
[igraph](https://igraph.org/r/) has many functions for [reading and writing graphs](https://igraph.org/r/html/latest/) and [converting to and from other data formats](https://igraph.org/r/html/latest/). We can create a network graph `G` from a `R` data frame using igraph's [`graph_from_data_frame`](https://igraph.org/r/html/latest/graph_from_data_frame.html) function.
A graph `G = (V, E)` is mathematical structure with a set `V` of elements, called vertices or nodes, and a set `E` of pairs of nodes, called edges or links. A network can be `undirected` or `directed`. In addition, a network can be `unweighted` or `weighted`.
To convert the edgelist data in the data frame into a graph `G` using `graph_from_data_frame`, the data frame must contain an edge list representation of a graph, i.e. two columns of node names and zero or more columns of edge attributes. Each row will be processed as one edge instance.
`d` is a data frame containing a symbolic edge list in the first two columns. Additional columns are considered as edge attributes. Since version 0.7 this argument is coerced to a data frame with as.data.frame..
`directed` is a logical scalar, whether or not to create a directed graph.
`vertices` is a data frame with vertex metadata, or NULL. Since version 0.7 this argument is coerced to a data frame with as.data.frame, if not NULL.
Given our interest in characters, we restrict the graph to references about people. In addition, we further restrict our attention to pairs of people who are referenced together more than 20 times across all the books to reduce the number of edges such that, for readability, the edge count is not more than four times the node count [[Melancon 06](https://hal-lirmm.ccsd.cnrs.fr/lirmm-00091354/document)].
``` {r create_graph}
my_edges <- sqldf::sqldf("
SELECT
sour.label as source_name, sour.type as source_type,
dest.label AS target_name, dest.type AS target_type,
conn.weight
FROM
books123 conn
JOIN ontology sour
ON
conn.source = sour.id
JOIN ontology dest
ON
conn.target = dest.id"
) %>%
dplyr::filter(source_type == "per", target_type == "per", weight > 20) %>%
dplyr::select(source_name, target_name, weight) %>%
dplyr::rename(from = source_name, to = target_name)
my_nodes <- network_df %>%
dplyr::filter(source_type == "per", target_type == "per", weight > 20) %>%
dplyr::select(source_name, source_type, source_subtype, source_gender, volume) %>%
dplyr::rename(name = source_name, type = source_type, subtype = source_subtype, gender = source_gender) %>%
dplyr::distinct()
G <- graph_from_data_frame(my_edges, directed = FALSE, vertices = my_nodes)
#print_all(G)
G
```
# Plot the nodes of the LotR network
```{r G_plot_nodes}
set.seed(10)
igraph_options(vertex.size=6,
vertex.color="blue",
vertex.frame.color="blue",
vertex.label=NA, # V(G)$name,
edge.color=NA)
par(mfrow=c(1,1))
plot(G, layout=layout_randomly) # layout_nicely
title("LotR network (no edges!)")
```
# Plot the nodes and edges of the LotR network
```{r G_plot_nodes_and_edges}
e.size <- 4 * E(G)$weight / max( E(G)$weight, na.rm = TRUE )
set.seed(10)
igraph_options(vertex.size=6,
vertex.color="blue",
vertex.frame.color="blue",
vertex.label=NA, # V(G)$name,
edge.width=e.size,
edge.color="blue")
par(mfrow=c(1,1))
plot(G, layout=layout_randomly) # layout_nicely
title("LotR network")
```
## Number of nodes
Sometimes the number of vertices (also called nodes) in a graph is called the `order` of the graph. We can get the number of nodes in a graph with `gorder(G)`.
``` {r G_number_of_nodes}
# Count the number of nodes
gorder(G)
vcount(G)
```
## List of nodes
```{r G_list_of_nodes}
# List all of the nodes
V(G)
```
## List neighbors of a node
```{r G_neighbors_of_Aragorn}
# List neighbors of node "Aragorn"
neighbors(G, "Aragorn")
```
## Determine the degree of each node in the graph
```{r G_degree}
G_degree <- degree(G)
# Arrange nodes alphabetically
# G_degree[sort(names(G_degree))]
# Arrange nodes in descending order of degree
G_degree[order(G_degree, decreasing = TRUE)]
```
```{r G_degree_distribution}
G_degree <- degree(G)
# Arrange nodes alphabetically
# G_degree[sort(names(G_degree))]
# Arrange nodes in descending order of degree
G_degree_distribution <- G_degree[order(G_degree, decreasing = TRUE)]
# Alternatively: tibble::enframe(G_degree_distribution) # returns columns `name` and `value`
G_degree_distribution <- data.frame(Character=factor(names(G_degree_distribution)),
Degree=G_degree_distribution,
row.names=NULL) %>% arrange(desc(Degree))
ggplot(data = G_degree_distribution, aes(x = reorder(Character, -Degree), y = Degree)) +
geom_col(aes(fill = Degree)) +
scale_color_distiller(palette = "Blues", direction = 1, aesthetics = c("colour", "fill")) +
scale_y_continuous(breaks=seq(0, 20, 2.5), limits = c(0, 20)) +
# ylim(0, 20) +
labs(title = "Degree Distribution of LotR Characters") +
xlab("Character") +
theme_light() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90))
```
## Determine the average degree of the graph
The average degree of a network is
\begin{align}
\langle k \rangle = \frac{\sum_{i=1} k_i}{N}
\end{align}
```{r G_average_degree}
round(mean(degree(G), na.rm = TRUE), 2)
```
```{r G_average_degree_calc}
# The average degree of an undirected network equals 2L / N
round( 2 * length(E(G)) / length(V(G)), 2)
```
## Number of edges
Sometimes the number of edges (also called links) in a graph is called the `size` of the graph. We can get the number of edges in a graph with `ecount(G)`.
``` {r G_number_of_edges}
# Count the number of edges
ecount(G)
```
## List of edges
``` {r G_list_of_edges}
# List all of the edges
E(G)
```
## Determine the density of the graph
The density of a network is the fraction of possible links that actually exist, which is the same as the fraction of pairs of nodes that are actually connected. The average degree of a network is related (directly proportional) to its density. The density is the ratio between the average and maximum degree.
The higher the density, the more likely it is that the network is *connected* (i.e. that you can reach any node from any other node by following a path along links and intermediate nodes).
The density of a network with *N* nodes and *L* links is
\begin{align}
d = \frac{L}{L_{max}}
\end{align}
In an *undirected* network this equals
\begin{align}
d = \frac{L}{L_{max}} = \frac{2L}{N(N-1)}
\end{align}
In a *directed* network this equals
\begin{align}
d = \frac{L}{L_{max}} = \frac{L}{N(N-1)}
\end{align}
```{r G_density}
round(edge_density(G), 2)
```
```{r G_density_calc}
# The density of an undirected network equals 2L / ( N * (N-1) )
# Alternatively: 2 * len(G.edges) / len(G.nodes) / (len(G.nodes)-1)
round( 2 * length(E(G)) / ( length(V(G)) * (length(V(G))-1) ), 2)
```
## Determine the *length* of the shortest path between two specific nodes
```{r G_shortest_path_distance_two_nodes}
length(shortest_paths(G, "Frodo", "Arwen", weights = E(G, directed = FALSE)$weight)$vpath[[1]])-1
```
## Specify the *shortest path* between two specific nodes
```{r G_shortest_path_two_nodes}
shortest_paths(G, "Frodo", "Arwen", weights = E(G, directed = FALSE)$weight)$vpath[[1]]
```
## Determine whether the network is connected
A node `j` is *reachable* from another node `i` if there is a walk from from `i` to `j`. A graph `G` is *connected* if every node is reachable from every other.
```{r G_is_connected}
is_connected(G)
```
## Determine the number and size of the connected components
A *connected component* of a graph `G` is a maximally *connected* (i.e. that you can reach any node from any other node by following a path along links and intermediate nodes) subgraph of `G`.
```{r G_connected_components}
clusters(G)
```
## Determine the average shortest path length of connected components in the graph
A common measure of the distance between nodes in a network is the minimum number of edges needed to traverse from one node to the other. This minimum path is called the *shortest path*, and its length is called the *shortest path length*.
We can then define the average shortest path length for the entire network by averaging the shortest path lengths across all pairs of nodes.
```{r G_average_shortest_path_length}
#round(mean_distance(G), 2)
c( round(mean_distance(induced_subgraph(G, clusters(G)$membership == 1)), 2),
round(mean_distance(induced_subgraph(G, clusters(G)$membership == 2)), 2) )
```
## Determine the diameter of the connected components in the graph
The *diameter* of a network is the maximum shortest path length across all pairs of nodes. That is, the length of the longest shortest path in the network.
```{r G_diameter}
#diameter(G, weights = NA)
c( diameter(induced_subgraph(G, clusters(G)$membership == 1), weights = NA),
diameter(induced_subgraph(G, clusters(G)$membership == 2), weights = NA) )
```
## Determine the average clustering coefficient of the graph
The local clustering of each node in a graph G is the fraction of triangles that actually exist over all possible triangles in its neighborhood. The average clustering coefficient of a graph G is the mean of local clusterings.
```{r G_average_clustering}
# Global average clustering
round(transitivity(G, type="average",
vids = NULL, # V(conn_comp),
weights = NULL,
isolates = "zero"), 3)
```
```{r G_transitivity}
# Global transitivity (weights nodes with large degree higher than average clustering)
round(transitivity(G, type="global",
vids = NULL, # V(conn_comp),
weights = NULL,
isolates = "zero"), 3)
```
## Determine the assortativity of the network based on the *correlation* between degrees of neighbor nodes, as well as a categorical attribute
Nodes (People) in a social network may have a variety of attributes, e.g. age, gender, location, interests, etc. For example, in our LotR network, we have the attributes `gender` and `subtype` for each person.
In social networks, it's often the case that people who are connected to one another tend to share similar attributes. For example, in our LotR network, it may be the case that people who are connected to one another tend to be of the same gender or subtype. This property is called *assortativity*.
*Assortativity* based on `degree` is called degree assortativity, which occurs when high-degree nodes tend to be connected to other high-degree nodes and low-degree nodes tend to be connected to other low-degree nodes. In addition to `degree`, assortativity can also be based on *categorical* or *numeric* attributes.
```{r G_assortativity_degree}
# Assortativity of the network based on the correlation between degrees of neighbor nodes
round(assortativity_degree(G, directed = FALSE), 2)
```
```{r G_assortativity_attribute}
# Assortativity of the network based on subtype, a categorical attribute of the nodes
v_types <- as.numeric(factor(V(G)$subtype))
round(assortativity_nominal(G, types = v_types, directed = FALSE), 2)
```
# Add gender and subtype attributes and colors to the nodes
Determine the unique genders in the dataset
``` {r genders}
genders <- network_df %>%
dplyr::filter(source_type == "per") %>%
dplyr::distinct(source_gender) %>%
dplyr::select(source_gender) %>%
dplyr::arrange(source_gender)
genders <- genders$source_gender
genders
```
``` {r color_palette}
# tableau
color_palette <- brewer.pal(n = 8, name = "Set1")
color_palette
```
Associate the unique genders with colors
``` {r gender_colormap}
# qc source: https://www.r-bloggers.com/2018/03/r-tip-use-named-vectors-to-re-map-values/
# tableau: orange, blue
gender_colormap <- qc( "female" = "#e49444" , "male" = "#5778a4" )
gender_colormap
```
Determine the unique subtypes in the dataset
``` {r subtypes}
subtypes <- network_df %>%
dplyr::filter(source_type == "per") %>%
dplyr::distinct(source_subtype) %>%
dplyr::select(source_subtype) %>%
dplyr::arrange(source_subtype)
subtypes <- subtypes$source_subtype
subtypes
```
Associate the unique subtypes with colors
``` {r subtype_colormap}
# tableau: blue, orange, green, red, purple, brown, pink, cyan
subtype_colormap <- qc( "ainur" = "#5778a4",
"animal" = "#e49444",
"dwarf" = "#6a9f58",
"elves" = "#d1615d",
"ents" = "#a87c9f",
"hobbit" = "#967662",
"men" = "#f1a2a9",
"orcs" = "#17becf" )
subtype_colormap
```
Create a data frame that associates each node (person) with four new attributes for that person
``` {r map_node_attributes}
attrs <- network_df %>%
dplyr::filter(source_type == "per", target_type == "per", weight > 20) %>%
dplyr::select(source_name, source_subtype, source_gender) %>%
dplyr::mutate(subtype_color = subtype_colormap[source_subtype],
gender_color = gender_colormap[source_gender]) %>%
dplyr::distinct()
attrs
```
Assign the attributes to the nodes in the network
``` {r add_node_attributes}
V(G)$subtype_color <- attrs$subtype_color
V(G)$gender_color <- attrs$gender_color
```
## Check that the attributes were added to the nodes
```{r G_check_node_attributes}
# V(G)
vertex.attributes(G)
```
## Check that the undirected edges have weights
```{r G_check_edge_attributes}
# E(G)
edge.attributes(G)
```
# Create a new network graph that only consists of the largest connected component
## Determine the largest connected component and
## Create a new network graph that only consists of the largest connected component
```{r largest_connected_component}
largest_cc <- (clusters(G)$membership == 1)
conn_comp <- induced_subgraph(G, largest_cc)
conn_comp
```
## Confirm that the new network graph (conn_comp) is indeed connected
```{r connected_component}
is_connected(conn_comp)
```
# Analyze characteristics of the largest connected component of the LotR network
## Number of nodes
Sometimes the number of vertices (also called nodes) in a graph is called the `order` of the graph. We can get the number of nodes in a graph with `gorder(G)`.
``` {r cc_number_of_nodes}
# Count the number of nodes
gorder(conn_comp)
vcount(conn_comp)
```
## List of nodes
```{r cc_list_of_nodes}
# List all of the nodes
V(conn_comp)
```
## List neighbors of a node
```{r cc_neighbors_of_Aragorn}
# List neighbors of node "Aragorn"
neighbors(conn_comp, "Aragorn")
```
## Determine the degree of each node in the graph
```{r cc_degree}
cc_degree <- degree(conn_comp)
# Arrange nodes alphabetically
# cc_degree[sort(names(cc_degree))]
# Arrange nodes in descending order of degree
cc_degree[order(cc_degree, decreasing = TRUE)]
```
```{r cc_degree_distribution}
cc_degree <- degree(conn_comp)
# Arrange nodes alphabetically
# cc_degree[sort(names(cc_degree))]
# Arrange nodes in descending order of degree
cc_degree_distribution <- cc_degree[order(cc_degree, decreasing = TRUE)]
# Alternatively: tibble::enframe(cc_degree_distribution) # returns columns `name` and `value`
cc_degree_distribution <- data.frame(Character=factor(names(cc_degree_distribution)),
Degree=cc_degree_distribution,
row.names=NULL) %>% arrange(desc(Degree))
ggplot(data = cc_degree_distribution, aes(x = reorder(Character, -Degree), y = Degree)) +
geom_col(aes(fill = Degree)) +
scale_color_distiller(palette = "Blues", direction = 1, aesthetics = c("colour", "fill")) +
scale_y_continuous(breaks=seq(0, 20, 2.5), limits = c(0, 20)) +
# ylim(0, 20) +
labs(title = "Degree Distribution of LotR Characters") +
xlab("Character") +
theme_light() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90))
```
## Determine the average degree of the graph
The average degree of a network is
\begin{align}
\langle k \rangle = \frac{\sum_{i=1} k_i}{N}
\end{align}
```{r cc_average_degree}
round(mean(degree(conn_comp), na.rm = TRUE), 2)
```
```{r cc_average_degree_calc}
# The average degree of an undirected network equals 2L / N
round( 2 * length(E(conn_comp)) / length(V(conn_comp)), 2)
```
## Number of edges
Sometimes the number of edges (also called links) in a graph is called the `size` of the graph. We can get the number of edges in a graph with `ecount(G)`.
``` {r cc_number_of_edges}
# Count the number of edges
ecount(conn_comp)
```
## List of edges
``` {r cc_list_of_edges}
# List all of the edges
E(conn_comp)
```
## Determine the density of the largest connected component
The density of a network is the fraction of possible links that actually exist, which is the same as the fraction of pairs of nodes that are actually connected. The average degree of a network is related (directly proportional) to its density. The density is the ratio between the average and maximum degree.
The higher the density, the more likely it is that the network is *connected* (i.e. that you can reach any node from any other node by following a path along links and intermediate nodes).
The density of a network with *N* nodes and *L* links is
\begin{align}
d = \frac{L}{L_{max}}
\end{align}
In an *undirected* network this equals
\begin{align}
d = \frac{L}{L_{max}} = \frac{2L}{N(N-1)}
\end{align}
In a *directed* network this equals
\begin{align}
d = \frac{L}{L_{max}} = \frac{L}{N(N-1)}
\end{align}
``` {r cc_density}
round(edge_density(conn_comp), 2)
```
```{r cc_density_calc}
# The density of an undirected network equals 2L / ( N * (N-1) )
# Alternatively: 2 * len(conn_comp.edges) / len(conn_comp.nodes) / (len(conn_comp.nodes)-1)
round( 2 * length(E(conn_comp)) / ( length(V(conn_comp)) * (length(V(conn_comp))-1) ), 2)
```
## Determine the *length* of the shortest path between two specific nodes
```{r cc_shortest_path_distance_two_nodes}
length(shortest_paths(conn_comp, "Frodo", "Arwen", weights = E(conn_comp, directed = FALSE)$weight)$vpath[[1]])-1
```
## Specify the *shortest path* between two specific nodes
```{r cc_shortest_path_two_nodes}
shortest_paths(conn_comp, "Frodo", "Arwen", weights = E(conn_comp, directed = FALSE)$weight)$vpath[[1]]
```
## Determine whether the network is connected
A node `j` is *reachable* from another node `i` if there is a walk from from `i` to `j`. A graph `G` is *connected* if every node is reachable from every other.
```{r cc_is_connected}
is_connected(conn_comp)
```
## Determine the number and size of the connected components
A *connected component* of a graph `G` is a maximally *connected* (i.e. that you can reach any node from any other node by following a path along links and intermediate nodes) subgraph of `G`.
```{r cc_connected_components}
# Count the number of connected components
clusters(conn_comp)
```
## Determine the average shortest path length of the largest connected component
A common measure of the distance between nodes in a network is the minimum number of edges needed to traverse from one node to the other. This minimum path is called the *shortest path*, and its length is called the *shortest path length*.
We can then define the average shortest path length for the entire network by averaging the shortest path lengths across all pairs of nodes.
```{r cc_shortest_path_distance}
round(mean_distance(conn_comp), 2)
```
## Determine the diameter of the largest connected component
The *diameter* of a network is the maximum shortest path length across all pairs of nodes. That is, the length of the longest shortest path in the network.
```{r cc_diameter}
diameter(conn_comp, weights = NA)
```
## Determine the eccentricity of the nodes in the largest connected component
The *eccentricity* of a node v is the maximum distance from v to all other nodes in G.
```{r cc_eccentricity}
eccentricity(conn_comp)
```
## Determine the radius of the largest connected component
The *radius* is the minimum eccentricity.
```{r cc_radius}
radius(conn_comp)
```
## Determine the nodes in the center of the largest connected component
The *center* is the set of nodes with eccentricity equal to radius.
```{r cc_center}
in_center <- (eccentricity(conn_comp) == radius(conn_comp))
eccentricity(conn_comp)[in_center]
```
## Determine the nodes in the periphery of the largest connected component
The *periphery* is the set of nodes with eccentricity equal to the diameter.
```{r cc_periphery}
in_periphery <- (eccentricity(conn_comp) == diameter(conn_comp, weights = NA))
eccentricity(conn_comp)[in_periphery]
```
## Determine the average clustering coefficient of the graph
The local clustering of each node in a graph G is the fraction of triangles that actually exist over all possible triangles in its neighborhood. The average clustering coefficient of a graph G is the mean of local clusterings.
```{r cc_average_clustering}
# Global average clustering
round(transitivity(conn_comp, type="average",
vids = NULL, # V(conn_comp),
weights = NULL,
isolates = "zero"), 3)
```
```{r cc_transitivity}
# Global transitivity (weights nodes with large degree higher than average clustering)
round(transitivity(conn_comp, type="global",
vids = NULL, # V(conn_comp),
weights = NULL,
isolates = "zero"), 3)
```
## Determine the assortativity of the connected component based on the *correlation* between degrees of neighbor nodes, as well as a categorical attribute
Nodes (People) in a social network may have a variety of attributes, e.g. age, gender, location, interests, etc. For example, in our LotR network, we have the attributes `gender` and `subtype` for each person.
In social networks, it's often the case that people who are connected to one another tend to share similar attributes. For example, in our LotR network, it may be the case that people who are connected to one another tend to be of the same gender or subtype. This property is called *assortativity*.
*Assortativity* based on `degree` is called degree assortativity, which occurs when high-degree nodes tend to be connected to other high-degree nodes and low-degree nodes tend to be connected to other low-degree nodes. In addition to `degree`, assortativity can also be based on *categorical* or *numeric* attributes.
```{r cc_assortativity_degree}
# Assortativity of the connected component based on the correlation between degrees of neighbor nodes
round(assortativity_degree(conn_comp, directed = FALSE), 2)
```
```{r cc_assortativity_attribute}
# Assortativity of the connected component based on subtype, a categorical attribute of the nodes
v_types <- as.numeric(factor(V(conn_comp)$subtype))
round(assortativity_nominal(conn_comp, types = v_types, directed = FALSE), 2)
```
# Analyze the nodes of the largest connected component of the LotR network
With social networks, people are often interested in how "*important*" a particular person is in the network. There are many measures of "importance".
For instance, in basketball, we might use the number of NBA championships, MVP awards, triple-doubles, points per game, three-pointers, rebounds, assists, etc. to assess "*importance*".
Similar to these sports statistics, there are many measures of "*importance*" in social networks:
- Who has the most connections?
- Who is the 'closest' to others in the network?
- Who is most often 'between' other pairs of people?
- Who is connected to other 'important' people? That is, who has the highest prestige, or rank?
The point isn't that one of these measures is more important than the others, but rather that each one:
- Captures useful information in a succinct manner
- Encapsulates different information
As Matthew Jackson notes in [The Human Network](https://web.stanford.edu/~jacksonm/books.html), "Our lives would be simpler if measuring something could always be boiled down to a single statistic. But part of what makes our lives so interesting is that such unidimensional rankings are generally impossible for many of the things that are most important to understand: lists of rankings end up being both controversial and intriguing".
## Degree centrality
In social networks, the number of connections (relationships) a person has is called the person's "degree". The associated measure of how central the person is within the network is called the person's "degree centrality". In [A First Course in Network Science](https://cambridgeuniversitypress.github.io/FirstCourseNetworkScience/), Menczer, Fortunato, and Davis refer to such high-degree nodes as *hubs*.
People with higher "degree centrality" have a disproportionate presence and influence in the network (e.g. through the phenomenon known as the "[friendship paradox](https://en.wikipedia.org/wiki/Friendship_paradox)" -- that most people have fewer friends than their friends have, on average).
In `igraph`, you can set the `normalized` argument equal to TRUE to normalize the degree. If TRUE then the result is divided by n-1, where n is the number of vertices in the graph.
```{r cc_degree_centrality}
V(conn_comp)$degree_centrality <- degree(conn_comp,
normalized = TRUE)
as_tibble(vertex.attributes(conn_comp)[c("name", "degree_centrality")]) %>%
mutate_if(is.numeric, round, 4) %>%
arrange(desc(degree_centrality))
```
```{r cc_degree_centrality_plot}
cc_degree_centrality <- degree(conn_comp,
normalized = TRUE)
# Arrange nodes alphabetically
# cc_degree_centrality[sort(names(cc_degree_centrality))]
# Arrange nodes in descending order of degree centrality
cc_degree_centralities <- cc_degree_centrality[order(cc_degree_centrality, decreasing = TRUE)]
# Alternatively: tibble::enframe(cc_degree_centrality) # returns columns `name` and `value`
cc_degree_centralities <- data.frame(Character=factor(names(cc_degree_centrality)),
Centrality=cc_degree_centrality,
row.names=NULL) %>% arrange(desc(Centrality))
ggplot(data = cc_degree_centralities, aes(x = reorder(Character, -Centrality),
y = Centrality)) +
geom_col(aes(fill = Centrality)) +
scale_color_distiller(palette = "Greens",
direction = 1,
aesthetics = c("colour", "fill")) +
scale_y_continuous(breaks=seq(0, 0.7, 0.1)) +
# ylim(0, 1) +
labs(title = "Degree Centralities of LotR Characters") +
xlab("Character") +
theme_light() +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),