Meetup Analytics with R and Neo4j

Exploring London NoSQL meetups
using R
Mark Needham
@markhneedham

Lots of bits of data
● Events
● Members
● Groups
● RSVPs
● Venues
● Topics

Interesting questions to ask...

Interesting questions to ask...
● What day of the week do people go to meetups?
● Where abouts in London are NoSQL meetups held?
● Do people sign up for multiple meetups on the same
day?
● Are there common members between groups?
● What topics are people most interested in?
● In which order do people join the NoSQL groups?
● Who are the most connected people on the NoSQL
scene?

The tool set
RNeo4j
Results as a data frame
Query
dplyr
ggplot2
igraph ggmap
cluster
geosphere

When do people go to meetups?
(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-()

When do people go to meetups?
MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-()
WHERE (event.time + event.utc_offset) < timestamp()
RETURN g.name,
event.time + event.utc_offset AS eventTime,
event.announced_at AS announcedAt,
event.name,
COUNT(*) AS rsvps

R Neo4j
install.packages("devtools")
devtools::install_github("nicolewhite/Rneo4j")
library(RNeo4j)
graph = startGraph("http://localhost:7474/db/data/")
query = "MATCH … RETURN …"
cypher(graph, query)

Grouping events by month
library(dplyr)
events %>%
group_by(month) %>%
summarise(events = n(),
count = sum(rsvps),
max = max(rsvps)) %>%
mutate(ave = count / events) %>%
arrange(desc(ave))

Grouping events by month
## month events count ave
## 1 November 55 3018 54.87273
## 2 May 52 2676 51.46154
## 3 April 58 2964 51.10345
## 4 June 47 2384 50.72340
## 5 October 71 3566 50.22535
## 6 September 59 2860 48.47458
## 7 February 43 2047 47.60465
## 8 January 34 1592 46.82353
## 9 December 24 1056 44.00000
## 10 March 39 1667 42.74359
## 11 July 48 1866 38.87500
## 12 August 34 1023 30.08824

Grouping events by day
events %>%
group_by(day) %>%
summarise(events = n(),
count = sum(rsvps),
max = max(rsvps)) %>%
mutate(ave = count / events) %>%
arrange(day)

Grouping events by day
## day events count ave
## 1 Monday 63 4034 64.03175
## 2 Tuesday 151 6696 44.34437
## 3 Wednesday 225 9481 42.13778
## 4 Thursday 104 5394 51.86538
## 5 Friday 11 378 34.36364
## 6 Saturday 10 736 73.60000

Some simple bar charts
library(ggplot2)
g1 = ggplot(aes(x = day, y = ave), data = byDay) +
geom_bar(stat="identity", fill="dark blue") +
ggtitle("Average attendees by day")
g2 = ggplot(aes(x = day, y = count), data = byDay) +
ggtitle("Total attendees by day")
grid.arrange(g1,g2, ncol = 1)

Where do people go to meetups?
(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-(),
(event)-[:HELD_AT]->(venue)

MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-(), (event)-[:HELD_AT]->(venue)
WHERE (event.time + event.utc_offset) < timestamp()
RETURN g.name,
event.time + event.utc_offset AS eventTime,
event.announced_at AS announcedAt,
event.name,
venue.name AS venue,
venue.lat AS lat,
venue.lon AS lon,
COUNT(*) AS rsvps

byVenue = events %>%
count(lat, lon, venue) %>%
ungroup() %>%
arrange(desc(n)) %>%
rename(count = n)

## lat lon venue count
## 1 51.50256 -0.019379 Skyline Bar at CCT Venues Plus 1
## 2 51.53373 -0.122340 The Guardian 1
## 3 51.51289 -0.067163 Erlang Solutions 3
## 4 51.49146 -0.219424 Novotel - W6 8DR 1
## 5 51.49311 -0.146531 Google HQ 1
## 6 51.52655 -0.084219 Look Mum No Hands! 22
## 7 51.51976 -0.097270 Vibrant Media, 3rd Floor 1
## 8 51.52303 -0.085178 Mind Candy HQ 2
## 9 51.51786 -0.109260 ThoughtWorks UK Office 2
## 10 51.51575 -0.097978 BT Centre 1

library(ggmap)
map = get_map(location = 'London', zoom = 12)
ggmap(map) +
geom_point(aes(x = lon, y = lat, size = count),
data = byVenue,
col = "red",
alpha = 0.8)

library(geosphere)
library(cluster)
clusteramounts = 40
distance.matrix = byVenue %>% select(lon, lat) %>% distm
clustersx <- as.hclust(agnes(distance.matrix, diss = T))
byVenue$group <- cutree(clustersx, k=clusteramounts)
byVenueClustered = byVenue %>%
group_by(group) %>%
summarise(meanLat = mean(lat),
meanLon = mean(lon),
total = sum(count),
venues = paste(venue, collapse = ","))
Spatial clustering

## group meanLat meanLon total
## 1 3 51.52349 -0.08506461 123
## 2 1 51.52443 -0.09919280 89
## 3 2 51.50547 -0.10325925 62
## 4 4 51.50794 -0.12714600 55
## 5 8 51.51671 -0.10028908 19
## 6 6 51.53655 -0.13798514 18
## 7 7 51.52159 -0.10934720 18
## 8 5 51.51155 -0.07004417 13
## 9 12 51.51459 -0.12314650 13
## 10 14 51.52129 -0.07588867 10
Spatial clustering

ggmap(map) +
geom_point(aes(x = meanLon, y = meanLat, size = total),
data = byVenueClustered,
col = "red",
alpha = 0.8)
Spatial clustering

byVenue %>%
filter(group == byVenueClustered$group[1])
What’s going on in Shoreditch?

Meetup Group Member Overlap
● Why would we want to know this?
○ Perhaps for joint meetups
○ Topics for future meetups

Extracting the data
MATCH (group1:Group), (group2:Group)
WHERE group1 <> group2
OPTIONAL MATCH p =
(group1)<-[:MEMBER_OF]-()-[:MEMBER_OF]->(group2)
WITH group1, group2, COLLECT(p) AS paths
RETURN group1.name, group2.name,
LENGTH(paths) as commonMembers
ORDER BY group1.name, group2.name

MATCH (group1:Group), (group2:Group)
WHERE group1 <> group2
OPTIONAL MATCH (group1)<-[:MEMBER_OF]-(member)
WITH group1, group2, COLLECT(member) AS group1Members
WITH group1, group2, group1Members,
LENGTH(group1Members) AS numberOfGroup1Members
UNWIND group1Members AS member
OPTIONAL MATCH path = (member)-[:MEMBER_OF]->(group2)
WITH group1, group2, COLLECT(path) AS paths, numberOfGroup1Members
WITH group1, group2, LENGTH(paths) as commonMembers, numberOfGroup1Members
RETURN group1.name, group2.name,
toInt(round(100.0 * commonMembers / numberOfGroup1Members)) AS percentage
ORDER BY group1.name, group1.name
Finding overlap as a percentage

How many groups are people part of?
MATCH (p:MeetupProfile)-[:MEMBER_OF]->()
RETURN ID(p), COUNT(*) AS groups
ORDER BY groups DESC

How many groups are people part of?
ggplot(aes(x = groups, y = n),
data = group_count %>% count(groups)) +
scale_y_sqrt() +
scale_x_continuous(
breaks = round(seq(min(group_count$groups),
max(group_count$groups), by = 1),1)) +
ggtitle("Number of groups people are members of")

Who’s the most connected?
● i.e. the person who had the chance to meet
the most people in the community
● Betweenness Centrality
● Page Rank

Betweenness Centrality
Calculates the number of shortest paths that go
through a particular node

library(igraph)
nodes_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event)
RETURN DISTINCT ID(p) AS id, p.id AS name, p.name AS fullName"
nodes = cypher(graph, nodes_query)
edges_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event),
(event)<-[:TO]-({response:'yes'})<-[:RSVPD]-(other)
RETURN ID(p) AS source, ID(other) AS target, COUNT(*) AS weight"
edges = cypher(graph, edges_query)
g = graph.data.frame(edges, directed = T, nodes)
bwGraph = betweenness(g)
bwDf = data.frame(id = names(bwGraph), score = bwGraph)

bwDf %>% arrange(desc(score)) %>% head(5)
merge(nodes, bwDf, by.x = "name", by.y = "id") %>%
arrange(desc(score)) %>%
head(5)

Page Rank
PageRank works by counting the number and quality of
links to a page to determine a rough estimate of how
important the website is.
The underlying assumption is that more important websites
are likely to receive more links from other websites.

Page Rank
PageRank works by counting the number and quality of
links to a person to determine a rough estimate of how
important the person is.
The underlying assumption is that more important people
are likely to receive more links from other people.

Page Rank
pr = page.rank(g)$vector
prDf = data.frame(name = names(pr), rank = pr)
data.frame(merge(nodes, prDf, by.x = "name", by.y = "name")) %>%
arrange(desc(rank)) %>%
head(10)

Blending back into the graph
query = "MATCH (p:MeetupProfile {id: {id}}) SET p.betweenness = {score}"
tx = newTransaction(graph)
for(i in 1:nrow(bwDf)) {
if(i %% 1000 == 0) {
commit(tx)
print(paste("Batch", i / 1000, "committed."))
}
id = bwDf[i, "id"]
score = bwDf[i, "score"]
appendCypher(tx, query, id = id, score = as.double(score))
}
commit(tx)

Blending back into the graph
query = "MATCH (p:MeetupProfile {id: {id}}) SET p.pageRank = {score}"
for(i in 1:nrow(prDf)) {
if(i %% 1000 == 0) {
commit(tx)
print(paste("Batch", i / 1000, "committed."))
}
name = prDf[i, "name"]
rank = prDf[i, "rank"]
appendCypher(tx, query, id = name, score = as.double(rank))
}
commit(tx)

Are they in the Neo4j group?
MATCH (p:MeetupProfile)
WITH p
ORDER BY p.pageRank DESC
LIMIT 20
OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group)
WHERE group.name = "Neo4j - London User Group"
RETURN p.name, p.id, p.pageRank, NOT m is null AS isMember

Are they in the Neo4j group?
blended_data = cypher(graph, query)

Have they been to any events?
MATCH (p:MeetupProfile)
WITH p
LIMIT 20
OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group)
WHERE g.name = "Neo4j - London User Group"
WITH p, NOT m is null AS isMember, g
OPTIONAL MATCH event= (p)-[:RSVPD]-({response:'yes'})-[:TO]->()<-[:HOSTED_EVENT]-(g)
WITH p, isMember, COLLECT(event) as events
RETURN p.name, p.id, p.pageRank, isMember, LENGTH(events) AS events

Have they been to any events?
blended_data = cypher(graph, query)

Take Aways
● ggplot => visualisations with minimal code
● dplyr => easy data manipulation for
people from other languages
● igraph => find the influencers in a network
● graphs => flexible way of modelling data
that allows querying across multiple
dimensions

http://github.com/mneedham/neo4j-meetup
Get the code

Meetup Analytics with R and Neo4j

Recommended

Recommended

More Related Content

What's hot

What's hot (13)

Viewers also liked

Viewers also liked (20)

Similar to Meetup Analytics with R and Neo4j

Similar to Meetup Analytics with R and Neo4j (20)

More from Neo4j

More from Neo4j (20)

Recently uploaded

Recently uploaded (20)

Meetup Analytics with R and Neo4j