meetup analytics with r and neo4j
TRANSCRIPT
Exploring London NoSQL meetups using R
Mark Needham@markhneedham
Scraper at the ready...
Not needed :(
Lots of bits of data
● Events● Members● Groups● RSVPs● Venues● Topics
The data model
Interesting questions to ask...
Interesting questions to ask...● What day of the week do people go to meetups?● Where abouts in London are NoSQL meetups held?● Do people sign up for multiple meetups on the same
day?● Are there common members between groups?● What topics are people most interested in?● In which order do people join the NoSQL groups?● Who are the most connected people on the NoSQL
scene?
The tool set
RNeo4j
Results as a data frame
Query
dplyrggplot2
igraph ggmapcluster
geosphere
When do people go to meetups?
When do people go to meetups?
(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-()
When do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-()
WHERE (event.time + event.utc_offset) < timestamp()
RETURN g.name,
event.time + event.utc_offset AS eventTime,
event.announced_at AS announcedAt,
event.name,
COUNT(*) AS rsvps
R Neo4jinstall.packages("devtools")
devtools::install_github("nicolewhite/Rneo4j")
library(RNeo4j)
graph = startGraph("http://localhost:7474/db/data/")
query = "MATCH … RETURN …"
cypher(graph, query)
Grouping events by monthlibrary(dplyr)
events %>%
group_by(month) %>%
summarise(events = n(),
count = sum(rsvps),
max = max(rsvps)) %>%
mutate(ave = count / events) %>%
arrange(desc(ave))
Grouping events by month## month events count ave
## 1 November 55 3018 54.87273
## 2 May 52 2676 51.46154
## 3 April 58 2964 51.10345
## 4 June 47 2384 50.72340
## 5 October 71 3566 50.22535
## 6 September 59 2860 48.47458
## 7 February 43 2047 47.60465
## 8 January 34 1592 46.82353
## 9 December 24 1056 44.00000
## 10 March 39 1667 42.74359
## 11 July 48 1866 38.87500
## 12 August 34 1023 30.08824
Grouping events by dayevents %>%
group_by(day) %>%
summarise(events = n(),
count = sum(rsvps),
max = max(rsvps)) %>%
mutate(ave = count / events) %>%
arrange(day)
Grouping events by day## day events count ave
## 1 Monday 63 4034 64.03175
## 2 Tuesday 151 6696 44.34437
## 3 Wednesday 225 9481 42.13778
## 4 Thursday 104 5394 51.86538
## 5 Friday 11 378 34.36364
## 6 Saturday 10 736 73.60000
Some simple bar chartslibrary(ggplot2)
g1 = ggplot(aes(x = day, y = ave), data = byDay) +
geom_bar(stat="identity", fill="dark blue") +
ggtitle("Average attendees by day")
g2 = ggplot(aes(x = day, y = count), data = byDay) +
geom_bar(stat="identity", fill="dark blue") +
ggtitle("Total attendees by day")
grid.arrange(g1,g2, ncol = 1)
London hits the pub
Where do people go to meetups?
(g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-(),
(event)-[:HELD_AT]->(venue)
Where do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-(), (event)-[:HELD_AT]->(venue)
WHERE (event.time + event.utc_offset) < timestamp()
RETURN g.name,
event.time + event.utc_offset AS eventTime,
event.announced_at AS announcedAt,
event.name,
venue.name AS venue,
venue.lat AS lat,
venue.lon AS lon,
COUNT(*) AS rsvps
Where do people go to meetups?MATCH (g:Group)-[:HOSTED_EVENT]->(event)<-[:TO]-
({response: 'yes'})<-[:RSVPD]-(), (event)-[:HELD_AT]->(venue)
WHERE (event.time + event.utc_offset) < timestamp()
RETURN g.name,
event.time + event.utc_offset AS eventTime,
event.announced_at AS announcedAt,
event.name,
venue.name AS venue,
venue.lat AS lat,
venue.lon AS lon,
COUNT(*) AS rsvps
Where do people go to meetups?byVenue = events %>%
count(lat, lon, venue) %>%
ungroup() %>%
arrange(desc(n)) %>%
rename(count = n)
Where do people go to meetups?## lat lon venue count
## 1 51.50256 -0.019379 Skyline Bar at CCT Venues Plus 1
## 2 51.53373 -0.122340 The Guardian 1
## 3 51.51289 -0.067163 Erlang Solutions 3
## 4 51.49146 -0.219424 Novotel - W6 8DR 1
## 5 51.49311 -0.146531 Google HQ 1
## 6 51.52655 -0.084219 Look Mum No Hands! 22
## 7 51.51976 -0.097270 Vibrant Media, 3rd Floor 1
## 8 51.52303 -0.085178 Mind Candy HQ 2
## 9 51.51786 -0.109260 ThoughtWorks UK Office 2
## 10 51.51575 -0.097978 BT Centre 1
Where do people go to meetups?library(ggmap)
map = get_map(location = 'London', zoom = 12)
ggmap(map) +
geom_point(aes(x = lon, y = lat, size = count),
data = byVenue,
col = "red",
alpha = 0.8)
library(geosphere)
library(cluster)
clusteramounts = 40
distance.matrix = byVenue %>% select(lon, lat) %>% distm
clustersx <- as.hclust(agnes(distance.matrix, diss = T))
byVenue$group <- cutree(clustersx, k=clusteramounts)
byVenueClustered = byVenue %>%
group_by(group) %>%
summarise(meanLat = mean(lat),
meanLon = mean(lon),
total = sum(count),
venues = paste(venue, collapse = ","))
Spatial clustering
## group meanLat meanLon total
## 1 3 51.52349 -0.08506461 123
## 2 1 51.52443 -0.09919280 89
## 3 2 51.50547 -0.10325925 62
## 4 4 51.50794 -0.12714600 55
## 5 8 51.51671 -0.10028908 19
## 6 6 51.53655 -0.13798514 18
## 7 7 51.52159 -0.10934720 18
## 8 5 51.51155 -0.07004417 13
## 9 12 51.51459 -0.12314650 13
## 10 14 51.52129 -0.07588867 10
Spatial clustering
ggmap(map) +
geom_point(aes(x = meanLon, y = meanLat, size = total),
data = byVenueClustered,
col = "red",
alpha = 0.8)
Spatial clustering
byVenue %>%
filter(group == byVenueClustered$group[1])
What’s going on in Shoreditch?
Meetup Group Member Overlap
● Why would we want to know this?○ Perhaps for joint meetups○ Topics for future meetups
Extracting the dataMATCH (group1:Group), (group2:Group)
WHERE group1 <> group2
OPTIONAL MATCH p = (group1)<-[:MEMBER_OF]-()-[:MEMBER_OF]->(group2)
WITH group1, group2, COLLECT(p) AS paths
RETURN group1.name, group2.name,
LENGTH(paths) as commonMembers
ORDER BY group1.name, group2.name
MATCH (group1:Group), (group2:Group)
WHERE group1 <> group2
OPTIONAL MATCH (group1)<-[:MEMBER_OF]-(member)
WITH group1, group2, COLLECT(member) AS group1Members
WITH group1, group2, group1Members, LENGTH(group1Members) AS numberOfGroup1Members
UNWIND group1Members AS member
OPTIONAL MATCH path = (member)-[:MEMBER_OF]->(group2)
WITH group1, group2, COLLECT(path) AS paths, numberOfGroup1Members
WITH group1, group2, LENGTH(paths) as commonMembers, numberOfGroup1Members
RETURN group1.name, group2.name,
toInt(round(100.0 * commonMembers / numberOfGroup1Members)) AS percentage
ORDER BY group1.name, group1.name
Finding overlap as a percentage
How many groups are people part of?MATCH (p:MeetupProfile)-[:MEMBER_OF]->()
RETURN ID(p), COUNT(*) AS groups
ORDER BY groups DESC
How many groups are people part of?ggplot(aes(x = groups, y = n),
data = group_count %>% count(groups)) +
geom_bar(stat="identity", fill="dark blue") +
scale_y_sqrt() +
scale_x_continuous(
breaks = round(seq(min(group_count$groups), max(group_count$groups), by = 1),1)) +
ggtitle("Number of groups people are members of")
Who’s the most connected?
● i.e. the person who had the chance to meet the most people in the community
● Betweenness Centrality● Page Rank
Who’s the most connected?
Betweenness Centrality
Calculates the number of shortest paths that go through a particular node
Betweenness Centralitylibrary(igraph)
nodes_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event)
RETURN DISTINCT ID(p) AS id, p.id AS name, p.name AS fullName"
nodes = cypher(graph, nodes_query)
edges_query = "MATCH (p:MeetupProfile)-[:RSVPD]->({response: 'yes'})-[:TO]->(event),
(event)<-[:TO]-({response:'yes'})<-[:RSVPD]-(other)
RETURN ID(p) AS source, ID(other) AS target, COUNT(*) AS weight"
edges = cypher(graph, edges_query)
g = graph.data.frame(edges, directed = T, nodes)
bwGraph = betweenness(g)
bwDf = data.frame(id = names(bwGraph), score = bwGraph)
Betweenness CentralitybwDf %>% arrange(desc(score)) %>% head(5)
merge(nodes, bwDf, by.x = "name", by.y = "id") %>%
arrange(desc(score)) %>%
head(5)
Page RankPageRank works by counting the number and quality of links to a page to determine a rough estimate of how important the website is. The underlying assumption is that more important websites are likely to receive more links from other websites.
Page RankPageRank works by counting the number and quality of links to a person to determine a rough estimate of how important the person is. The underlying assumption is that more important people are likely to receive more links from other people.
Page Rankpr = page.rank(g)$vector
prDf = data.frame(name = names(pr), rank = pr)
data.frame(merge(nodes, prDf, by.x = "name", by.y = "name")) %>%
arrange(desc(rank)) %>%
head(10)
Blending back into the graphquery = "MATCH (p:MeetupProfile {id: {id}}) SET p.betweenness = {score}"
tx = newTransaction(graph)
for(i in 1:nrow(bwDf)) {
if(i %% 1000 == 0) {
commit(tx)
print(paste("Batch", i / 1000, "committed."))
tx = newTransaction(graph)
}
id = bwDf[i, "id"]
score = bwDf[i, "score"]
appendCypher(tx, query, id = id, score = as.double(score))
}
commit(tx)
Blending back into the graphquery = "MATCH (p:MeetupProfile {id: {id}}) SET p.pageRank = {score}"
tx = newTransaction(graph)
for(i in 1:nrow(prDf)) {
if(i %% 1000 == 0) {
commit(tx)
print(paste("Batch", i / 1000, "committed."))
tx = newTransaction(graph)
}
name = prDf[i, "name"]
rank = prDf[i, "rank"]
appendCypher(tx, query, id = name, score = as.double(rank))
}
commit(tx)
Are they in the Neo4j group?
MATCH (p:MeetupProfile)
WITH p
ORDER BY p.pageRank DESC
LIMIT 20
OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group)
WHERE group.name = "Neo4j - London User Group"
RETURN p.name, p.id, p.pageRank, NOT m is null AS isMember
ORDER BY p.pageRank DESC
Are they in the Neo4j group?blended_data = cypher(graph, query)
Have they been to any events?
Have they been to any events?MATCH (p:MeetupProfile)
WITH p
ORDER BY p.pageRank DESC
LIMIT 20
OPTIONAL MATCH member = (p)-[m:MEMBER_OF]->(g:Group) WHERE g.name = "Neo4j - London User Group"
WITH p, NOT m is null AS isMember, g
OPTIONAL MATCH event= (p)-[:RSVPD]-({response:'yes'})-[:TO]->()<-[:HOSTED_EVENT]-(g)
WITH p, isMember, COLLECT(event) as events
RETURN p.name, p.id, p.pageRank, isMember, LENGTH(events) AS events
ORDER BY p.pageRank DESC
Have they been to any events?blended_data = cypher(graph, query)
Take Aways
● ggplot => visualisations with minimal code● dplyr => easy data manipulation for
people from other languages● igraph => find the influencers in a network● graphs => flexible way of modelling data
that allows querying across multiple dimensions
And one final take away...
http://github.com/mneedham/neo4j-meetup
Get the code