-
Notifications
You must be signed in to change notification settings - Fork 4
/
makeHTnetwork.py
159 lines (113 loc) · 6.63 KB
/
makeHTnetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Author: Alexandre Bovet <alexandre.bovet@gmail.com>
# License: BSD 3 clause
import time
import graph_tool.all as gt
import sqlite3
import numpy as np
import pandas as pd
from itertools import combinations
from collections import Counter
from baseModule import baseModule
class makeHTNetwork(baseModule):
""" Builds the network of hashtag co-occurrences.
Must be initialized with a dictionary `job` containing keys `sqlite_db_filename`
and `graph_file`.
Reads all the co-occurences from the SQLite database and builds the network
of where nodes are hashtags and edges are co-occurrences.
The graph is a graph-tool object and is saved in graphml format to graph_file.
Nodes of the graph have two properties: `counts` is the number of single
occurrences of the hashtag and `name` is the name of the hashtag.
Edges have a property `weights` equal to the number of co-occurrences they represent.
The graph has the following properties saved with it:
- `Ntweets`: number of tweets with at least one hashtag used to build the graph.
- `start_date` : date of the first tweet.
- `stop_date` : date of the last tweet.
- `weight_threshold` : co-occurrence threshold. Edges with less than `weight_threshold` co-occurrences are discarded.
*Optional parameters that can be added to `job`:*
:start_date: and
:stop_date: to specify a time range for the tweets. (Default is `None`,
i.e. select all the tweets in the database).
:weight_threshold: is the minimum number of co-occurences between to
hashtag to be included in the graph. (Default is 3).
"""
def run(self):
#==============================================================================
# PARAMETERS
#==============================================================================
# filename of the existing sqlite file
sqlite_file = self.job['sqlite_db_filename']
# filename of the graph to be saved
graph_file = self.job['graph_file']
#==============================================================================
# OPTIONAL PARAMETERS
#==============================================================================
#optionally provide start and stop dates
start_date = self.job.get('start_date', None)
stop_date = self.job.get('stop_date', None)
# remove edges with less than weight_threshold counts
weight_threshold = self.job.get('weight_threshold', 3)
if start_date is not None and stop_date is not None:
# filter tweet dates
with sqlite3.connect(sqlite_file, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) as conn:
# get
df = pd.read_sql_query("""SELECT hashtag, tweet_id FROM hashtag_tweet_user
WHERE tweet_id IN (
SELECT tweet_id FROM tweet
WHERE datetime_EST >= ?
AND datetime_EST < ?
)""",
conn, params=(start_date, stop_date))
else:
with sqlite3.connect(sqlite_file, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) as conn:
df = pd.read_sql_query("""SELECT hashtag, tweet_id FROM hashtag_tweet_user""",
conn)
gp = df.groupby('tweet_id')
edges = []
print('creating edge list')
t0 = time.time()
for name, group in gp:
if len(group) > 1:
edges.extend(list(combinations(sorted(group.hashtag),2)))
ht_pair_count = Counter(edges)
edges_list_weigths = np.array([(ht1, ht2, w) for (ht1, ht2),
w in ht_pair_count.items() if w >=weight_threshold])
self.print_elapsed_time(t0)
print('creating graph')
t0 = time.time()
self.G = gt.Graph(directed=False)
e_weights = self.G.new_edge_property('int')
self.G.vp['names'] = self.G.add_edge_list(edges_list_weigths, hashed=True,
string_vals=True, eprops=e_weights)
e_weights.a = edges_list_weigths[:,2]
self.G.ep['weights'] = e_weights
ht_names = self.G.vp.names.get_2d_array([0])
self.G.graph_properties['Ntweets'] = self.G.new_graph_property('int')
self.G.graph_properties['Ntweets'] = df.tweet_id.unique().size
self.G.graph_properties['start_date'] = self.G.new_graph_property('object')
self.G.graph_properties['start_date'] = start_date
self.G.graph_properties['stop_date'] = self.G.new_graph_property('object')
self.G.graph_properties['stop_date'] = stop_date
self.G.graph_properties['weight_threshold'] = self.G.new_graph_property('int')
self.G.graph_properties['weight_threshold'] = weight_threshold
self.print_elapsed_time(t0)
#% ht counts
count_group = df.groupby('hashtag')
df_ht_counts = count_group.aggregate('count')
df_ht_counts.sort_values('tweet_id', ascending=False, inplace=True)
df_ht_counts.rename(columns={'tweet_id': 'count'}, inplace=True)
df_ht_counts['id'] = np.arange(0, df_ht_counts.index.size)
df_ht_counts['hashtag'] = df_ht_counts.index
df_ht_counts = df_ht_counts[['id','hashtag','count']]
print(df_ht_counts.columns)
print(df_ht_counts)
#add counts to Graph vertex
v_counts = self.G.new_vertex_property('int', val=0)
ht_counts_names = np.array(df_ht_counts.hashtag.tolist())
sorter = np.argsort(ht_counts_names)
v_counts.a = df_ht_counts.iloc[sorter[np.searchsorted(ht_counts_names,
ht_names, sorter=sorter)].flatten()]['count']
self.G.vp['counts'] = v_counts
# save graph file
self.G.save(graph_file, fmt='graphml')
print('\nNumber of nodes: ' + str(self.G.num_vertices()))
print('Number of edges: ' + str(self.G.num_edges()))