-
Notifications
You must be signed in to change notification settings - Fork 6
/
dataset_handler.py
331 lines (281 loc) · 11 KB
/
dataset_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/usr/bin/env python
import json
import random
import numpy as np
import re # For preprocessing
from pickle import dump
from pickle import load as pkload
import pandas as pd # For data handling
import spacy
import constants as c
import word_embedding_model as emb
URL_REGEX = r"@\w*|https?:?\/?\/?[\w.\/]*|https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/=]*)"
not_lemmatize = []
REMOVE_WORDS = ['rt', 'ht', 'htt', 'https', 'http', 'https t']
INPUT_PATH = c.INPUT_PATH
MINCOUNT = c.MINCOUNT
TRAIN_TEST_INPUT = c.TRAIN_TEST_INPUT
GUSE_PATH = c.GUSE_PATH
def cleaning(words_doc):
txt = []
is_hashtag = False
for token in words_doc:
if token.text == "#":
is_hashtag = True
elif not (token.is_stop or (len(token.text) < 2 and token.text not in not_lemmatize)):
if is_hashtag:
txt.append('#' + token.text)
is_hashtag = False
else:
txt.append(token.text if token.text in not_lemmatize else token.lemma_)
return ' '.join(txt)
def get_tweet_corpus(state=None, no_retweet=True):
import os
corpus_tweets = []
print("Read files")
input_path = INPUT_PATH
files = os.listdir(input_path)
if state is not None:
files = [state]
for file_path in files:
print(" - file: " + file_path)
with open(input_path + file_path, 'r', encoding="utf8") as input_file:
i = 0
for line in input_file:
if i % 5000 == 0:
print(str(i) + " processed")
i = i + 1
line = line.strip()
tweet = json.loads(line)
isRetweet = tweet["isRetweet"]
text = tweet["text"]
# skip tweet without text and retweets (no benefits for the embedding phase)
if len(text) == 0 or (isRetweet and no_retweet):
continue
words = text.split()
corpus_tweets.append(words)
return pd.DataFrame({'tweets': corpus_tweets})
def clean_and_phrase(corpus_tweets):
# Removes non-alphabetic characters:
corpus_cleaned = []
for tweet in corpus_tweets:
tweet_cleaned = []
tweet = re.sub(URL_REGEX, '', ' '.join(tweet)).lower().strip()
tweet = re.sub('[^#\\d\\w_]+', ' ', tweet).strip()
tweet = tweet.split()
for word in tweet:
word_cleaned = word
for r in REMOVE_WORDS:
if r == word_cleaned:
word_cleaned = ''
if word_cleaned != '':
tweet_cleaned.append(word_cleaned)
corpus_cleaned.append(' '.join(tweet_cleaned))
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
# Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:
txt = [cleaning(words_doc) for words_doc in nlp.pipe(corpus_cleaned, batch_size=5000, n_threads=-1)]
return txt
def store(corpus, file):
"""
Store list of list sentences in text file
:param1 file.
:return: corpus represented in list of list of words
"""
with open(file, 'w', encoding="utf8") as stream_out:
for line in corpus:
for word in line:
stream_out.write(str(word) + ' ')
stream_out.write('\n')
def load(file):
"""
Load text file in list of list sentences
:param1 file.
:return: corpus represented in list of list of words
"""
sentences = []
with open(file, "r", encoding="utf8") as stream_in:
for line in stream_in:
sentences.append(line.split())
return sentences
def preprocess_data(save_file="sentences.txt"):
"""
Read data, clean data, calculate bigrams and store in save_file for Word2Vec model
:param1 save_file.
"""
corpus_dataframe = get_tweet_corpus(no_retweet=False)
tweet_corpus = corpus_dataframe['tweets']
sentences = clean_and_phrase(tweet_corpus)
with open(save_file, 'w', encoding="utf8") as stream_out:
for line in sentences:
stream_out.write(line+'\n')
def preprocess_data_for_sentence_embedding(file=None):
"""
Clean sentences and stores them in file for Google Universal Sentence Encoder
:param1 file: specify input file in order to not use the whole input folder.
"""
corpus_dataframe = get_tweet_corpus(file)
tweet_corpus = corpus_dataframe['tweets']
corpus_cleaned = []
for tweet in tweet_corpus:
tweet_cleaned = []
tweet = re.sub(URL_REGEX, '', ' '.join(tweet)).strip()
tweet = re.sub(r'[^\x00-\x7f]', r' ', tweet).lower().strip()
tweet = tweet.split()
for word in tweet:
word_cleaned = word
# word_cleaned = re.sub("[^#*\d*\w+_*]+", ' ', url_removal).strip()
for r in REMOVE_WORDS:
if r == word_cleaned:
word_cleaned = ''
if word_cleaned != '':
tweet_cleaned.append(word_cleaned)
# print(' '.join(tweet_cleaned))
corpus_cleaned.append(tweet_cleaned)
store(corpus_cleaned, TRAIN_TEST_INPUT)
# ---------------------------------------------------------------------------------------------------------
def generate_sample(perc):
sents = load(c.TRAIN_TEST_INPUT)
with open("sample_" + str(perc), 'w', encoding="utf8") as out:
for s in sents:
r = random.random()
if (r < perc):
for word in s:
out.write(word + ' ')
out.write('\n')
def load_without_not_relevant_hts(input):
counter = dict()
with open(input, 'r', encoding="utf8") as in_stream:
for line in in_stream:
l = line.split()
for w in l:
if '#' in w:
counter[w] = counter.get(w, 0) + 1
# pprint(counter)
result = []
with open(input, 'r', encoding="utf8") as in_stream:
for line in in_stream:
l = line.split()
res_line = []
for w in l:
if '#' in w:
if counter[w] > MINCOUNT:
res_line.append(w)
else:
res_line.append(w)
if len(res_line) > 0:
result.append(res_line)
return result
def prepare_train_test(perc_test):
"""
Split corpus in train and test and store them
:param1 perc_test: test corpus percentage in float.
"""
corpus_with_bigrams = load_without_not_relevant_hts(TRAIN_TEST_INPUT) # emb.load(TRAIN_TEST_INPUT) #
random.shuffle(corpus_with_bigrams)
cleaned_corpus_with_bigrams = []
# keeping tweets with at least one hashtag and one word
for tweet in corpus_with_bigrams:
bool_h = False
bool_w = False
for w in tweet:
if w == 'rt':
break # each retweet starts with 'rt'
if '#' in w:
bool_h = True
else:
bool_w = True
if bool_w and bool_h:
cleaned_corpus_with_bigrams.append(tweet)
store(cleaned_corpus_with_bigrams[int(len(cleaned_corpus_with_bigrams) * perc_test):], c.TRAIN_CORPUS)
store(cleaned_corpus_with_bigrams[:int(len(cleaned_corpus_with_bigrams) * perc_test)], c.TEST_CORPUS)
def hashtags_list(tweet, model):
ht_list = []
for w in tweet:
word_cleaned = re.sub('[^#\\d\\w_]+', ' ', w).lower().strip()
for word in word_cleaned.split():
if word[0] == '#':
word_cleaned = word
break
for r in emb.REMOVE_WORDS:
if r == word_cleaned:
word_cleaned = ''
if len(word_cleaned) > 1 and '#' in word_cleaned and word_cleaned in model.wv.vocab:
ht_list.append(word_cleaned)
return ht_list
def count_words(tweets):
counter = dict()
for line in tweets:
l = line.split()
for w in l:
counter[w] = counter.get(w, 0) + 1
return counter
def remove_hashtags_from_sentences(tweets, hts, populate_dictionary=True):
if c.SKIP_HASHTAG_REMOVING:
result_tweets = []
for tweet in tweets:
tweet_string = ""
for w in tweet:
if w[0] == '#':
tweet_string = tweet_string + " " + w[1:]
else:
tweet_string = tweet_string + " " + w
result_tweets.append(tweet_string.strip())
return result_tweets, hts
if populate_dictionary:
counter = count_words(tweets)
dump(counter, open(c.H_REMOVING_DICT, 'wb'))
else:
counter = pkload(open(c.H_REMOVING_DICT, 'rb'))
result_tweets = []
result_hts = []
for tweet, ht_list in zip(tweets, hts):
norm_tweet = []
tweet = tweet.split()
for word in tweet:
if word[0] == '#':
no_ht_word = word[1:]
if counter.__contains__(no_ht_word) and counter[no_ht_word] > 2: # mincount
norm_tweet.append(no_ht_word)
else:
norm_tweet.append(word)
if len(norm_tweet) > 0:
result_tweets.append(" ".join(norm_tweet))
result_hts.append(ht_list)
return result_tweets, result_hts
def prepare_model_inputs_and_targets(w_emb):
"""
Prepare train and test <X,Y> for neural network
:param1 w_emb: Word2Vec model.
"""
train = load(c.TRAIN_CORPUS)
test = load(c.TEST_CORPUS)
targets_train = []
sentences_train = []
targets_test = []
sentences_test = []
ht_lists = []
for tweet in train:
ht_list = hashtags_list(tweet, w_emb)
h_embedding = emb.tweet_arith_embedding(w_emb, " ".join(ht_list))
if h_embedding is not None:
targets_train.append(emb.np.array(h_embedding))
sentences_train.append(tweet)
for tweet in test:
ht_list = hashtags_list(tweet, w_emb)
h_embedding = emb.tweet_arith_embedding(w_emb, " ".join(ht_list))
if h_embedding is not None:
targets_test.append(h_embedding)
sentences_test.append(tweet)
ht_lists.append(ht_list)
sentences_train_len = len(sentences_train)
targets_train_len = len(targets_train)
sentences = sentences_train
sentences.extend(sentences_test)
targets = targets_train
targets.extend(targets_test)
sentences, targets = remove_hashtags_from_sentences(sentences, targets)
targets_train = np.array(targets[:targets_train_len])
targets_test = np.array(targets[targets_train_len:])
sentences_train = np.array(sentences[:sentences_train_len])
sentences_test = np.array(sentences[sentences_train_len:])
return sentences_train, sentences_test, targets_train, targets_test, ht_lists