-
Notifications
You must be signed in to change notification settings - Fork 1
/
script.py
144 lines (118 loc) · 4.48 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
<<<<<<< HEAD
import tweepy
import csv
import pandas as pd
import re
import numpy as np
import os
####input your credentials here
consumer_key = os.getenv('consumer_key')
consumer_secret = os.getenv('consumer_secret')
access_token = os.getenv('access_token')
access_token_secret = os.getenv('access_token_secret')
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)
# The search term you want to find
query = "Andela Nigeria"
# Language code (follows ISO 639-1 standards)
language = "en"
# Number of tweets to pull
tweetCount = 20
# Calling the user_timeline function with our parameters
results = api.search(q=query, lang=language, count=tweetCount)
# util functions:
def clean_tweet(tweet_text):
# remove any 'RT' at the beginning of the tweet_text
tweet_text = re.sub(r'^RT\s{1}', '', tweet_text)
# convert www.* or https?://* to URL
tweet_text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet_text)
# convert @username to AT_USER
tweet_text = re.sub('@[^\s]+',' ',tweet_text)
# remove additional white spaces
tweet_text = re.sub('[\s]+', ' ', tweet_text)
# remove every 'hash' from words
tweet_text = re.sub(r'#([^\s]+)', r'\1', tweet_text)
#r emove quotes
tweet_text = tweet_text.strip('\'"')
# remove punctuation
tweet_text = re.sub(r'[\.\,\-,\!,\?]', '', tweet_text)
# remove preceeding and trailing whitespace
tweet_text = tweet_text.strip()
return tweet_text
def get_vocabulary(tweet_list):
# join all words in each dataset row into one string
joined_tweets = " ".join(tweet_list)
vocabulary = list(set(joined_tweets.split(' ')))
return vocabulary
# start processing tweets
def process_tweets(results):
# foreach through all tweets pulled
processed_tweets = []
for result in results:
# grab the main tweet
tweet_text = clean_tweet(result.text)
processed_tweet = {'text': tweet_text}
processed_tweets.append(processed_tweet)
return processed_tweets
# ---------------------------------------------------
tweet_list = pd.read_csv('data/tweets.csv')['text'].tolist()
vocabulary = get_vocabulary(tweet_list)
print "tweet_list[0]: ", tweet_list[0]
print "vocabulary: ", vocabulary
def tweet_to_matrix_row(tweet):
tweet_words = tweet.split(' ')
matrix_row = [0] * len(vocabulary)
for word in tweet_words:
index = vocabulary.index(word)
matrix_row[index] += 1
return matrix_row
sentiment_matrix = map(tweet_to_matrix_row, tweet_list)
print "sentiment_matrix[0]: ", sentiment_matrix[0]
# import pdb; pdb.set_trace()
=======
import re, math
from collections import Counter
import numpy as np
text1 = 'How can I be a geologist?'
text2 = 'What should I do to be a geologist?'
class Similarity():
def compute_cosine_similarity(self, string1, string2):
# intersects the words that are common
# in the set of the two words
intersection = set(string1.keys()) & set(string2.keys())
# dot matrix of vec1 and vec2
numerator = sum([string1[x] * string2[x] for x in intersection])
# sum of the squares of each vector
# sum1 is the sum of text1 and same for sum2 for text2
sum1 = sum([string1[x]**2 for x in string1.keys()])
sum2 = sum([string2[x]**2 for x in string2.keys()])
# product of the square root of both sum(s)
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return round(numerator/float(denominator),4)
def text_to_vector(self,text):
WORD = re.compile(r'\w+')
words = WORD.findall(text)
return Counter(words)
# Jaccard Similarity
def tokenize(self,string):
return string.lower().split(" ")
def jaccard_similarity(self, string1, string2):
intersection = set(string1).intersection(set(string2))
union = set(string1).union(set(string2))
return len(intersection)/float(len(union))
similarity = Similarity()
# vector space
vector1 = similarity.text_to_vector(text1)
vector2 = similarity.text_to_vector(text2)
# split words into tokens
token1 = similarity.tokenize(text1)
token2 = similarity.tokenize(text2)
cosine = similarity.compute_cosine_similarity(vector1, vector2)
print 'Cosine Similarity:', cosine
jaccard = similarity.jaccard_similarity(token1,token2)
print 'Jaccard Similarity:', jaccard
>>>>>>> similarity-index/master