-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam_classifier.py
181 lines (153 loc) · 6.56 KB
/
spam_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import pandas as pd
import csv
import matplotlib.pyplot as plt
import cPickle
import numpy as np
import time
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
#the data for the code below is available on the following link
# https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
# here we use naive bayes and svm for sms classification
# the following is adapted from tutorial on http://radimrehurek.com/data_science_python/
# it contains some additional thoughts and analysis apart from tutorial
#function to split the mssges and convert to bag of words representation and
#1). remove stopwords like is,an etc as these words carry very less information about our target class.
#2). getting the morphological root of a word since going, for most purposes, gives same information as go gives.
#3). Cases dont enhance the information given by a word. So converting all the messages to lowercase makes sense.
# All of the above helps to achieve reduced dimensionality. Since as dimensionality increases theamount
# data needed for getting a model that generalises well increases. This is not generally possible. So
# techiques like the above are used, since text classification with bag of words representation has high
# high dimensionality.
#use nltk corpus to get the stopwords
def convert(data):
data = unicode(data,"utf-8").lower()
data = TextBlob(data).words
stop = stopwords.words('english')
data = list(set(data)-set(stop))
return [word.lemma for word in data]
def convert_direct(data):
data = unicode(data,"utf-8").lower()
return TextBlob(data).words
#getting the feel of data
data=[line.strip() for line in open('address of the sms file')]
print len(data)
for msg_no, msg in enumerate(data[:5]):
print msg_no,":",msg
#using pandas to get statistics of the data
df=pd.read_csv("/home/rishab/ML/cs229/post_collegeate_earning/preprocess_experiments/smsspamcollection/SMSSpamCollection", sep="\t", names=["labels","mssg"], quoting=csv.QUOTE_NONE)
#check data
print df.head()
#statistics
print df.groupby("labels").describe()
# getting the length of mssg which could be conisdered as an important feaure for filtering
df['len_of_mssg']=df['mssg'].map(lambda text : len(text))
print df.head()
print df.mssg.shape,df.labels.shape
print df.len_of_mssg.describe()
#bag of words representation of data
bow=df.mssg.apply(convert)
print bow.head()
print TextBlob("this is a test sentence").tags
#difference of direct and conversion after stopwords removal
print df.mssg.head().apply(convert)
print df.mssg.head().apply(convert_direct)
# converting to form for making a ml model
bow_transformer = CountVectorizer(analyzer=convert).fit(df.mssg)
# print bow_transformer.vocabulary_
# print df.mssg[2]
# print convert(df.mssg[2])
# print bow_transformer.transform([df.mssg[2]])
mssg_to_bow = bow_transformer.transform(df.mssg)
test_msg=bow_transformer.transform(["festive offer"])
# print mssg_to_bow.nnz
# print mssg_to_bow
print "sparsity : ",(mssg_to_bow.nnz/float(mssg_to_bow.shape[0]*mssg_to_bow.shape[1]))*100.0
#using the tfidf for document weight and normalisation
tfidf_transformer = TfidfTransformer().fit(mssg_to_bow)
print tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]
data_tfidf = tfidf_transformer.transform(mssg_to_bow)
test_tfidf=tfidf_transformer.transform(test_msg)
#getting the tme for training the model
millis = int(round(time.time() * 1000))
#building model
#naive bayes
print df.labels.head()
model = MultinomialNB().fit(data_tfidf,np.asarray(df['labels'],dtype="|S6"))
print model.predict(test_tfidf)
test_msgs=["Diwali bumper sale","Festive offer"]
#when we run prediction on festive offer then we get a misclassification. This proves that classifier
#is as good as the trainig data especially in case of text classification. This is the case for all examples in
#test_msgs. Classifier is not able to predict well for thise words in spam that it doesnt see in
#the training data as we see festive, offer etc are not there in the vocabulary
try:
print tfidf_transformer.idf_[bow_transformer.vocabulary_['festive']]
except Exception, e:
print "not there in vocabulay of the CountVectorizer"
#Experimenting with the data
#getting the test_train_split of the for getting the data for validation/testing
msg_train,msg_test,msg_label_train,msg_label_test = train_test_split(df.mssg,df.labels,test_size=0.3)
print msg_label_train.shape,msg_train.shape
#creating the model using pipeline
pipeline=Pipeline([
('bow',CountVectorizer()),
('tfidf',TfidfTransformer()),
('classifier',MultinomialNB())])
scores=cross_val_score(pipeline, msg_train, np.asarray(msg_label_train,dtype="|S6"), cv=5, scoring="accuracy")
print scores
using gridsearch to get the optmized parmeters
params={
'tfidf__use_idf': (True, False),
"bow__analyzer" : (convert,convert_direct)
}
grid=GridSearchCV(
pipeline,
params,
refit = True,
scoring='accuracy',
cv=StratifiedKFold(msg_label_train,n_folds=5)
)
spam_gs = grid.fit(msg_train,np.asarray(msg_label_train,dtype="|S6"))
print spam_gs.grid_scores_
# performing gridsearch for svm model
# Grid Search is a great class to efficiently run algorithm many times for tuning optimal model para-
# meters
pipeline_svm=Pipeline([
('bow' , CountVectorizer(analyzer=convert)),
('tfidf' , TfidfTransformer()),
('classifier' , SVC())
])
params_svm = [
{
'classifier__C' : [1,10,100,1000],
'classifier__kernel' : ['linear']
},
{
'classifier__C' : [1,10,100,1000],
'classifier__kernel' : ['rbf'],
'classifier__gamma' : [0.001,0.001,0.01]
}
]
grid_svm = GridSearchCV(
pipeline_svm,
param_grid=params_svm,
refit=True,
scoring='accuracy',
cv=StratifiedKFold(msg_label_train,n_folds=5)
)
svm_results=grid_svm.fit(msg_train,np.asarray(msg_label_train,dtype="|S6"))
print svm_results.grid_scores_
#storing the model params
with open("sms_predictor_model.pkl",'wb') as fout:
cPickle.dump(svm_results,fout)
# When we run the grid search for spam detection , it is observed that naive bayes runs faster but
# the accuracy observed is much lower around 0.95 as compared to >0.98 in case of SVC. This is primari;y
# because naive bayes is a fairly simple model while SVC can have a fairly complex model.