-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
196 lines (133 loc) · 5.78 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# used to change filepaths
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd
import numpy as np
from PIL import Image
from skimage.feature import hog
from skimage.color import rgb2grey
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, accuracy_score
#DISPLAY IMAGE OF EACH TYPE OF BEE
# load the labels using pandas
labels = pd.read_csv("datasets/labels.csv", index_col=0)
# show the first five rows of the dataframe using head
display(labels.head())
#modify value of root parmeter to take local path of images depending on the data location
def get_image(row_id, root="datasets/"):
"""
Converts an image number into the file path where the image is located,
opens the image, and returns the image as a numpy array.
"""
filename = "{}.jpg".format(row_id)
file_path = os.path.join(root, filename)
img = Image.open(file_path)
return np.array(img)
# subset the dataframe to just Apis (genus is 0.0) get the value of the sixth item in the index
apis_row = labels[labels.genus == 0.0].index[5]
# show the corresponding image of an Apis
plt.imshow(get_image(apis_row))
plt.show()
# subset the dataframe to just Bombus (genus is 1.0) get the value of the sixth item in the index
bombus_row = labels[labels.genus == 1.0].index[5]
# show the corresponding image of a Bombus
plt.imshow(get_image(bombus_row))
plt.show()
#MANIPULATE IMAGE WITH rgb2grey
# load a bombus image using our get_image function and bombus_row from the previous cell
bombus = get_image(bombus_row)
# print the shape of the bombus image
print('Color bombus image has shape: ', bombus.shape)
# convert the bombus image to greyscale
grey_bombus = rgb2grey(bombus)
# show the greyscale image
plt.imshow(grey_bombus, cmap=mpl.cm.gray)
# greyscale bombus image only has one channel
print('Greyscale bombus image has shape: ', grey_bombus.shape)
#HISTOGRAM OF ORIENTED GRADIENTS (read more about oriented gradients here "https://scikit-image.org/docs/dev/auto_examples/features_detection/plot_hog.html")
# run HOG using our greyscale bombus image
hog_features, hog_image = hog(grey_bombus,
visualize=True,
block_norm='L2-Hys',
pixels_per_cell=(16, 16))
plt.imshow(hog_image, cmap=mpl.cm.gray)
#CREATE FEATURES FROM IMAGES
def create_features(img):
# flatten three channel color image
color_features = img.flatten()
# convert image to greyscale
grey_image = rgb2grey(img)
# get HOG features from greyscale image
hog_features = hog(grey_image, block_norm='L2-Hys', pixels_per_cell=(16, 16))
# combine color and hog features into a single array
flat_features = np.hstack((color_features, hog_features))
return flat_features
bombus_features = create_features(bombus)
#PREPROCESSING
def create_feature_matrix(label_dataframe):
features_list = []
for img_id in label_dataframe.index:
# load image
img = get_image(img_id)
# get features for image
image_features = create_features(img)
features_list.append(image_features)
# convert list of arrays into a matrix
feature_matrix = np.array(features_list)
return feature_matrix
# run create_feature_matrix on our dataframe of images
feature_matrix = create_feature_matrix(labels)
#SCALING FEATURE MATRIX USING PRINCIPAL COMPONENT ANALYSIS
# get shape of feature matrix
print('Feature matrix shape is: ', feature_matrix.shape)
# define standard scaler
ss = StandardScaler()
# run this on our feature matrix
bees_stand = ss.fit_transform(feature_matrix)
pca = PCA(n_components=500)
# use fit_transform to run PCA on our standardized matrix
bees_pca = pca.fit_transform(bees_stand)
# look at new shape
print('PCA matrix shape is: ', bees_pca.shape)
#SPLIT YOUR DATASET INTO TRAIN AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(bees_pca,
labels.genus.values,
test_size=.3,
random_state=1234123)
# look at the distrubution of labels in the train set
pd.Series(y_train).value_counts()
#TRAIN THE MODEL
# define support vector classifier
svm = SVC(kernel='linear', probability=True, random_state=42)
# fit model
svm.fit(X_train, y_train)
# PREDICT ON THE TEST SET AND SCORE PERFORMANCE
# generate predictions
y_pred = svm.predict(X_test)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Model accuracy is: ', accuracy)
#ANALYSE PERFORMANCE WITH ROC CURVE AND AUC
# predict probabilities for X_test using predict_proba
probabilities = svm.predict_proba(X_test)
# select the probabilities for label 1.0
y_proba = probabilities[:,1]
# calculate false positive rate and true positive rate at different thresholds
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1)
# calculate AUC
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
# plot the false positive rate on the x axis and the true positive rate on the y axis
roc_plot = plt.plot(false_positive_rate,
true_positive_rate,
label='AUC = {:0.2f}'.format(roc_auc))
plt.legend(loc=0)
plt.plot([0,1], [0,1], ls='--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate');
#TODO Hyper-parameter tuning (Improve model)