-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_caption.py
119 lines (97 loc) · 4.14 KB
/
generate_caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
## PYTHON SCRIPT BUILT SO AS TO EXECUTE FROM NODE.JS SERVER
## AVAILABLE COMMANDS
# > python generate_caption.py generate
# generates captions for all the dev set images and store them in random_captions.txt
# > python generate_caption.py image <path_to_image>
# predict caption for the provided image
# IMPORTS
import sys
import numpy as np
from pickle import load
from tensorflow.keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.lite.python import interpreter as interpreter_wrapper
# Path to root dataset folder on mounted drive
project_root_path = "./Python/"
models_root_path = project_root_path + "Models/"
variables_root_path = project_root_path + "Variables/"
# Function to load data from .pkl file at filepath
def loadData(filepath):
with open(filepath, "rb") as encoded_pickle:
return load(encoded_pickle)
# Loading Variables
max_caption_length = loadData(variables_root_path + "max_caption_length.pickle")
dev_dataset = loadData(variables_root_path + "dev_dataset.pickle")
# resnet_model = load_model(models_root_path + "resnet_model.h5")
encoded_dev_images = loadData(variables_root_path + "encoded_dev_images_resnet.pickle")
index_to_word = loadData(variables_root_path + "index_to_word.pickle")
word_to_index = loadData(variables_root_path + "word_to_index.pickle")
loaded_language_model = load_model(models_root_path + "language_model_2")
# Load the TFLite model and allocate tensors.
interpreter = interpreter_wrapper.Interpreter(
model_path=models_root_path + "/resnet_model.tflite"
)
interpreter.allocate_tensors()
# Get input and output tensors.
input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]
# FUNCTION TO PREDICT CAPTION FROM FEATURE VECTOR
def predict(feature_vec):
partial_caption = "startseq"
for i in range(max_caption_length):
# integer encode input sequence
seq = [
word_to_index[word]
for word in partial_caption.split()
if word in word_to_index
]
# pad input
seq = pad_sequences([seq], maxlen=max_caption_length)
# predict next word
model_softMax_output = loaded_language_model.predict(
[feature_vec, seq], verbose=0
)
# convert probability to integer
word_index = np.argmax(model_softMax_output)
# map integer to word
word = index_to_word[word_index]
partial_caption += " " + word
if word == "endseq":
break
final_caption = partial_caption.split()[1:-1]
final_caption = " ".join(final_caption)
return final_caption
# FUNCTION TO CONVERT IMAGE TO FEATURE VECTOR
def image_to_feature_vec(image_path):
img = image.load_img(image_path, target_size=(224, 224))
# convert the image pixels to a numpy array
x = image.img_to_array(img)
# prepare the image for the ResNet model
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# get features
interpreter.set_tensor(input_index, x)
interpreter.invoke()
feature_vec = interpreter.get_tensor(output_index)
# reshape data for the model
feature_vec = np.reshape(feature_vec, feature_vec.shape[1])
return feature_vec.reshape((1, 2048))
# FUNCTION TO GENERATE CAPTIONS FOR ALL DEV DATASET IMAGES
def generate_captions():
with open("./random_captions.txt", "w") as file:
for i in range(0, 1000):
key = list(dev_dataset.keys())[i]
feature_vec = encoded_dev_images[key].reshape((1, 2048))
file.write('"{}${}",\n'.format(key, predict(feature_vec)))
# FUNCTION TO GENERATE CAPTION FROM IMAGE WITH PROVIDED PATH
def generate_caption_from_image(image_path):
feature_vec = image_to_feature_vec(image_path)
print("{}".format(predict(feature_vec)))
sys.stdout.flush() # Return printed output back to Node.js server
if __name__ == "__main__":
if sys.argv[1] == "generate":
generate_captions()
elif sys.argv[1] == "image":
generate_caption_from_image(sys.argv[2])