-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasetprep.py
128 lines (114 loc) · 6.13 KB
/
datasetprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import json
import sys
import os
import random
# matplotlib and PIL imports
import numpy as np
import matplotlib.pyplot as plt # (load and display images)
# import matplotlib.gridspec as gridspec
# from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
from collections import OrderedDict #storing state_dict - all the weights and parameter
# General pytorch imports:
import torch # root package
from torch.utils.data import Dataset # dataset representation and loading
# Neural Network API imports
#import torch.autograd as autograd # computation graph
from torch.autograd import Variable # variable node in computation graph
from torch import nn # neural networks
import torch.nn.functional as F # layers, activations and more, to use nn functions to # define your layers
from torch import optim # optimizers e.g. gradient descent, ADAM, etc.
# from torch.optim import lr_scheduler
# pytorch vision imports
import torchvision #enables use of CNN neural nets, ResNet VGG and other pretrained models
from torchvision import datasets, models, transforms # vision datasets, architectures & transforms
from torchvision.utils import make_grid
#cli project imports
#import utility
# create a mapping from the label number (some number between 1 and 102) and the actual flower name.
def map_catalog_to_name():
with open('cat_to_name.json', 'r') as f:
cat_to_name = json.load(f)
return cat_to_name
def load_and_transform(data_dir, batch_size=32):
"""
Creates pytorch training, validation and testing pytorch dataloaders and applies transformations
respectively, and then through our network for training, testing and prediction. Uses ImageFolder and Dataloader
Parameters:
data_dir - Path to data to be used
Returns:
training - Normalized training data loader with random crops, flipping and resizing applied
testing - Normalized testing data loader with fixed cropping and resizing
validation - Normalized validation data loader with fixed cropping and resizing
"""
# set the data filepaths for the pictures and data
train_dir = data_dir + '/train'
valid_dir = data_dir + '/valid'
test_dir = data_dir + '/test'
#The images have large scale, pose and light variations.
#In addition, there are categories that have large variations within the category and several very similar categories.
#The dataset is visualized using isomap with shape and colour features.
# Define your transforms for the training, validation, and testing sets
# Data augmentation and normalization for training
# Just normalization for validation
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]
data_transforms = {
# For the training, you'll want to apply transformations such as random scaling, cropping, and flipping
# The validation and testing sets are used to measure the model's performance on data it hasn't seen yet.
# For this you don't want any scaling or rotation transformations,
# but you'll need to resize then crop the images to the appropriate size.
# training - Normalized training data loader with random crops, flipping and resizing applied
# validation - Normalized validation data loader with fixed cropping and resizing
# testing - Normalized testing data loader with fixed cropping and resizing
'training_transforms' : transforms.Compose([
transforms.RandomRotation(25),
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std)
]),
'validation_transforms' : transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std)
]),
'testing_transforms' : transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std)
])
}
print("transformations completed")
# Load the datasets with ImageFolder
print("Initializing Datasets and Dataloaders...")
# Use ImageFolder and Dataloader
# Load the datasets with ImageFolder - make sure data is in flowers folder.
image_datasets = {
'training_dataset' : datasets.ImageFolder(train_dir, transform=data_transforms['training_transforms']),
'validation_dataset' : datasets.ImageFolder(valid_dir, transform=data_transforms['validation_transforms']),
'testing_dataset' : datasets.ImageFolder(test_dir, transform=data_transforms['testing_transforms'])
}
# Using the image datasets and the trainforms, define the dataloaders
# Batch size for training
# The training, validation, testing returns images and labels(labels are the class that the image belongs to)
dataloaders = {
'training' : torch.utils.data.DataLoader(image_datasets['training_dataset'], batch_size, shuffle=True ),
'validation' : torch.utils.data.DataLoader(image_datasets['validation_dataset'], batch_size, shuffle=True ),
'testing' : torch.utils.data.DataLoader(image_datasets['testing_dataset'], batch_size, shuffle=True )
}
trainloader = dataloaders['training']
vloader = dataloaders['validation']
testloader = dataloaders['testing']
dataset_sizes = {x: len(image_datasets[x]) for x in ['training_dataset', 'validation_dataset', 'testing_dataset']}
training_set_percent = len(testloader)/len(trainloader) * 100
print ("dataloading complete:/ sizes:", dataset_sizes, 'batch_size:', batch_size)
# Get the class ids and Store class_to_idx into a model property
class_to_idx = image_datasets['training_dataset'].class_to_idx
# print("class ids as a list: ", class_to_idx)
return dataloaders, trainloader, vloader, testloader, class_to_idx, dataset_sizes