-
Notifications
You must be signed in to change notification settings - Fork 1
/
stl10_input.py
173 lines (136 loc) · 5.22 KB
/
stl10_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from __future__ import print_function
import sys
import os, sys, tarfile, errno
import numpy as np
import matplotlib.pyplot as plt
if sys.version_info >= (3, 0, 0):
import urllib.request as urllib # ugly but works
else:
import urllib
try:
from imageio import imsave
except:
from scipy.misc import imsave
print(sys.version_info)
# image shape
HEIGHT = 96
WIDTH = 96
DEPTH = 3
# size of a single image in bytes
SIZE = HEIGHT * WIDTH * DEPTH
# path to the directory with the data
DATA_DIR = './data'
# url of the binary data
DATA_URL = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'
# path to the binary train file with image data
TRAIN_DATA_PATH = './data/stl10_binary/train_X.bin'
# path to the binary train file with labels
TRAIN_LABEL_PATH = './data/stl10_binary/train_y.bin'
# path to the binary train file with image data
TEST_DATA_PATH = './data/stl10_binary/test_X.bin'
# path to the binary train file with labels
TEST_LABEL_PATH = './data/stl10_binary/test_y.bin'
# path to the binary train file with image data
UNLABELED_DATA_PATH = './data/stl10_binary/unlabeled_X.bin'
def read_labels(path_to_labels):
"""
:param path_to_labels: path to the binary file containing labels from the STL-10 dataset
:return: an array containing the labels
"""
with open(path_to_labels, 'rb') as f:
labels = np.fromfile(f, dtype=np.uint8)
return labels
def read_all_images(path_to_data):
"""
:param path_to_data: the file containing the binary images from the STL-10 dataset
:return: an array containing all the images
"""
with open(path_to_data, 'rb') as f:
# read whole file in uint8 chunks
everything = np.fromfile(f, dtype=np.uint8)
# We force the data into 3x96x96 chunks, since the
# images are stored in "column-major order", meaning
# that "the first 96*96 values are the red channel,
# the next 96*96 are green, and the last are blue."
# The -1 is since the size of the pictures depends
# on the input file, and this way numpy determines
# the size on its own.
images = np.reshape(everything, (-1, 3, 96, 96))
# Now transpose the images into a standard image format
# readable by, for example, matplotlib.imshow
# You might want to comment this line or reverse the shuffle
# if you will use a learning algorithm like CNN, since they like
# their channels separated.
images = np.transpose(images, (0, 3, 2, 1))
return images
def read_single_image(image_file):
"""
CAREFUL! - this method uses a file as input instead of the path - so the
position of the reader will be remembered outside of context of this method.
:param image_file: the open file containing the images
:return: a single image
"""
# read a single image, count determines the number of uint8's to read
image = np.fromfile(image_file, dtype=np.uint8, count=SIZE)
# force into image matrix
image = np.reshape(image, (3, 96, 96))
# transpose to standard format
# You might want to comment this line or reverse the shuffle
# if you will use a learning algorithm like CNN, since they like
# their channels separated.
image = np.transpose(image, (2, 1, 0))
return image
def plot_image(image):
"""
:param image: the image to be plotted in a 3-D matrix format
:return: None
"""
plt.imshow(image)
plt.show()
def save_image(image, name):
imsave("%s.png" % name, image, format="png")
def download_and_extract():
"""
Download and extract the STL-10 dataset
:return: None
"""
dest_directory = DATA_DIR
if not os.path.exists(dest_directory):
os.makedirs(dest_directory)
filename = DATA_URL.split('/')[-1]
filepath = os.path.join(dest_directory, filename)
if not os.path.exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\rDownloading %s %.2f%%' % (filename,
float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.urlretrieve(DATA_URL, filepath, reporthook=_progress)
print('Downloaded', filename)
tarfile.open(filepath, 'r:gz').extractall(dest_directory)
def save_images(images, labels, split):
print("Saving images to disk")
i = 0
for image in images:
label = labels[i]
directory = './img/stl10/' + split + '/' + str(label) + '/'
try:
os.makedirs(directory, exist_ok=True)
except OSError as exc:
if exc.errno == errno.EEXIST:
pass
filename = directory + str(i)
print(filename)
save_image(image, filename)
i = i + 1
if __name__ == "__main__":
# download data if needed
download_and_extract()
images = read_all_images(TRAIN_DATA_PATH)
labels = read_labels(TRAIN_LABEL_PATH)
save_images(images, labels, 'train')
images = read_all_images(TEST_DATA_PATH)
labels = read_labels(TEST_LABEL_PATH)
save_images(images, labels, 'test')
images = read_all_images(UNLABELED_DATA_PATH)
labels = np.zeros(len(images), dtype=np.long)
save_images(images, labels, 'unlabeled')