-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
203 lines (155 loc) · 6.66 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import numpy as np
import sklearn.metrics.pairwise
from pathlib import Path
import urllib.request
import pandas as pd
def get_dataset(data_name, n, dim, clusters=None, noshuffle=False):
if ((dim % 8 != 0)):
print("Choose dimensionality which is divisible by 8 (restriction)")
exit(1)
if data_name == 'gaussian':
# n datapoints sampled from each of dim gaussians centered around canonical basis vector with dimension dim
n = 1000 if n is None else n
return GaussianDataset(dimension=dim, variance=2.0, n=n)
elif data_name == 'clustered':
# n datapoints sampled from each of dim gaussians centered around canonical basis vector with dimension dim
n = 1000 if n is None else n
return ClusteredDataset(dimension=dim, n=n, clusters=clusters, noshuffle=noshuffle)
elif data_name == 'singlegaussian':
n = 1000 if n is None else n
return SingleGaussianDataset(dimension=dim, variance=2.0, n=n)
elif data_name == 'audio':
# Audio dataset as described in the NN-Descent publication
# 54,387 points (192 dimensional)
my_file = Path("audio.data")
if not my_file.is_file():
print("audio.data not here, downloading...")
urllib.request.urlretrieve ("http://kluser.ch/audio.data", "audio.data")
return AudioDataset(n)
elif data_name == 'mnist' or data_name == 'digits':
# MNIST dataset of 70k handwritten digits (784 dimensional)
mnist_filenames = ["mnist_train.csv","mnist_test.csv"]
for csv_file in mnist_filenames:
if not Path(csv_file).is_file():
print("downloading " + csv_file)
urllib.request.urlretrieve("https://pjreddie.com/media/files/" + csv_file, csv_file)
return MnistDataset(n)
elif data_name == 'pca_mnist':
return MnistSortedDataset(umap=False)
elif data_name == 'umap_mnist':
return MnistSortedDataset(umap=True)
else:
print("dataset not supported")
exit(1)
class Dataset:
def __init__(self,X):
self.X = X
self.N = X.shape[0]
self.D = X.shape[1]
def save(self, filename):
# save dataset to space separated values file
np.savetxt(filename, self.X)
class SingleGaussianDataset(Dataset):
def __init__(self, dimension, variance, n):
cov = variance * np.identity(dimension)
X = []
mean = np.zeros(dimension)
X = np.random.multivariate_normal(mean, cov, n)
np.random.shuffle(X)
Dataset.__init__(self, X)
class ClusteredDataset(Dataset):
def __init__(self, dimension, clusters, n, noshuffle):
print("Dimension",dimension, ", clusters:", clusters, ", n:",n, ", shuffle: ",not noshuffle)
# n=total points, equally distributed on clusters. each point has dimension dimensions
def min_dist(mean_matrix):
curr_min = np.inf
c = sklearn.metrics.pairwise_distances(mean_matrix, mean_matrix)
np.fill_diagonal(c, np.inf)
print(c.min())
return c.min()
# find separate cluster means
separate_cluster_means = False
means = []
while not separate_cluster_means:
print("generating means")
means = np.array([np.random.randint(0,1000000)*(np.random.rand(dimension)-np.repeat(0.5, dimension)) for i in range(clusters)])
if min_dist(means)>1000:
separate_cluster_means = True
self.means = means
cov = 1 * np.identity(dimension)
X = []
for i in range(clusters):
mean = np.zeros(dimension)
X.append(np.random.multivariate_normal(means[i], cov, n//clusters))
X = np.vstack(X)
if not noshuffle:
np.random.shuffle(X)
print("shuffled")
print("done with dataset")
Dataset.__init__(self, X)
class GaussianDataset(Dataset):
def __init__(self, dimension, variance, n):
# generate n//dimension points from each of dimension many sperical gaussians
# the gaussians are each centered around a canonical basisvector
# n//dimension*dimension many points in total
cov = variance * np.identity(dimension)
X = []
for i in range(dimension):
mean = np.zeros(dimension)
mean[i] = 1.0
X.append(np.random.multivariate_normal(mean, cov, n//dimension))
X = np.vstack(X)
np.random.shuffle(X)
Dataset.__init__(self, X)
class AudioDataset(Dataset):
# Audio dataset as described in the publicaiton. 54,387 points (192 dimensional)
def __init__(self, n=0):
path = 'audio.data'
# read data as specified here
# http://lshkit.sourceforge.net/dc/d46/matrix_8h.html
with open(path, "rb") as f:
def read_uint(uint_bytes):
return int.from_bytes(uint_bytes, signed=False, byteorder='little')
elem_size = read_uint(f.read(4))
size = read_uint(f.read(4))
dim_elem_size = read_uint(f.read(4))
# reads all 4-byte floats into flat array of dimension size*dim_elem_size
X = np.frombuffer(np.array(f.read(4*size*dim_elem_size)), dtype=np.float32);
X = X.reshape((size,dim_elem_size))
np.random.shuffle(X)
# take not the whole dataset
if (n!=0):
X = X[:n,:]
self.X = X
# X = np.zeros((size,dim_elem_size))
#
# for i in range(size):
# for j in range(dim_elem_size):
# X[i,j] = np.array(f.read(4)).view(dtype=np.float32)
Dataset.__init__(self, X)
class MnistDataset(Dataset):
# Default MNIST Dataset 70k handwritten digits (784 dimensional)
def __init__(self, n):
train = 'mnist_train.csv' # 60k
test = 'mnist_test.csv' # 10k
train_df = pd.read_csv(train, header=None)
test_df = pd.read_csv(test, header=None)
complete_df = pd.concat([train_df, test_df], axis = 0)
X = complete_df.iloc[:,1:].to_numpy(dtype='float32')
np.random.shuffle(X)
if (n!=0):
X = X[:n,:]
self.X = X
Dataset.__init__(self, X)
#self.N = complete_df.shape[0] # 70,000
#self.D = complete_df.shape[1] - 1 # 784
class MnistSortedDataset(Dataset):
# MNIST dataset sorted according to a 1d umap
def __init__(self, umap=True):
if umap:
df = pd.read_csv('mnist_sort_umap.csv', header=None)
else:
df = pd.read_csv('mnist_sort_pca.csv', header=None)
self.X = df.to_numpy(dtype='float32')
self.N = df.shape[0] # 70,000
self.D = df.shape[1] # 784