-
Notifications
You must be signed in to change notification settings - Fork 5
/
class_balance.py
117 lines (80 loc) · 3.18 KB
/
class_balance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
"""
Check the class balance for each dataset from the raw dataset (before .tfrecord)
Note: sets CUDA_VISIBLE_DEVICES= so that it doesn't use the GPU.
Run something like the following to save the result:
./class_balance.py | tee class_balance.txt
(Based on samples_per_target.py and datasets/class_balance.py [see git tag v2])
"""
import os
import numpy as np
from absl import app
from absl import flags
from datasets import datasets
from load_datasets import load_da
FLAGS = flags.FLAGS
def get_labels(dataset):
""" Count training examples for all the sources datasets """
ys = []
for _, y, _ in dataset:
ys.append(y.numpy())
return np.hstack(ys)
def calc_class_balance(labels, num_classes):
""" Count number of labels from each class in the dataset
(Copied from methods.py)
"""
p_y = [0]*num_classes
for class_num in range(0, num_classes):
# Count instances of this class
this_class_count = sum(labels == class_num)
p_y[class_num] = this_class_count
# Normalize to make P(y) sum to one like a proper probability
# distribution
p_y = p_y / sum(p_y)
return p_y
def class_balance(dataset, num_classes):
""" First get the labels as a numpy array, then calculate label proportions """
return calc_class_balance(get_labels(dataset), num_classes)
def print_table(title, classes, total_width=5):
""" Print the table of dataset and then each of the classes """
print(title)
for n, v in classes.items():
print(n, *["{:.1f}".format(x*100).rjust(total_width) for x in v], sep=" ")
print()
def print_class_balances(dataset_name, user_source_pairs):
classes_train = {}
classes_test = {}
for user, source in user_source_pairs:
train = None
test = None
if source.train_evaluation is not None:
train = class_balance(source.train_evaluation, source.num_classes)
if source.test_evaluation is not None:
test = class_balance(source.test_evaluation, source.num_classes)
name = dataset_name + "_" + str(user)
if train is not None:
classes_train[name] = train
if test is not None:
classes_test[name] = test
print_table(dataset_name + " (train)", classes_train)
print_table(dataset_name + " (test)", classes_test)
def main(argv):
# Don't bother using the GPU for this
os.environ["CUDA_VISIBLE_DEVICES"] = ""
for dataset_name in datasets.list_datasets():
user_source_pairs = []
# We run out of memory...
if "casas" in dataset_name:
continue
for user in datasets.get_dataset_users(dataset_name):
# Note: test=False so we only look at the training samples, where
# train=80% of training set, test=20% of training set, i.e. the
# validation set
sources, _ = load_da(dataset_name, str(user), "", test=False)
# We load them one at a time
assert len(sources) == 1
source = sources[0]
user_source_pairs.append((user, source))
print_class_balances(dataset_name, user_source_pairs)
if __name__ == "__main__":
app.run(main)