-
Notifications
You must be signed in to change notification settings - Fork 0
/
3 Chapter Three Data Understanding.py
440 lines (386 loc) · 18.3 KB
/
3 Chapter Three Data Understanding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
%reset -f
#----------------------------------------------------
# Chapter Three
#----------------------------------------------------
# 3. Data Understanding
# Feature Distribution Over Classes
# Feature Correlation
# Feature Correlation by Regression
# Feature Importance
# SHAP
# LIME
# Class Distribution Ratio
# PCA and t-SNE Feature Plots
#----------------------------------------------------
import numpy as np
import pandas as pd
import os
import warnings
import time
from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import lime
import lime.lime_tabular
import shap
#----------------------------------------------------
# Record the start time
start_time = time.time()
# Suppress warnings
warnings.filterwarnings("ignore")
# Parse BVH files
def parse_bvh(file_path):
"""Parse the BVH file to extract the header and motion data."""
print(f"Parsing file: {file_path}")
with open(file_path, 'r') as file:
lines = file.readlines()
header, motion_data = [], []
capture_data = False
for line in lines:
if "MOTION" in line:
capture_data = True
elif capture_data:
if line.strip().startswith("Frames") or line.strip().startswith("Frame Time"):
continue
motion_data.append(np.fromstring(line, sep=' '))
else:
header.append(line)
print(f"Finished parsing: {file_path}")
return header, np.array(motion_data)
def read_bvh(filename):
"""Reads a BVH file and returns motion data parsed."""
header, motion_data = parse_bvh(filename)
return motion_data
def interpolate_frames(motion_data, target_frame_count):
"""Interpolates BVH frames to match a target frame count."""
print(f"Interpolating frames to match target frame count: {target_frame_count}")
original_frame_count = len(motion_data)
original_time = np.linspace(0, 1, original_frame_count)
target_time = np.linspace(0, 1, target_frame_count)
interpolated_frames = []
for frame in np.array(motion_data).T:
interpolator = interp1d(original_time, frame.astype(float), kind='linear')
interpolated_frame = interpolator(target_time)
interpolated_frames.append(interpolated_frame)
print(f"Finished interpolation for frames: Original frame count = {original_frame_count}, Target frame count = {target_frame_count}")
return np.array(interpolated_frames).T
def find_max_frames(folder_path):
"""Finds the maximum number of frames among all BVH files in a folder."""
print(f"Finding maximum number of frames in folder: {folder_path}")
max_frames = 0
for filename in os.listdir(folder_path):
if filename.endswith('.bvh'):
motion_data = read_bvh(os.path.join(folder_path, filename))
max_frames = max(max_frames, len(motion_data))
print(f"Maximum frames found: {max_frames}")
return max_frames
def process_bvh_files(folder_path, max_frames):
"""Processes each BVH file in the folder after interpolating to the same number of frames."""
print(f"Processing BVH files in folder: {folder_path}")
all_features = []
for filename in os.listdir(folder_path):
if filename.endswith('.bvh'):
print(f"Processing file: {filename}")
full_path = os.path.join(folder_path, filename)
motion_data = read_bvh(full_path)
interpolated_frames = interpolate_frames(motion_data, max_frames)
all_features.append(interpolated_frames) # Keep the 2D array before flattening for feature extraction
print(f"Finished processing file: {filename}")
print(f"Finished processing all BVH files in folder: {folder_path}")
return np.array(all_features)
#------------------------------------------------------------------
# Feature Extraction
#------------------------------------------------------------------
def extract_motion_features(motion_data):
"""Extracts key motion features such as angular velocity, acceleration, jerk, and range of motion."""
print("Extracting motion features...")
channels_per_joint = 3 # 3 channels per joint (e.g., X, Y, Z rotations)
num_joints = motion_data.shape[1] // channels_per_joint
features = []
for i in range(num_joints):
joint_rotations = motion_data[:, i * channels_per_joint:(i + 1) * channels_per_joint]
# Angular velocity: derivative of joint rotation
angular_velocity = np.diff(joint_rotations, axis=0)
# Acceleration: derivative of angular velocity
acceleration = np.diff(angular_velocity, axis=0)
# Jerk: derivative of acceleration
jerk = np.diff(acceleration, axis=0)
# Range of Motion: max rotation - min rotation
range_of_motion = np.max(joint_rotations, axis=0) - np.min(joint_rotations, axis=0)
# Flatten all features for this joint and append
joint_features = np.concatenate([angular_velocity.flatten(), acceleration.flatten(), jerk.flatten(), range_of_motion])
features.append(joint_features)
# Flatten the entire features array for all joints
flattened_features = np.concatenate(features)
print("Feature extraction complete.")
return flattened_features
# -------------------------------
# Train Folder
train_folder_path = 'Small Dataset/'
# -------------------------------
# Find maximum frame size in the training data
print("Finding maximum frames in the training dataset...")
max_frames_train = find_max_frames(train_folder_path)
# Process BVH files using the maximum frame size
print("Processing training data...")
all_features_train = process_bvh_files(train_folder_path, max_frames_train)
# Extract motion features for each processed BVH file
print("Extracting features from processed data...")
X_features = np.array([extract_motion_features(interpolate_frames(motion_data, max_frames_train)) for motion_data in all_features_train])
# Train Labels
print("Assigning labels to the dataset...")
C_Angry = [0] * 42
C_Depressed = [1] * 42
C_Neutral = [2] * 42
C_Proud = [3] * 32
Labels = np.array(C_Angry + C_Depressed + C_Neutral + C_Proud, dtype=np.int32)
print("Labels assigned.")
# Standardize the data
print("Standardizing the data...")
scaler = StandardScaler()
X_standardized = scaler.fit_transform(all_features_train.reshape(len(all_features_train), -1)) # Flatten for standardization
print("Data standardized.")
# Normalize the data (min-max scaling)
print("Normalizing the data...")
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)
print("Data normalized.")
# Handle NaN values
print("Handling NaN values...")
imputer = SimpleImputer(strategy='mean')
X_cleaned = imputer.fit_transform(X_normalized)
print("NaN values handled.")
# Final feature set and labels
X = X_cleaned
Y = Labels
# X_features contains the flattened extracted features
print(f"Feature extraction completed. X_features shape: {X_features.shape}")
Xtemp = X
#------------------------------------------------------------------
# Set the number of features to select (this can be adjusted as needed)
num_features_to_select = 10 # Example number, adjust based on dataset
#------------------------------------------------------------------
# Standardize the data before applying NCA
scaler = StandardScaler()
X_features_scaled = scaler.fit_transform(X)
# Perform Neighborhood Components Analysis (NCA)
print("Performing NCA feature selection...")
nca = NeighborhoodComponentsAnalysis(n_components=num_features_to_select, random_state=42)
X = nca.fit_transform(X_features_scaled, Y)
print(f"Selected {num_features_to_select} features using NCA.")
#-----------------------------------------------------------------------------------
# Data Understanding and Features
#-----------------------------------------------------------------------------------
# Feature distribution over classes
# Define the feature indices to visualize (for example, feature indices 1, 5, and 10)
feature_indices_to_plot = [5, 7, 2]
# Select the desired features for visualization using their indices
X_selected_for_plot = X[:, feature_indices_to_plot]
# Number of features to plot
num_features_to_plot = len(feature_indices_to_plot)
# Create subplots for each feature
fig, axes = plt.subplots(num_features_to_plot, 1, figsize=(10, 20), sharex=True)
# Loop through the selected features and plot distribution for each class
for idx, feature_index in enumerate(feature_indices_to_plot):
feature_values = X_selected_for_plot[:, idx]
for class_label in np.unique(Y):
sns.kdeplot(feature_values[Y == class_label], ax=axes[idx], label=f'Class {class_label}', fill=True)
axes[idx].set_title(f'Feature {feature_index} Distribution Over Classes')
axes[idx].set_xlabel('Feature Value')
axes[idx].set_ylabel('Density')
axes[idx].legend()
plt.tight_layout()
plt.show()
#--------------------------------
# Feature Correlation
# Define the number of features to plot (select a subset of features)
desired_features = 8 # Change this to the number of features to visualize
num_features = min(desired_features, X.shape[1])
# Select the subset of the correlation matrix for the desired number of features
correlation_matrix_subset = np.corrcoef(X[:, :num_features], rowvar=False)
# Plot the correlation matrix for the selected number of features
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_subset, annot=True, cmap='coolwarm', fmt='.2f', cbar=True, square=True)
plt.title(f"Feature Correlation Matrix for {num_features} Features")
plt.show()
#Feature Correlation by Regression
# Convert X to a DataFrame for easier manipulation
Xtemp_df = pd.DataFrame(X)
# Define the specific features to plot a regression between (feature 0 and feature 4 in this case)
feature_1_index = 7
feature_2_index = 8
# Extract the data for the two features
x = Xtemp_df.iloc[:, feature_1_index].values.reshape(-1, 1) # Reshape for the model
y = Xtemp_df.iloc[:, feature_2_index].values
# Compute correlation coefficient
correlation_value = np.corrcoef(x[:, 0], y)[0, 1]
# Set up the plot
plt.figure(figsize=(15, 5))
# Linear regression plot
plt.subplot(1, 2, 1)
sns.regplot(x=x[:, 0], y=y, scatter_kws={"s": 10}, line_kws={"color": "red"})
plt.title(f"Linear Regression (Corr: {correlation_value:.2f})")
plt.xlabel(f"Feature {feature_1_index + 1}")
plt.ylabel(f"Feature {feature_2_index + 1}")
# Polynomial (non-linear) regression plot
plt.subplot(1, 2, 2)
# Fit a 2nd degree polynomial
poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(x)
poly_reg_model = LinearRegression()
poly_reg_model.fit(x_poly, y)
# Predict the polynomial regression line
y_poly_pred = poly_reg_model.predict(x_poly)
# Plot the non-linear regression with a smooth curve
plt.scatter(x, y, s=10, label="Data Points")
plt.plot(np.sort(x[:, 0]), y_poly_pred[np.argsort(x[:, 0])], color='red', label="Polynomial Fit (Degree 3)")
plt.title(f"Polynomial Regression (Corr: {correlation_value:.2f})")
plt.xlabel(f"Feature {feature_1_index + 1}")
plt.ylabel(f"Feature {feature_2_index + 1}")
plt.legend()
plt.tight_layout()
plt.show()
# RandomForest model (use RandomForestClassifier for classification tasks)
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Fit the model on the data
model.fit(X, Y)
# Get feature importances
feature_importances = model.feature_importances_
# Create a DataFrame for easier handling and sorting
feature_importance_df = pd.DataFrame({
'Feature': np.arange(X.shape[1]),
'Importance': feature_importances
})
# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Print feature importances
print("Feature Importance:\n", feature_importance_df)
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='b', align='center')
plt.gca().invert_yaxis() # To display the most important at the top
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()
#-----------------------------------------------
#SHAP
# RandomForest model (use RandomForestClassifier for classification tasks)
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Fit the model on the data
model.fit(X, Y)
# SHAP explainer
explainer = shap.TreeExplainer(model)
# Calculate SHAP values (Shapley values) for all data points
shap_values = explainer.shap_values(X)
# Print SHAP values for the first instance (for interpretation)
print("SHAP values for the first instance:\n", shap_values[0])
# Plot SHAP summary plot for feature importance
shap.summary_plot(shap_values, X, plot_type="bar")
# the detailed SHAP summary plot (with feature distribution)
shap.summary_plot(shap_values, X)
#-------------------------------------------
# LIME
# RandomForest model (use RandomForestClassifier for classification tasks)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, Y)
# LIME explainer for tabular data
explainer = lime.lime_tabular.LimeTabularExplainer(X, feature_names=[f"Feature {i}" for i in range(X.shape[1])],
verbose=True, mode='regression') # Use mode='classification' if using a classifier
# Select a specific instance to explain
instance_to_explain = X[1] # Replace 0 with the index of the instance to explain
# Generate LIME explanation for the selected instance
exp = explainer.explain_instance(instance_to_explain, model.predict)
# Print the feature importance for this instance
print("LIME explanation for the selected instance:\n")
exp.show_in_notebook(show_table=True)
# Plot the LIME explanation (bar chart)
exp.as_pyplot_figure()
plt.show()
#--------------------------------------
# Class Distribution Ratio
# Count the number of samples per class
class_counts = Counter(Y)
# Print the class distribution
print("Class Distribution (Number of samples per class):")
for class_label, count in class_counts.items():
print(f"Class {class_label}: {count} samples")
# Plot the class distribution
plt.figure(figsize=(8, 6))
sns.barplot(x=list(class_counts.keys()), y=list(class_counts.values()), palette='viridis')
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Number of Samples")
plt.show()
# PCA Plot ---------------------------------------------
# Step 1: Augment the data to exactly 2000 samples
target_samples = 2000 # Total desired samples
classes, counts = np.unique(Y, return_counts=True)
# Adjust the target for each class depending on the current distribution
target_per_class = target_samples // len(classes) # Divide 2000 equally across all classes
sampling_strategy = {cls: target_per_class for cls in classes} # Adjust this if classes are not balanced
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_augmented, Y_augmented = ros.fit_resample(X, Y)
# Step 2: Standardize the features before PCA to ensure proper scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_augmented)
# Step 3: Apply PCA to reduce the dimensionality to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Check the number of samples after augmentation
print(f"Original number of samples: {Xtemp.shape[0]}")
print(f"Augmented number of samples: {X_augmented.shape[0]}")
# Step 4: Plot the augmented data with PCA in 2D, and make sure the points are visibly scattered
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=Y_augmented, palette='viridis', s=100, alpha=0.7)
# Add labels and title
plt.title('PCA of Augmented Features (2D Projection)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Class")
plt.show()
# #t-SNE Plot ----------------------------------
# from imblearn.over_sampling import RandomOverSampler
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.preprocessing import StandardScaler
# # Step 1: Augment the data to exactly 2000 samples
# target_samples = 2000 # Total desired samples
# classes, counts = np.unique(Y, return_counts=True)
# # Adjust the target for each class depending on the current distribution
# target_per_class = target_samples // len(classes) # Divide 2000 equally across all classes
# sampling_strategy = {cls: target_per_class for cls in classes} # Adjust this if classes are not balanced
# ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
# X_augmented, Y_augmented = ros.fit_resample(X, Y)
# # Step 2: Standardize the features before t-SNE to ensure proper scaling
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X_augmented)
# # Step 3: Apply t-SNE to reduce the dimensionality to 2 components
# tsne = TSNE(n_components=2, random_state=42, perplexity=100, n_iter=1000) # Adjust parameters as needed
# X_tsne = tsne.fit_transform(X_scaled)
# # Check the number of samples after augmentation
# print(f"Original number of samples: {Xtemp.shape[0]}")
# print(f"Augmented number of samples: {X_augmented.shape[0]}")
# # Step 4: Plot the augmented data with t-SNE in 2D, and make sure the points are visibly scattered
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=Y_augmented, palette='viridis', s=50, alpha=0.7)
# # Add labels and title
# plt.title('t-SNE of Augmented Features (2D Projection)')
# plt.xlabel('t-SNE Component 1')
# plt.ylabel('t-SNE Component 2')
# plt.legend(title="Class")
# plt.show()