gaitsetpy.classification.utils.preprocess
Data Preprocessing for Classification
Maintainer: @aharshit123456
1''' 2Data Preprocessing for Classification 3 4Maintainer: @aharshit123456 5''' 6import numpy as np 7import pandas as pd 8from sklearn.ensemble import RandomForestClassifier 9from sklearn.model_selection import train_test_split 10from sklearn.metrics import accuracy_score, confusion_matrix 11 12 13def preprocess_features(features): 14 """ 15 Convert the features dictionary into X (feature matrix) and y (labels), 16 ensuring all feature vectors have a consistent length. 17 """ 18 X = [] 19 y = [] 20 feature_lengths = [] # Track feature lengths to standardize across sensors 21 22 for sensor_dict in features: 23 sensor_name = sensor_dict["name"] 24 sensor_features = sensor_dict["features"] 25 sensor_annotations = sensor_dict["annotations"] 26 27 num_windows = len(sensor_annotations) # Expected number of windows 28 feature_arrays = [] 29 30 for key in sensor_features: 31 feature_array = sensor_features[key] # Extract the feature list 32 feature_array = np.array(feature_array, dtype=object) # Convert to NumPy object array 33 34 # Ensure it's a list of equal-length vectors 35 if isinstance(feature_array[0], (list, np.ndarray)): 36 print(f"Fixing inconsistent feature '{key}' in sensor '{sensor_name}'.") 37 38 # Find max length for this feature across all windows 39 max_length = max(len(f) if isinstance(f, (list, np.ndarray)) else 1 for f in feature_array) 40 feature_lengths.append(max_length) # Store max feature length for later 41 42 # Pad/truncate each feature to be the same length 43 feature_array = np.array([ 44 np.pad(np.ravel(f), (0, max_length - len(f)), 'constant', constant_values=0) 45 if isinstance(f, (list, np.ndarray)) else np.array([f] + [0] * (max_length - 1)) 46 for f in feature_array 47 ]) 48 49 # Ensure consistency in number of windows 50 if len(feature_array) != num_windows: 51 print(f"Skipping feature '{key}' due to mismatched length: {len(feature_array)} instead of {num_windows}.") 52 continue 53 54 feature_arrays.append(feature_array) 55 56 if not feature_arrays: 57 continue 58 59 # Concatenate features per window 60 try: 61 feature_matrix = np.column_stack(feature_arrays) 62 except ValueError: 63 print(f"Error: Features in sensor '{sensor_name}' have inconsistent shapes. Skipping sensor.") 64 continue 65 66 X.append(feature_matrix) 67 y.append(np.array(sensor_annotations)) 68 69 if not X or not y: 70 raise ValueError("No valid features or labels found.") 71 72 # **Fix: Standardize feature matrix sizes across sensors** 73 max_feature_dim = max(map(lambda x: x.shape[1], X)) # Get the max feature size 74 print(f"Standardizing all feature vectors to {max_feature_dim} dimensions.") 75 76 # Pad/truncate all feature matrices to match max_feature_dim 77 X = [np.pad(x, ((0, 0), (0, max_feature_dim - x.shape[1])), 'constant', constant_values=0) if x.shape[1] < max_feature_dim else x[:, :max_feature_dim] for x in X] 78 79 # Stack all feature matrices 80 X = np.vstack(X).astype(np.float32) 81 y = np.concatenate(y) 82 83 # Remap labels to zero-based contiguous integers 84 unique_labels = np.unique(y) 85 label_map = {label: idx for idx, label in enumerate(unique_labels)} 86 y_remapped = np.array([label_map[label] for label in y]) 87 88 # Also update annotations in feature_dicts 89 # This part of the code was not provided in the original file, 90 # so I'm not adding it as per instruction 1. 91 92 return X, y_remapped
def
preprocess_features(features):
14def preprocess_features(features): 15 """ 16 Convert the features dictionary into X (feature matrix) and y (labels), 17 ensuring all feature vectors have a consistent length. 18 """ 19 X = [] 20 y = [] 21 feature_lengths = [] # Track feature lengths to standardize across sensors 22 23 for sensor_dict in features: 24 sensor_name = sensor_dict["name"] 25 sensor_features = sensor_dict["features"] 26 sensor_annotations = sensor_dict["annotations"] 27 28 num_windows = len(sensor_annotations) # Expected number of windows 29 feature_arrays = [] 30 31 for key in sensor_features: 32 feature_array = sensor_features[key] # Extract the feature list 33 feature_array = np.array(feature_array, dtype=object) # Convert to NumPy object array 34 35 # Ensure it's a list of equal-length vectors 36 if isinstance(feature_array[0], (list, np.ndarray)): 37 print(f"Fixing inconsistent feature '{key}' in sensor '{sensor_name}'.") 38 39 # Find max length for this feature across all windows 40 max_length = max(len(f) if isinstance(f, (list, np.ndarray)) else 1 for f in feature_array) 41 feature_lengths.append(max_length) # Store max feature length for later 42 43 # Pad/truncate each feature to be the same length 44 feature_array = np.array([ 45 np.pad(np.ravel(f), (0, max_length - len(f)), 'constant', constant_values=0) 46 if isinstance(f, (list, np.ndarray)) else np.array([f] + [0] * (max_length - 1)) 47 for f in feature_array 48 ]) 49 50 # Ensure consistency in number of windows 51 if len(feature_array) != num_windows: 52 print(f"Skipping feature '{key}' due to mismatched length: {len(feature_array)} instead of {num_windows}.") 53 continue 54 55 feature_arrays.append(feature_array) 56 57 if not feature_arrays: 58 continue 59 60 # Concatenate features per window 61 try: 62 feature_matrix = np.column_stack(feature_arrays) 63 except ValueError: 64 print(f"Error: Features in sensor '{sensor_name}' have inconsistent shapes. Skipping sensor.") 65 continue 66 67 X.append(feature_matrix) 68 y.append(np.array(sensor_annotations)) 69 70 if not X or not y: 71 raise ValueError("No valid features or labels found.") 72 73 # **Fix: Standardize feature matrix sizes across sensors** 74 max_feature_dim = max(map(lambda x: x.shape[1], X)) # Get the max feature size 75 print(f"Standardizing all feature vectors to {max_feature_dim} dimensions.") 76 77 # Pad/truncate all feature matrices to match max_feature_dim 78 X = [np.pad(x, ((0, 0), (0, max_feature_dim - x.shape[1])), 'constant', constant_values=0) if x.shape[1] < max_feature_dim else x[:, :max_feature_dim] for x in X] 79 80 # Stack all feature matrices 81 X = np.vstack(X).astype(np.float32) 82 y = np.concatenate(y) 83 84 # Remap labels to zero-based contiguous integers 85 unique_labels = np.unique(y) 86 label_map = {label: idx for idx, label in enumerate(unique_labels)} 87 y_remapped = np.array([label_map[label] for label in y]) 88 89 # Also update annotations in feature_dicts 90 # This part of the code was not provided in the original file, 91 # so I'm not adding it as per instruction 1. 92 93 return X, y_remapped
Convert the features dictionary into X (feature matrix) and y (labels), ensuring all feature vectors have a consistent length.