gaitsetpy.classification.utils.preprocess

Data Preprocessing for Classification

Maintainer: @aharshit123456

 1'''
 2Data Preprocessing for Classification
 3
 4Maintainer: @aharshit123456
 5'''
 6import numpy as np
 7import pandas as pd
 8from sklearn.ensemble import RandomForestClassifier
 9from sklearn.model_selection import train_test_split
10from sklearn.metrics import accuracy_score, confusion_matrix
11
12
13def preprocess_features(features):
14    """
15    Convert the features dictionary into X (feature matrix) and y (labels),
16    ensuring all feature vectors have a consistent length.
17    """
18    X = []
19    y = []
20    feature_lengths = []  # Track feature lengths to standardize across sensors
21
22    for sensor_dict in features:
23        sensor_name = sensor_dict["name"]
24        sensor_features = sensor_dict["features"]
25        sensor_annotations = sensor_dict["annotations"]
26
27        num_windows = len(sensor_annotations)  # Expected number of windows
28        feature_arrays = []
29
30        for key in sensor_features:
31            feature_array = sensor_features[key]  # Extract the feature list
32            feature_array = np.array(feature_array, dtype=object)  # Convert to NumPy object array
33
34            # Ensure it's a list of equal-length vectors
35            if isinstance(feature_array[0], (list, np.ndarray)):
36                print(f"Fixing inconsistent feature '{key}' in sensor '{sensor_name}'.")
37
38                # Find max length for this feature across all windows
39                max_length = max(len(f) if isinstance(f, (list, np.ndarray)) else 1 for f in feature_array)
40                feature_lengths.append(max_length)  # Store max feature length for later
41
42                # Pad/truncate each feature to be the same length
43                feature_array = np.array([
44                    np.pad(np.ravel(f), (0, max_length - len(f)), 'constant', constant_values=0)
45                    if isinstance(f, (list, np.ndarray)) else np.array([f] + [0] * (max_length - 1))
46                    for f in feature_array
47                ])
48
49            # Ensure consistency in number of windows
50            if len(feature_array) != num_windows:
51                print(f"Skipping feature '{key}' due to mismatched length: {len(feature_array)} instead of {num_windows}.")
52                continue
53
54            feature_arrays.append(feature_array)
55
56        if not feature_arrays:
57            continue
58
59        # Concatenate features per window
60        try:
61            feature_matrix = np.column_stack(feature_arrays)
62        except ValueError:
63            print(f"Error: Features in sensor '{sensor_name}' have inconsistent shapes. Skipping sensor.")
64            continue
65
66        X.append(feature_matrix)
67        y.append(np.array(sensor_annotations))
68
69    if not X or not y:
70        raise ValueError("No valid features or labels found.")
71
72    # **Fix: Standardize feature matrix sizes across sensors**
73    max_feature_dim = max(map(lambda x: x.shape[1], X))  # Get the max feature size
74    print(f"Standardizing all feature vectors to {max_feature_dim} dimensions.")
75
76    # Pad/truncate all feature matrices to match max_feature_dim
77    X = [np.pad(x, ((0, 0), (0, max_feature_dim - x.shape[1])), 'constant', constant_values=0) if x.shape[1] < max_feature_dim else x[:, :max_feature_dim] for x in X]
78
79    # Stack all feature matrices
80    X = np.vstack(X).astype(np.float32)
81    y = np.concatenate(y)
82
83    # Remap labels to zero-based contiguous integers
84    unique_labels = np.unique(y)
85    label_map = {label: idx for idx, label in enumerate(unique_labels)}
86    y_remapped = np.array([label_map[label] for label in y])
87
88    # Also update annotations in feature_dicts
89    # This part of the code was not provided in the original file,
90    # so I'm not adding it as per instruction 1.
91
92    return X, y_remapped
def preprocess_features(features):
14def preprocess_features(features):
15    """
16    Convert the features dictionary into X (feature matrix) and y (labels),
17    ensuring all feature vectors have a consistent length.
18    """
19    X = []
20    y = []
21    feature_lengths = []  # Track feature lengths to standardize across sensors
22
23    for sensor_dict in features:
24        sensor_name = sensor_dict["name"]
25        sensor_features = sensor_dict["features"]
26        sensor_annotations = sensor_dict["annotations"]
27
28        num_windows = len(sensor_annotations)  # Expected number of windows
29        feature_arrays = []
30
31        for key in sensor_features:
32            feature_array = sensor_features[key]  # Extract the feature list
33            feature_array = np.array(feature_array, dtype=object)  # Convert to NumPy object array
34
35            # Ensure it's a list of equal-length vectors
36            if isinstance(feature_array[0], (list, np.ndarray)):
37                print(f"Fixing inconsistent feature '{key}' in sensor '{sensor_name}'.")
38
39                # Find max length for this feature across all windows
40                max_length = max(len(f) if isinstance(f, (list, np.ndarray)) else 1 for f in feature_array)
41                feature_lengths.append(max_length)  # Store max feature length for later
42
43                # Pad/truncate each feature to be the same length
44                feature_array = np.array([
45                    np.pad(np.ravel(f), (0, max_length - len(f)), 'constant', constant_values=0)
46                    if isinstance(f, (list, np.ndarray)) else np.array([f] + [0] * (max_length - 1))
47                    for f in feature_array
48                ])
49
50            # Ensure consistency in number of windows
51            if len(feature_array) != num_windows:
52                print(f"Skipping feature '{key}' due to mismatched length: {len(feature_array)} instead of {num_windows}.")
53                continue
54
55            feature_arrays.append(feature_array)
56
57        if not feature_arrays:
58            continue
59
60        # Concatenate features per window
61        try:
62            feature_matrix = np.column_stack(feature_arrays)
63        except ValueError:
64            print(f"Error: Features in sensor '{sensor_name}' have inconsistent shapes. Skipping sensor.")
65            continue
66
67        X.append(feature_matrix)
68        y.append(np.array(sensor_annotations))
69
70    if not X or not y:
71        raise ValueError("No valid features or labels found.")
72
73    # **Fix: Standardize feature matrix sizes across sensors**
74    max_feature_dim = max(map(lambda x: x.shape[1], X))  # Get the max feature size
75    print(f"Standardizing all feature vectors to {max_feature_dim} dimensions.")
76
77    # Pad/truncate all feature matrices to match max_feature_dim
78    X = [np.pad(x, ((0, 0), (0, max_feature_dim - x.shape[1])), 'constant', constant_values=0) if x.shape[1] < max_feature_dim else x[:, :max_feature_dim] for x in X]
79
80    # Stack all feature matrices
81    X = np.vstack(X).astype(np.float32)
82    y = np.concatenate(y)
83
84    # Remap labels to zero-based contiguous integers
85    unique_labels = np.unique(y)
86    label_map = {label: idx for idx, label in enumerate(unique_labels)}
87    y_remapped = np.array([label_map[label] for label in y])
88
89    # Also update annotations in feature_dicts
90    # This part of the code was not provided in the original file,
91    # so I'm not adding it as per instruction 1.
92
93    return X, y_remapped

Convert the features dictionary into X (feature matrix) and y (labels), ensuring all feature vectors have a consistent length.