gaitsetpy.dataset.physionet
PhysioNet VGRF Dataset Loader. Maintainer: @aharshit123456
This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.
Dataset source: https://physionet.org/content/gaitpdb/1.0.0/
1''' 2PhysioNet VGRF Dataset Loader. 3Maintainer: @aharshit123456 4 5This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. 6The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with 7Parkinson's disease and healthy controls. 8 9Dataset source: https://physionet.org/content/gaitpdb/1.0.0/ 10''' 11 12import os 13import pandas as pd 14import numpy as np 15from typing import List, Dict, Tuple, Optional 16from glob import glob 17import requests 18from tqdm import tqdm 19import zipfile 20from ..core.base_classes import BaseDatasetLoader 21from .utils import sliding_window 22 23 24class PhysioNetLoader(BaseDatasetLoader): 25 """ 26 PhysioNet VGRF dataset loader class. 27 28 This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. 29 The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 30 disease and healthy controls. 31 32 Features concurrent downloading for efficient data retrieval. 33 """ 34 35 def __init__(self, max_workers: int = 8): 36 """ 37 Initialize PhysioNet loader with concurrent download support. 38 39 Args: 40 max_workers: Maximum number of concurrent download threads (default: 8) 41 """ 42 super().__init__( 43 name="physionet", 44 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls", 45 max_workers=max_workers 46 ) 47 self.metadata = { 48 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 49 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 50 'sampling_frequency': 100, # 100 Hz sampling frequency 51 'subjects': { 52 'Co': 'Control subjects', 53 'Pt': 'Parkinson\'s disease patients' 54 }, 55 'window_size': 600, # 6 seconds at 100 Hz 56 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 57 } 58 self.labels = [] 59 self.subject_types = [] 60 61 def _download_physionet_data(self, data_dir: str) -> str: 62 """ 63 Download PhysioNet dataset if not already present using concurrent downloads. 64 65 This method uses multi-threaded downloading to significantly speed up the 66 download process for the 100+ files in the PhysioNet dataset. 67 68 Args: 69 data_dir: Directory to store the dataset 70 71 Returns: 72 Path to the downloaded/existing dataset directory 73 """ 74 dataset_path = os.path.join(data_dir, "physionet_gaitpdb") 75 76 if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0: 77 print(f"PhysioNet dataset already exists at: {dataset_path}") 78 return dataset_path 79 80 os.makedirs(dataset_path, exist_ok=True) 81 82 # Download the dataset files 83 base_url = "https://physionet.org/files/gaitpdb/1.0.0/" 84 85 # Get list of files (basic file names based on the reference) 86 file_patterns = [ 87 # Control subjects - Ga prefix 88 *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)], 89 "GaCo22_01.txt", "GaCo22_10.txt", 90 91 # Parkinson's patients - Ga prefix 92 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)], 93 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)], 94 *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)], 95 96 # Control subjects - Ju prefix 97 *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)], 98 99 # Parkinson's patients - Ju prefix 100 *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)], 101 102 # Control subjects - Si prefix 103 *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)], 104 105 # Parkinson's patients - Si prefix 106 *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)] 107 ] 108 109 # Prepare download tasks for concurrent execution 110 download_tasks = [ 111 { 112 'url': base_url + filename, 113 'dest_path': os.path.join(dataset_path, filename) 114 } 115 for filename in file_patterns 116 ] 117 118 print(f"Downloading PhysioNet dataset to {dataset_path} using {self.max_workers} threads") 119 120 # Use concurrent downloading from base class 121 results = self.download_files_concurrent( 122 download_tasks, 123 show_progress=True, 124 desc="Downloading PhysioNet files" 125 ) 126 127 # Print summary 128 print(f"\nDownload Summary:") 129 print(f" Total files: {results['total']}") 130 print(f" Successfully downloaded: {results['success']}") 131 print(f" Already existed (skipped): {results['skipped']}") 132 print(f" Failed: {results['failed']}") 133 134 if results['failed'] > 0 and len(results['failed_downloads']) > 0: 135 print(f"\nFailed downloads (showing first 10):") 136 for failed in results['failed_downloads'][:10]: 137 print(f" - {os.path.basename(failed['dest_path'])}: {failed['error']}") 138 if len(results['failed_downloads']) > 10: 139 print(f" ... and {len(results['failed_downloads']) - 10} more failures") 140 141 return dataset_path 142 143 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 144 """ 145 Load PhysioNet VGRF dataset from the specified directory. 146 147 Args: 148 data_dir: Directory to store/find the dataset 149 **kwargs: Additional arguments (unused for PhysioNet) 150 151 Returns: 152 Tuple of (data_list, names_list) 153 """ 154 # Download dataset if needed 155 dataset_path = self._download_physionet_data(data_dir) 156 157 physionet_data = [] 158 physionet_names = [] 159 self.labels = [] 160 self.subject_types = [] 161 162 # Load all available files 163 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 164 filename = os.path.basename(filepath) 165 166 # Extract subject type from filename 167 if 'Co' in filename: 168 subject_type = 'Control' 169 label = 'Co' 170 elif 'Pt' in filename: 171 subject_type = 'Patient' 172 label = 'Pt' 173 else: 174 continue # Skip files that don't match expected pattern 175 176 try: 177 # Read the file - PhysioNet files are tab-delimited with variable columns 178 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 179 df = pd.read_csv(filepath, delimiter='\t', header=None) 180 181 # Handle variable number of columns 182 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 183 df = df.iloc[:, :n_cols] 184 185 # Create column names 186 col_names = ['time'] 187 for i in range(1, n_cols): 188 if i <= 8: 189 col_names.append(f'VGRF_L{i}') 190 elif i <= 16: 191 col_names.append(f'VGRF_R{i-8}') 192 else: 193 col_names.append(f'sensor_{i}') 194 195 df.columns = col_names 196 197 # Set time as index 198 df = df.set_index('time') 199 200 # Add subject metadata 201 df['subject_type'] = subject_type 202 df['label'] = label 203 204 physionet_data.append(df) 205 physionet_names.append(filename) 206 self.labels.append(label) 207 self.subject_types.append(subject_type) 208 209 except Exception as e: 210 print(f"Error loading {filename}: {e}") 211 continue 212 213 # Store loaded data 214 self.data = physionet_data 215 self.names = physionet_names 216 217 print(f"Loaded {len(physionet_data)} PhysioNet files") 218 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 219 220 return physionet_data, physionet_names 221 222 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 223 window_size: int = 600, step_size: int = 100) -> List[Dict]: 224 """ 225 Create sliding windows from the PhysioNet dataset. 226 227 Args: 228 data: List of DataFrames containing PhysioNet data 229 names: List of names corresponding to the data 230 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 231 step_size: Step size for the sliding window (default: 100) 232 233 Returns: 234 List of dictionaries containing sliding windows for each DataFrame 235 """ 236 windows_data = [] 237 238 for idx, df in enumerate(data): 239 # Remove metadata columns for windowing 240 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 241 df_sensors = df[sensor_columns] 242 243 if df_sensors.empty or len(df_sensors) < window_size: 244 continue 245 246 windows = [] 247 248 # Create windows for each sensor 249 for col in sensor_columns: 250 try: 251 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 252 windows.append({"name": col, "data": window_data}) 253 except Exception as e: 254 print(f"Error creating windows for {col} in {names[idx]}: {e}") 255 continue 256 257 if windows: 258 windows_data.append({ 259 "name": names[idx], 260 "windows": windows, 261 "metadata": { 262 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 263 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 264 "window_size": window_size, 265 "step_size": step_size, 266 "num_windows": len(windows[0]["data"]) if windows else 0 267 } 268 }) 269 270 return windows_data 271 272 def get_supported_formats(self) -> List[str]: 273 """ 274 Get list of supported file formats for PhysioNet dataset. 275 276 Returns: 277 List of supported file extensions 278 """ 279 return ['.txt'] 280 281 def get_sensor_info(self) -> Dict[str, List[str]]: 282 """ 283 Get information about sensors in the dataset. 284 285 Returns: 286 Dictionary containing sensor information 287 """ 288 return { 289 'sensors': self.metadata['sensors'], 290 'sampling_frequency': self.metadata['sampling_frequency'], 291 'window_size': self.metadata['window_size'] 292 } 293 294 def get_subject_info(self) -> Dict[str, str]: 295 """ 296 Get information about subjects in the dataset. 297 298 Returns: 299 Dictionary containing subject information 300 """ 301 return self.metadata['subjects'] 302 303 def get_labels(self) -> List[str]: 304 """ 305 Get labels for loaded data. 306 307 Returns: 308 List of labels corresponding to loaded data 309 """ 310 return self.labels 311 312 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 313 """ 314 Filter loaded data by subject type. 315 316 Args: 317 subject_type: 'Control' or 'Patient' 318 319 Returns: 320 Tuple of (filtered_data, filtered_names) 321 """ 322 if not self.data: 323 raise ValueError("No data loaded. Call load_data() first.") 324 325 filtered_data = [] 326 filtered_names = [] 327 328 for i, df in enumerate(self.data): 329 if df['subject_type'].iloc[0] == subject_type: 330 filtered_data.append(df) 331 filtered_names.append(self.names[i]) 332 333 return filtered_data, filtered_names 334 335 336# Legacy function for backward compatibility 337def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]: 338 """ 339 Legacy function to load PhysioNet data. 340 341 Args: 342 data_dir: Directory containing the dataset 343 344 Returns: 345 Tuple of (data_list, names_list) 346 """ 347 loader = PhysioNetLoader() 348 return loader.load_data(data_dir) 349 350 351def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 352 window_size: int = 600, step_size: int = 100) -> List[Dict]: 353 """ 354 Legacy function to create sliding windows from PhysioNet data. 355 356 Args: 357 data: List of DataFrames 358 names: List of names 359 window_size: Size of sliding window 360 step_size: Step size for sliding window 361 362 Returns: 363 List of sliding window dictionaries 364 """ 365 loader = PhysioNetLoader() 366 return loader.create_sliding_windows(data, names, window_size, step_size)
25class PhysioNetLoader(BaseDatasetLoader): 26 """ 27 PhysioNet VGRF dataset loader class. 28 29 This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. 30 The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 31 disease and healthy controls. 32 33 Features concurrent downloading for efficient data retrieval. 34 """ 35 36 def __init__(self, max_workers: int = 8): 37 """ 38 Initialize PhysioNet loader with concurrent download support. 39 40 Args: 41 max_workers: Maximum number of concurrent download threads (default: 8) 42 """ 43 super().__init__( 44 name="physionet", 45 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls", 46 max_workers=max_workers 47 ) 48 self.metadata = { 49 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 50 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 51 'sampling_frequency': 100, # 100 Hz sampling frequency 52 'subjects': { 53 'Co': 'Control subjects', 54 'Pt': 'Parkinson\'s disease patients' 55 }, 56 'window_size': 600, # 6 seconds at 100 Hz 57 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 58 } 59 self.labels = [] 60 self.subject_types = [] 61 62 def _download_physionet_data(self, data_dir: str) -> str: 63 """ 64 Download PhysioNet dataset if not already present using concurrent downloads. 65 66 This method uses multi-threaded downloading to significantly speed up the 67 download process for the 100+ files in the PhysioNet dataset. 68 69 Args: 70 data_dir: Directory to store the dataset 71 72 Returns: 73 Path to the downloaded/existing dataset directory 74 """ 75 dataset_path = os.path.join(data_dir, "physionet_gaitpdb") 76 77 if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0: 78 print(f"PhysioNet dataset already exists at: {dataset_path}") 79 return dataset_path 80 81 os.makedirs(dataset_path, exist_ok=True) 82 83 # Download the dataset files 84 base_url = "https://physionet.org/files/gaitpdb/1.0.0/" 85 86 # Get list of files (basic file names based on the reference) 87 file_patterns = [ 88 # Control subjects - Ga prefix 89 *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)], 90 "GaCo22_01.txt", "GaCo22_10.txt", 91 92 # Parkinson's patients - Ga prefix 93 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)], 94 *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)], 95 *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)], 96 97 # Control subjects - Ju prefix 98 *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)], 99 100 # Parkinson's patients - Ju prefix 101 *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)], 102 103 # Control subjects - Si prefix 104 *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)], 105 106 # Parkinson's patients - Si prefix 107 *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)] 108 ] 109 110 # Prepare download tasks for concurrent execution 111 download_tasks = [ 112 { 113 'url': base_url + filename, 114 'dest_path': os.path.join(dataset_path, filename) 115 } 116 for filename in file_patterns 117 ] 118 119 print(f"Downloading PhysioNet dataset to {dataset_path} using {self.max_workers} threads") 120 121 # Use concurrent downloading from base class 122 results = self.download_files_concurrent( 123 download_tasks, 124 show_progress=True, 125 desc="Downloading PhysioNet files" 126 ) 127 128 # Print summary 129 print(f"\nDownload Summary:") 130 print(f" Total files: {results['total']}") 131 print(f" Successfully downloaded: {results['success']}") 132 print(f" Already existed (skipped): {results['skipped']}") 133 print(f" Failed: {results['failed']}") 134 135 if results['failed'] > 0 and len(results['failed_downloads']) > 0: 136 print(f"\nFailed downloads (showing first 10):") 137 for failed in results['failed_downloads'][:10]: 138 print(f" - {os.path.basename(failed['dest_path'])}: {failed['error']}") 139 if len(results['failed_downloads']) > 10: 140 print(f" ... and {len(results['failed_downloads']) - 10} more failures") 141 142 return dataset_path 143 144 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 145 """ 146 Load PhysioNet VGRF dataset from the specified directory. 147 148 Args: 149 data_dir: Directory to store/find the dataset 150 **kwargs: Additional arguments (unused for PhysioNet) 151 152 Returns: 153 Tuple of (data_list, names_list) 154 """ 155 # Download dataset if needed 156 dataset_path = self._download_physionet_data(data_dir) 157 158 physionet_data = [] 159 physionet_names = [] 160 self.labels = [] 161 self.subject_types = [] 162 163 # Load all available files 164 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 165 filename = os.path.basename(filepath) 166 167 # Extract subject type from filename 168 if 'Co' in filename: 169 subject_type = 'Control' 170 label = 'Co' 171 elif 'Pt' in filename: 172 subject_type = 'Patient' 173 label = 'Pt' 174 else: 175 continue # Skip files that don't match expected pattern 176 177 try: 178 # Read the file - PhysioNet files are tab-delimited with variable columns 179 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 180 df = pd.read_csv(filepath, delimiter='\t', header=None) 181 182 # Handle variable number of columns 183 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 184 df = df.iloc[:, :n_cols] 185 186 # Create column names 187 col_names = ['time'] 188 for i in range(1, n_cols): 189 if i <= 8: 190 col_names.append(f'VGRF_L{i}') 191 elif i <= 16: 192 col_names.append(f'VGRF_R{i-8}') 193 else: 194 col_names.append(f'sensor_{i}') 195 196 df.columns = col_names 197 198 # Set time as index 199 df = df.set_index('time') 200 201 # Add subject metadata 202 df['subject_type'] = subject_type 203 df['label'] = label 204 205 physionet_data.append(df) 206 physionet_names.append(filename) 207 self.labels.append(label) 208 self.subject_types.append(subject_type) 209 210 except Exception as e: 211 print(f"Error loading {filename}: {e}") 212 continue 213 214 # Store loaded data 215 self.data = physionet_data 216 self.names = physionet_names 217 218 print(f"Loaded {len(physionet_data)} PhysioNet files") 219 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 220 221 return physionet_data, physionet_names 222 223 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 224 window_size: int = 600, step_size: int = 100) -> List[Dict]: 225 """ 226 Create sliding windows from the PhysioNet dataset. 227 228 Args: 229 data: List of DataFrames containing PhysioNet data 230 names: List of names corresponding to the data 231 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 232 step_size: Step size for the sliding window (default: 100) 233 234 Returns: 235 List of dictionaries containing sliding windows for each DataFrame 236 """ 237 windows_data = [] 238 239 for idx, df in enumerate(data): 240 # Remove metadata columns for windowing 241 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 242 df_sensors = df[sensor_columns] 243 244 if df_sensors.empty or len(df_sensors) < window_size: 245 continue 246 247 windows = [] 248 249 # Create windows for each sensor 250 for col in sensor_columns: 251 try: 252 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 253 windows.append({"name": col, "data": window_data}) 254 except Exception as e: 255 print(f"Error creating windows for {col} in {names[idx]}: {e}") 256 continue 257 258 if windows: 259 windows_data.append({ 260 "name": names[idx], 261 "windows": windows, 262 "metadata": { 263 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 264 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 265 "window_size": window_size, 266 "step_size": step_size, 267 "num_windows": len(windows[0]["data"]) if windows else 0 268 } 269 }) 270 271 return windows_data 272 273 def get_supported_formats(self) -> List[str]: 274 """ 275 Get list of supported file formats for PhysioNet dataset. 276 277 Returns: 278 List of supported file extensions 279 """ 280 return ['.txt'] 281 282 def get_sensor_info(self) -> Dict[str, List[str]]: 283 """ 284 Get information about sensors in the dataset. 285 286 Returns: 287 Dictionary containing sensor information 288 """ 289 return { 290 'sensors': self.metadata['sensors'], 291 'sampling_frequency': self.metadata['sampling_frequency'], 292 'window_size': self.metadata['window_size'] 293 } 294 295 def get_subject_info(self) -> Dict[str, str]: 296 """ 297 Get information about subjects in the dataset. 298 299 Returns: 300 Dictionary containing subject information 301 """ 302 return self.metadata['subjects'] 303 304 def get_labels(self) -> List[str]: 305 """ 306 Get labels for loaded data. 307 308 Returns: 309 List of labels corresponding to loaded data 310 """ 311 return self.labels 312 313 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 314 """ 315 Filter loaded data by subject type. 316 317 Args: 318 subject_type: 'Control' or 'Patient' 319 320 Returns: 321 Tuple of (filtered_data, filtered_names) 322 """ 323 if not self.data: 324 raise ValueError("No data loaded. Call load_data() first.") 325 326 filtered_data = [] 327 filtered_names = [] 328 329 for i, df in enumerate(self.data): 330 if df['subject_type'].iloc[0] == subject_type: 331 filtered_data.append(df) 332 filtered_names.append(self.names[i]) 333 334 return filtered_data, filtered_names
PhysioNet VGRF dataset loader class.
This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.
Features concurrent downloading for efficient data retrieval.
36 def __init__(self, max_workers: int = 8): 37 """ 38 Initialize PhysioNet loader with concurrent download support. 39 40 Args: 41 max_workers: Maximum number of concurrent download threads (default: 8) 42 """ 43 super().__init__( 44 name="physionet", 45 description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls", 46 max_workers=max_workers 47 ) 48 self.metadata = { 49 'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8', 50 'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'], 51 'sampling_frequency': 100, # 100 Hz sampling frequency 52 'subjects': { 53 'Co': 'Control subjects', 54 'Pt': 'Parkinson\'s disease patients' 55 }, 56 'window_size': 600, # 6 seconds at 100 Hz 57 'url': 'https://physionet.org/files/gaitpdb/1.0.0/' 58 } 59 self.labels = [] 60 self.subject_types = []
Initialize PhysioNet loader with concurrent download support.
Args: max_workers: Maximum number of concurrent download threads (default: 8)
144 def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]: 145 """ 146 Load PhysioNet VGRF dataset from the specified directory. 147 148 Args: 149 data_dir: Directory to store/find the dataset 150 **kwargs: Additional arguments (unused for PhysioNet) 151 152 Returns: 153 Tuple of (data_list, names_list) 154 """ 155 # Download dataset if needed 156 dataset_path = self._download_physionet_data(data_dir) 157 158 physionet_data = [] 159 physionet_names = [] 160 self.labels = [] 161 self.subject_types = [] 162 163 # Load all available files 164 for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))): 165 filename = os.path.basename(filepath) 166 167 # Extract subject type from filename 168 if 'Co' in filename: 169 subject_type = 'Control' 170 label = 'Co' 171 elif 'Pt' in filename: 172 subject_type = 'Patient' 173 label = 'Pt' 174 else: 175 continue # Skip files that don't match expected pattern 176 177 try: 178 # Read the file - PhysioNet files are tab-delimited with variable columns 179 # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist 180 df = pd.read_csv(filepath, delimiter='\t', header=None) 181 182 # Handle variable number of columns 183 n_cols = min(df.shape[1], 19) # Limit to 19 columns max 184 df = df.iloc[:, :n_cols] 185 186 # Create column names 187 col_names = ['time'] 188 for i in range(1, n_cols): 189 if i <= 8: 190 col_names.append(f'VGRF_L{i}') 191 elif i <= 16: 192 col_names.append(f'VGRF_R{i-8}') 193 else: 194 col_names.append(f'sensor_{i}') 195 196 df.columns = col_names 197 198 # Set time as index 199 df = df.set_index('time') 200 201 # Add subject metadata 202 df['subject_type'] = subject_type 203 df['label'] = label 204 205 physionet_data.append(df) 206 physionet_names.append(filename) 207 self.labels.append(label) 208 self.subject_types.append(subject_type) 209 210 except Exception as e: 211 print(f"Error loading {filename}: {e}") 212 continue 213 214 # Store loaded data 215 self.data = physionet_data 216 self.names = physionet_names 217 218 print(f"Loaded {len(physionet_data)} PhysioNet files") 219 print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}") 220 221 return physionet_data, physionet_names
Load PhysioNet VGRF dataset from the specified directory.
Args: data_dir: Directory to store/find the dataset **kwargs: Additional arguments (unused for PhysioNet)
Returns: Tuple of (data_list, names_list)
223 def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 224 window_size: int = 600, step_size: int = 100) -> List[Dict]: 225 """ 226 Create sliding windows from the PhysioNet dataset. 227 228 Args: 229 data: List of DataFrames containing PhysioNet data 230 names: List of names corresponding to the data 231 window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) 232 step_size: Step size for the sliding window (default: 100) 233 234 Returns: 235 List of dictionaries containing sliding windows for each DataFrame 236 """ 237 windows_data = [] 238 239 for idx, df in enumerate(data): 240 # Remove metadata columns for windowing 241 sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')] 242 df_sensors = df[sensor_columns] 243 244 if df_sensors.empty or len(df_sensors) < window_size: 245 continue 246 247 windows = [] 248 249 # Create windows for each sensor 250 for col in sensor_columns: 251 try: 252 window_data = sliding_window(df_sensors[col].values, window_size, step_size) 253 windows.append({"name": col, "data": window_data}) 254 except Exception as e: 255 print(f"Error creating windows for {col} in {names[idx]}: {e}") 256 continue 257 258 if windows: 259 windows_data.append({ 260 "name": names[idx], 261 "windows": windows, 262 "metadata": { 263 "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown', 264 "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown', 265 "window_size": window_size, 266 "step_size": step_size, 267 "num_windows": len(windows[0]["data"]) if windows else 0 268 } 269 }) 270 271 return windows_data
Create sliding windows from the PhysioNet dataset.
Args: data: List of DataFrames containing PhysioNet data names: List of names corresponding to the data window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) step_size: Step size for the sliding window (default: 100)
Returns: List of dictionaries containing sliding windows for each DataFrame
273 def get_supported_formats(self) -> List[str]: 274 """ 275 Get list of supported file formats for PhysioNet dataset. 276 277 Returns: 278 List of supported file extensions 279 """ 280 return ['.txt']
Get list of supported file formats for PhysioNet dataset.
Returns: List of supported file extensions
282 def get_sensor_info(self) -> Dict[str, List[str]]: 283 """ 284 Get information about sensors in the dataset. 285 286 Returns: 287 Dictionary containing sensor information 288 """ 289 return { 290 'sensors': self.metadata['sensors'], 291 'sampling_frequency': self.metadata['sampling_frequency'], 292 'window_size': self.metadata['window_size'] 293 }
Get information about sensors in the dataset.
Returns: Dictionary containing sensor information
295 def get_subject_info(self) -> Dict[str, str]: 296 """ 297 Get information about subjects in the dataset. 298 299 Returns: 300 Dictionary containing subject information 301 """ 302 return self.metadata['subjects']
Get information about subjects in the dataset.
Returns: Dictionary containing subject information
304 def get_labels(self) -> List[str]: 305 """ 306 Get labels for loaded data. 307 308 Returns: 309 List of labels corresponding to loaded data 310 """ 311 return self.labels
Get labels for loaded data.
Returns: List of labels corresponding to loaded data
313 def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]: 314 """ 315 Filter loaded data by subject type. 316 317 Args: 318 subject_type: 'Control' or 'Patient' 319 320 Returns: 321 Tuple of (filtered_data, filtered_names) 322 """ 323 if not self.data: 324 raise ValueError("No data loaded. Call load_data() first.") 325 326 filtered_data = [] 327 filtered_names = [] 328 329 for i, df in enumerate(self.data): 330 if df['subject_type'].iloc[0] == subject_type: 331 filtered_data.append(df) 332 filtered_names.append(self.names[i]) 333 334 return filtered_data, filtered_names
Filter loaded data by subject type.
Args: subject_type: 'Control' or 'Patient'
Returns: Tuple of (filtered_data, filtered_names)
338def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]: 339 """ 340 Legacy function to load PhysioNet data. 341 342 Args: 343 data_dir: Directory containing the dataset 344 345 Returns: 346 Tuple of (data_list, names_list) 347 """ 348 loader = PhysioNetLoader() 349 return loader.load_data(data_dir)
Legacy function to load PhysioNet data.
Args: data_dir: Directory containing the dataset
Returns: Tuple of (data_list, names_list)
352def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 353 window_size: int = 600, step_size: int = 100) -> List[Dict]: 354 """ 355 Legacy function to create sliding windows from PhysioNet data. 356 357 Args: 358 data: List of DataFrames 359 names: List of names 360 window_size: Size of sliding window 361 step_size: Step size for sliding window 362 363 Returns: 364 List of sliding window dictionaries 365 """ 366 loader = PhysioNetLoader() 367 return loader.create_sliding_windows(data, names, window_size, step_size)
Legacy function to create sliding windows from PhysioNet data.
Args: data: List of DataFrames names: List of names window_size: Size of sliding window step_size: Step size for sliding window
Returns: List of sliding window dictionaries