gaitsetpy.dataset.physionet

PhysioNet VGRF Dataset Loader. Maintainer: @aharshit123456

This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader. The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.

Dataset source: https://physionet.org/content/gaitpdb/1.0.0/

  1'''
  2PhysioNet VGRF Dataset Loader.
  3Maintainer: @aharshit123456
  4
  5This file contains the PhysioNet VGRF dataset loader class that inherits from BaseDatasetLoader.
  6The PhysioNet dataset contains vertical ground reaction force (VGRF) data from subjects with 
  7Parkinson's disease and healthy controls.
  8
  9Dataset source: https://physionet.org/content/gaitpdb/1.0.0/
 10'''
 11
 12import os
 13import pandas as pd
 14import numpy as np
 15from typing import List, Dict, Tuple, Optional
 16from glob import glob
 17import requests
 18from tqdm import tqdm
 19import zipfile
 20from ..core.base_classes import BaseDatasetLoader
 21from .utils import sliding_window
 22
 23
 24class PhysioNetLoader(BaseDatasetLoader):
 25    """
 26    PhysioNet VGRF dataset loader class.
 27    
 28    This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset.
 29    The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 
 30    disease and healthy controls.
 31    
 32    Features concurrent downloading for efficient data retrieval.
 33    """
 34    
 35    def __init__(self, max_workers: int = 8):
 36        """
 37        Initialize PhysioNet loader with concurrent download support.
 38        
 39        Args:
 40            max_workers: Maximum number of concurrent download threads (default: 8)
 41        """
 42        super().__init__(
 43            name="physionet",
 44            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls",
 45            max_workers=max_workers
 46        )
 47        self.metadata = {
 48            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
 49                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
 50            'sampling_frequency': 100,  # 100 Hz sampling frequency
 51            'subjects': {
 52                'Co': 'Control subjects',
 53                'Pt': 'Parkinson\'s disease patients'
 54            },
 55            'window_size': 600,  # 6 seconds at 100 Hz
 56            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
 57        }
 58        self.labels = []
 59        self.subject_types = []
 60    
 61    def _download_physionet_data(self, data_dir: str) -> str:
 62        """
 63        Download PhysioNet dataset if not already present using concurrent downloads.
 64        
 65        This method uses multi-threaded downloading to significantly speed up the
 66        download process for the 100+ files in the PhysioNet dataset.
 67        
 68        Args:
 69            data_dir: Directory to store the dataset
 70            
 71        Returns:
 72            Path to the downloaded/existing dataset directory
 73        """
 74        dataset_path = os.path.join(data_dir, "physionet_gaitpdb")
 75        
 76        if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0:
 77            print(f"PhysioNet dataset already exists at: {dataset_path}")
 78            return dataset_path
 79        
 80        os.makedirs(dataset_path, exist_ok=True)
 81        
 82        # Download the dataset files
 83        base_url = "https://physionet.org/files/gaitpdb/1.0.0/"
 84        
 85        # Get list of files (basic file names based on the reference)
 86        file_patterns = [
 87            # Control subjects - Ga prefix
 88            *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)],
 89            "GaCo22_01.txt", "GaCo22_10.txt",
 90            
 91            # Parkinson's patients - Ga prefix
 92            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)],
 93            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)],
 94            *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)],
 95            
 96            # Control subjects - Ju prefix
 97            *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)],
 98            
 99            # Parkinson's patients - Ju prefix
100            *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)],
101            
102            # Control subjects - Si prefix
103            *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)],
104            
105            # Parkinson's patients - Si prefix
106            *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)]
107        ]
108        
109        # Prepare download tasks for concurrent execution
110        download_tasks = [
111            {
112                'url': base_url + filename,
113                'dest_path': os.path.join(dataset_path, filename)
114            }
115            for filename in file_patterns
116        ]
117        
118        print(f"Downloading PhysioNet dataset to {dataset_path} using {self.max_workers} threads")
119        
120        # Use concurrent downloading from base class
121        results = self.download_files_concurrent(
122            download_tasks, 
123            show_progress=True, 
124            desc="Downloading PhysioNet files"
125        )
126        
127        # Print summary
128        print(f"\nDownload Summary:")
129        print(f"  Total files: {results['total']}")
130        print(f"  Successfully downloaded: {results['success']}")
131        print(f"  Already existed (skipped): {results['skipped']}")
132        print(f"  Failed: {results['failed']}")
133        
134        if results['failed'] > 0 and len(results['failed_downloads']) > 0:
135            print(f"\nFailed downloads (showing first 10):")
136            for failed in results['failed_downloads'][:10]:
137                print(f"  - {os.path.basename(failed['dest_path'])}: {failed['error']}")
138            if len(results['failed_downloads']) > 10:
139                print(f"  ... and {len(results['failed_downloads']) - 10} more failures")
140        
141        return dataset_path
142    
143    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
144        """
145        Load PhysioNet VGRF dataset from the specified directory.
146        
147        Args:
148            data_dir: Directory to store/find the dataset
149            **kwargs: Additional arguments (unused for PhysioNet)
150            
151        Returns:
152            Tuple of (data_list, names_list)
153        """
154        # Download dataset if needed
155        dataset_path = self._download_physionet_data(data_dir)
156        
157        physionet_data = []
158        physionet_names = []
159        self.labels = []
160        self.subject_types = []
161        
162        # Load all available files
163        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
164            filename = os.path.basename(filepath)
165            
166            # Extract subject type from filename
167            if 'Co' in filename:
168                subject_type = 'Control'
169                label = 'Co'
170            elif 'Pt' in filename:
171                subject_type = 'Patient'
172                label = 'Pt'
173            else:
174                continue  # Skip files that don't match expected pattern
175            
176            try:
177                # Read the file - PhysioNet files are tab-delimited with variable columns
178                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
179                df = pd.read_csv(filepath, delimiter='\t', header=None)
180                
181                # Handle variable number of columns
182                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
183                df = df.iloc[:, :n_cols]
184                
185                # Create column names
186                col_names = ['time']
187                for i in range(1, n_cols):
188                    if i <= 8:
189                        col_names.append(f'VGRF_L{i}')
190                    elif i <= 16:
191                        col_names.append(f'VGRF_R{i-8}')
192                    else:
193                        col_names.append(f'sensor_{i}')
194                
195                df.columns = col_names
196                
197                # Set time as index
198                df = df.set_index('time')
199                
200                # Add subject metadata
201                df['subject_type'] = subject_type
202                df['label'] = label
203                
204                physionet_data.append(df)
205                physionet_names.append(filename)
206                self.labels.append(label)
207                self.subject_types.append(subject_type)
208                
209            except Exception as e:
210                print(f"Error loading {filename}: {e}")
211                continue
212        
213        # Store loaded data
214        self.data = physionet_data
215        self.names = physionet_names
216        
217        print(f"Loaded {len(physionet_data)} PhysioNet files")
218        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
219        
220        return physionet_data, physionet_names
221    
222    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
223                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
224        """
225        Create sliding windows from the PhysioNet dataset.
226        
227        Args:
228            data: List of DataFrames containing PhysioNet data
229            names: List of names corresponding to the data
230            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
231            step_size: Step size for the sliding window (default: 100)
232            
233        Returns:
234            List of dictionaries containing sliding windows for each DataFrame
235        """
236        windows_data = []
237        
238        for idx, df in enumerate(data):
239            # Remove metadata columns for windowing
240            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
241            df_sensors = df[sensor_columns]
242            
243            if df_sensors.empty or len(df_sensors) < window_size:
244                continue
245                
246            windows = []
247            
248            # Create windows for each sensor
249            for col in sensor_columns:
250                try:
251                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
252                    windows.append({"name": col, "data": window_data})
253                except Exception as e:
254                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
255                    continue
256            
257            if windows:
258                windows_data.append({
259                    "name": names[idx],
260                    "windows": windows,
261                    "metadata": {
262                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
263                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
264                        "window_size": window_size,
265                        "step_size": step_size,
266                        "num_windows": len(windows[0]["data"]) if windows else 0
267                    }
268                })
269        
270        return windows_data
271    
272    def get_supported_formats(self) -> List[str]:
273        """
274        Get list of supported file formats for PhysioNet dataset.
275        
276        Returns:
277            List of supported file extensions
278        """
279        return ['.txt']
280    
281    def get_sensor_info(self) -> Dict[str, List[str]]:
282        """
283        Get information about sensors in the dataset.
284        
285        Returns:
286            Dictionary containing sensor information
287        """
288        return {
289            'sensors': self.metadata['sensors'],
290            'sampling_frequency': self.metadata['sampling_frequency'],
291            'window_size': self.metadata['window_size']
292        }
293    
294    def get_subject_info(self) -> Dict[str, str]:
295        """
296        Get information about subjects in the dataset.
297        
298        Returns:
299            Dictionary containing subject information
300        """
301        return self.metadata['subjects']
302    
303    def get_labels(self) -> List[str]:
304        """
305        Get labels for loaded data.
306        
307        Returns:
308            List of labels corresponding to loaded data
309        """
310        return self.labels
311    
312    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
313        """
314        Filter loaded data by subject type.
315        
316        Args:
317            subject_type: 'Control' or 'Patient'
318            
319        Returns:
320            Tuple of (filtered_data, filtered_names)
321        """
322        if not self.data:
323            raise ValueError("No data loaded. Call load_data() first.")
324        
325        filtered_data = []
326        filtered_names = []
327        
328        for i, df in enumerate(self.data):
329            if df['subject_type'].iloc[0] == subject_type:
330                filtered_data.append(df)
331                filtered_names.append(self.names[i])
332        
333        return filtered_data, filtered_names
334
335
336# Legacy function for backward compatibility
337def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]:
338    """
339    Legacy function to load PhysioNet data.
340    
341    Args:
342        data_dir: Directory containing the dataset
343        
344    Returns:
345        Tuple of (data_list, names_list)
346    """
347    loader = PhysioNetLoader()
348    return loader.load_data(data_dir)
349
350
351def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 
352                           window_size: int = 600, step_size: int = 100) -> List[Dict]:
353    """
354    Legacy function to create sliding windows from PhysioNet data.
355    
356    Args:
357        data: List of DataFrames
358        names: List of names
359        window_size: Size of sliding window
360        step_size: Step size for sliding window
361        
362    Returns:
363        List of sliding window dictionaries
364    """
365    loader = PhysioNetLoader()
366    return loader.create_sliding_windows(data, names, window_size, step_size) 
class PhysioNetLoader(gaitsetpy.core.base_classes.BaseDatasetLoader):
 25class PhysioNetLoader(BaseDatasetLoader):
 26    """
 27    PhysioNet VGRF dataset loader class.
 28    
 29    This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset.
 30    The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's 
 31    disease and healthy controls.
 32    
 33    Features concurrent downloading for efficient data retrieval.
 34    """
 35    
 36    def __init__(self, max_workers: int = 8):
 37        """
 38        Initialize PhysioNet loader with concurrent download support.
 39        
 40        Args:
 41            max_workers: Maximum number of concurrent download threads (default: 8)
 42        """
 43        super().__init__(
 44            name="physionet",
 45            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls",
 46            max_workers=max_workers
 47        )
 48        self.metadata = {
 49            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
 50                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
 51            'sampling_frequency': 100,  # 100 Hz sampling frequency
 52            'subjects': {
 53                'Co': 'Control subjects',
 54                'Pt': 'Parkinson\'s disease patients'
 55            },
 56            'window_size': 600,  # 6 seconds at 100 Hz
 57            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
 58        }
 59        self.labels = []
 60        self.subject_types = []
 61    
 62    def _download_physionet_data(self, data_dir: str) -> str:
 63        """
 64        Download PhysioNet dataset if not already present using concurrent downloads.
 65        
 66        This method uses multi-threaded downloading to significantly speed up the
 67        download process for the 100+ files in the PhysioNet dataset.
 68        
 69        Args:
 70            data_dir: Directory to store the dataset
 71            
 72        Returns:
 73            Path to the downloaded/existing dataset directory
 74        """
 75        dataset_path = os.path.join(data_dir, "physionet_gaitpdb")
 76        
 77        if os.path.exists(dataset_path) and len(os.listdir(dataset_path)) > 0:
 78            print(f"PhysioNet dataset already exists at: {dataset_path}")
 79            return dataset_path
 80        
 81        os.makedirs(dataset_path, exist_ok=True)
 82        
 83        # Download the dataset files
 84        base_url = "https://physionet.org/files/gaitpdb/1.0.0/"
 85        
 86        # Get list of files (basic file names based on the reference)
 87        file_patterns = [
 88            # Control subjects - Ga prefix
 89            *[f"GaCo{i:02d}_{j:02d}.txt" for i in range(1, 18) for j in range(1, 3)],
 90            "GaCo22_01.txt", "GaCo22_10.txt",
 91            
 92            # Parkinson's patients - Ga prefix
 93            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(3, 10) for j in range(1, 3)],
 94            *[f"GaPt{i:02d}_{j:02d}.txt" for i in range(12, 34) for j in range(1, 3)],
 95            *[f"GaPt{i:02d}_10.txt" for i in range(13, 34)],
 96            
 97            # Control subjects - Ju prefix
 98            *[f"JuCo{i:02d}_01.txt" for i in range(1, 27)],
 99            
100            # Parkinson's patients - Ju prefix
101            *[f"JuPt{i:02d}_{j:02d}.txt" for i in range(1, 30) for j in range(1, 8)],
102            
103            # Control subjects - Si prefix
104            *[f"SiCo{i:02d}_01.txt" for i in range(1, 31)],
105            
106            # Parkinson's patients - Si prefix
107            *[f"SiPt{i:02d}_01.txt" for i in range(2, 41)]
108        ]
109        
110        # Prepare download tasks for concurrent execution
111        download_tasks = [
112            {
113                'url': base_url + filename,
114                'dest_path': os.path.join(dataset_path, filename)
115            }
116            for filename in file_patterns
117        ]
118        
119        print(f"Downloading PhysioNet dataset to {dataset_path} using {self.max_workers} threads")
120        
121        # Use concurrent downloading from base class
122        results = self.download_files_concurrent(
123            download_tasks, 
124            show_progress=True, 
125            desc="Downloading PhysioNet files"
126        )
127        
128        # Print summary
129        print(f"\nDownload Summary:")
130        print(f"  Total files: {results['total']}")
131        print(f"  Successfully downloaded: {results['success']}")
132        print(f"  Already existed (skipped): {results['skipped']}")
133        print(f"  Failed: {results['failed']}")
134        
135        if results['failed'] > 0 and len(results['failed_downloads']) > 0:
136            print(f"\nFailed downloads (showing first 10):")
137            for failed in results['failed_downloads'][:10]:
138                print(f"  - {os.path.basename(failed['dest_path'])}: {failed['error']}")
139            if len(results['failed_downloads']) > 10:
140                print(f"  ... and {len(results['failed_downloads']) - 10} more failures")
141        
142        return dataset_path
143    
144    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
145        """
146        Load PhysioNet VGRF dataset from the specified directory.
147        
148        Args:
149            data_dir: Directory to store/find the dataset
150            **kwargs: Additional arguments (unused for PhysioNet)
151            
152        Returns:
153            Tuple of (data_list, names_list)
154        """
155        # Download dataset if needed
156        dataset_path = self._download_physionet_data(data_dir)
157        
158        physionet_data = []
159        physionet_names = []
160        self.labels = []
161        self.subject_types = []
162        
163        # Load all available files
164        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
165            filename = os.path.basename(filepath)
166            
167            # Extract subject type from filename
168            if 'Co' in filename:
169                subject_type = 'Control'
170                label = 'Co'
171            elif 'Pt' in filename:
172                subject_type = 'Patient'
173                label = 'Pt'
174            else:
175                continue  # Skip files that don't match expected pattern
176            
177            try:
178                # Read the file - PhysioNet files are tab-delimited with variable columns
179                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
180                df = pd.read_csv(filepath, delimiter='\t', header=None)
181                
182                # Handle variable number of columns
183                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
184                df = df.iloc[:, :n_cols]
185                
186                # Create column names
187                col_names = ['time']
188                for i in range(1, n_cols):
189                    if i <= 8:
190                        col_names.append(f'VGRF_L{i}')
191                    elif i <= 16:
192                        col_names.append(f'VGRF_R{i-8}')
193                    else:
194                        col_names.append(f'sensor_{i}')
195                
196                df.columns = col_names
197                
198                # Set time as index
199                df = df.set_index('time')
200                
201                # Add subject metadata
202                df['subject_type'] = subject_type
203                df['label'] = label
204                
205                physionet_data.append(df)
206                physionet_names.append(filename)
207                self.labels.append(label)
208                self.subject_types.append(subject_type)
209                
210            except Exception as e:
211                print(f"Error loading {filename}: {e}")
212                continue
213        
214        # Store loaded data
215        self.data = physionet_data
216        self.names = physionet_names
217        
218        print(f"Loaded {len(physionet_data)} PhysioNet files")
219        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
220        
221        return physionet_data, physionet_names
222    
223    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
224                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
225        """
226        Create sliding windows from the PhysioNet dataset.
227        
228        Args:
229            data: List of DataFrames containing PhysioNet data
230            names: List of names corresponding to the data
231            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
232            step_size: Step size for the sliding window (default: 100)
233            
234        Returns:
235            List of dictionaries containing sliding windows for each DataFrame
236        """
237        windows_data = []
238        
239        for idx, df in enumerate(data):
240            # Remove metadata columns for windowing
241            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
242            df_sensors = df[sensor_columns]
243            
244            if df_sensors.empty or len(df_sensors) < window_size:
245                continue
246                
247            windows = []
248            
249            # Create windows for each sensor
250            for col in sensor_columns:
251                try:
252                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
253                    windows.append({"name": col, "data": window_data})
254                except Exception as e:
255                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
256                    continue
257            
258            if windows:
259                windows_data.append({
260                    "name": names[idx],
261                    "windows": windows,
262                    "metadata": {
263                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
264                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
265                        "window_size": window_size,
266                        "step_size": step_size,
267                        "num_windows": len(windows[0]["data"]) if windows else 0
268                    }
269                })
270        
271        return windows_data
272    
273    def get_supported_formats(self) -> List[str]:
274        """
275        Get list of supported file formats for PhysioNet dataset.
276        
277        Returns:
278            List of supported file extensions
279        """
280        return ['.txt']
281    
282    def get_sensor_info(self) -> Dict[str, List[str]]:
283        """
284        Get information about sensors in the dataset.
285        
286        Returns:
287            Dictionary containing sensor information
288        """
289        return {
290            'sensors': self.metadata['sensors'],
291            'sampling_frequency': self.metadata['sampling_frequency'],
292            'window_size': self.metadata['window_size']
293        }
294    
295    def get_subject_info(self) -> Dict[str, str]:
296        """
297        Get information about subjects in the dataset.
298        
299        Returns:
300            Dictionary containing subject information
301        """
302        return self.metadata['subjects']
303    
304    def get_labels(self) -> List[str]:
305        """
306        Get labels for loaded data.
307        
308        Returns:
309            List of labels corresponding to loaded data
310        """
311        return self.labels
312    
313    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
314        """
315        Filter loaded data by subject type.
316        
317        Args:
318            subject_type: 'Control' or 'Patient'
319            
320        Returns:
321            Tuple of (filtered_data, filtered_names)
322        """
323        if not self.data:
324            raise ValueError("No data loaded. Call load_data() first.")
325        
326        filtered_data = []
327        filtered_names = []
328        
329        for i, df in enumerate(self.data):
330            if df['subject_type'].iloc[0] == subject_type:
331                filtered_data.append(df)
332                filtered_names.append(self.names[i])
333        
334        return filtered_data, filtered_names

PhysioNet VGRF dataset loader class.

This class handles loading and processing of the PhysioNet Gait in Parkinson's Disease dataset. The dataset contains vertical ground reaction force (VGRF) data from subjects with Parkinson's disease and healthy controls.

Features concurrent downloading for efficient data retrieval.

PhysioNetLoader(max_workers: int = 8)
36    def __init__(self, max_workers: int = 8):
37        """
38        Initialize PhysioNet loader with concurrent download support.
39        
40        Args:
41            max_workers: Maximum number of concurrent download threads (default: 8)
42        """
43        super().__init__(
44            name="physionet",
45            description="PhysioNet Gait in Parkinson's Disease Dataset - Contains VGRF data from subjects with Parkinson's disease and healthy controls",
46            max_workers=max_workers
47        )
48        self.metadata = {
49            'sensors': ['VGRF_L1', 'VGRF_L2', 'VGRF_L3', 'VGRF_L4', 'VGRF_L5', 'VGRF_L6', 'VGRF_L7', 'VGRF_L8',
50                       'VGRF_R1', 'VGRF_R2', 'VGRF_R3', 'VGRF_R4', 'VGRF_R5', 'VGRF_R6', 'VGRF_R7', 'VGRF_R8'],
51            'sampling_frequency': 100,  # 100 Hz sampling frequency
52            'subjects': {
53                'Co': 'Control subjects',
54                'Pt': 'Parkinson\'s disease patients'
55            },
56            'window_size': 600,  # 6 seconds at 100 Hz
57            'url': 'https://physionet.org/files/gaitpdb/1.0.0/'
58        }
59        self.labels = []
60        self.subject_types = []

Initialize PhysioNet loader with concurrent download support.

Args: max_workers: Maximum number of concurrent download threads (default: 8)

metadata
labels
subject_types
def load_data( self, data_dir: str, **kwargs) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
144    def load_data(self, data_dir: str, **kwargs) -> Tuple[List[pd.DataFrame], List[str]]:
145        """
146        Load PhysioNet VGRF dataset from the specified directory.
147        
148        Args:
149            data_dir: Directory to store/find the dataset
150            **kwargs: Additional arguments (unused for PhysioNet)
151            
152        Returns:
153            Tuple of (data_list, names_list)
154        """
155        # Download dataset if needed
156        dataset_path = self._download_physionet_data(data_dir)
157        
158        physionet_data = []
159        physionet_names = []
160        self.labels = []
161        self.subject_types = []
162        
163        # Load all available files
164        for filepath in sorted(glob(os.path.join(dataset_path, "Ga*.txt"))):
165            filename = os.path.basename(filepath)
166            
167            # Extract subject type from filename
168            if 'Co' in filename:
169                subject_type = 'Control'
170                label = 'Co'
171            elif 'Pt' in filename:
172                subject_type = 'Patient'
173                label = 'Pt'
174            else:
175                continue  # Skip files that don't match expected pattern
176            
177            try:
178                # Read the file - PhysioNet files are tab-delimited with variable columns
179                # Column 0: time, Columns 1-16: VGRF sensors, additional columns may exist
180                df = pd.read_csv(filepath, delimiter='\t', header=None)
181                
182                # Handle variable number of columns
183                n_cols = min(df.shape[1], 19)  # Limit to 19 columns max
184                df = df.iloc[:, :n_cols]
185                
186                # Create column names
187                col_names = ['time']
188                for i in range(1, n_cols):
189                    if i <= 8:
190                        col_names.append(f'VGRF_L{i}')
191                    elif i <= 16:
192                        col_names.append(f'VGRF_R{i-8}')
193                    else:
194                        col_names.append(f'sensor_{i}')
195                
196                df.columns = col_names
197                
198                # Set time as index
199                df = df.set_index('time')
200                
201                # Add subject metadata
202                df['subject_type'] = subject_type
203                df['label'] = label
204                
205                physionet_data.append(df)
206                physionet_names.append(filename)
207                self.labels.append(label)
208                self.subject_types.append(subject_type)
209                
210            except Exception as e:
211                print(f"Error loading {filename}: {e}")
212                continue
213        
214        # Store loaded data
215        self.data = physionet_data
216        self.names = physionet_names
217        
218        print(f"Loaded {len(physionet_data)} PhysioNet files")
219        print(f"Subject distribution: {dict(zip(*np.unique(self.subject_types, return_counts=True)))}")
220        
221        return physionet_data, physionet_names

Load PhysioNet VGRF dataset from the specified directory.

Args: data_dir: Directory to store/find the dataset **kwargs: Additional arguments (unused for PhysioNet)

Returns: Tuple of (data_list, names_list)

def create_sliding_windows( self, data: List[pandas.core.frame.DataFrame], names: List[str], window_size: int = 600, step_size: int = 100) -> List[Dict]:
223    def create_sliding_windows(self, data: List[pd.DataFrame], names: List[str], 
224                             window_size: int = 600, step_size: int = 100) -> List[Dict]:
225        """
226        Create sliding windows from the PhysioNet dataset.
227        
228        Args:
229            data: List of DataFrames containing PhysioNet data
230            names: List of names corresponding to the data
231            window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz)
232            step_size: Step size for the sliding window (default: 100)
233            
234        Returns:
235            List of dictionaries containing sliding windows for each DataFrame
236        """
237        windows_data = []
238        
239        for idx, df in enumerate(data):
240            # Remove metadata columns for windowing
241            sensor_columns = [col for col in df.columns if col.startswith('VGRF_') or col.startswith('sensor_')]
242            df_sensors = df[sensor_columns]
243            
244            if df_sensors.empty or len(df_sensors) < window_size:
245                continue
246                
247            windows = []
248            
249            # Create windows for each sensor
250            for col in sensor_columns:
251                try:
252                    window_data = sliding_window(df_sensors[col].values, window_size, step_size)
253                    windows.append({"name": col, "data": window_data})
254                except Exception as e:
255                    print(f"Error creating windows for {col} in {names[idx]}: {e}")
256                    continue
257            
258            if windows:
259                windows_data.append({
260                    "name": names[idx],
261                    "windows": windows,
262                    "metadata": {
263                        "subject_type": df['subject_type'].iloc[0] if 'subject_type' in df.columns else 'Unknown',
264                        "label": df['label'].iloc[0] if 'label' in df.columns else 'Unknown',
265                        "window_size": window_size,
266                        "step_size": step_size,
267                        "num_windows": len(windows[0]["data"]) if windows else 0
268                    }
269                })
270        
271        return windows_data

Create sliding windows from the PhysioNet dataset.

Args: data: List of DataFrames containing PhysioNet data names: List of names corresponding to the data window_size: Size of the sliding window (default: 600 for 6 seconds at 100Hz) step_size: Step size for the sliding window (default: 100)

Returns: List of dictionaries containing sliding windows for each DataFrame

def get_supported_formats(self) -> List[str]:
273    def get_supported_formats(self) -> List[str]:
274        """
275        Get list of supported file formats for PhysioNet dataset.
276        
277        Returns:
278            List of supported file extensions
279        """
280        return ['.txt']

Get list of supported file formats for PhysioNet dataset.

Returns: List of supported file extensions

def get_sensor_info(self) -> Dict[str, List[str]]:
282    def get_sensor_info(self) -> Dict[str, List[str]]:
283        """
284        Get information about sensors in the dataset.
285        
286        Returns:
287            Dictionary containing sensor information
288        """
289        return {
290            'sensors': self.metadata['sensors'],
291            'sampling_frequency': self.metadata['sampling_frequency'],
292            'window_size': self.metadata['window_size']
293        }

Get information about sensors in the dataset.

Returns: Dictionary containing sensor information

def get_subject_info(self) -> Dict[str, str]:
295    def get_subject_info(self) -> Dict[str, str]:
296        """
297        Get information about subjects in the dataset.
298        
299        Returns:
300            Dictionary containing subject information
301        """
302        return self.metadata['subjects']

Get information about subjects in the dataset.

Returns: Dictionary containing subject information

def get_labels(self) -> List[str]:
304    def get_labels(self) -> List[str]:
305        """
306        Get labels for loaded data.
307        
308        Returns:
309            List of labels corresponding to loaded data
310        """
311        return self.labels

Get labels for loaded data.

Returns: List of labels corresponding to loaded data

def filter_by_subject_type( self, subject_type: str) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
313    def filter_by_subject_type(self, subject_type: str) -> Tuple[List[pd.DataFrame], List[str]]:
314        """
315        Filter loaded data by subject type.
316        
317        Args:
318            subject_type: 'Control' or 'Patient'
319            
320        Returns:
321            Tuple of (filtered_data, filtered_names)
322        """
323        if not self.data:
324            raise ValueError("No data loaded. Call load_data() first.")
325        
326        filtered_data = []
327        filtered_names = []
328        
329        for i, df in enumerate(self.data):
330            if df['subject_type'].iloc[0] == subject_type:
331                filtered_data.append(df)
332                filtered_names.append(self.names[i])
333        
334        return filtered_data, filtered_names

Filter loaded data by subject type.

Args: subject_type: 'Control' or 'Patient'

Returns: Tuple of (filtered_data, filtered_names)

def load_physionet_data(data_dir: str) -> Tuple[List[pandas.core.frame.DataFrame], List[str]]:
338def load_physionet_data(data_dir: str) -> Tuple[List[pd.DataFrame], List[str]]:
339    """
340    Legacy function to load PhysioNet data.
341    
342    Args:
343        data_dir: Directory containing the dataset
344        
345    Returns:
346        Tuple of (data_list, names_list)
347    """
348    loader = PhysioNetLoader()
349    return loader.load_data(data_dir)

Legacy function to load PhysioNet data.

Args: data_dir: Directory containing the dataset

Returns: Tuple of (data_list, names_list)

def create_physionet_windows( data: List[pandas.core.frame.DataFrame], names: List[str], window_size: int = 600, step_size: int = 100) -> List[Dict]:
352def create_physionet_windows(data: List[pd.DataFrame], names: List[str], 
353                           window_size: int = 600, step_size: int = 100) -> List[Dict]:
354    """
355    Legacy function to create sliding windows from PhysioNet data.
356    
357    Args:
358        data: List of DataFrames
359        names: List of names
360        window_size: Size of sliding window
361        step_size: Step size for sliding window
362        
363    Returns:
364        List of sliding window dictionaries
365    """
366    loader = PhysioNetLoader()
367    return loader.create_sliding_windows(data, names, window_size, step_size) 

Legacy function to create sliding windows from PhysioNet data.

Args: data: List of DataFrames names: List of names window_size: Size of sliding window step_size: Step size for sliding window

Returns: List of sliding window dictionaries