Coverage for src/cc_liquid/data_loader.py: 79%

58 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-10-13 20:16 +0000

1"""Data loading and abstraction for cc-liquid.""" 

2 

3from abc import ABC, abstractmethod 

4 

5import polars as pl 

6 

7 

8class DataSource(ABC): 

9 """Abstract base class for data sources.""" 

10 

11 @abstractmethod 

12 def load(self) -> pl.DataFrame: 

13 """Load data into a Polars DataFrame.""" 

14 pass 

15 

16 

17class FileDataSource(DataSource): 

18 """Loads prediction data from a Parquet or CSV file.""" 

19 

20 def __init__( 

21 self, 

22 path: str, 

23 date_column: str, 

24 asset_id_column: str, 

25 prediction_column: str, 

26 ): 

27 self.path = path 

28 self.date_column = date_column 

29 self.asset_id_column = asset_id_column 

30 self.prediction_column = prediction_column 

31 

32 def load(self) -> pl.DataFrame: 

33 """Loads data from the file.""" 

34 if self.path.endswith(".parquet"): 

35 df = pl.read_parquet(self.path) 

36 elif self.path.endswith(".csv"): 

37 df = pl.read_csv(self.path) 

38 else: 

39 raise ValueError("Unsupported file type. Use .parquet or .csv.") 

40 

41 return df 

42 

43 

44class DataFrameDataSource(DataSource): 

45 """Uses an existing Polars DataFrame as the data source.""" 

46 

47 def __init__( 

48 self, 

49 df: pl.DataFrame, 

50 date_column: str, 

51 asset_id_column: str, 

52 prediction_column: str, 

53 ): 

54 self.df = df 

55 self.date_column = date_column 

56 self.asset_id_column = asset_id_column 

57 self.prediction_column = prediction_column 

58 

59 def load(self) -> pl.DataFrame: 

60 """Returns the existing DataFrame.""" 

61 return self.df 

62 

63 

64class DataLoader: 

65 """Factory for creating data sources.""" 

66 

67 @staticmethod 

68 def from_file(path: str, date_col: str, id_col: str, pred_col: str) -> pl.DataFrame: 

69 """Create a file data source and load data.""" 

70 return FileDataSource( 

71 path, 

72 date_column=date_col, 

73 asset_id_column=id_col, 

74 prediction_column=pred_col, 

75 ).load() 

76 

77 @staticmethod 

78 def from_dataframe( 

79 df: pl.DataFrame, date_col: str, id_col: str, pred_col: str 

80 ) -> pl.DataFrame: 

81 """Create a DataFrame data source and load data.""" 

82 return DataFrameDataSource( 

83 df, 

84 date_column=date_col, 

85 asset_id_column=id_col, 

86 prediction_column=pred_col, 

87 ).load() 

88 

89 @staticmethod 

90 def from_crowdcent_api( 

91 api_key: str | None = None, 

92 challenge_slug: str = "hyperliquid-ranking", 

93 download_path: str | None = None, 

94 date_col: str = "release_date", 

95 id_col: str = "id", 

96 pred_col: str = "pred_10d", 

97 ) -> pl.DataFrame: 

98 """ 

99 Download and load the CrowdCent meta model. 

100 

101 Args: 

102 api_key: CrowdCent API key (if None, will try to load from env) 

103 challenge_slug: The challenge to download data for 

104 download_path: Optional path to save the downloaded file 

105 date_col: Date column name in the meta model 

106 id_col: Asset ID column name in the meta model 

107 pred_col: Prediction column name to use from the meta model 

108 

109 Returns: 

110 Polars DataFrame with original column names 

111 """ 

112 from crowdcent_challenge import ChallengeClient 

113 

114 if api_key is None: 

115 import os 

116 

117 api_key = os.getenv("CROWDCENT_API_KEY") 

118 if not api_key: 

119 raise ValueError("CROWDCENT_API_KEY not found in environment variables") 

120 

121 client = ChallengeClient(challenge_slug=challenge_slug, api_key=api_key) 

122 

123 if download_path is None: 

124 download_path = "predictions.parquet" 

125 

126 client.download_meta_model(download_path) 

127 

128 return DataLoader.from_file( 

129 path=download_path, date_col=date_col, id_col=id_col, pred_col=pred_col 

130 ) 

131 

132 @staticmethod 

133 def from_numerai_api( 

134 download_path: str | None = None, 

135 date_col: str = "date", 

136 id_col: str = "symbol", 

137 pred_col: str = "meta_model", 

138 ) -> pl.DataFrame: 

139 """ 

140 Download and load the Numerai crypto meta model. 

141 

142 Args: 

143 download_path: Optional path to save the downloaded file 

144 date_col: Date column name in the meta model 

145 id_col: Asset ID/symbol column name in the meta model 

146 pred_col: Prediction column name to use from the meta model 

147 

148 Returns: 

149 Polars DataFrame with original column names 

150 """ 

151 try: 

152 from numerapi import CryptoAPI 

153 except ImportError: 

154 raise ImportError( 

155 "numerapi is required. Install with: uv add cc-liquid[numerai]" 

156 ) 

157 

158 api = CryptoAPI() 

159 

160 if download_path is None: 

161 download_path = "predictions.parquet" 

162 

163 api.download_dataset("v1.0/historical_meta_models.parquet", download_path) 

164 

165 return DataLoader.from_file( 

166 path=download_path, date_col=date_col, id_col=id_col, pred_col=pred_col 

167 )