Coverage for src/cc_liquid/data_loader.py: 79%
58 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-10-13 20:16 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-10-13 20:16 +0000
1"""Data loading and abstraction for cc-liquid."""
3from abc import ABC, abstractmethod
5import polars as pl
8class DataSource(ABC):
9 """Abstract base class for data sources."""
11 @abstractmethod
12 def load(self) -> pl.DataFrame:
13 """Load data into a Polars DataFrame."""
14 pass
17class FileDataSource(DataSource):
18 """Loads prediction data from a Parquet or CSV file."""
20 def __init__(
21 self,
22 path: str,
23 date_column: str,
24 asset_id_column: str,
25 prediction_column: str,
26 ):
27 self.path = path
28 self.date_column = date_column
29 self.asset_id_column = asset_id_column
30 self.prediction_column = prediction_column
32 def load(self) -> pl.DataFrame:
33 """Loads data from the file."""
34 if self.path.endswith(".parquet"):
35 df = pl.read_parquet(self.path)
36 elif self.path.endswith(".csv"):
37 df = pl.read_csv(self.path)
38 else:
39 raise ValueError("Unsupported file type. Use .parquet or .csv.")
41 return df
44class DataFrameDataSource(DataSource):
45 """Uses an existing Polars DataFrame as the data source."""
47 def __init__(
48 self,
49 df: pl.DataFrame,
50 date_column: str,
51 asset_id_column: str,
52 prediction_column: str,
53 ):
54 self.df = df
55 self.date_column = date_column
56 self.asset_id_column = asset_id_column
57 self.prediction_column = prediction_column
59 def load(self) -> pl.DataFrame:
60 """Returns the existing DataFrame."""
61 return self.df
64class DataLoader:
65 """Factory for creating data sources."""
67 @staticmethod
68 def from_file(path: str, date_col: str, id_col: str, pred_col: str) -> pl.DataFrame:
69 """Create a file data source and load data."""
70 return FileDataSource(
71 path,
72 date_column=date_col,
73 asset_id_column=id_col,
74 prediction_column=pred_col,
75 ).load()
77 @staticmethod
78 def from_dataframe(
79 df: pl.DataFrame, date_col: str, id_col: str, pred_col: str
80 ) -> pl.DataFrame:
81 """Create a DataFrame data source and load data."""
82 return DataFrameDataSource(
83 df,
84 date_column=date_col,
85 asset_id_column=id_col,
86 prediction_column=pred_col,
87 ).load()
89 @staticmethod
90 def from_crowdcent_api(
91 api_key: str | None = None,
92 challenge_slug: str = "hyperliquid-ranking",
93 download_path: str | None = None,
94 date_col: str = "release_date",
95 id_col: str = "id",
96 pred_col: str = "pred_10d",
97 ) -> pl.DataFrame:
98 """
99 Download and load the CrowdCent meta model.
101 Args:
102 api_key: CrowdCent API key (if None, will try to load from env)
103 challenge_slug: The challenge to download data for
104 download_path: Optional path to save the downloaded file
105 date_col: Date column name in the meta model
106 id_col: Asset ID column name in the meta model
107 pred_col: Prediction column name to use from the meta model
109 Returns:
110 Polars DataFrame with original column names
111 """
112 from crowdcent_challenge import ChallengeClient
114 if api_key is None:
115 import os
117 api_key = os.getenv("CROWDCENT_API_KEY")
118 if not api_key:
119 raise ValueError("CROWDCENT_API_KEY not found in environment variables")
121 client = ChallengeClient(challenge_slug=challenge_slug, api_key=api_key)
123 if download_path is None:
124 download_path = "predictions.parquet"
126 client.download_meta_model(download_path)
128 return DataLoader.from_file(
129 path=download_path, date_col=date_col, id_col=id_col, pred_col=pred_col
130 )
132 @staticmethod
133 def from_numerai_api(
134 download_path: str | None = None,
135 date_col: str = "date",
136 id_col: str = "symbol",
137 pred_col: str = "meta_model",
138 ) -> pl.DataFrame:
139 """
140 Download and load the Numerai crypto meta model.
142 Args:
143 download_path: Optional path to save the downloaded file
144 date_col: Date column name in the meta model
145 id_col: Asset ID/symbol column name in the meta model
146 pred_col: Prediction column name to use from the meta model
148 Returns:
149 Polars DataFrame with original column names
150 """
151 try:
152 from numerapi import CryptoAPI
153 except ImportError:
154 raise ImportError(
155 "numerapi is required. Install with: uv add cc-liquid[numerai]"
156 )
158 api = CryptoAPI()
160 if download_path is None:
161 download_path = "predictions.parquet"
163 api.download_dataset("v1.0/historical_meta_models.parquet", download_path)
165 return DataLoader.from_file(
166 path=download_path, date_col=date_col, id_col=id_col, pred_col=pred_col
167 )