In [ ]:
%reset -f
%load_ext autoreload
%autoreload 2
filoma.ml — minimal examples¶
Tiny examples showing filename feature discovery and dataset splitting.
In [ ]:
# Run these cells with: PYTHONPATH=./src
import polars as pl
from filoma import ml
Discover tokens from filenames (separator='_')¶
In [ ]:
from filoma.dataframe import DataFrame as FDataFrame
df = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
dfw = FDataFrame(df)
df2 = dfw.add_filename_features(sep="_", prefix=None, include_parent=False, path_col="path")
print(df2.columns)
print(df2)
Split by single token (token1)¶
In [ ]:
train, val, test = ml.split_data(df2, train_val_test=(60, 20, 20), feature=("token1",), path_col="path", seed=0)
print(len(train), len(val), len(test))
print(train)
Split by combined features (parent + token2)¶
In [ ]:
df3 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
train, val, test = ml.split_data(df3, train_val_test=(60, 20, 20), feature=("parent", "token2"), path_col="path", seed=0)
print(len(train), len(val), len(test))
Custom token names and auto names¶
In [ ]:
df4 = FDataFrame(df).add_filename_features(sep="_", prefix=None, token_names=["site", "kind", "idx"], path_col="path")
print(df4.columns)
df5 = FDataFrame(df).add_filename_features(sep="_", prefix="fn", token_names="auto", path_col="path")
print(df5.columns)
Include all path parts as features¶
In [ ]:
df6 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_all_parts=True, path_col="path")
print(df6.columns)
Use a custom path column¶
If your paths live in a column with a different name (for example my_path), pass path_col to discovery and splitting functions.
In [ ]:
df_custom = pl.DataFrame({"my_path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df_custom2 = FDataFrame(df_custom).add_filename_features(sep="_", prefix=None, include_parent=True, include_all_parts=True, path_col="my_path")
print(df_custom2.columns)
print(df_custom2)
train, val, test = ml.split_data(df_custom2, discover=False, feature="path_parts", path_parts=(-1,), path_col="my_path", seed=0)
print(len(train), len(val), len(test))
Return types: filoma wrapper and pandas¶
Below are two short examples showing how to request the filoma.DataFrame wrapper and a pandas.DataFrame from ml.split_data. The pandas example will fall back with a message if pandas is not installed. Run with PYTHONPATH=./src.
In [ ]:
# Example: return the filoma.DataFrame wrapper
df7 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df7 = FDataFrame(df7).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
train_f, val_f, test_f = ml.split_data(
df7, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
)
# filoma.DataFrame implements .to_polars() and other helpers
print("train_f type:", type(train_f))
print("train_f is filoma.DataFrame -> to_polars columns:", getattr(train_f, "to_polars")().columns)
print("split sizes:", len(train_f), len(val_f), len(test_f))
print("train_f head:")
print(train_f.head())
In [ ]:
# Example: return pandas.DataFrame (if pandas + pyarrow are installed)
# We check for both pandas and pyarrow and show an actionable message if missing.
try:
import pandas as pd # noqa: F401
import pyarrow # noqa: F401
except ImportError:
print("pandas or pyarrow not available, skipping pandas example")
print("Install with: pip install pandas pyarrow")
else:
df8 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df8 = FDataFrame(df8).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
try:
train_p, val_p, test_p = ml.split_data(
df8, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
)
print("train_p type:", type(train_p))
print("train_p head:")
print(train_p.head())
except Exception as e:
print("conversion failed:", e)
In [ ]:
# Temporary test: create 10 underscored .txt files, run discovery + split_data, then clean up
import shutil
from pathlib import Path
import polars as pl
from filoma import ml
from filoma.dataframe import DataFrame as FDataFrame
tmp = Path("tests/tmp_ml_files")
if tmp.exists():
shutil.rmtree(tmp)
# create folders
(tmp / "A").mkdir(parents=True, exist_ok=True)
(tmp / "B" / "C").mkdir(parents=True, exist_ok=True)
(tmp / "D").mkdir(parents=True, exist_ok=True)
(tmp / "E" / "sub").mkdir(parents=True, exist_ok=True)
files = [
tmp / "A" / "LCFM_20200312_LAPAZ_image_01.txt",
tmp / "A" / "LCFM_20200312_LAPAZ_image_02.txt",
tmp / "B" / "OTHER_20210101_SITE_image_01.txt",
tmp / "B" / "OTHER_20210101_SITE_image_02.txt",
tmp / "B" / "C" / "MISC_20211111_TEST_doc_001.txt",
tmp / "B" / "C" / "MISC_20211111_TEST_doc_002.txt",
tmp / "D" / "EXTRA_FILE_01.txt",
tmp / "D" / "EXTRA_FILE_02.txt",
tmp / "E" / "sub" / "DEEP_202001_sample_01.txt",
tmp / "E" / "sub" / "DEEP_202001_sample_02.txt",
]
for p in files:
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text("test\n")
paths = [str(p) for p in files]
print("created files:", len(paths))
for p in paths:
print(" -", p)
df = pl.DataFrame({"path": paths})
df2 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
print("Discovered columns:", df2.columns)
train, val, test = ml.split_data(
df2, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=42, return_type="polars"
)
print("Split sizes:", len(train), len(val), len(test))
print("Train sample:")
print(train)
# cleanup
shutil.rmtree(tmp)
print("cleaned up", tmp)
In [ ]: