In [ ]:
%reset -f
%load_ext autoreload
%autoreload 2

filoma.ml — minimal examples¶

Tiny examples showing filename feature discovery and dataset splitting.

In [ ]:
# Run these cells with: PYTHONPATH=./src
import polars as pl

from filoma import ml

Discover tokens from filenames (separator='_')¶

In [ ]:
from filoma.dataframe import DataFrame as FDataFrame

df = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})

dfw = FDataFrame(df)
df2 = dfw.add_filename_features(sep="_", prefix=None, include_parent=False, path_col="path")
print(df2.columns)
print(df2)

Split by single token (token1)¶

In [ ]:
train, val, test = ml.split_data(df2, train_val_test=(60, 20, 20), feature=("token1",), path_col="path", seed=0)
print(len(train), len(val), len(test))
print(train)

Split by combined features (parent + token2)¶

In [ ]:
df3 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
train, val, test = ml.split_data(df3, train_val_test=(60, 20, 20), feature=("parent", "token2"), path_col="path", seed=0)
print(len(train), len(val), len(test))

Custom token names and auto names¶

In [ ]:
df4 = FDataFrame(df).add_filename_features(sep="_", prefix=None, token_names=["site", "kind", "idx"], path_col="path")
print(df4.columns)
df5 = FDataFrame(df).add_filename_features(sep="_", prefix="fn", token_names="auto", path_col="path")
print(df5.columns)

Include all path parts as features¶

In [ ]:
df6 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_all_parts=True, path_col="path")
print(df6.columns)

Use a custom path column¶

If your paths live in a column with a different name (for example my_path), pass path_col to discovery and splitting functions.

In [ ]:
df_custom = pl.DataFrame({"my_path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df_custom2 = FDataFrame(df_custom).add_filename_features(sep="_", prefix=None, include_parent=True, include_all_parts=True, path_col="my_path")
print(df_custom2.columns)
print(df_custom2)
train, val, test = ml.split_data(df_custom2, discover=False, feature="path_parts", path_parts=(-1,), path_col="my_path", seed=0)
print(len(train), len(val), len(test))

Return types: filoma wrapper and pandas¶

Below are two short examples showing how to request the filoma.DataFrame wrapper and a pandas.DataFrame from ml.split_data. The pandas example will fall back with a message if pandas is not installed. Run with PYTHONPATH=./src.

In [ ]:
# Example: return the filoma.DataFrame wrapper
df7 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
df7 = FDataFrame(df7).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
train_f, val_f, test_f = ml.split_data(
    df7, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
)
# filoma.DataFrame implements .to_polars() and other helpers
print("train_f type:", type(train_f))
print("train_f is filoma.DataFrame -> to_polars columns:", getattr(train_f, "to_polars")().columns)
print("split sizes:", len(train_f), len(val_f), len(test_f))
print("train_f head:")
print(train_f.head())
In [ ]:
# Example: return pandas.DataFrame (if pandas + pyarrow are installed)
# We check for both pandas and pyarrow and show an actionable message if missing.
try:
    import pandas as pd  # noqa: F401
    import pyarrow  # noqa: F401
except ImportError:
    print("pandas or pyarrow not available, skipping pandas example")
    print("Install with: pip install pandas pyarrow")
else:
    df8 = pl.DataFrame({"path": ["LCFM/20200312/LAPAZ_image_01.tif", "OTHER/20210101/SITE_image_01.tif"]})
    df8 = FDataFrame(df8).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
    try:
        train_p, val_p, test_p = ml.split_data(
            df8, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=0, return_type="filoma"
        )
        print("train_p type:", type(train_p))
        print("train_p head:")
        print(train_p.head())
    except Exception as e:
        print("conversion failed:", e)
In [ ]:
# Temporary test: create 10 underscored .txt files, run discovery + split_data, then clean up
import shutil
from pathlib import Path

import polars as pl

from filoma import ml
from filoma.dataframe import DataFrame as FDataFrame

tmp = Path("tests/tmp_ml_files")
if tmp.exists():
    shutil.rmtree(tmp)
# create folders
(tmp / "A").mkdir(parents=True, exist_ok=True)
(tmp / "B" / "C").mkdir(parents=True, exist_ok=True)
(tmp / "D").mkdir(parents=True, exist_ok=True)
(tmp / "E" / "sub").mkdir(parents=True, exist_ok=True)
files = [
    tmp / "A" / "LCFM_20200312_LAPAZ_image_01.txt",
    tmp / "A" / "LCFM_20200312_LAPAZ_image_02.txt",
    tmp / "B" / "OTHER_20210101_SITE_image_01.txt",
    tmp / "B" / "OTHER_20210101_SITE_image_02.txt",
    tmp / "B" / "C" / "MISC_20211111_TEST_doc_001.txt",
    tmp / "B" / "C" / "MISC_20211111_TEST_doc_002.txt",
    tmp / "D" / "EXTRA_FILE_01.txt",
    tmp / "D" / "EXTRA_FILE_02.txt",
    tmp / "E" / "sub" / "DEEP_202001_sample_01.txt",
    tmp / "E" / "sub" / "DEEP_202001_sample_02.txt",
]
for p in files:
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text("test\n")
paths = [str(p) for p in files]
print("created files:", len(paths))
for p in paths:
    print(" -", p)
df = pl.DataFrame({"path": paths})
df2 = FDataFrame(df).add_filename_features(sep="_", prefix=None, include_parent=True, path_col="path")
print("Discovered columns:", df2.columns)
train, val, test = ml.split_data(
    df2, train_val_test=(60, 20, 20), feature="path_parts", path_parts=(-1,), path_col="path", seed=42, return_type="polars"
)
print("Split sizes:", len(train), len(val), len(test))
print("Train sample:")
print(train)
# cleanup
shutil.rmtree(tmp)
print("cleaned up", tmp)
In [ ]: