Machine Learning with Dataframes
2025-07-09
skrub
!skrub
simplifies many tedious data preparation operations
import pandas as pd
import matplotlib.pyplot as plt
import skrub
dataset = skrub.datasets.fetch_employee_salaries()
employees, salaries = dataset.X, dataset.y
df = pd.DataFrame(employees)
# Plot the distribution of the numerical values using a histogram
fig, axs = plt.subplots(2,1, figsize=(10, 6))
ax1, ax2 = axs
ax1.hist(df['year_first_hired'], bins=30, edgecolor='black', alpha=0.7)
ax1.set_xlabel('Year first hired')
ax1.set_ylabel('Frequency')
ax1.grid(True, linestyle='--', alpha=0.5)
# Count the frequency of each category
category_counts = df['department'].value_counts()
# Create a bar plot
category_counts.plot(kind='bar', edgecolor='black', ax=ax2)
# Add labels and title
ax2.set_xlabel('Department')
ax2.set_ylabel('Frequency')
ax2.grid(True, linestyle='--', axis='y', alpha=0.5) # Add grid lines for y-axis
fig.suptitle("Distribution of values")
# Show the plot
plt.show()
skrub
Main features:
html
fileimport pandas as pd
import numpy as np
data = {
'Constant int': [1, 1, 1], # Single unique value
'B': [2, 3, 2], # Multiple unique values
'Constant str': ['x', 'x', 'x'], # Single unique value
'D': [4, 5, 6], # Multiple unique values
'All nan': [np.nan, np.nan, np.nan], # All missing values
'All empty': ['', '', ''], # All empty strings
'Date': ['01/01/2023', '02/01/2023', '03/01/2023'],
}
df = pd.DataFrame(data)
display(df)
Constant int | B | Constant str | D | All nan | All empty | Date | |
---|---|---|---|---|---|---|---|
0 | 1 | 2 | x | 4 | NaN | 01/01/2023 | |
1 | 1 | 3 | x | 5 | NaN | 02/01/2023 | |
2 | 1 | 2 | x | 6 | NaN | 03/01/2023 |
# Parse the datetime strings with a specific format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
# Drop columns with only a single unique value
df_cleaned = df.loc[:, df.nunique(dropna=True) > 1]
# Function to drop columns with only missing values or empty strings
def drop_empty_columns(df):
# Drop columns with only missing values
df_cleaned = df.dropna(axis=1, how='all')
# Drop columns with only empty strings
empty_string_cols = df_cleaned.columns[df_cleaned.eq('').all()]
df_cleaned = df_cleaned.drop(columns=empty_string_cols)
return df_cleaned
# Apply the function to the DataFrame
df_cleaned = drop_empty_columns(df_cleaned)
display(df_cleaned)
B | D | Date | |
---|---|---|---|
0 | 2 | 4 | 2023-01-01 |
1 | 3 | 5 | 2023-01-02 |
2 | 2 | 6 | 2023-01-03 |
Cleaner
import pandas as pd
data = {
'date': ['2023-01-01 12:34:56', '2023-02-15 08:45:23', '2023-03-20 18:12:45'],
'value': [10, 20, 30]
}
df = pd.DataFrame(data)
df_expanded = df.copy()
datetime_column = "date"
df_expanded[datetime_column] = pd.to_datetime(df_expanded[datetime_column], errors='coerce')
df_expanded['year'] = df_expanded[datetime_column].dt.year
df_expanded['month'] = df_expanded[datetime_column].dt.month
df_expanded['day'] = df_expanded[datetime_column].dt.day
df_expanded['hour'] = df_expanded[datetime_column].dt.hour
df_expanded['minute'] = df_expanded[datetime_column].dt.minute
df_expanded['second'] = df_expanded[datetime_column].dt.second
df_expanded['hour_sin'] = np.sin(2 * np.pi * df_expanded['hour'] / 24)
df_expanded['hour_cos'] = np.cos(2 * np.pi * df_expanded['hour'] / 24)
df_expanded['month_sin'] = np.sin(2 * np.pi * df_expanded['month'] / 12)
df_expanded['month_cos'] = np.cos(2 * np.pi * df_expanded['month'] / 12)
print("Original DataFrame:")
print(df)
print("\nDataFrame with expanded datetime columns:")
print(df_expanded)
Original DataFrame:
date value
0 2023-01-01 12:34:56 10
1 2023-02-15 08:45:23 20
2 2023-03-20 18:12:45 30
DataFrame with expanded datetime columns:
date value year month day hour minute second \
0 2023-01-01 12:34:56 10 2023 1 1 12 34 56
1 2023-02-15 08:45:23 20 2023 2 15 8 45 23
2 2023-03-20 18:12:45 30 2023 3 20 18 12 45
hour_sin hour_cos month_sin month_cos
0 1.224647e-16 -1.000000e+00 0.500000 8.660254e-01
1 8.660254e-01 -5.000000e-01 0.866025 5.000000e-01
2 -1.000000e+00 -1.836970e-16 1.000000 6.123234e-17
skrub.DatetimeEncoder
from skrub import DatetimeEncoder, ToDatetime
de = DatetimeEncoder(periodic_encoding="circular")
X_date = ToDatetime().fit_transform(df["date"])
X_enc = de.fit_transform(X_date)
print(X_enc)
date_year date_total_seconds date_month_circular_0 \
0 2023.0 1.672577e+09 0.500000
1 2023.0 1.676451e+09 0.866025
2 2023.0 1.679336e+09 1.000000
date_month_circular_1 date_day_circular_0 date_day_circular_1 \
0 8.660254e-01 2.079117e-01 0.978148
1 5.000000e-01 1.224647e-16 -1.000000
2 6.123234e-17 -8.660254e-01 -0.500000
date_hour_circular_0 date_hour_circular_1
0 1.224647e-16 -1.000000e+00
1 8.660254e-01 -5.000000e-01
2 -1.000000e+00 -1.836970e-16
TableVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from sklearn.compose import make_column_transformer
categorical_columns = selector(dtype_include=object)(employees)
numerical_columns = selector(dtype_exclude=object)(employees)
ct = make_column_transformer(
(StandardScaler(),
numerical_columns),
(OneHotEncoder(handle_unknown="ignore"),
categorical_columns))
model = make_pipeline(ct, SimpleImputer(), Ridge())
tabular_learner
Pipeline(steps=[('tablevectorizer', TableVectorizer(datetime=DatetimeEncoder(periodic_encoding='spline'))), ('simpleimputer', SimpleImputer(add_indicator=True)), ('standardscaler', StandardScaler()), ('ridge', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('tablevectorizer', TableVectorizer(datetime=DatetimeEncoder(periodic_encoding='spline'))), ('simpleimputer', SimpleImputer(add_indicator=True)), ('standardscaler', StandardScaler()), ('ridge', Ridge())])
TableVectorizer(datetime=DatetimeEncoder(periodic_encoding='spline'))
PassThrough()
DatetimeEncoder(periodic_encoding='spline')
OneHotEncoder(drop='if_binary', dtype='float32', handle_unknown='ignore', sparse_output=False)
StringEncoder()
SimpleImputer(add_indicator=True)
StandardScaler()
Ridge()
skrub.datasets
, or user dataskrub.TableReport
skrub.TableVectorizer
, Cleaner
, DatetimeEncoder
…skrub.TableVectorizer
, TextEncoder
, StringEncoder
…tabular_learner
, sklearn.pipeline.make_pipeline
…A data scientist needs to train a ML model, but features are spread across multiple tables.
Warning
Many issues with this!
scikit-learn
pipelines support only a single feature matrix X
skrub
DataOpsWhen a normal pipe is not enough…
… the skrub
DataOps come to the rescue 🚒
scikit-learn
machinery to complex multi-table operationslearner
learner
s: oh my!A DataOp
(singular) wraps a single operation, and can be combined and concatenated with other DataOps
.
The Data Plan is a collective name for a sequence and combination of DataOps
.
The Data Plan can be exported as a standalone object called learner
. The learner
takes a dictionary of values rather than just X
and y
.
DataOps wrap around user operations, where user operations are:
Important
DataOps record user operations, so that they can later be replayed in the same order and with the same arguments on unseen data.
DataOps
X
, y
, products
represent inputs to the pipeline.skrub
splits X
and y
when training.from skrub import selectors as s
from sklearn.ensemble import ExtraTreesClassifier
vectorizer = skrub.TableVectorizer(high_cardinality=skrub.StringEncoder(), n_jobs=-1)
vectorized_products = products.skb.apply(vectorizer, cols=s.all() - "basket_ID")
aggregated_products = vectorized_products.groupby("basket_ID").agg("mean").reset_index()
features = X.merge(aggregated_products, left_on="ID", right_on="basket_ID")
features = features.drop(columns=["ID", "basket_ID"])
predictions = features.skb.apply(ExtraTreesClassifier(n_jobs=-1), y=y)
learner
The data plan can be exported as a learner
:
# anywhere
learner = predictions.skb.make_learner()
# search is a HPO object
best_learner = search.skb.best_learner_
Then, the learner
can be pickled …
skrub
implements four choose_*
functions:
choose_from
: select from the given list of optionschoose_int
: select an integer within a rangechoose_float
: select a float within a rangechoose_bool
: select a booloptional
: chooses between a value or DataOp and no opIt’s possible to nest these functions to create complex grids:
scikit-learn
can be complexpipe = Pipeline([("dim_reduction", PCA()), ("regressor", Ridge())])
grid = [
{
"dim_reduction": [PCA()],
"dim_reduction__n_components": [10, 20, 30],
"regressor": [Ridge()],
"regressor__alpha": loguniform(0.1, 10.0),
},
{
"dim_reduction": [SelectKBest()],
"dim_reduction__k": [10, 20, 30],
"regressor": [Ridge()],
"regressor__alpha": loguniform(0.1, 10.0),
},
{
"dim_reduction": [PCA()],
"dim_reduction__n_components": [10, 20, 30],
"regressor": [RandomForestClassifier()],
"regressor__n_estimators": loguniform(20, 200),
},
{
"dim_reduction": [SelectKBest()],
"dim_reduction__k": [10, 20, 30],
"regressor": [RandomForestClassifier()],
"regressor__n_estimators": loguniform(20, 200),
},
]
model = RandomizedSearchCV(pipe, grid)
DataOps
is simple!dim_reduction = X.skb.apply(
skrub.choose_from(
{
"PCA": PCA(n_components=skrub.choose_int(10, 30)),
"SelectKBest": SelectKBest(k=skrub.choose_int(10, 30))
}, name="dim_reduction"
)
)
regressor = dim_reduction.skb.apply(
skrub.choose_from(
{
"Ridge": Ridge(alpha=skrub.choose_float(0.1, 10.0, log=True)),
"RandomForest": RandomForestClassifier(
n_estimators=skrub.choose_int(20, 200, log=True)
)
}, name="regressor"
)
)
regressor.skb.get_randomized_search(scoring="roc_auc", fitted=True)
search = pred.skb.get_randomized_search(scoring="roc_auc", fitted=True)
search.plot_parallel_coord()
skrub
provides