from sklearn import model_selection

from forml import project
from forml.pipeline import ensemble, wrap

with wrap.importer():
    from category_encoders import TargetEncoder
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler

CATEGORICAL_COLUMNS = ...  # Keep original

STACK = ensemble.FullStack(
    GradientBoostingClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    crossvalidator=model_selection.StratifiedKFold(n_splits=2, shuffle=True, random_state=42),
)

PIPELINE = (
    TargetEncoder(cols=CATEGORICAL_COLUMNS)
    >> MinMaxScaler()
    >> STACK
    >> LogisticRegression(max_iter=1000, random_state=42)
)

# Registering the pipeline
project.setup(PIPELINE)


! forml project eval

running eval
0.3911373233919557


! git add avazuctr/pipeline.py


! forml project train -R graphviz

running train

[project]
name = "forml-solution-avazuctr"
version = "0.1"
dependencies = [
    "category-encoders==2.6.0",
    "forml==0.93",
    "imbalanced-learn==0.10.1 ",
    "openschema==0.7",
    "pandas==2.0.1",
    "scikit-learn==1.2.2"
]

[tool.forml]
package = "avazuctr"


! git add pyproject.toml

import typing

from imblearn import over_sampling
from sklearn import model_selection

from forml import flow, project
from forml.pipeline import ensemble, wrap

with wrap.importer():
    from category_encoders import TargetEncoder
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler

@wrap.Actor.apply
def OverSampler(
    features, labels, *, random_state: typing.Optional[int] = None
):
    """Stateless actor with two input and two output ports for oversampling the features/labels of the minor class."""
    return over_sampling.RandomOverSampler(
        random_state=random_state
    ).fit_resample(features, labels)

class Balancer(flow.Operator):
    """Balancer operator inserting the provided sampler into the ``train`` & ``label`` paths."""

    def __init__(
        self,
        sampler: flow.Builder = OverSampler.builder(random_state=42),
    ):
        self._sampler = sampler

    def compose(self, scope: flow.Composable) -> flow.Trunk:
        left = scope.expand()
        sampler = flow.Worker(self._sampler, 2, 2)
        sampler[0].subscribe(left.train.publisher)
        new_features = flow.Future()
        new_features[0].subscribe(sampler[0])
        sampler[1].subscribe(left.label.publisher)
        new_labels = flow.Future()
        new_labels[0].subscribe(sampler[1])
        return left.use(
            train=left.train.extend(tail=new_features),
            label=left.label.extend(tail=new_labels),
        )

CATEGORICAL_COLUMNS = ... # Keep original
STACK = ... # Keep original

PIPELINE = (
    TargetEncoder(cols=CATEGORICAL_COLUMNS)
    >> Balancer()    # Inserting the Balancer
    >> MinMaxScaler()
    >> STACK
    >> LogisticRegression(warm_start=True, max_iter=1000, random_state=42)
)

# Registering the pipeline
project.setup(PIPELINE)


from forml import project
from avazuctr import pipeline
PROJECT = project.open(path=".", package="avazuctr")
PROJECT.components.source.bind(
    pipeline.Balancer()
).launcher.train().labels.value_counts()

click
0    417919
1    417919
Name: count, dtype: int64


! forml project eval

running eval
0.38634029551291765


! git add avazuctr/pipeline.py


! forml project train -R graphviz

running train


! touch tests/test_pipeline.py

from forml import testing
from avazuctr import pipeline

class TestBalancer(testing.operator(pipeline.Balancer)):
    """Balancer unit tests."""

    default_oversample = (
        testing.Case()
        .train([[1], [1], [0]], [1, 1, 0])
        .returns([[1], [1], [0], [0]], labels=[1, 1, 0, 0])
    )


! git add tests/test_pipeline.py


! forml project test 2>&1 | tail -n 20

    result = self.execute(*args)
  File "/usr/local/lib/python3.10/site-packages/forml/flow/_code/target/user.py", line 196, in execute
    return self.action(self.builder(), *args)
  File "/usr/local/lib/python3.10/site-packages/forml/flow/_code/target/user.py", line 150, in __call__
    result = actor.apply(*args)
  File "/usr/local/lib/python3.10/site-packages/forml/pipeline/wrap/_actor.py", line 166, in apply
    return self.Apply(*features, **self._kwargs)
  File "/opt/forml/workspace/3-solution/forml-solution-avazuctr/avazuctr/source.py", line 11, in TimeExtractor
    assert "hour" in features.columns, "Missing column: hour"
AssertionError: Missing column: hour
ok
test_valid_extraction (tests.test_source.TestTimeExtractor)
Test of Valid Extraction ... ok
test_default_oversample (tests.test_pipeline.TestBalancer)
Test of Default Oversample ... ok

----------------------------------------------------------------------
Ran 3 tests in 5.571s

OK

Pipeline Enhancements¶

Model Ensembling¶

Adding the Ensemble¶

Evaluating the Change¶

Reviewing the Ensembling Task Graph¶

Balancing the Target Classes¶

Adding the Balancer¶

Evaluating the Change¶

Reviewing the Final Task Graph¶

Adding the Balancer Unit Test¶