import pandas
from forml import project
from forml.pipeline import payload, wrap
from openschema import kaggle as schema


@wrap.Operator.mapper
@wrap.Actor.apply
def TimeExtractor(features: pandas.DataFrame) -> pandas.DataFrame:
    """Transformer extracting temporal features from the original ``hour`` column."""
    assert "hour" in features.columns, "Missing column: hour"
    time = features["hour"]
    features["dayofweek"] = time.dt.dayofweek
    features["day"] = time.dt.day
    features["hour"] = time.dt.hour  # replacing the original column
    features["month"] = time.dt.month
    return features

OUTCOMES = ...   # Keep original
ORDINAL = ...    # Keep original
STATEMENT = ...  # Keep original

# Setting up the source descriptor:
SOURCE = (
    project.Source.query(STATEMENT, OUTCOMES, ordinal=ORDINAL)
    >> payload.ToPandas()
    >> TimeExtractor()  # Applying the temporal feature extraction
)

# Registering the descriptor
project.setup(SOURCE)


! git add avazuctr/source.py

from forml import project
from forml.pipeline import wrap

with wrap.importer():
    from category_encoders import TargetEncoder
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import MinMaxScaler

CATEGORICAL_COLUMNS = [
    "C1", "banner_pos", "site_id", "site_domain",
    "site_category", "app_id", "app_domain", "app_category",
    "device_model", "device_type", "device_conn_type",
    "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"
]

PIPELINE = (
    TargetEncoder(cols=CATEGORICAL_COLUMNS)
    >> MinMaxScaler()
    >> LogisticRegression(max_iter=1000, random_state=42)
)

# Registering the pipeline
project.setup(PIPELINE)


! git add avazuctr/pipeline.py


! forml project eval

running eval
0.39313604609251457


! touch tests/test_source.py

import pandas
from forml import testing

from avazuctr import source

class TestTimeExtractor(testing.operator(source.TimeExtractor)):
    """Unit testing the stateless TimeExtractor transformer."""

    # Dataset fixtures
    EMPTY = pandas.DataFrame()
    INPUT = pandas.DataFrame({"hour": [
        pandas.Timestamp("2023-02-01 14:12:10"),
        pandas.Timestamp("2023-03-04 06:13:27"),
        pandas.Timestamp("2023-04-10 12:00:00")
    ]})
    EXPECTED = pandas.DataFrame({
        "hour": [14, 6, 12], "dayofweek": [2, 5, 0],
        "day": [1, 4, 10], "month": [2, 3, 4]
    }).astype("int32")

    # Test scenarios
    missing_column = (
        testing.Case().apply(EMPTY).raises(AssertionError, "Missing column: hour")
    )
    valid_extraction = (
        testing.Case().apply(INPUT).returns(EXPECTED, testing.pandas_equals)
    )


! git add tests/test_source.py


! forml project test 2>&1 | tail -n 20

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/forml/flow/_code/target/__init__.py", line 56, in __call__
    result = self.execute(*args)
  File "/usr/local/lib/python3.10/site-packages/forml/flow/_code/target/user.py", line 196, in execute
    return self.action(self.builder(), *args)
  File "/usr/local/lib/python3.10/site-packages/forml/flow/_code/target/user.py", line 150, in __call__
    result = actor.apply(*args)
  File "/usr/local/lib/python3.10/site-packages/forml/pipeline/wrap/_actor.py", line 166, in apply
    return self.Apply(*features, **self._kwargs)
  File "/opt/forml/workspace/3-solution/forml-solution-avazuctr/avazuctr/source.py", line 11, in TimeExtractor
    assert "hour" in features.columns, "Missing column: hour"
AssertionError: Missing column: hour
ok
test_valid_extraction (tests.test_source.TestTimeExtractor)
Test of Valid Extraction ... ok

----------------------------------------------------------------------
Ran 2 tests in 3.565s

OK

Formal Base Model¶

Updating the Project Code Base¶

Adding TimeExtractor to Source.py¶

Adding the Base Model to Pipeline.py¶

Perform the Development Evaluation¶

Adding Unit Test for TimeExtractor¶