Source code for eemeter.caltrack.hourly

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""

   Copyright 2014-2023 OpenEEmeter contributors

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

"""
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

from ..features import (
    compute_time_features,
    compute_temperature_bin_features,
    compute_occupancy_feature,
    merge_features,
)
from ..metrics import ModelMetrics
from ..segmentation import CalTRACKSegmentModel, SegmentedModel, fit_model_segments
from ..warnings import EEMeterWarning


__all__ = (
    "CalTRACKHourlyModelResults",
    "CalTRACKHourlyModel",
    "caltrack_hourly_fit_feature_processor",
    "caltrack_hourly_prediction_feature_processor",
    "fit_caltrack_hourly_model_segment",
    "fit_caltrack_hourly_model",
)


[docs]class CalTRACKHourlyModelResults(object):
    """Contains information about the chosen model.

    Attributes
    ----------
    status : :any:`str`
        A string indicating the status of this result. Possible statuses:

        - ``'NO DATA'``: No baseline data was available.
        - ``'NO MODEL'``: A complete model could not be constructed.
        - ``'SUCCESS'``: A model was constructed.
    method_name : :any:`str`
        The name of the method used to fit the baseline model.
    model : :any:`eemeter.CalTRACKHourlyModel` or :any:`None`
        The selected model, if any.
    warnings : :any:`list` of :any:`eemeter.EEMeterWarning`
        A list of any warnings reported during the model selection and fitting
        process.
    metadata : :any:`dict`
        An arbitrary dictionary of metadata to be associated with this result.
        This can be used, for example, to tag the results with attributes like
        an ID::

            {
                'id': 'METER_12345678',
            }

    settings : :any:`dict`
        A dictionary of settings used by the method.
    totals_metrics : :any:`ModelMetrics`
        A ModelMetrics object, if one is calculated and associated with this
        model. (This initializes to None.) The ModelMetrics object contains
        model fit information and descriptive statistics about the underlying data,
        with that data expressed as period totals.
    avgs_metrics : :any:`ModelMetrics`
        A ModelMetrics object, if one is calculated and associated with this
        model. (This initializes to None.) The ModelMetrics object contains
        model fit information and descriptive statistics about the underlying data,
        with that data expressed as daily averages.
    """

    def __init__(
        self, status, method_name, model=None, warnings=[], metadata=None, settings=None
    ):
        self.status = status
        self.method_name = method_name

        self.model = model

        self.warnings = warnings

        if metadata is None:
            metadata = {}
        self.metadata = metadata

        if settings is None:
            settings = {}
        self.settings = settings

        self.totals_metrics = None
        self.avgs_metrics = None

    def __repr__(self):
        return "CalTRACKHourlyModelResults(status='{}', method_name='{}')".format(
            self.status, self.method_name
        )

[docs]    def json(self, with_candidates=False):
        """Return a JSON-serializable representation of this result.

        The output of this function can be converted to a serialized string
        with :any:`json.dumps`.
        """

        def _json_or_none(obj):
            return None if obj is None else obj.json()

        def _json_or_none_in_dict(obj):
            return (
                None
                if obj is None
                else {key: _json_or_none(val) for key, val in obj.items()}
            )

        data = {
            "status": self.status,
            "method_name": self.method_name,
            "model": _json_or_none(self.model),
            "warnings": [w.json() for w in self.warnings],
            "metadata": self.metadata,
            "settings": self.settings,
            "totals_metrics": _json_or_none_in_dict(self.totals_metrics),
            "avgs_metrics": _json_or_none_in_dict(self.avgs_metrics),
        }
        return data

[docs]    @classmethod
    def from_json(cls, data):
        """Loads a JSON-serializable representation into the model state.

        The input of this function is a dict which can be the result
        of :any:`json.loads`.
        """

        # "model" is a CalTRACKHourlyModel that was serialized
        model = None
        d = data.get("model")
        if d:
            model = CalTRACKHourlyModel.from_json(d)

        c = cls(
            data.get("status"),
            data.get("method_name"),
            model=model,
            warnings=data.get("warnings"),
            metadata=data.get("metadata"),
            settings=data.get("settings"),
        )

        # Note the metrics do not contain all the data needed
        # for reconstruction (like the input pandas) ...
        d = data.get("avgs_metrics")
        if d:
            c.avgs_metrics = ModelMetrics.from_json(d)  # pragma: no cover
        d = data.get("totals_metrics")
        if d:
            c.totals_metrics = ModelMetrics.from_json(d)
        return c

[docs]    def predict(self, prediction_index, temperature_data, **kwargs):
        """Predict over a particular index using temperature data.

        Parameters
        ----------
        prediction_index : :any:`pandas.DatetimeIndex`
            Time period over which to predict.
        temperature_data : :any:`pandas.DataFrame`
            Hourly temperature data to use for prediction. Time period should match
            the ``prediction_index`` argument.
        **kwargs
            Extra keyword arguments to send to self.model.predict

        Returns
        -------
        prediction : :any:`pandas.DataFrame`
            The predicted usage values.
        """
        return self.model.predict(prediction_index, temperature_data, **kwargs)


[docs]class CalTRACKHourlyModel(SegmentedModel):
    """An object which holds CalTRACK Hourly model data and metadata, and
    which can be used for prediction.

    Attributes
    ----------
    segment_models : :any:`dict` of `eemeter.CalTRACKSegmentModel`
        Dictionary of models for each segment, keys are segment names.
    occupancy_lookup : :any:`pandas.DataFrame`
        A dataframe with occupancy flags for each hour of the week and each segment.
        Segment names are columns, occupancy flags are 0 or 1.
    occupied_temperature_bins : :any:`pandas.DataFrame`
        A dataframe of bin endpoint flags for each segment. Segment names are columns.
    unoccupied_temperature_bins : :any:`pandas.DataFrame`
        Ditto for the unoccupied mode.
    """

    def __init__(
        self,
        segment_models,
        occupancy_lookup,
        occupied_temperature_bins,
        unoccupied_temperature_bins,
    ):
        self.occupancy_lookup = occupancy_lookup
        self.occupied_temperature_bins = occupied_temperature_bins
        self.unoccupied_temperature_bins = unoccupied_temperature_bins
        super(CalTRACKHourlyModel, self).__init__(
            segment_models=segment_models,
            prediction_segment_type="one_month",
            prediction_segment_name_mapping={
                "jan": "dec-jan-feb-weighted",
                "feb": "jan-feb-mar-weighted",
                "mar": "feb-mar-apr-weighted",
                "apr": "mar-apr-may-weighted",
                "may": "apr-may-jun-weighted",
                "jun": "may-jun-jul-weighted",
                "jul": "jun-jul-aug-weighted",
                "aug": "jul-aug-sep-weighted",
                "sep": "aug-sep-oct-weighted",
                "oct": "sep-oct-nov-weighted",
                "nov": "oct-nov-dec-weighted",
                "dec": "nov-dec-jan-weighted",
            },
            prediction_feature_processor=caltrack_hourly_prediction_feature_processor,
            prediction_feature_processor_kwargs={
                "occupancy_lookup": self.occupancy_lookup,
                "occupied_temperature_bins": self.occupied_temperature_bins,
                "unoccupied_temperature_bins": self.unoccupied_temperature_bins,
            },
        )

[docs]    def json(self):
        """Return a JSON-serializable representation of this result.

        The output of this function can be converted to a serialized string
        with :any:`json.dumps`.
        """
        data = super(CalTRACKHourlyModel, self).json()
        data.update(
            {
                "occupancy_lookup": self.occupancy_lookup.to_json(orient="split"),
                "occupied_temperature_bins": self.occupied_temperature_bins.to_json(
                    orient="split"
                ),
                "unoccupied_temperature_bins": self.unoccupied_temperature_bins.to_json(
                    orient="split"
                ),
            }
        )
        return data

[docs]    @classmethod
    def from_json(cls, data):
        """Loads a JSON-serializable representation into the model state.

        The input of this function is a dict which can be the result
        of :any:`json.loads`.
        """

        segment_models = [
            CalTRACKSegmentModel.from_json(s) for s in data.get("segment_models")
        ]

        occupancy_lookup = pd.read_json(data.get("occupancy_lookup"), orient="split")
        occupancy_lookup.index = occupancy_lookup.index.astype("category")

        c = cls(
            segment_models,
            occupancy_lookup,
            pd.read_json(data.get("occupied_temperature_bins"), orient="split"),
            pd.read_json(data.get("unoccupied_temperature_bins"), orient="split"),
        )

        return c


[docs]def caltrack_hourly_fit_feature_processor(
    segment_name,
    segmented_data,
    occupancy_lookup,
    occupied_temperature_bins,
    unoccupied_temperature_bins,
):
    """A function that takes in temperature data and returns a dataframe of
    features suitable for use with :any:`eemeter.fit_caltrack_hourly_model_segment`.
    Designed for use with :any:`eemeter.iterate_segmented_dataset`.

    Parameters
    ----------
    segment_name : :any:`str`
        The name of the segment.
    segmented_data : :any:`pandas.DataFrame`
        Hourly temperature data for the segment.
    occupancy_lookup : :any:`pandas.DataFrame`
        A dataframe with occupancy flags for each hour of the week and each segment.
        Segment names are columns, occupancy flags are 0 or 1.
    occupied_temperature_bins : :any:`pandas.DataFrame`
        A dataframe of bin endpoint flags for each segment. Segment names are columns.
    unoccupied_temperature_bins : :any:`pandas.DataFrame`
        Ditto for the unoccupied mode.

    Returns
    -------
    features : :any:`pandas.DataFrame`
        A dataframe of features with the following columns:

        - 'meter_value': the observed meter value
        - 'hour_of_week': 0-167
        - 'bin_<0-6>_occupied': temp bin feature, or 0 if unoccupied
        - 'bin_<0-6>_unoccupied': temp bin feature or 0 in occupied
        - 'weight': 0.0 or 0.5 or 1.0
    """
    # get occupied feature
    hour_of_week = segmented_data.hour_of_week
    occupancy = occupancy_lookup[segment_name]
    occupancy_feature = compute_occupancy_feature(hour_of_week, occupancy)

    # get temperature bin features
    temperatures = segmented_data.temperature_mean
    occupied_bin_endpoints_list = (
        occupied_temperature_bins[segment_name]
        .index[occupied_temperature_bins[segment_name]]
        .tolist()
    )
    unoccupied_bin_endpoints_list = (
        unoccupied_temperature_bins[segment_name]
        .index[unoccupied_temperature_bins[segment_name]]
        .tolist()
    )
    occupied_temperature_bin_features = compute_temperature_bin_features(
        segmented_data.temperature_mean, occupied_bin_endpoints_list
    )
    occupied_temperature_bin_features[occupancy_feature == 0] = 0
    occupied_temperature_bin_features.rename(
        columns={
            c: "{}_occupied".format(c)
            for c in occupied_temperature_bin_features.columns
        },
        inplace=True,
    )
    unoccupied_temperature_bin_features = compute_temperature_bin_features(
        segmented_data.temperature_mean, unoccupied_bin_endpoints_list
    )
    unoccupied_temperature_bin_features[occupancy_feature == 1] = 0
    unoccupied_temperature_bin_features.rename(
        columns={
            c: "{}_unoccupied".format(c)
            for c in unoccupied_temperature_bin_features.columns
        },
        inplace=True,
    )

    # combine features
    return merge_features(
        [
            segmented_data[["meter_value", "hour_of_week"]],
            occupied_temperature_bin_features,
            unoccupied_temperature_bin_features,
            segmented_data.weight,
        ]
    )


[docs]def caltrack_hourly_prediction_feature_processor(
    segment_name,
    segmented_data,
    occupancy_lookup,
    occupied_temperature_bins,
    unoccupied_temperature_bins,
):
    """A function that takes in temperature data and returns a dataframe of
    features suitable for use inside :any:`eemeter.CalTRACKHourlyModel`.
    Designed for use with :any:`eemeter.iterate_segmented_dataset`.

    Parameters
    ----------
    segment_name : :any:`str`
        The name of the segment.
    segmented_data : :any:`pandas.DataFrame`
        Hourly temperature data for the segment.
    occupancy_lookup : :any:`pandas.DataFrame`
        A dataframe with occupancy flags for each hour of the week and each segment.
        Segment names are columns, occupancy flags are 0 or 1.
    occupied_temperature_bins : :any:`pandas.DataFrame`
        A dataframe of bin endpoint flags for each segment. Segment names are columns.
    unoccupied_temperature_bins : :any:`pandas.DataFrame`
        Ditto for the unoccupied mode.

    Returns
    -------
    features : :any:`pandas.DataFrame`
        A dataframe of features with the following columns:

        - 'hour_of_week': 0-167
        - 'bin_<0-6>_occupied': temp bin feature, or 0 if unoccupied
        - 'bin_<0-6>_unoccupied': temp bin feature or 0 in occupied
        - 'weight': 1
    """
    # hour of week feature
    hour_of_week_feature = compute_time_features(
        segmented_data.index, hour_of_week=True, day_of_week=False, hour_of_day=False
    )

    # occupancy feature
    occupancy = occupancy_lookup[segment_name]
    occupancy_feature = compute_occupancy_feature(
        hour_of_week_feature.hour_of_week, occupancy
    )

    # get temperature bin features
    temperatures = segmented_data
    occupied_bin_endpoints_list = (
        occupied_temperature_bins[segment_name]
        .index[occupied_temperature_bins[segment_name]]
        .tolist()
    )
    unoccupied_bin_endpoints_list = (
        unoccupied_temperature_bins[segment_name]
        .index[unoccupied_temperature_bins[segment_name]]
        .tolist()
    )
    occupied_temperature_bin_features = compute_temperature_bin_features(
        segmented_data.temperature_mean, occupied_bin_endpoints_list
    )
    occupied_temperature_bin_features[occupancy_feature == 0] = 0
    occupied_temperature_bin_features.rename(
        columns={
            c: "{}_occupied".format(c)
            for c in occupied_temperature_bin_features.columns
        },
        inplace=True,
    )
    unoccupied_temperature_bin_features = compute_temperature_bin_features(
        segmented_data.temperature_mean, unoccupied_bin_endpoints_list
    )
    unoccupied_temperature_bin_features[occupancy_feature == 1] = 0
    unoccupied_temperature_bin_features.rename(
        columns={
            c: "{}_unoccupied".format(c)
            for c in unoccupied_temperature_bin_features.columns
        },
        inplace=True,
    )

    # combine features
    return merge_features(
        [
            hour_of_week_feature,
            occupied_temperature_bin_features,
            unoccupied_temperature_bin_features,
            segmented_data.weight,
        ]
    )


[docs]def fit_caltrack_hourly_model_segment(segment_name, segment_data):
    """Fit a model for a single segment.

    Parameters
    ----------
    segment_name : :any:`str`
        The name of the segment.
    segment_data : :any:`pandas.DataFrame`
        A design matrix for caltrack hourly, of the form returned by
        :any:`eemeter.caltrack_hourly_prediction_feature_processor`.

    Returns
    -------
    segment_model : :any:`CalTRACKSegmentModel`
        A model that represents the fitted model.
    """

    warnings = []
    if segment_data.dropna().empty:
        model = None
        formula = None
        model_params = None
        warnings.append(
            EEMeterWarning(
                qualified_name="eemeter.fit_caltrack_hourly_model_segment.no_nonnull_data",
                description="The segment contains either an empty dataset or all NaNs.",
                data={
                    "n_rows": segment_data.shape[0],
                    "n_rows_after_dropna": segment_data.dropna().shape[0],
                },
            )
        )

    else:

        def _get_hourly_model_formula(data):
            return "meter_value ~ C(hour_of_week) - 1{}".format(
                "".join(
                    [" + {}".format(c) for c in data.columns if c.startswith("bin")]
                )
            )

        formula = _get_hourly_model_formula(segment_data)

        # remove categories that only have null or missing entries
        # this ensures that predictions will predict null
        segment_data["hour_of_week"] = pd.Categorical(
            segment_data["hour_of_week"],
            categories=segment_data["hour_of_week"].dropna().unique(),
            ordered=False,
        )
        model = smf.wls(formula=formula, data=segment_data, weights=segment_data.weight)
        model_params = {coeff: value for coeff, value in model.fit().params.items()}

    segment_model = CalTRACKSegmentModel(
        segment_name=segment_name,
        model=model,
        formula=formula,
        model_params=model_params,
        warnings=warnings,
    )
    if model:
        this_segment_data = segment_data[segment_data.weight == 1]
        predicted_value = pd.Series(model.fit().predict(this_segment_data))
        segment_model.totals_metrics = ModelMetrics(
            this_segment_data.meter_value, predicted_value, len(model_params)
        )
    else:
        segment_model.totals_metrics = None

    return segment_model


[docs]def fit_caltrack_hourly_model(
    segmented_design_matrices,
    occupancy_lookup,
    occupied_temperature_bins,
    unoccupied_temperature_bins,
):
    """Fit a CalTRACK hourly model

    Parameters
    ----------
    segmented_design_matrices : :any:`dict` of :any:`pandas.DataFrame`
        A dictionary of dataframes of the form returned by
        :any:`eemeter.create_caltrack_hourly_segmented_design_matrices`
    occupancy_lookup : :any:`pandas.DataFrame`
        A dataframe with occupancy flags for each hour of the week and each segment.
        Segment names are columns, occupancy flags are 0 or 1.
    occupied_temperature_bins : :any:`pandas.DataFrame`
        A dataframe of bin endpoint flags for each segment. Segment names are columns.
    unoccupied_temperature_bins : :any:`pandas.DataFrame`
        Ditto for the unoccupied mode.

    Returns
    -------
    model : :any:`CalTRACKHourlyModelResults`
        Has a `model.predict` method which take input data and makes a prediction
        using this model.
    """
    segment_models = fit_model_segments(
        segmented_design_matrices, fit_caltrack_hourly_model_segment
    )
    all_warnings = [
        warning
        for segment_model in segment_models
        for warning in segment_model.warnings
    ]

    model = CalTRACKHourlyModel(
        segment_models,
        occupancy_lookup,
        occupied_temperature_bins,
        unoccupied_temperature_bins,
    )

    model_results = CalTRACKHourlyModelResults(
        status="SUCCEEDED",
        method_name="caltrack_hourly",
        warnings=all_warnings,
        model=model,
    )
    model_results.totals_metrics = {
        seg_model.segment_name: seg_model.totals_metrics for seg_model in segment_models
    }
    return model_results
Source code for eemeter.caltrack.hourly

Useful Links

Table of Contents