Source code for ebm.validators

"""
Pandera validators for ebm input files.
"""
import itertools

import numpy as np
import pandas as pd
import pandera as pa

from ebm.model.building_category import BuildingCategory, RESIDENTIAL, NON_RESIDENTIAL
from ebm.model.building_condition import BuildingCondition
from ebm.model.column_operations import explode_unique_columns, explode_column_alias
from ebm.model.data_classes import YearRange
from ebm.model.energy_purpose import EnergyPurpose
from ebm.model.heating_systems import HeatingSystems



[docs]
def check_building_category(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series value contains values that are corresponding to a BuildingCategory

    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against BuildingCategory

    Returns
    -------
    pd.Series of bool values

    """
    return value.isin(iter(BuildingCategory))



[docs]
def check_default_building_category(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series value contains values that are corresponding to a BuildingCategory or default

    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against BuildingCategory and 'default' 

    Returns
    -------
    pd.Series of bool values

    """
    return value.isin(list(BuildingCategory) + ['default'])



[docs]
def check_default_building_category_with_group(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series value contains values that are corresponding to a BuildingCategory, 
    BuildingCategory group (RESIDENTIAL or NON_RESIDENTIAL) or 'default'

    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against BuildingCategory, RESIDENTIAL, NON_RESIDENTIAL and 'default' 

    Returns
    -------
    pd.Series of bool values
    """
    return value.isin(list(BuildingCategory) + ['default'] + [RESIDENTIAL, NON_RESIDENTIAL])




[docs]
def check_building_condition(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series value contains values that are corresponding to a BuildingCondition

    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against BuildingCondition

    Returns
    -------
    pd.Series of bool values

    """
    return value.isin(iter(BuildingCondition))




[docs]
def check_existing_building_conditions(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series contains values that are corresponding to 'existing' building conditions.

    Existing building conditions are all members (conditions) of BuildingCondition, except of DEMOLITION.

    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against 'existing' BuildingCondition members

    Returns
    -------
    pd.Series of bool values

    """
    return value.isin(iter(BuildingCondition.existing_conditions()))




[docs]
def check_all_existing_building_conditions_present(df: pd.DataFrame):
    """
    Ensures that all 'existing' building conditions are present in the 'building_conditions' column for
    each unique combination of 'building_category', 'building_code', and 'purpose'.

    Existing building conditions are all members (conditions) of BuildingCondition, except of DEMOLITION.

    Parameters
    ----------
    df: pd.Dataframe
    """
    grouped = df.groupby(['building_category', 'building_code', 'purpose'])['building_condition']
    existing_conditions = set(BuildingCondition.existing_conditions())
    for _, conditions in grouped:
        if set(conditions) != existing_conditions:
            return False
    return True




[docs]
def check_energy_purpose(value: pd.Series) -> pd.Series:
    """
    Makes sure that the value contains one of the valid purpose values: 'Cooling', 'Electrical equipment', 'Fans and pumps', 'HeatingDHW', 'HeatingRV', or 'Lighting'

    Args:
        value: Input value to check against the valid purpose values

    Returns:
        pd.Series: Series of bool values indicating if each value matches a valid purpose
    """
    return value.isin(iter(EnergyPurpose))




[docs]
def check_default_energy_purpose(value: pd.Series) -> pd.Series:
    """
    Makes sure that the value contains one of the default or purpose values: 'Cooling', 'Electrical equipment', 'Fans and pumps', 'HeatingDHW', 'HeatingRV', or 'Lighting'

    Args:
        value: Input value to check against the valid purpose values

    Returns:
        pd.Series: Series of bool values indicating if each value matches a valid purpose
    """
    return value.isin(list(EnergyPurpose) + ['default'])




[docs]
def check_building_code(value: str) -> bool:
    """
    A crude check to determine if value is a 'building_code'

    Args:
        value (str): A string to check if it's a building_code

    Returns:
        bool: True when the function thinks that value might be a building_code
    """
    return 'TEK' in value




[docs]
def check_default_building_code(value: str) -> bool:
    """
    A crude check to determine if value is a 'building_code' or default

    Args:
        value (str): A string to check if it's a TEK or default

    Returns:
        bool: True when the function thinks that value might be a TEK
    """
    return check_building_code(value) or value == 'default'


#TODO: edge cases?

[docs]
def check_overlapping_building_code_periods(df: pd.DataFrame) -> pd.Series:
    """
    """
    df = df.sort_values(["period_end_year"])
    end_years = df['period_end_year'] + 1
    start_years = df["period_start_year"].shift(-1)

    end_years = end_years.iloc[:-1]
    start_years = start_years.iloc[:-1]
    check = end_years == start_years
    checked_series = pd.Series(check.to_list() +[True])  
    return checked_series




[docs]
def check_building_category_share(values: pd.DataFrame) -> pd.Series:
    """
    Makes sure that the sum of values in values.new_house_share + values.new_apartment_block_share is 1.0

    Args:
        values (pd.DataFrame): A dataframe with new_house_share and new_apartment_block_share

    Returns:
        pd.Series: A series of bool with the truth value of new_house_share + new_apartment_block_share equals 1.0
    """
    return values.new_house_share + values.new_apartment_block_share == 1.0




[docs]
def create_residential_area_checks():
    """
    Creates a list of checks used for house and apartment_block categories.
        - Checks that the first two rows are not empty
        - Checks that the next (3) rows are empty
        - Checks that non-empty rows are not negative

    Returns
    -------
    List[pa.Check]
    """
    return [
        pa.Check(lambda s: s.iloc[:2].notnull().all(),
                 element_wise=False,
                 error='Expects number in first two rows for house'),
        pa.Check(lambda s: s.iloc[2:].isnull().all(),
                 element_wise=False,
                 error='Expects empty in the last four years for house'),
        pa.Check.greater_than_or_equal_to(0.0)]




[docs]
def check_heating_systems(value: pd.Series) -> pd.Series:
    """
    Makes sure that the series contains values that corresponds to a HeatingSystems
    
    Parameters
    ----------
    value: pd.Series
        A series of str that will be checked against HeatingSystems
    
    Returns
    -------
     pd.Series of bool values

    """
    return value.isin(iter(HeatingSystems))




[docs]
def check_sum_of_heating_system_shares_equal_1(df: pd.DataFrame):
    """
    """
    precision = 4
    df = df.groupby(by=['building_category', 'building_code'])[['heating_system_share']].sum()
    df['heating_system_share'] = round(df['heating_system_share'] * 100, precision)
    return_series = df["heating_system_share"] == 100.0
    return return_series




[docs]
def make_building_purpose(years: YearRange | None = None) -> pd.DataFrame:
    """
    Returns a dataframe of all combinations building_categories, building_codes, original_condition, purposes
    and optionally years.

    Parameters
    ----------
    years : YearRange, optional

    Returns
    -------
    pd.DataFrame
    """
    data = []
    columns = [list(BuildingCategory),
               ['PRE_TEK49', 'TEK49', 'TEK69', 'TEK87', 'TEK97', 'TEK07', 'TEK10', 'TEK17'],
               EnergyPurpose]

    column_headers = ['building_category', 'building_code', 'building_condition', 'purpose']
    if years:
        columns.append(years)
        column_headers.append('year')

    for bc, building_code, purpose, *year in itertools.product(*columns):
        row = [bc, building_code, 'original_condition', purpose]
        if years:
            row.append(year[0])
        data.append(row)

    return pd.DataFrame(data=data, columns=column_headers)




[docs]
def behaviour_factor_parser(df: pd.DataFrame) -> pd.DataFrame:
    model_years = YearRange(2020, 2050)
    all_combinations = make_building_purpose(years=model_years)

    if 'start_year' not in df.columns:
        df=df.assign(**{'start_year': model_years.start})
    if 'end_year' not in df.columns:
        df=df.assign(**{'end_year': model_years.end})
    if 'function' not in df.columns:
        df=df.assign(function='noop')
    else:
        df['function'] = df.function.fillna('noop')
    if 'parameter' not in df.columns:
        df=df.assign(parameter=0.0)

    df['start_year'] = df.start_year.fillna(model_years.start).astype(int)
    df['end_year'] = df.end_year.fillna(model_years.end).astype(int)

    unique_columns = ['building_category', 'building_code', 'purpose', 'start_year', 'end_year']
    behaviour_factor = explode_unique_columns(df,
                                              unique_columns=unique_columns)

    behaviour_factor = explode_column_alias(behaviour_factor,
                       column='purpose',
                       values=[p for p in EnergyPurpose],
                       alias='default',
                       de_dup_by=unique_columns)

    behaviour_factor['year'] = behaviour_factor.apply(
        lambda row: range(row.start_year, row.end_year+1), axis=1)
    behaviour_factor['interpolation'] = behaviour_factor.apply(
        lambda row: np.linspace(row.behaviour_factor, row.parameter, num=row.end_year+1-row.start_year), axis=1)

    behaviour_factor = behaviour_factor.explode(['year', 'interpolation'])

    behaviour_factor['year'] = behaviour_factor['year'].astype(int)

    interpolation_slice = (behaviour_factor.function == 'improvement_at_end_year') & (~behaviour_factor.interpolation.isna())
    behaviour_factor.loc[interpolation_slice, 'behaviour_factor'] = behaviour_factor.loc[
        interpolation_slice, 'interpolation'].astype(float)

    behaviour_factor.sort_values(['building_category', 'building_code', 'purpose', 'year'])

    behaviour_factor = calculate_yearly_reduction(behaviour_factor)

    behaviour_factor=behaviour_factor.set_index(['building_category', 'building_code', 'purpose', 'year'], drop=True)
    all_combinations=all_combinations.set_index(['building_category', 'building_code', 'purpose', 'year'], drop=True)

    joined = all_combinations.join(behaviour_factor, how='left')
    joined.behaviour_factor = joined.behaviour_factor.fillna(1.0)
    return joined.reset_index()




[docs]
def calculate_yearly_reduction(df):
    reduction_slice = df[df['function'] == 'yearly_reduction'].index
    df.loc[reduction_slice, 'behaviour_factor'] = df.loc[reduction_slice].behaviour_factor * ((1.0 - df.loc[
        reduction_slice].parameter) ** (df.loc[reduction_slice].year - df.loc[reduction_slice].start_year))
    return df



energy_need_behaviour_factor = pa.DataFrameSchema(
    parsers=pa.Parser(behaviour_factor_parser),
    columns={
        "building_category": pa.Column(str),
        'building_code': pa.Column(str), #
        "purpose": pa.Column(str),
        'year': pa.Column(int, required=False),
        'behaviour_factor': pa.Column(float)
    }
)

area = pa.DataFrameSchema(
    columns={
        "building_category": pa.Column(str, checks=[pa.Check(check_building_category)]),
        'building_code': pa.Column(str, checks=[pa.Check(check_building_code, element_wise=True)]),
        "area": pa.Column(float, checks=[pa.Check.greater_than(0)], coerce=True)},
    name='area_parameters'
)

building_code_parameters = pa.DataFrameSchema(columns={
        'building_code': pa.Column(str, unique=True, checks=[pa.Check(check_building_code, element_wise=True)]),
        'building_year': pa.Column(int, checks=[
            pa.Check.greater_than_or_equal_to(1940),
            pa.Check.less_than_or_equal_to(2070)]),
        'period_start_year': pa.Column(int, checks=[
            pa.Check.greater_than_or_equal_to(0),
            pa.Check.less_than_or_equal_to(2070),
        ]),
        'period_end_year': pa.Column(int, checks=[
            pa.Check.greater_than_or_equal_to(1940),
            pa.Check.less_than_or_equal_to(2070, error='period_end_year should be 2070 or lower'),
            pa.Check.between(1940, 2070, error='period_end_year should be between 1940 and 2070')])},
    checks=[pa.Check(lambda df: df["period_end_year"] > df["period_start_year"],
                     error="period_end_year should be greater than period_start_year"),
            pa.Check(check_overlapping_building_code_periods, 
                     error="building_code periods do not overlap")],
    name='building_code_parameters'
)


area_new_residential_buildings = pa.DataFrameSchema(
    columns={
        'year': pa.Column(int),
        'house': pa.Column(pa.Float64, nullable=True, checks=create_residential_area_checks()),
        'apartment_block': pa.Column(pa.Float64, nullable=True, checks=create_residential_area_checks())
    },
    name='construction_building_category_yearly'
)


new_buildings_residential = pa.DataFrameSchema(
    columns={
        'year': pa.Column(int, checks=[pa.Check.between(2010, 2070)]),
        'new_house_share': pa.Column(float, checks=[pa.Check.between(0.0, 1.0)]),
        'new_apartment_block_share': pa.Column(float, checks=[pa.Check.between(0.0, 1.0)]),
        'floor_area_new_house': pa.Column(int, checks=[pa.Check.between(1, 1000)]),
        'flood_area_new_apartment_block': pa.Column(int, checks=[pa.Check.between(1, 1000)])
    },
    checks=[pa.Check(check_building_category_share,
                     error='The sum of new_house_share and new_apartment_block_share should be 1.0 (100%)')],
    name='new_buildings_house_share'
)


population_forecast = pa.DataFrameSchema(
    columns={
        'year': pa.Column(int, coerce=True, checks=[pa.Check.between(1900, 2070)]),
        'population': pa.Column(int, coerce=True, checks=[pa.Check.greater_than_or_equal_to(0)]),
        'household_size': pa.Column(float, coerce=True, nullable=True, checks=[pa.Check.greater_than_or_equal_to(0)])},
    name='new_buildings_population')


#TODO: evaluete if restrictions on rush and never share make sense (if the program crashes unless they are there)
s_curve = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=[pa.Check(check_building_category)]),
        'condition': pa.Column(str, checks=[pa.Check(check_building_condition)]),
        'earliest_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
        'average_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
        'rush_period_years': pa.Column(int, checks=[pa.Check.greater_than(0)]),
        'last_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
        'rush_share': pa.Column(float, checks=[pa.Check.between(min_value=0.0, max_value=1.0, include_min=False)]),
        'never_share': pa.Column(float, checks=[pa.Check.between(min_value=0.0, max_value=1.0, include_min=False)])
    },
    name='scurve_parameters')


### TODO: remove strong restrictions on float values and add warnings (should be able to be neg values)
energy_need_original_condition = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=[pa.Check(check_default_building_category)]),
        'building_code': pa.Column(str, checks=[pa.Check(check_default_building_code, element_wise=True)]),
        'purpose': pa.Column(str, checks=[pa.Check(check_default_energy_purpose)]),
        'kwh_m2': pa.Column(float, coerce=True, checks=[pa.Check.greater_than_or_equal_to(0)])
    }, 
    unique=['building_category', 'building_code', 'purpose'],
    report_duplicates='all'
)


improvement_building_upgrade = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=pa.Check(check_default_building_category)),
        'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
        'purpose': pa.Column(str, checks=pa.Check(check_default_energy_purpose)),
        'building_condition': pa.Column(str, checks=[pa.Check(check_existing_building_conditions)]),
        'reduction_share': pa.Column(float, coerce=True, checks=[pa.Check.between(min_value=0.0, include_min=True,
                                                                                  max_value=1.0, include_max=True)])
    },
    unique=['building_category', 'building_code', 'purpose', 'building_condition'],
    report_duplicates='all'
)


energy_need_improvements = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=pa.Check(check_default_building_category_with_group)),
        'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
        'purpose':pa.Column(str, checks=pa.Check(check_default_energy_purpose)),
        'value': pa.Column(float, coerce=True,
                                                   checks=[pa.Check.between(min_value=0.0, include_min=True,
                                                                            max_value=1.0, include_max=True)])
    },
    unique=['building_category', 'building_code', 'purpose', 'start_year', 'function', 'end_year'],
    report_duplicates='all'
)


holiday_home_stock = pa.DataFrameSchema(
    columns={
        'year': pa.Column(int),
        'Existing buildings Chalet, summerhouses and other holiday houses': pa.Column(int),
        'Existing buildings Detached houses and farmhouses used as holiday houses': pa.Column(int)
    }
)


holiday_home_energy_consumption = pa.DataFrameSchema(
    columns={
        'year': pa.Column(int),
        'electricity': pa.Column(int),
        'fuelwood': pa.Column(float, nullable=True)
    }
)

area_per_person = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=pa.Check(check_building_category)),
        'area_per_person': pa.Column(float, nullable=True)
    }
)


heating_system_initial_shares = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=pa.Check(check_building_category)),
        'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
        'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),
        'year': pa.Column(int, pa.Check(
            lambda year: len(year.unique()) == 1,
            error="All values in the 'year' column must be identical."
        )),
        'heating_system_share': pa.Column(float, coerce=True,
                                checks=[pa.Check.between(min_value=0.0, include_min=True,
                                                         max_value=1.0, include_max=True)]) 
    },
    #TODO: better warning messages to see where the issues are
    checks=[pa.Check(check_sum_of_heating_system_shares_equal_1, raise_warning=True, 
                     error="Sum of 'heating_system_share' do not equal 1 for one or more combination of 'building_category' and 'building_code'")],
    name='heating_systems_shares_start_year'
)


#TODO: 
# - add check on years. Parse to make long format and check years and values? years must be in order, max limit (2070) etc.
heating_system_forecast = pa.DataFrameSchema(
    columns={
        'building_category': pa.Column(str, checks=pa.Check(check_default_building_category_with_group)),
        'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
        'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),
        'new_heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)) 
    },
    unique=['building_category', 'building_code', 'heating_systems', 'new_heating_systems'],
    report_duplicates='all'
)


"""
TODO: how to check columns that are heating systems (but not in enum) and 'energivare'. Columns:
        'Grunnlast': pa.Column(str), 
        'Spisslast': pa.Column(str),
        'Ekstralast': pa.Column(str),
        'base_load_energy_product': pa.Column(str),
        'peak_load_energy_product': pa.Column(str),
        'tertiary_load_energy_product': pa.Column(str),  
"""
heating_system_efficiencies = pa.DataFrameSchema(
    columns={
        'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),  
        'base_load_energy_product': pa.Column(str),
        'peak_load_energy_product': pa.Column(str),
        'tertiary_load_energy_product': pa.Column(str),        
        'tertiary_load_coverage': pa.Column(float, coerce=True),
        'base_load_coverage': pa.Column(float, coerce=True),
        'peak_load_coverage': pa.Column(float, coerce=True),
        'base_load_efficiency': pa.Column(float, coerce=True),
        'peak_load_efficiency': pa.Column(float, coerce=True),
        'tertiary_load_efficiency': pa.Column(float, coerce=True),
        'domestic_hot_water_energy_product': pa.Column(str),
        'domestic_hot_water_efficiency': pa.Column(float, coerce=True),
        'Spesifikt elforbruk': pa.Column(float, coerce=True),
        'cooling_efficiency': pa.Column(float, coerce=True)
    }
)


__all__ = [area,
           building_code_parameters,
           area_new_residential_buildings,
           new_buildings_residential,
           population_forecast,
           s_curve,
           new_buildings_residential,
           improvement_building_upgrade]