"""
Pandera validators for ebm input files.
"""
import itertools
import numpy as np
import pandas as pd
import pandera as pa
from ebm.model.building_category import BuildingCategory, RESIDENTIAL, NON_RESIDENTIAL
from ebm.model.building_condition import BuildingCondition
from ebm.model.column_operations import explode_unique_columns, explode_column_alias
from ebm.model.data_classes import YearRange
from ebm.model.energy_purpose import EnergyPurpose
from ebm.model.heating_systems import HeatingSystems
[docs]
def check_building_category(value: pd.Series) -> pd.Series:
"""
Makes sure that the series value contains values that are corresponding to a BuildingCategory
Parameters
----------
value: pd.Series
A series of str that will be checked against BuildingCategory
Returns
-------
pd.Series of bool values
"""
return value.isin(iter(BuildingCategory))
[docs]
def check_default_building_category(value: pd.Series) -> pd.Series:
"""
Makes sure that the series value contains values that are corresponding to a BuildingCategory or default
Parameters
----------
value: pd.Series
A series of str that will be checked against BuildingCategory and 'default'
Returns
-------
pd.Series of bool values
"""
return value.isin(list(BuildingCategory) + ['default'])
[docs]
def check_default_building_category_with_group(value: pd.Series) -> pd.Series:
"""
Makes sure that the series value contains values that are corresponding to a BuildingCategory,
BuildingCategory group (RESIDENTIAL or NON_RESIDENTIAL) or 'default'
Parameters
----------
value: pd.Series
A series of str that will be checked against BuildingCategory, RESIDENTIAL, NON_RESIDENTIAL and 'default'
Returns
-------
pd.Series of bool values
"""
return value.isin(list(BuildingCategory) + ['default'] + [RESIDENTIAL, NON_RESIDENTIAL])
[docs]
def check_building_condition(value: pd.Series) -> pd.Series:
"""
Makes sure that the series value contains values that are corresponding to a BuildingCondition
Parameters
----------
value: pd.Series
A series of str that will be checked against BuildingCondition
Returns
-------
pd.Series of bool values
"""
return value.isin(iter(BuildingCondition))
[docs]
def check_existing_building_conditions(value: pd.Series) -> pd.Series:
"""
Makes sure that the series contains values that are corresponding to 'existing' building conditions.
Existing building conditions are all members (conditions) of BuildingCondition, except of DEMOLITION.
Parameters
----------
value: pd.Series
A series of str that will be checked against 'existing' BuildingCondition members
Returns
-------
pd.Series of bool values
"""
return value.isin(iter(BuildingCondition.existing_conditions()))
[docs]
def check_all_existing_building_conditions_present(df: pd.DataFrame):
"""
Ensures that all 'existing' building conditions are present in the 'building_conditions' column for
each unique combination of 'building_category', 'building_code', and 'purpose'.
Existing building conditions are all members (conditions) of BuildingCondition, except of DEMOLITION.
Parameters
----------
df: pd.Dataframe
"""
grouped = df.groupby(['building_category', 'building_code', 'purpose'])['building_condition']
existing_conditions = set(BuildingCondition.existing_conditions())
for _, conditions in grouped:
if set(conditions) != existing_conditions:
return False
return True
[docs]
def check_energy_purpose(value: pd.Series) -> pd.Series:
"""
Makes sure that the value contains one of the valid purpose values: 'Cooling', 'Electrical equipment', 'Fans and pumps', 'HeatingDHW', 'HeatingRV', or 'Lighting'
Args:
value: Input value to check against the valid purpose values
Returns:
pd.Series: Series of bool values indicating if each value matches a valid purpose
"""
return value.isin(iter(EnergyPurpose))
[docs]
def check_default_energy_purpose(value: pd.Series) -> pd.Series:
"""
Makes sure that the value contains one of the default or purpose values: 'Cooling', 'Electrical equipment', 'Fans and pumps', 'HeatingDHW', 'HeatingRV', or 'Lighting'
Args:
value: Input value to check against the valid purpose values
Returns:
pd.Series: Series of bool values indicating if each value matches a valid purpose
"""
return value.isin(list(EnergyPurpose) + ['default'])
[docs]
def check_building_code(value: str) -> bool:
"""
A crude check to determine if value is a 'building_code'
Args:
value (str): A string to check if it's a building_code
Returns:
bool: True when the function thinks that value might be a building_code
"""
return 'TEK' in value
[docs]
def check_default_building_code(value: str) -> bool:
"""
A crude check to determine if value is a 'building_code' or default
Args:
value (str): A string to check if it's a TEK or default
Returns:
bool: True when the function thinks that value might be a TEK
"""
return check_building_code(value) or value == 'default'
#TODO: edge cases?
[docs]
def check_overlapping_building_code_periods(df: pd.DataFrame) -> pd.Series:
"""
"""
df = df.sort_values(["period_end_year"])
end_years = df['period_end_year'] + 1
start_years = df["period_start_year"].shift(-1)
end_years = end_years.iloc[:-1]
start_years = start_years.iloc[:-1]
check = end_years == start_years
checked_series = pd.Series(check.to_list() +[True])
return checked_series
[docs]
def check_building_category_share(values: pd.DataFrame) -> pd.Series:
"""
Makes sure that the sum of values in values.new_house_share + values.new_apartment_block_share is 1.0
Args:
values (pd.DataFrame): A dataframe with new_house_share and new_apartment_block_share
Returns:
pd.Series: A series of bool with the truth value of new_house_share + new_apartment_block_share equals 1.0
"""
return values.new_house_share + values.new_apartment_block_share == 1.0
[docs]
def create_residential_area_checks():
"""
Creates a list of checks used for house and apartment_block categories.
- Checks that the first two rows are not empty
- Checks that the next (3) rows are empty
- Checks that non-empty rows are not negative
Returns
-------
List[pa.Check]
"""
return [
pa.Check(lambda s: s.iloc[:2].notnull().all(),
element_wise=False,
error='Expects number in first two rows for house'),
pa.Check(lambda s: s.iloc[2:].isnull().all(),
element_wise=False,
error='Expects empty in the last four years for house'),
pa.Check.greater_than_or_equal_to(0.0)]
[docs]
def check_heating_systems(value: pd.Series) -> pd.Series:
"""
Makes sure that the series contains values that corresponds to a HeatingSystems
Parameters
----------
value: pd.Series
A series of str that will be checked against HeatingSystems
Returns
-------
pd.Series of bool values
"""
return value.isin(iter(HeatingSystems))
[docs]
def check_sum_of_heating_system_shares_equal_1(df: pd.DataFrame):
"""
"""
precision = 4
df = df.groupby(by=['building_category', 'building_code'])[['heating_system_share']].sum()
df['heating_system_share'] = round(df['heating_system_share'] * 100, precision)
return_series = df["heating_system_share"] == 100.0
return return_series
[docs]
def make_building_purpose(years: YearRange | None = None) -> pd.DataFrame:
"""
Returns a dataframe of all combinations building_categories, building_codes, original_condition, purposes
and optionally years.
Parameters
----------
years : YearRange, optional
Returns
-------
pd.DataFrame
"""
data = []
columns = [list(BuildingCategory),
['PRE_TEK49', 'TEK49', 'TEK69', 'TEK87', 'TEK97', 'TEK07', 'TEK10', 'TEK17'],
EnergyPurpose]
column_headers = ['building_category', 'building_code', 'building_condition', 'purpose']
if years:
columns.append(years)
column_headers.append('year')
for bc, building_code, purpose, *year in itertools.product(*columns):
row = [bc, building_code, 'original_condition', purpose]
if years:
row.append(year[0])
data.append(row)
return pd.DataFrame(data=data, columns=column_headers)
[docs]
def behaviour_factor_parser(df: pd.DataFrame) -> pd.DataFrame:
model_years = YearRange(2020, 2050)
all_combinations = make_building_purpose(years=model_years)
if 'start_year' not in df.columns:
df=df.assign(**{'start_year': model_years.start})
if 'end_year' not in df.columns:
df=df.assign(**{'end_year': model_years.end})
if 'function' not in df.columns:
df=df.assign(function='noop')
else:
df['function'] = df.function.fillna('noop')
if 'parameter' not in df.columns:
df=df.assign(parameter=0.0)
df['start_year'] = df.start_year.fillna(model_years.start).astype(int)
df['end_year'] = df.end_year.fillna(model_years.end).astype(int)
unique_columns = ['building_category', 'building_code', 'purpose', 'start_year', 'end_year']
behaviour_factor = explode_unique_columns(df,
unique_columns=unique_columns)
behaviour_factor = explode_column_alias(behaviour_factor,
column='purpose',
values=[p for p in EnergyPurpose],
alias='default',
de_dup_by=unique_columns)
behaviour_factor['year'] = behaviour_factor.apply(
lambda row: range(row.start_year, row.end_year+1), axis=1)
behaviour_factor['interpolation'] = behaviour_factor.apply(
lambda row: np.linspace(row.behaviour_factor, row.parameter, num=row.end_year+1-row.start_year), axis=1)
behaviour_factor = behaviour_factor.explode(['year', 'interpolation'])
behaviour_factor['year'] = behaviour_factor['year'].astype(int)
interpolation_slice = (behaviour_factor.function == 'improvement_at_end_year') & (~behaviour_factor.interpolation.isna())
behaviour_factor.loc[interpolation_slice, 'behaviour_factor'] = behaviour_factor.loc[
interpolation_slice, 'interpolation'].astype(float)
behaviour_factor.sort_values(['building_category', 'building_code', 'purpose', 'year'])
behaviour_factor = calculate_yearly_reduction(behaviour_factor)
behaviour_factor=behaviour_factor.set_index(['building_category', 'building_code', 'purpose', 'year'], drop=True)
all_combinations=all_combinations.set_index(['building_category', 'building_code', 'purpose', 'year'], drop=True)
joined = all_combinations.join(behaviour_factor, how='left')
joined.behaviour_factor = joined.behaviour_factor.fillna(1.0)
return joined.reset_index()
[docs]
def calculate_yearly_reduction(df):
reduction_slice = df[df['function'] == 'yearly_reduction'].index
df.loc[reduction_slice, 'behaviour_factor'] = df.loc[reduction_slice].behaviour_factor * ((1.0 - df.loc[
reduction_slice].parameter) ** (df.loc[reduction_slice].year - df.loc[reduction_slice].start_year))
return df
energy_need_behaviour_factor = pa.DataFrameSchema(
parsers=pa.Parser(behaviour_factor_parser),
columns={
"building_category": pa.Column(str),
'building_code': pa.Column(str), #
"purpose": pa.Column(str),
'year': pa.Column(int, required=False),
'behaviour_factor': pa.Column(float)
}
)
area = pa.DataFrameSchema(
columns={
"building_category": pa.Column(str, checks=[pa.Check(check_building_category)]),
'building_code': pa.Column(str, checks=[pa.Check(check_building_code, element_wise=True)]),
"area": pa.Column(float, checks=[pa.Check.greater_than(0)], coerce=True)},
name='area_parameters'
)
building_code_parameters = pa.DataFrameSchema(columns={
'building_code': pa.Column(str, unique=True, checks=[pa.Check(check_building_code, element_wise=True)]),
'building_year': pa.Column(int, checks=[
pa.Check.greater_than_or_equal_to(1940),
pa.Check.less_than_or_equal_to(2070)]),
'period_start_year': pa.Column(int, checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2070),
]),
'period_end_year': pa.Column(int, checks=[
pa.Check.greater_than_or_equal_to(1940),
pa.Check.less_than_or_equal_to(2070, error='period_end_year should be 2070 or lower'),
pa.Check.between(1940, 2070, error='period_end_year should be between 1940 and 2070')])},
checks=[pa.Check(lambda df: df["period_end_year"] > df["period_start_year"],
error="period_end_year should be greater than period_start_year"),
pa.Check(check_overlapping_building_code_periods,
error="building_code periods do not overlap")],
name='building_code_parameters'
)
area_new_residential_buildings = pa.DataFrameSchema(
columns={
'year': pa.Column(int),
'house': pa.Column(pa.Float64, nullable=True, checks=create_residential_area_checks()),
'apartment_block': pa.Column(pa.Float64, nullable=True, checks=create_residential_area_checks())
},
name='construction_building_category_yearly'
)
new_buildings_residential = pa.DataFrameSchema(
columns={
'year': pa.Column(int, checks=[pa.Check.between(2010, 2070)]),
'new_house_share': pa.Column(float, checks=[pa.Check.between(0.0, 1.0)]),
'new_apartment_block_share': pa.Column(float, checks=[pa.Check.between(0.0, 1.0)]),
'floor_area_new_house': pa.Column(int, checks=[pa.Check.between(1, 1000)]),
'flood_area_new_apartment_block': pa.Column(int, checks=[pa.Check.between(1, 1000)])
},
checks=[pa.Check(check_building_category_share,
error='The sum of new_house_share and new_apartment_block_share should be 1.0 (100%)')],
name='new_buildings_house_share'
)
population_forecast = pa.DataFrameSchema(
columns={
'year': pa.Column(int, coerce=True, checks=[pa.Check.between(1900, 2070)]),
'population': pa.Column(int, coerce=True, checks=[pa.Check.greater_than_or_equal_to(0)]),
'household_size': pa.Column(float, coerce=True, nullable=True, checks=[pa.Check.greater_than_or_equal_to(0)])},
name='new_buildings_population')
#TODO: evaluete if restrictions on rush and never share make sense (if the program crashes unless they are there)
s_curve = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=[pa.Check(check_building_category)]),
'condition': pa.Column(str, checks=[pa.Check(check_building_condition)]),
'earliest_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
'average_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
'rush_period_years': pa.Column(int, checks=[pa.Check.greater_than(0)]),
'last_age_for_measure': pa.Column(int, checks=[pa.Check.greater_than(0)]),
'rush_share': pa.Column(float, checks=[pa.Check.between(min_value=0.0, max_value=1.0, include_min=False)]),
'never_share': pa.Column(float, checks=[pa.Check.between(min_value=0.0, max_value=1.0, include_min=False)])
},
name='scurve_parameters')
### TODO: remove strong restrictions on float values and add warnings (should be able to be neg values)
energy_need_original_condition = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=[pa.Check(check_default_building_category)]),
'building_code': pa.Column(str, checks=[pa.Check(check_default_building_code, element_wise=True)]),
'purpose': pa.Column(str, checks=[pa.Check(check_default_energy_purpose)]),
'kwh_m2': pa.Column(float, coerce=True, checks=[pa.Check.greater_than_or_equal_to(0)])
},
unique=['building_category', 'building_code', 'purpose'],
report_duplicates='all'
)
improvement_building_upgrade = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=pa.Check(check_default_building_category)),
'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
'purpose': pa.Column(str, checks=pa.Check(check_default_energy_purpose)),
'building_condition': pa.Column(str, checks=[pa.Check(check_existing_building_conditions)]),
'reduction_share': pa.Column(float, coerce=True, checks=[pa.Check.between(min_value=0.0, include_min=True,
max_value=1.0, include_max=True)])
},
unique=['building_category', 'building_code', 'purpose', 'building_condition'],
report_duplicates='all'
)
energy_need_improvements = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=pa.Check(check_default_building_category)),
'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
'purpose':pa.Column(str, checks=pa.Check(check_default_energy_purpose)),
'value': pa.Column(float, coerce=True,
checks=[pa.Check.between(min_value=0.0, include_min=True,
max_value=1.0, include_max=True)])
},
unique=['building_category', 'building_code', 'purpose', 'start_year', 'function', 'end_year'],
report_duplicates='all'
)
holiday_home_stock = pa.DataFrameSchema(
columns={
'year': pa.Column(int),
'Existing buildings Chalet, summerhouses and other holiday houses': pa.Column(int),
'Existing buildings Detached houses and farmhouses used as holiday houses': pa.Column(int)
}
)
holiday_home_energy_consumption = pa.DataFrameSchema(
columns={
'year': pa.Column(int),
'electricity': pa.Column(int),
'fuelwood': pa.Column(float, nullable=True)
}
)
area_per_person = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=pa.Check(check_building_category)),
'area_per_person': pa.Column(float, nullable=True)
}
)
heating_system_initial_shares = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=pa.Check(check_building_category)),
'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),
'year': pa.Column(int, pa.Check(
lambda year: len(year.unique()) == 1,
error="All values in the 'year' column must be identical."
)),
'heating_system_share': pa.Column(float, coerce=True,
checks=[pa.Check.between(min_value=0.0, include_min=True,
max_value=1.0, include_max=True)])
},
#TODO: better warning messages to see where the issues are
checks=[pa.Check(check_sum_of_heating_system_shares_equal_1, raise_warning=True,
error="Sum of 'heating_system_share' do not equal 1 for one or more combination of 'building_category' and 'building_code'")],
name='heating_systems_shares_start_year'
)
#TODO:
# - add check on years. Parse to make long format and check years and values? years must be in order, max limit (2070) etc.
heating_system_forecast = pa.DataFrameSchema(
columns={
'building_category': pa.Column(str, checks=pa.Check(check_default_building_category_with_group)),
'building_code': pa.Column(str, checks=pa.Check(check_default_building_code, element_wise=True)),
'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),
'new_heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems))
},
unique=['building_category', 'building_code', 'heating_systems', 'new_heating_systems'],
report_duplicates='all'
)
"""
TODO: how to check columns that are heating systems (but not in enum) and 'energivare'. Columns:
'Grunnlast': pa.Column(str),
'Spisslast': pa.Column(str),
'Ekstralast': pa.Column(str),
'base_load_energy_product': pa.Column(str),
'peak_load_energy_product': pa.Column(str),
'tertiary_load_energy_product': pa.Column(str),
"""
heating_system_efficiencies = pa.DataFrameSchema(
columns={
'heating_systems': pa.Column(str, checks=pa.Check(check_heating_systems)),
'base_load_energy_product': pa.Column(str),
'peak_load_energy_product': pa.Column(str),
'tertiary_load_energy_product': pa.Column(str),
'tertiary_load_coverage': pa.Column(float, coerce=True),
'base_load_coverage': pa.Column(float, coerce=True),
'peak_load_coverage': pa.Column(float, coerce=True),
'base_load_efficiency': pa.Column(float, coerce=True),
'peak_load_efficiency': pa.Column(float, coerce=True),
'tertiary_load_efficiency': pa.Column(float, coerce=True),
'domestic_hot_water_energy_product': pa.Column(str),
'domestic_hot_water_efficiency': pa.Column(float, coerce=True),
'Spesifikt elforbruk': pa.Column(float, coerce=True),
'cooling_efficiency': pa.Column(float, coerce=True)
}
)
__all__ = [area,
building_code_parameters,
area_new_residential_buildings,
new_buildings_residential,
population_forecast,
s_curve,
new_buildings_residential,
improvement_building_upgrade]