Source code for ebm.model.column_operations

import pathlib
from typing import List, Optional

import pandas as pd
from pandera.typing.common import DataFrameBase

from ebm.model.building_category import BuildingCategory


[docs] def explode_building_category_column(df: pd.DataFrame, unique_columns: List[str]) -> pd.DataFrame: """ Explodes the 'building_category' column in the DataFrame into multiple columns based on residential and non-residential categories. Parameters ---------- df : pd.DataFrame The input DataFrame containing the 'building_category' column. unique_columns : List[str] List of columns to use for de-duplication. Returns ------- pd.DataFrame The DataFrame with exploded 'building_category' columns. """ df = explode_column_alias(df=df, column='building_category', values=[bc for bc in BuildingCategory if bc.is_residential()], alias='residential', de_dup_by=unique_columns) df = explode_column_alias(df=df, column='building_category', values=[bc for bc in BuildingCategory if not bc.is_residential()], alias='non_residential', de_dup_by=unique_columns) df = explode_column_alias(df=df, column='building_category', values=[bc for bc in BuildingCategory], alias='default', de_dup_by=unique_columns) return df
[docs] def explode_building_code_column(df: pd.DataFrame, unique_columns: List[str], default_building_code: None | pd.DataFrame = None) -> pd.DataFrame: """ Explodes the 'building_code' column in the DataFrame into multiple columns based on the provided building_codelist. Parameters ---------- df : pd.DataFrame The input DataFrame containing the 'building_code' column. unique_columns : List[str] List of columns to use for de-duplication. default_building_code : Optional[pd.DataFrame], optional DataFrame containing default building_codevalues. If not provided, building_codevalues are read from 'input/building_codes.csv'. Returns ------- pd.DataFrame The DataFrame with exploded 'building_code' columns. """ # Hvor skal building_code_list hentes fra? building_code_list = pd.read_csv(pathlib.Path(__file__).parent.parent / 'data' / 'original' /'building_code_parameters.csv')['building_code'].unique() if default_building_code is None else default_building_code df = explode_column_alias(df=df, column='building_code', values=building_code_list, de_dup_by=unique_columns) return df
[docs] def explode_unique_columns(df: pd.DataFrame| DataFrameBase, unique_columns: List[str], default_building_code: List[str]|None = None) -> pd.DataFrame: """ Explodes 'building_code' and 'building_category' columns in df. Parameters ---------- df : pd.DataFrame The input DataFrame containing the columns to be exploded. unique_columns : List[str] List of columns to use for de-duplication. default_building_code : List[str], optional List of TEKs to replace default Returns ------- pd.DataFrame The DataFrame with exploded columns. """ df = explode_building_code_column(df, unique_columns, default_building_code=default_building_code) df = explode_building_category_column(df, unique_columns) return df
[docs] def explode_column_alias(df, column, values: list|dict=None, alias='default', de_dup_by: list[str]=None): """ Explodes a specified column in the DataFrame into multiple rows based on provided values and alias. Parameters ---------- df : pd.DataFrame The input DataFrame containing the column to be exploded. column : str The name of the column to be exploded. values : Optional[List[str], dict[str, list[str]], optional List or dict of values to explode the column by. If not provided, unique values from the column excluding the alias are used. alias : str, optional The alias to be used for default values. Default is 'default'. When values is a dict the parameter alias is ignored de_dup_by : Optional[List[str]], optional List of columns to use for de-duplication. If not provided, no de-duplication is performed. Returns ------- pd.DataFrame The DataFrame with the exploded column. Examples -------- >>> d_f = pd.DataFrame({'category': ['A', 'B', 'default']}) >>> explode_column_alias(d_f, column='category', values=['A', 'B'], alias='default') category 0 A 1 B 2 A 2 B """ if column not in df.columns: raise ValueError(f"The DataFrame (df) must contain the column: {column}") df = replace_column_alias(df, column=column, values=values, alias=alias, de_dup_by=None) df = df.assign(**{column: df[column].str.split('+')}).explode(column) if de_dup_by: df = df.sort_values(by='_explode_column_alias_default', ascending=True) df = df.drop_duplicates(de_dup_by) return df.drop(columns=['_explode_column_alias_default'], errors='ignore')
[docs] def replace_column_alias(df: pd.DataFrame, column: str, values: Optional[list|dict]=None, alias: Optional[str]='default', de_dup_by=None) -> pd.DataFrame: values = values if values is not None else [c for c in df[column].unique().tolist() if c != alias] aliases = {alias: values} if not isinstance(values, dict) else values df = df.copy() for k, v in aliases.items(): df['_explode_column_alias_default'] = df[column] == k df.loc[df[df[column] == k].index, column] = '+'.join(v) if not de_dup_by: return df return df.drop(columns=['_explode_column_alias_default'], errors='ignore')
[docs] def explode_column(df: pd.DataFrame, column: str) -> pd.DataFrame: df = df.assign(**{column: df[column].str.split('+')}).explode(column) return df