Source code for ebm.model.file_handler

import os
import pathlib
import shutil
import typing

import pandas as pd
from loguru import logger
from pandera.errors import SchemaErrors, SchemaError

import ebm.validators as validators
from ebm.model.defaults import default_calibrate_heating_rv, default_calibrate_energy_consumption


[docs] class FileHandler: """ Handles file operations. """ # Filenames BUILDING_CONDITIONS = 'building_conditions.csv' BUILDING_CODE_PARAMS = 'building_code_parameters.csv' S_CURVE = 's_curve.csv' POPULATION_FORECAST = 'population_forecast.csv' NEW_BUILDINGS_RESIDENTIAL = 'new_buildings_residential.csv' AREA_NEW_RESIDENTIAL_BUILDINGS = 'area_new_residential_buildings.csv' AREA = 'area.csv' BEHAVIOUR_FACTOR = 'energy_need_behaviour_factor.csv' ENERGY_NEED_ORIGINAL_CONDITION = 'energy_need_original_condition.csv' IMPROVEMENT_BUILDING_UPGRADE = 'improvement_building_upgrade.csv' ENERGY_NEED_YEARLY_IMPROVEMENTS = 'energy_need_improvements.csv' HOLIDAY_HOME_STOCK = 'holiday_home_stock.csv' HOLIDAY_HOME_ENERGY_CONSUMPTION = 'holiday_home_energy_consumption.csv' AREA_PER_PERSON = 'area_per_person.csv' HEATING_SYSTEM_INITIAL_SHARES = 'heating_system_initial_shares.csv' HEATING_SYSTEM_EFFICIENCIES = 'heating_system_efficiencies.csv' HEATING_SYSTEM_FORECAST = 'heating_system_forecast.csv' CALIBRATE_ENERGY_REQUIREMENT = 'calibrate_heating_rv.xlsx' CALIBRATE_ENERGY_CONSUMPTION = 'calibrate_energy_consumption.xlsx' input_directory: pathlib.Path
[docs] def __init__(self, directory: typing.Union[str, pathlib.Path, None] = None): """ Constructor for FileHandler Object. Sets FileHandler.input_directory. Parameters ---------- directory : pathlib.Path | None | (str) When directory is None the constructor will attempt to read directory location from environment variable EBM_INPUT_DIRECTORY """ if directory is None: # Use 'input' as fall back when EBM_INPUT_DIRECTORY is not set in environment. directory = os.environ.get('EBM_INPUT_DIRECTORY', 'input') self.input_directory = directory if isinstance(directory, pathlib.Path) else pathlib.Path(directory) self.files_to_check = [self.BUILDING_CODE_PARAMS, self.S_CURVE, self.POPULATION_FORECAST, self.NEW_BUILDINGS_RESIDENTIAL, self.AREA_NEW_RESIDENTIAL_BUILDINGS, self.AREA, self.BEHAVIOUR_FACTOR, self.ENERGY_NEED_ORIGINAL_CONDITION, self.IMPROVEMENT_BUILDING_UPGRADE, self.ENERGY_NEED_YEARLY_IMPROVEMENTS, self.HOLIDAY_HOME_ENERGY_CONSUMPTION, self.HOLIDAY_HOME_STOCK, self.AREA_PER_PERSON, self.HEATING_SYSTEM_INITIAL_SHARES, self.HEATING_SYSTEM_EFFICIENCIES, self.HEATING_SYSTEM_FORECAST]
def __repr__(self): return f'FileHandler(input_directory="{self.input_directory}")' def __str__(self): return repr(self)
[docs] @staticmethod def default_data_directory() -> pathlib.Path: """ Returns the path for ebm default data. The function is used when content is needed for a new input directory Not to be confused with FileHandler.input_directory. Returns ------- pathlib.Path See Also -------- create_missing_input_files """ data_directory = pathlib.Path(__file__).parent.parent / 'data' default_data_directory = data_directory / 'calibrated' if not default_data_directory.is_dir(): msg = f'Could not find default data directory {default_data_directory}' raise FileNotFoundError(msg) if not default_data_directory.is_dir(): msg = f'{default_data_directory} is not a directory' raise NotADirectoryError(msg) return default_data_directory
[docs] def get_file(self, file_name: str) -> pd.DataFrame: """ Finds and returns a file by searching in the folder defined by self.input_folder Parameters: - file_name (str): Name of the file to retrieve. Returns: - file_df (pd.DataFrame): DataFrame containing file data. """ logger.debug(f'get_file {file_name}') file_path: pathlib.Path = pathlib.Path(self.input_directory) / file_name logger.debug(f'{file_path=}') try: if file_path.suffix == '.xlsx': file_df = pd.read_excel(file_path) elif file_path.suffix == '.csv': file_df = pd.read_csv(file_path) else: msg = f'{file_name} is not of type xlsx or csv' logger.error(msg) raise ValueError(msg) return file_df except FileNotFoundError as ex: logger.exception(ex) logger.debug(f'Current directory is {os.getcwd()}') logger.error(f'Unable to open {file_path}. File not found.') raise except PermissionError as ex: logger.exception(ex) logger.error(f'Unable to open {file_path}. Permission denied.') raise except IOError as ex: logger.exception(ex) logger.error(f'Unable to open {file_path}. Unable to read file.') raise
[docs] def get_building_code(self) -> pd.DataFrame: """ Get TEK parameters DataFrame. Returns: - building_code_params (pd.DataFrame): DataFrame containing TEK parameters. """ building_code_params = self.get_file(self.BUILDING_CODE_PARAMS) return building_code_params
[docs] def get_s_curve(self) -> pd.DataFrame: """ Get S-curve parameters DataFrame. Returns: - scurve_params (pd.DataFrame): DataFrame containing S-curve parameters. """ scurve_params = self.get_file(self.S_CURVE) return scurve_params
[docs] def get_construction_population(self) -> pd.DataFrame: """ Get population and household size DataFrame from a file. Returns: - construction_population (pd.DataFrame): Dataframe containing population numbers year population household_size """ return self.get_file(self.POPULATION_FORECAST)
[docs] def get_population(self) -> pd.DataFrame: """ Loads population data from population.csv as float64 Should probably be merged with get_construction_population Returns population : pd.DataFrame dataframe with population ------- """ file_path = self.input_directory / self.POPULATION_FORECAST logger.debug(f'{file_path=}') return pd.read_csv(file_path, dtype={"household_size": "float64"})
[docs] def get_construction_building_category_share(self) -> pd.DataFrame: """ Get building category share by year DataFrame from a file. The number can be used in conjunction with number of households to calculate total number of buildings of category house and apartment block Returns: - construction_population (pd.DataFrame): Dataframe containing population numbers "year", "Andel nye småhus", "Andel nye leiligheter", "Areal nye småhus", "Areal nye leiligheter" """ return self.get_file(self.NEW_BUILDINGS_RESIDENTIAL)
[docs] def get_building_category_area(self) -> pd.DataFrame: """ Get population and household size DataFrame from a file. Returns: - construction_population (pd.DataFrame): Dataframe containing population numbers "area","type of building","2010","2011" """ file_path = self.input_directory / self.AREA_NEW_RESIDENTIAL_BUILDINGS logger.debug(f'{file_path=}') return pd.read_csv(file_path, index_col=0, header=0)
[docs] def get_area_parameters(self) -> pd.DataFrame: """ Get dataframe with area parameters. Returns: - area_parameters (pd.DataFrame): Dataframe containing total area (m^2) per building category and TEK. """ return self.get_file(self.AREA)
[docs] def get_energy_req_original_condition(self) -> pd.DataFrame: """ Get dataframe with energy requirement (kWh/m^2) for floor area in original condition. Returns ------- pd.DataFrame Dataframe containing energy requirement (kWh/m^2) for floor area in original condition, per building category and purpose. """ return self.get_file(self.ENERGY_NEED_ORIGINAL_CONDITION)
[docs] def get_energy_req_reduction_per_condition(self) -> pd.DataFrame: """ Get dataframe with shares for reducing the energy requirement of the different building conditions. Returns ------- pd.DataFrame Dataframe containing energy requirement reduction shares for the different building conditions, per building category, TEK and purpose. """ return self.get_file(self.IMPROVEMENT_BUILDING_UPGRADE)
[docs] def get_energy_need_yearly_improvements(self) -> pd.DataFrame: """ Get dataframe with yearly efficiency rates for energy requirement improvements. Returns ------- pd.DataFrame Dataframe containing yearly efficiency rates (%) for energy requirement improvements, per building category, tek and purpose. """ return self.get_file(self.ENERGY_NEED_YEARLY_IMPROVEMENTS)
[docs] def get_holiday_home_energy_consumption(self) -> pd.DataFrame: return self.get_file(self.HOLIDAY_HOME_ENERGY_CONSUMPTION)
[docs] def get_holiday_home_by_year(self) -> pd.DataFrame: return self.get_file(self.HOLIDAY_HOME_STOCK)
[docs] def get_area_per_person(self): return self.get_file(self.AREA_PER_PERSON)
[docs] def get_calibrate_heating_rv(self) -> pd.DataFrame: """ Retrieve the calibrated heating requirement values This method attempts to load the energy requirement calibration file from the input directory. It first checks for a file without extension, then for a `.csv` version. If neither is found, it returns a default DataFrame. Returns ------- pd.DataFrame A DataFrame containing the calibrated heating requirement values. If no file is found, a default DataFrame is returned. """ calibrate_heating_rv = self.input_directory / self.CALIBRATE_ENERGY_REQUIREMENT if calibrate_heating_rv.is_file(): return self.get_file(calibrate_heating_rv.name) if calibrate_heating_rv.with_suffix('.csv').is_file(): return self.get_file(calibrate_heating_rv.with_suffix('.csv').name) return default_calibrate_heating_rv()
[docs] def get_calibrate_heating_systems(self) -> pd.DataFrame: """ Retrieve the calibrated energy consumption values for heating systems This method attempts to load the energy consumption calibration file from the input directory. It first checks for a file without extension, then for a `.csv` version. If neither is found, it returns a default DataFrame. Returns ------- pd.DataFrame A DataFrame containing the calibrated energy consumption values. If no file is found, a default DataFrame is returned. """ calibrate_energy_consumption = self.input_directory / self.CALIBRATE_ENERGY_CONSUMPTION if calibrate_energy_consumption.is_file(): return self.get_file(calibrate_energy_consumption.name) if calibrate_energy_consumption.with_suffix('.csv').is_file(): return self.get_file(calibrate_energy_consumption.with_suffix('.csv').name) return default_calibrate_energy_consumption()
[docs] def get_heating_systems_shares_start_year(self) -> pd.DataFrame: """ """ return self.get_file(self.HEATING_SYSTEM_INITIAL_SHARES)
[docs] def get_heating_system_efficiencies(self) -> pd.DataFrame: """Load heating_system_efficiencies.csv from file into a dataframe Returns ------- heating_system_efficiencies : pd.DataFrame pandas DataFrame with heating system efficiencies """ return self.get_file(self.HEATING_SYSTEM_EFFICIENCIES)
[docs] def get_heating_system_forecast(self) -> pd.DataFrame: """ """ return self.get_file(self.HEATING_SYSTEM_FORECAST)
def _check_is_file(self, filename: str) -> bool: """ Check if the filename is a file in self.input_folder Parameters ---------- filename : str Returns ------- file_exists : bool """ return (pathlib.Path(self.input_directory) / filename).is_file()
[docs] def check_for_missing_files(self) -> typing.List[str]: """ Returns a list of required files that are not present in self.input_folder Returns ------- missing_files : List[str] Raises ------ FileNotFoundError If FileHandler::input_directory not found NotADirectoryError If FileHandler::input_directory is not a directory """ if not self.input_directory.exists(): msg=f'{self.input_directory.absolute()} not found' logger.error(msg) raise FileNotFoundError(f'Input Directory Not Found') if not self.input_directory.is_dir(): raise NotADirectoryError(f'{self.input_directory} is not a directory') missing_files = [file for file in self.files_to_check if not self._check_is_file(file)] if missing_files: plural = 's' if len(missing_files) != 1 else '' msg = f'{len(missing_files)} required file{plural} missing from {self.input_directory}' logger.error(msg) for f in missing_files: logger.error(f'Could not find {f}') return missing_files
[docs] def create_missing_input_files(self, source_directory: (pathlib.Path | None)=None) -> None: """ Creates any input files missing in self.input_directory. When source is omitted FileHandler Parameters ---------- source_directory : pathlib.Path, optional Optional directory for sourcing files to copy. Returns ------- None See Also -------- default_data_directory : default source for data files """ source = FileHandler.default_data_directory() if not source_directory else source_directory if not source.is_dir(): raise NotADirectoryError(f'{self.input_directory} is not a directory') if not self.input_directory.is_dir(): logger.info(f'Creating directory {self.input_directory}') self.input_directory.mkdir() for file in self.files_to_check: logger.debug(f'Create input file {file}') self.create_input_file(file, source_directory=source)
[docs] def create_input_file(self, file, source_directory=None): source_directory = FileHandler.default_data_directory() if not source_directory else source_directory source_file = source_directory / file target_file = self.input_directory / file if target_file.is_file(): logger.debug(f'Skipping existing file {target_file}') elif not source_file.is_file(): logger.error(f'Source file {source_file} does not exist!') else: shutil.copy(source_file, target_file) logger.info( f'Creating missing file {target_file}')
[docs] def validate_input_files(self): """ Validates the input files for correct formatting and content using the validators module Raises ------ pa.errors.SchemaErrors If any invalid data for formatting is found when validating files. The validation is lazy, meaning multiple errors may be listed in the exception. """ for file_to_validate in self.files_to_check: df = self.get_file(file_to_validate) validator = getattr(validators, file_to_validate[:-4].lower()) try: validator.validate(df, lazy=True) except (SchemaErrors, SchemaError): logger.error(f'Got error while validating {file_to_validate}') raise
[docs] def is_calibrated(self) -> bool: """ Check if calibration files exist in the input directory. This method verifies the presence of both energy consumption and energy requirement files in either `.xlsx` or `.csv` format within the specified input directory. Returns ------- bool `True` if both required files exist with the same extension (`.xlsx` or `.csv`), otherwise `False`. """ energy_consumption = (self.input_directory / self.CALIBRATE_ENERGY_CONSUMPTION) energy_requirement = (self.input_directory / self.CALIBRATE_ENERGY_REQUIREMENT) if energy_consumption.with_suffix('.xlsx').is_file() and energy_requirement.with_suffix('.xlsx').is_file(): return True if energy_consumption.with_suffix('.csv').is_file() and energy_requirement.with_suffix('.csv').is_file(): return True return False