Source code for ebm.services.spreadsheet

import dataclasses
import itertools
import math
import pathlib
import string
import typing
from dataclasses import dataclass

import numpy as np
from loguru import logger
from openpyxl import load_workbook
from openpyxl.cell import Cell
from openpyxl.formatting.rule import FormulaRule
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.cell import cols_from_range, coordinate_to_tuple, get_column_letter
from openpyxl.workbook import Workbook
from openpyxl.worksheet.errors import IgnoredError



[docs]
@dataclass
class SpreadsheetCell:
    """
    A class to represent a cell in a spreadsheet.

    Attributes
    ----------
    column : int
        The column number of the cell.
    row : int
        The row number of the cell.
    value : object
        The value contained in the cell.

    Methods
    -------
    spreadsheet_cell() -> str
        Returns the cell's address in A1 notation.
    replace(**kwargs) -> 'SpreadsheetCell'
        Returns a new SpreadsheetCell with updated attributes.
    first_row(cell_range) -> tuple
        Returns the first row of cells in the given range.
    first_column(cell_range) -> tuple
        Returns the first column of cells in the given range.
    submatrix(cell_range) -> tuple
        Returns the submatrix excluding the first row and column.
    """

    column: int
    row: int
    value: object


[docs]
    def spreadsheet_cell(self) -> str:
        """
        Returns the cell's address in A1 notation.

        Returns
        -------
        str
            The cell's address in A1 notation.
        """
        return f'{get_column_letter(self.column)}{self.row}'



[docs]
    def replace(self, **kwargs) -> 'SpreadsheetCell':
        """
        Returns a new SpreadsheetCell with updated attributes.

        Parameters
        ----------
        **kwargs : dict
            The attributes to update.

        Returns
        -------
        SpreadsheetCell
            A new SpreadsheetCell with updated attributes.
        """
        return dataclasses.replace(self, **kwargs)



[docs]
    @classmethod
    def first_row(cls, cell_range):
        """
        Returns the first row of cells in the given range.

        Parameters
        ----------
        cell_range : str
            The range of cells in A1 notation.

        Returns
        -------
        tuple
            A tuple of SpreadsheetCell objects representing the first row.
        """
        table = list(cols_from_range(cell_range))
        first_row = [coordinate_to_tuple(rows[0]) for rows in table]
        return tuple(SpreadsheetCell(column=cell[1], row=cell[0], value=None) for cell in first_row)



[docs]
    @classmethod
    def first_column(cls, cell_range):
        """
        Returns the first column of cells in the given range.

        Parameters
        ----------
        cell_range : str
            The range of cells in A1 notation.

        Returns
        -------
        tuple
            A tuple of SpreadsheetCell objects representing the first column.
        """
        table = list(cols_from_range(cell_range))
        first_column = [coordinate_to_tuple(cell) for cell in table[0]]
        return tuple(SpreadsheetCell(column=cell[1], row=cell[0], value=None) for cell in first_column)



[docs]
    @classmethod
    def submatrix(cls, cell_range):
        """
        Returns the submatrix excluding the first row and column.

        Parameters
        ----------
        cell_range : str
            The range of cells in A1 notation.

        Returns
        -------
        tuple
            A tuple of SpreadsheetCell objects representing the submatrix.

        Examples
        --------
        the_range = "A1:C3"
        submatrix = SpreadsheetCell.submatrix(the_range)
        for cell in submatrix:
            print(cell.spreadsheet_cell(), cell.column, cell.row)

        B2 2 2
        C2 3 2
        B3 2 3
        C3 3 3
        """
        table = list(cols_from_range(cell_range))
        first_column = [coordinate_to_tuple(cell) for cell in table[0]]
        first_row = [coordinate_to_tuple(cols[0]) for cols in table]
        box = []
        for row in table:
            for column in row:
                row_idx, column_idx = coordinate_to_tuple(column)
                if (row_idx, column_idx) not in first_column and (row_idx, column_idx) not in first_row:
                    box.append(SpreadsheetCell(column=column_idx, row=row_idx, value=None))

        return tuple(sorted(box, key=lambda k: (k.row, k.column)))





[docs]
def iter_cells(first_column: str = 'E', left_padding: str = '') -> typing.Generator[str, None, None]:
    """ Returns spreadsheet column names from A up to ZZ
        Parameters:
        - first_column Letter of the first column to return. Default (E)
        - left_padding Padding added in front of single letter columns. Default empty
        Returns:
        - Generator supplying a column name
    """
    if not first_column:
        first_index = 0
    elif first_column.upper() not in string.ascii_uppercase:
        raise ValueError(f'Expected first_column {first_column} in {string.ascii_uppercase}')
    elif len(first_column) != 1:
        raise ValueError(f'Expected first_column of length 1 was: {len(first_column)}')
    else:
        first_index = string.ascii_uppercase.index(first_column.upper())
    for cell in string.ascii_uppercase[first_index:]:
        yield f'{left_padding}{cell}'
    for a, b in itertools.product(string.ascii_uppercase, repeat=2):
        yield a+b




[docs]
def detect_format_from_values(col_name, col_values, model):
    cell_format = ''
    if np.issubdtype(model[col_name].dtype, np.floating):
        cell_format = '#,##0.00'
        if col_values.max() > 1000.0:
            cell_format = '# ##0'
        elif 1.0 >= col_values.mean() >= -1.0:
            cell_format = '0.00%'
    elif np.issubdtype(model[col_name].dtype, np.integer):
        cell_format = '#,##0'

    return cell_format




[docs]
def find_max_column_width(col: typing.Tuple[Cell]):
    max_length = 5
    for cell in col:
        try:
            if cell.value is not None:
                cell_length = len(str(cell.value)) + 1
                if cell.data_type == 'n':
                    thousands = math.floor(math.log(1_000_000_0000, 1000))
                    cell_length = max(len(str(cell.value).split('.')[0]) + thousands, 2)
                if cell_length > max_length:
                    max_length = cell_length
        except (AttributeError, KeyError, IgnoredError, ValueError) as ex:
            logger.debug(f'Got error f{cell.column_letter}')
            logger.error(ex)
            pass
    return max_length




[docs]
def add_top_row_filter(workbook_file: pathlib.Path|str|None=None, workbook: Workbook| None=None, sheet_names: list[str] | None=None):
    if not workbook_file and not workbook_file:
        raise ValueError('add_top_row_filter require either workbook_file or workbook')

    wb = workbook if workbook else load_workbook(workbook_file)

    sheet_names = wb.sheetnames if not sheet_names else sheet_names
    for worksheet in sheet_names:
        ws = wb[worksheet]
        top_row = f'A1:{get_column_letter(ws.max_column)}{1}'
        ws.auto_filter.ref = top_row

    if not workbook and workbook_file:
        wb.save(workbook_file)




[docs]
def make_pretty(workbook_name: pathlib.Path|str):
    wb = load_workbook(workbook_name)

    header_font = Font(name='Source Sans Pro', size=11, bold=True, color="ffffff")
    body_font = Font(name='Source Sans Pro', size=11, bold=False, color="000000")

    for s in wb.sheetnames:
        ws = wb[s]
        # Freeze the top row
        ws.freeze_panes = ws['A2']

        # Define the fill color
        header_fill = PatternFill(start_color='c8102e', end_color='c8102e', fill_type='solid')
        odd_fill = PatternFill(start_color='ffd8de', end_color='ffd8de', fill_type='solid')
        even_fill = PatternFill(start_color='ffebee', end_color='ffebee', fill_type='solid')

        # Apply the fill color to the header row
        for cell in ws[1]:
            cell.fill = header_fill
            cell.font = header_font
        for row_number, row in enumerate(ws.rows):
            if row_number == 0:
                continue
            for column_number, cell in enumerate(row):
                cell.font = body_font
        even_rule = FormulaRule(formula=['MOD(ROW(),2)=0'], fill=even_fill)
        odd_rule = FormulaRule(formula=['MOD(ROW(),2)=1'], fill=odd_fill)
        worksheet_range = f'A2:{get_column_letter(ws.max_column)}{ws.max_row}'
        # logger.error(worksheet_range)
        ws.conditional_formatting.add(worksheet_range, odd_rule)
        ws.conditional_formatting.add(worksheet_range, even_rule)

        for col in ws.iter_cols(min_col=0):
            adjusted_width = find_max_column_width(col)
            ws.column_dimensions[col[0].column_letter].width = adjusted_width + 1.5
            # Skipping first row, assuming it is a header for now.
            first_column_value = col[0].value
            values = [int(r.value) for r in col[1:] if r.value and r.data_type == 'n']
            if values:
                max_min = max(values), min(values)
                number_format = ''
                if max_min[0] > 1000:
                    #cell.number_format = '#,##0.00;[Red]-#,##0.00;0'
                    number_format = r'_-* #,##0_-;[Red]\-* #,##0_-;_-* "-"??_-;_-@_-'
                elif max_min[0] <=1.0 and max_min[1] >= 0:
                    # number_format = '0%'
                    number_format = '0.000'
                elif max_min[0] <=1.0 and max_min[1] >=-1.0:
                    number_format = '0.000'

                if number_format:
                    for row_number, cell in enumerate(col):
                        if row_number < 1:
                            if cell.value == 'year':
                                break
                            continue
                        if True or cell.value:
                            if True or cell.data_type == 'n':
                                cell.number_format = number_format
                        else:
                            cell.number_format = "@"


    wb.save(workbook_name)