european_electricity/scripts/prep.py

import pandas as pd

# normalisation with the the maximum value in the column
# allows normalisation factor other than 1
# to account for more important feature / column in the subsequent clustering
def max_normalise(series, factor=1):
    return series/series.max()*factor

# add country name in front of feature prior to merging to master table
def add_country_name(df, country_name):
    _df=df.copy(deep=True)
    old_columns=_df.columns
    new_columns=[country_name+'_'+col for col in old_columns]
    return _df.rename(dict(zip(old_columns, new_columns)), axis=1)

# merge to generate master df for clustering
def merge(list_dfs, year=None):

    # check if all dfs have the same length
    it = iter(list_dfs)
    the_len = len(next(it))
    if all(len(l) == the_len for l in it):

        # check if there is a whole year of data
        if the_len==8760 or the_len==8784:
            df=pd.concat(list_dfs, axis=1)

            # drop columns with identical entries
            del_cols=list()
            for col in df.columns:
                if len(df[col].unique())==1:
                    del_cols.append(col)
            df=df.drop(del_cols, axis=1)
            # reindex using UTC datetime if required
            if year is not None:
                df=df_AT.set_index(pd.date_range(start='%d-01-01 00:00:00' %year , end='%d-12-31 23:00:00' %year, freq='60min'))

            return df

        else:
            raise ValueError('Not a full year of data!')

    else:
        raise ValueError('Not all dataframes have same length!')