import pandas as pd # normalisation with the the maximum value in the column # allows normalisation factor other than 1 # to account for more important feature / column in the subsequent clustering def max_normalise(series, factor=1): return series/series.max()*factor # add country name in front of feature prior to merging to master table def add_country_name(df, country_name): _df=df.copy(deep=True) old_columns=_df.columns new_columns=[country_name+'_'+col for col in old_columns] return _df.rename(dict(zip(old_columns, new_columns)), axis=1) # merge to generate master df for clustering def merge(list_dfs, year=None): # check if all dfs have the same length it = iter(list_dfs) the_len = len(next(it)) if all(len(l) == the_len for l in it): # check if there is a whole year of data if the_len==8760 or the_len==8784: df=pd.concat(list_dfs, axis=1) # drop columns with identical entries del_cols=list() for col in df.columns: if len(df[col].unique())==1: del_cols.append(col) df=df.drop(del_cols, axis=1) # reindex using UTC datetime if required if year is not None: df=df_AT.set_index(pd.date_range(start='%d-01-01 00:00:00' %year , end='%d-12-31 23:00:00' %year, freq='60min')) return df else: raise ValueError('Not a full year of data!') else: raise ValueError('Not all dataframes have same length!')