47 lines
1.6 KiB
Python
47 lines
1.6 KiB
Python
import pandas as pd
|
|
|
|
# normalisation with the the maximum value in the column
|
|
# allows normalisation factor other than 1
|
|
# to account for more important feature / column in the subsequent clustering
|
|
def max_normalise(series, factor=1):
|
|
return series/series.max()*factor
|
|
|
|
# add country name in front of feature prior to merging to master table
|
|
def add_country_name(df, country_name):
|
|
_df=df.copy(deep=True)
|
|
old_columns=_df.columns
|
|
new_columns=[country_name+'_'+col for col in old_columns]
|
|
return _df.rename(dict(zip(old_columns, new_columns)), axis=1)
|
|
|
|
# merge to generate master df for clustering
|
|
def merge(list_dfs, year=None):
|
|
|
|
# check if all dfs have the same length
|
|
it = iter(list_dfs)
|
|
the_len = len(next(it))
|
|
if all(len(l) == the_len for l in it):
|
|
|
|
# check if there is a whole year of data
|
|
if the_len==8760 or the_len==8784:
|
|
df=pd.concat(list_dfs, axis=1)
|
|
|
|
# drop columns with identical entries
|
|
del_cols=list()
|
|
for col in df.columns:
|
|
if len(df[col].unique())==1:
|
|
del_cols.append(col)
|
|
df=df.drop(del_cols, axis=1)
|
|
# reindex using UTC datetime if required
|
|
if year is not None:
|
|
df=df_AT.set_index(pd.date_range(start='%d-01-01 00:00:00' %year , end='%d-12-31 23:00:00' %year, freq='60min'))
|
|
|
|
return df
|
|
|
|
else:
|
|
raise ValueError('Not a full year of data!')
|
|
|
|
else:
|
|
raise ValueError('Not all dataframes have same length!')
|
|
|
|
|