Example k-means reduction for AT and BE input data.
This commit is contained in:
parent
b1224b13bc
commit
d5318bab56
|
@ -138,3 +138,5 @@ dmypy.json
|
||||||
# Cython debug symbols
|
# Cython debug symbols
|
||||||
cython_debug/
|
cython_debug/
|
||||||
|
|
||||||
|
# backup files
|
||||||
|
.~lock.*#
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
# european_electricity
|
# european_electricity
|
||||||
|
|
||||||
Python notebooks and scripts for the European Electricity project
|
Python notebooks and scripts for the European Electricity project.
|
||||||
|
In the notebook main_kmeans_clustering, the dataset (consisting of AT and BE only at the moment) is reduced using k-means clustering and the cluster centres are used as the typical-day representatives.
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,31 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
def clustering(df, K=20): # default 20 day types
|
||||||
|
|
||||||
|
N_cols = int(len(df.columns)*24) # flatten each day
|
||||||
|
N_days=int(len(df)/24)
|
||||||
|
M = ((df.values).ravel()).reshape((N_days,N_cols))
|
||||||
|
|
||||||
|
kmeans= KMeans(n_clusters=K, random_state=2468).fit(M)
|
||||||
|
|
||||||
|
return kmeans.labels_, kmeans.cluster_centers_
|
||||||
|
|
||||||
|
def df_daily_label(v_labels , year): # create daily day type label dataframe
|
||||||
|
|
||||||
|
return pd.DataFrame(index=pd.date_range(start='%d-01-01 00:00:00' %year, end='%d-12-31 23:00:00' %year, freq='1D'), columns=['daytype'], data=v_labels)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def df_centres(df_original, v_labels, m_centres): # concatenate each day according to day type label
|
||||||
|
N_days=int(len(df_original)/24)
|
||||||
|
N_cols =int(len(df_original.columns))
|
||||||
|
for i in range(0,N_days):
|
||||||
|
if i==0:
|
||||||
|
M_reduced=m_centres[v_labels[i]].reshape((24, N_cols))
|
||||||
|
else:
|
||||||
|
M_reduced=np.vstack((M_reduced, m_centres[v_labels[i]].reshape((24, N_cols))))
|
||||||
|
|
||||||
|
return pd.DataFrame(index=df_original.index, columns=df_original.columns, data=M_reduced)
|
|
@ -0,0 +1,46 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# normalisation with the the maximum value in the column
|
||||||
|
# allows normalisation factor other than 1
|
||||||
|
# to account for more important feature / column in the subsequent clustering
|
||||||
|
def max_normalise(series, factor=1):
|
||||||
|
return series/series.max()*factor
|
||||||
|
|
||||||
|
# add country name in front of feature prior to merging to master table
|
||||||
|
def add_country_name(df, country_name):
|
||||||
|
_df=df.copy(deep=True)
|
||||||
|
old_columns=_df.columns
|
||||||
|
new_columns=[country_name+'_'+col for col in old_columns]
|
||||||
|
return _df.rename(dict(zip(old_columns, new_columns)), axis=1)
|
||||||
|
|
||||||
|
# merge to generate master df for clustering
|
||||||
|
def merge(list_dfs, year=None):
|
||||||
|
|
||||||
|
# check if all dfs have the same length
|
||||||
|
it = iter(list_dfs)
|
||||||
|
the_len = len(next(it))
|
||||||
|
if all(len(l) == the_len for l in it):
|
||||||
|
|
||||||
|
# check if there is a whole year of data
|
||||||
|
if the_len==8760 or the_len==8784:
|
||||||
|
df=pd.concat(list_dfs, axis=1)
|
||||||
|
|
||||||
|
# drop columns with identical entries
|
||||||
|
del_cols=list()
|
||||||
|
for col in df.columns:
|
||||||
|
if len(df[col].unique())==1:
|
||||||
|
del_cols.append(col)
|
||||||
|
df=df.drop(del_cols, axis=1)
|
||||||
|
# reindex using UTC datetime if required
|
||||||
|
if year is not None:
|
||||||
|
df=df_AT.set_index(pd.date_range(start='%d-01-01 00:00:00' %year , end='%d-12-31 23:00:00' %year, freq='60min'))
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError('Not a full year of data!')
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError('Not all dataframes have same length!')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# read data file, allows for different indices
|
||||||
|
# for now there are two options: timestep (1-8760) or UTC
|
||||||
|
# can add local time option in the script if needed
|
||||||
|
def zonal_data(file_path, datetime_index=False):
|
||||||
|
if datetime_index is False:
|
||||||
|
df=pd.read_excel(file_path, index_col=0)
|
||||||
|
elif datetime_index =='UTC':
|
||||||
|
df=pd.read_excel(file_path)
|
||||||
|
df[df.columns[0]]=pd.to_datetime(df[df.columns[0]])
|
||||||
|
df=df.set_index(df.columns[0])
|
||||||
|
|
||||||
|
# drop identically 0 columns
|
||||||
|
del_cols=list()
|
||||||
|
for col in df.columns:
|
||||||
|
if (df[col] == 0).all():
|
||||||
|
print(col + ' is identically zero.')
|
||||||
|
del_cols.append(col)
|
||||||
|
df=df.drop(del_cols, axis=1)
|
||||||
|
|
||||||
|
return df
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue