Example k-means reduction for AT and BE input data.

2021-01-11 22:12:55 +00:00 · 2021-01-11 22:12:55 +00:00 · d5318bab56
parent b1224b13bc
commit d5318bab56
8 changed files with 1252 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -138,3 +138,5 @@ dmypy.json
 # Cython debug symbols
 cython_debug/

+# backup files
+.~lock.*#
--- a/README.md
+++ b/README.md
@ -1,3 +1,4 @@
 # european_electricity

-Python notebooks and scripts for the European Electricity project
+Python notebooks and scripts for the European Electricity project.
+In the notebook main_kmeans_clustering, the dataset (consisting of AT and BE only at the moment) is reduced using k-means clustering and the cluster centres are used as the typical-day representatives. 
--- a/main_kmeans_clustering.ipynb
+++ b/main_kmeans_clustering.ipynb
--- a/scripts/kmeans.py
+++ b/scripts/kmeans.py
@ -0,0 +1,31 @@
+import pandas as pd
+import numpy as np
+
+from sklearn.cluster import KMeans
+
+def clustering(df, K=20): # default 20 day types
+    
+    N_cols = int(len(df.columns)*24) # flatten each day
+    N_days=int(len(df)/24)
+    M = ((df.values).ravel()).reshape((N_days,N_cols))
+    
+    kmeans= KMeans(n_clusters=K, random_state=2468).fit(M) 
+    
+    return kmeans.labels_, kmeans.cluster_centers_
+
+def df_daily_label(v_labels , year): # create daily day type label dataframe
+    
+    return pd.DataFrame(index=pd.date_range(start='%d-01-01 00:00:00' %year, end='%d-12-31 23:00:00' %year, freq='1D'), columns=['daytype'], data=v_labels)
+
+
+
+def df_centres(df_original, v_labels, m_centres): # concatenate each day according to day type label
+    N_days=int(len(df_original)/24)
+    N_cols =int(len(df_original.columns))
+    for i in range(0,N_days):
+        if i==0:
+            M_reduced=m_centres[v_labels[i]].reshape((24, N_cols))
+        else:
+            M_reduced=np.vstack((M_reduced, m_centres[v_labels[i]].reshape((24, N_cols))))
+    
+    return pd.DataFrame(index=df_original.index, columns=df_original.columns, data=M_reduced)
--- a/scripts/prep.py
+++ b/scripts/prep.py
@ -0,0 +1,46 @@
+import pandas as pd
+
+# normalisation with the the maximum value in the column
+# allows normalisation factor other than 1 
+# to account for more important feature / column in the subsequent clustering
+def max_normalise(series, factor=1):
+    return series/series.max()*factor
+
+# add country name in front of feature prior to merging to master table 
+def add_country_name(df, country_name):
+    _df=df.copy(deep=True)
+    old_columns=_df.columns
+    new_columns=[country_name+'_'+col for col in old_columns]
+    return _df.rename(dict(zip(old_columns, new_columns)), axis=1)
+
+# merge to generate master df for clustering
+def merge(list_dfs, year=None):
+    
+    # check if all dfs have the same length
+    it = iter(list_dfs)
+    the_len = len(next(it))
+    if all(len(l) == the_len for l in it):
+        
+        # check if there is a whole year of data
+        if the_len==8760 or the_len==8784:
+            df=pd.concat(list_dfs, axis=1)
+            
+            # drop columns with identical entries
+            del_cols=list()
+            for col in df.columns:
+                if len(df[col].unique())==1:
+                    del_cols.append(col)    
+            df=df.drop(del_cols, axis=1)
+            # reindex using UTC datetime if required
+            if year is not None:
+                df=df_AT.set_index(pd.date_range(start='%d-01-01 00:00:00' %year , end='%d-12-31 23:00:00' %year, freq='60min'))
+            
+            return df
+    
+        else:
+            raise ValueError('Not a full year of data!')
+
+    else:
+        raise ValueError('Not all dataframes have same length!')
+    
+
--- a/scripts/read.py
+++ b/scripts/read.py
@ -0,0 +1,22 @@
+import pandas as pd
+
+# read data file, allows for different indices
+# for now there are two options: timestep (1-8760) or UTC
+# can add local time option in the script if needed
+def zonal_data(file_path, datetime_index=False):
+    if datetime_index is False:
+        df=pd.read_excel(file_path, index_col=0)
+    elif datetime_index =='UTC':
+        df=pd.read_excel(file_path)
+        df[df.columns[0]]=pd.to_datetime(df[df.columns[0]])
+        df=df.set_index(df.columns[0])
+        
+    # drop identically 0 columns
+    del_cols=list()
+    for col in df.columns:
+        if (df[col] == 0).all():
+            print(col + ' is identically zero.')
+            del_cols.append(col)
+    df=df.drop(del_cols, axis=1)    
+    
+    return df
--- a/zonal_data/INPUT_DATA_ZONAL_AT.xlsx
+++ b/zonal_data/INPUT_DATA_ZONAL_AT.xlsx
--- a/zonal_data/INPUT_DATA_ZONAL_BE.xlsx
+++ b/zonal_data/INPUT_DATA_ZONAL_BE.xlsx