added selection.py to select the best sample from the random samples according to user defined selection rules

rearranged main.py
2021-06-02 11:28:19 +01:00 · 2021-06-02 11:28:19 +01:00 · 9082e92f90
parent 7a41174593
commit 9082e92f90
11 changed files with 35205 additions and 35062 deletions
--- a/baseline.py
+++ b/baseline.py
@ -0,0 +1,80 @@
+import pandas as pd
+import numpy as np
+import pickle
+import os
+
+import scripts.read as read 
+from scripts.misc import *
+from scripts.selection import *
+
+# load clustering daylabels
+with open('intermediate/kmeans30_daylabels.pickle', 'rb') as handle:
+    daylabels = pickle.load(handle)
+
+# load valid draws
+with open('intermediate/kmeans30_valid_draw_samples.pickle', 'rb') as handle:
+    valid_draws = pickle.load(handle)
+
+# load target df
+df_raw= read.load_csv('input/data_mod.csv', index_col='UTC')
+df_raw['daylabel']=np.array([np.repeat(i,24) for i in daylabels]).ravel()
+
+# load selection rules
+df_rule=pd.read_csv('baseline_selections.csv', usecols=[0,1], index_col='keyword')
+keywords=list(df_rule.index)
+dict_rule=dict(zip(keywords, df_rule['selection_rule'].values))
+
+# array of daytypes
+daytypes=np.sort(df_raw['daylabel'].unique())
+
+# max VRE CFs
+with open('input/VRE_max_CF.pickle', 'rb') as handle:
+    VRE_max = pickle.load(handle)
+
+# initialise output dataframe
+df_baseline=pd.DataFrame(index=df_raw.index, columns=df_raw.columns)
+df_baseline.index.name='UTC'
+df_baseline['daylabel']=df_raw['daylabel']
+
+for col in df_raw.columns:
+    if col != 'daylabel':
+        for keyword in keywords:
+            if keyword in col:
+                selection_rule=dict_rule[keyword]
+
+                if 'yearly' in selection_rule:
+                    s_original=df_raw[col]
+                    df_samples=read.load_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col, index_col='UTC')
+                    s=eval(selection_rule)(s_original, df_samples)
+                    df_baseline[col]=s
+                    print(col)
+                    
+                elif 'cluster' in selection_rule:
+                    for dtype in daytypes:
+                        dtype_idx=df_raw.loc[df_raw['daylabel']==dtype].index
+                        N_dtype_days=int(len(dtype_idx)/24)
+                        M_original=col_to_M(df_raw.loc[df_raw['daylabel']==dtype, col])
+                        if selection_rule=='cluster_mean':
+                            best_sample=cluster_mean(M_original)
+                        else:
+                            M_draw=valid_draws[col][dtype]
+                            beset_sample=eval(selection_rule)(M_original, M_draw)
+                        df_baseline.loc[dtype_idx, col]=np.tile(beset_sample, N_dtype_days)
+                    print(col)
+                else:
+                    raise ValueError(col, "  Please check the selection rule")
+                
+                if 'VRE' in col:
+                    df_baseline[col]=s*VRE_max[col]
+                    df_baseline[col]=df_baseline[col].clip(0. ,1.)
+                else:
+                    col_total=df_baseline[col].sum()
+                    df_baseline[col]=df_baseline[col]/col_total*1000000  
+
+# check if output directory exists
+if not os.path.exists('output'):
+    os.makedirs('output')
+    
+df_baseline.to_csv('output/.csv')
+
+                        
--- a/baseline_selections.csv
+++ b/baseline_selections.csv
@ -0,0 +1,14 @@
+keyword,selection_rule,note
+shore,yearly_std,both on- and offshore wind 
+pv,yearly_rmse,both rooftop (rtpv) and utility (upv)
+RES_heating,yearly_std,
+COM_heating,cluster_rmse,
+cooling,yearly_rmse,both COM and RES
+RES_SE,cluster_mean,
+COM_SE,cluster_rmse,
+RES_shw,cluster_rmse,
+COM_shw,cluster_mean,
+RES_cooking,cluster_mean,all profiles are identical
+COM_catering,cluster_mean,
+IND,yearly_rmse,
+TRN,cluster_rmse,all three types of transport
--- a/input/VRE_max_CF.pickle
+++ b/input/VRE_max_CF.pickle
--- a/input/data_clustering.csv
+++ b/input/data_clustering.csv
--- a/input/data_clustering_UK.csv
+++ b/input/data_clustering_UK.csv
--- a/input/data_clustering_all.csv
+++ b/input/data_clustering_all.csv
--- a/input/data_mod.csv
+++ b/input/data_mod.csv
--- a/input/data_mod_UK.csv
+++ b/input/data_mod_UK.csv
--- a/input/data_mod_all.csv
+++ b/input/data_mod_all.csv
--- a/main.py
+++ b/main.py
@ -161,6 +161,28 @@ for col in [col for col in df_draw.columns if 'label' not in col]:
    
    print(col)
    
+#######
+# OUTPUT
+#######
+
+# create output sub directory if not already exists
+if not os.path.exists('intermediate'):
+    os.makedirs('intermediate')
+
+# output kmeans daylabels 
+with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
+    pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+# output random sampling results
+with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
+    pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    
+# output valid draw samples
+with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
+    pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+
    
 ############
 # RANDOM YEARLY TIME SERIES
@ -215,23 +237,5 @@ for col in df_draw.columns:
        


-#######
-# OUTPUT
-#######

-# create output sub directory if not already exists
-if not os.path.exists('intermediate'):
-    os.makedirs('intermediate')
-
-# output kmeans daylabels 
-with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
-    pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-# output random sampling results
-with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
-    pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
-    
-# output valid draw samples
-with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
-    pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)

--- a/scripts/selection.py
+++ b/scripts/selection.py
@ -0,0 +1,45 @@
+import numpy as np
+
+from sklearn.metrics import mean_squared_error
+
+
+# cluster mean
+def cluster_mean(M_original):
+    return(M_original.mean(axis=0))
+
+
+# smallest root mean square error with respect to the original series
+def cluster_rmse(M_original, M_draw):
+    
+    if M_draw.shape==(24,): # single unique row
+        return M_draw
+    
+    else: 
+        N_days=M_original.shape[0]
+        v_mse=np.array([])
+        for i in range(M_draw.shape[0]):
+            v_mse=np.append(v_mse, mean_squared_error(M_original.ravel() , 
+                                                    np.repeat([M_draw[i]], N_days, axis=0).ravel()))
+            
+        return(M_draw[np.argmin(v_mse)])
+
+
+# randomly reconstructed yearly time series with the least rmse with respect to the original series
+def yearly_rmse(s_original, df_samples):
+    v_mse=np.array([])
+    for n in range(len(df_samples.columns)):
+        v_mse=np.append(v_mse, mean_squared_error(s_original , df_samples.iloc[:,n]))
+                        
+    return df_samples.iloc[:, np.argmin(v_mse)]
+
+
+# randomly reconstructed yearly time series with the closest std with respect to the original series
+
+def yearly_std(s_original, df_samples):
+    v_diff_std=np.array([])
+    original_std=s_original.std()
+    for n in range(len(df_samples.columns)):
+        diff_std=np.abs(original_std -  df_samples.iloc[:,n].std())
+        v_diff_std=np.append(v_diff_std, diff_std)
+        
+    return df_samples.iloc[:, np.argmin(v_diff_std)]