added selection.py to select the best sample from the random samples according to user defined selection rules

rearranged main.py
This commit is contained in:
Carmen 2021-06-02 11:28:19 +01:00
parent 7a41174593
commit 9082e92f90
11 changed files with 35205 additions and 35062 deletions

80
baseline.py Normal file
View File

@ -0,0 +1,80 @@
import pandas as pd
import numpy as np
import pickle
import os
import scripts.read as read
from scripts.misc import *
from scripts.selection import *
# load clustering daylabels
with open('intermediate/kmeans30_daylabels.pickle', 'rb') as handle:
daylabels = pickle.load(handle)
# load valid draws
with open('intermediate/kmeans30_valid_draw_samples.pickle', 'rb') as handle:
valid_draws = pickle.load(handle)
# load target df
df_raw= read.load_csv('input/data_mod.csv', index_col='UTC')
df_raw['daylabel']=np.array([np.repeat(i,24) for i in daylabels]).ravel()
# load selection rules
df_rule=pd.read_csv('baseline_selections.csv', usecols=[0,1], index_col='keyword')
keywords=list(df_rule.index)
dict_rule=dict(zip(keywords, df_rule['selection_rule'].values))
# array of daytypes
daytypes=np.sort(df_raw['daylabel'].unique())
# max VRE CFs
with open('input/VRE_max_CF.pickle', 'rb') as handle:
VRE_max = pickle.load(handle)
# initialise output dataframe
df_baseline=pd.DataFrame(index=df_raw.index, columns=df_raw.columns)
df_baseline.index.name='UTC'
df_baseline['daylabel']=df_raw['daylabel']
for col in df_raw.columns:
if col != 'daylabel':
for keyword in keywords:
if keyword in col:
selection_rule=dict_rule[keyword]
if 'yearly' in selection_rule:
s_original=df_raw[col]
df_samples=read.load_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col, index_col='UTC')
s=eval(selection_rule)(s_original, df_samples)
df_baseline[col]=s
print(col)
elif 'cluster' in selection_rule:
for dtype in daytypes:
dtype_idx=df_raw.loc[df_raw['daylabel']==dtype].index
N_dtype_days=int(len(dtype_idx)/24)
M_original=col_to_M(df_raw.loc[df_raw['daylabel']==dtype, col])
if selection_rule=='cluster_mean':
best_sample=cluster_mean(M_original)
else:
M_draw=valid_draws[col][dtype]
beset_sample=eval(selection_rule)(M_original, M_draw)
df_baseline.loc[dtype_idx, col]=np.tile(beset_sample, N_dtype_days)
print(col)
else:
raise ValueError(col, " Please check the selection rule")
if 'VRE' in col:
df_baseline[col]=s*VRE_max[col]
df_baseline[col]=df_baseline[col].clip(0. ,1.)
else:
col_total=df_baseline[col].sum()
df_baseline[col]=df_baseline[col]/col_total*1000000
# check if output directory exists
if not os.path.exists('output'):
os.makedirs('output')
df_baseline.to_csv('output/.csv')

14
baseline_selections.csv Normal file
View File

@ -0,0 +1,14 @@
keyword,selection_rule,note
shore,yearly_std,both on- and offshore wind
pv,yearly_rmse,both rooftop (rtpv) and utility (upv)
RES_heating,yearly_std,
COM_heating,cluster_rmse,
cooling,yearly_rmse,both COM and RES
RES_SE,cluster_mean,
COM_SE,cluster_rmse,
RES_shw,cluster_rmse,
COM_shw,cluster_mean,
RES_cooking,cluster_mean,all profiles are identical
COM_catering,cluster_mean,
IND,yearly_rmse,
TRN,cluster_rmse,all three types of transport
1 keyword selection_rule note
2 shore yearly_std both on- and offshore wind
3 pv yearly_rmse both rooftop (rtpv) and utility (upv)
4 RES_heating yearly_std
5 COM_heating cluster_rmse
6 cooling yearly_rmse both COM and RES
7 RES_SE cluster_mean
8 COM_SE cluster_rmse
9 RES_shw cluster_rmse
10 COM_shw cluster_mean
11 RES_cooking cluster_mean all profiles are identical
12 COM_catering cluster_mean
13 IND yearly_rmse
14 TRN cluster_rmse all three types of transport

BIN
input/VRE_max_CF.pickle Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

8761
input/data_clustering_UK.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

8761
input/data_mod_UK.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

40
main.py
View File

@ -161,6 +161,28 @@ for col in [col for col in df_draw.columns if 'label' not in col]:
print(col)
#######
# OUTPUT
#######
# create output sub directory if not already exists
if not os.path.exists('intermediate'):
os.makedirs('intermediate')
# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
############
# RANDOM YEARLY TIME SERIES
@ -215,23 +237,5 @@ for col in df_draw.columns:
#######
# OUTPUT
#######
# create output sub directory if not already exists
if not os.path.exists('intermediate'):
os.makedirs('intermediate')
# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)

45
scripts/selection.py Normal file
View File

@ -0,0 +1,45 @@
import numpy as np
from sklearn.metrics import mean_squared_error
# cluster mean
def cluster_mean(M_original):
return(M_original.mean(axis=0))
# smallest root mean square error with respect to the original series
def cluster_rmse(M_original, M_draw):
if M_draw.shape==(24,): # single unique row
return M_draw
else:
N_days=M_original.shape[0]
v_mse=np.array([])
for i in range(M_draw.shape[0]):
v_mse=np.append(v_mse, mean_squared_error(M_original.ravel() ,
np.repeat([M_draw[i]], N_days, axis=0).ravel()))
return(M_draw[np.argmin(v_mse)])
# randomly reconstructed yearly time series with the least rmse with respect to the original series
def yearly_rmse(s_original, df_samples):
v_mse=np.array([])
for n in range(len(df_samples.columns)):
v_mse=np.append(v_mse, mean_squared_error(s_original , df_samples.iloc[:,n]))
return df_samples.iloc[:, np.argmin(v_mse)]
# randomly reconstructed yearly time series with the closest std with respect to the original series
def yearly_std(s_original, df_samples):
v_diff_std=np.array([])
original_std=s_original.std()
for n in range(len(df_samples.columns)):
diff_std=np.abs(original_std - df_samples.iloc[:,n].std())
v_diff_std=np.append(v_diff_std, diff_std)
return df_samples.iloc[:, np.argmin(v_diff_std)]