european_data/baseline.py

81 lines
2.9 KiB
Python

import pandas as pd
import numpy as np
import pickle
import os
import scripts.read as read
from scripts.misc import *
from scripts.selection import *
# load clustering daylabels
with open('intermediate/kmeans30_daylabels.pickle', 'rb') as handle:
daylabels = pickle.load(handle)
# load valid draws
with open('intermediate/kmeans30_valid_draw_samples.pickle', 'rb') as handle:
valid_draws = pickle.load(handle)
# load target df
df_raw= read.load_csv('input/data_mod.csv', index_col='UTC')
df_raw['daylabel']=np.array([np.repeat(i,24) for i in daylabels]).ravel()
# load selection rules
df_rule=pd.read_csv('baseline_selections.csv', usecols=[0,1], index_col='keyword')
keywords=list(df_rule.index)
dict_rule=dict(zip(keywords, df_rule['selection_rule'].values))
# array of daytypes
daytypes=np.sort(df_raw['daylabel'].unique())
# max VRE CFs
with open('input/VRE_max_CF.pickle', 'rb') as handle:
VRE_max = pickle.load(handle)
# initialise output dataframe
df_baseline=pd.DataFrame(index=df_raw.index, columns=df_raw.columns)
df_baseline.index.name='UTC'
df_baseline['daylabel']=df_raw['daylabel']
for col in df_raw.columns:
if col != 'daylabel':
for keyword in keywords:
if keyword in col:
selection_rule=dict_rule[keyword]
if 'yearly' in selection_rule:
s_original=df_raw[col]
df_samples=read.load_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col, index_col='UTC')
s=eval(selection_rule)(s_original, df_samples)
df_baseline[col]=s
print(col)
elif 'cluster' in selection_rule:
for dtype in daytypes:
dtype_idx=df_raw.loc[df_raw['daylabel']==dtype].index
N_dtype_days=int(len(dtype_idx)/24)
M_original=col_to_M(df_raw.loc[df_raw['daylabel']==dtype, col])
if selection_rule=='cluster_mean':
best_sample=cluster_mean(M_original)
else:
M_draw=valid_draws[col][dtype]
beset_sample=eval(selection_rule)(M_original, M_draw)
df_baseline.loc[dtype_idx, col]=np.tile(beset_sample, N_dtype_days)
print(col)
else:
raise ValueError(col, " Please check the selection rule")
if 'VRE' in col:
df_baseline[col]=s*VRE_max[col]
df_baseline[col]=df_baseline[col].clip(0. ,1.)
else:
col_total=df_baseline[col].sum()
df_baseline[col]=df_baseline[col]/col_total*1000000
# check if output directory exists
if not os.path.exists('output'):
os.makedirs('output')
df_baseline.to_csv('output/.csv')