242 lines
6.8 KiB
Python
242 lines
6.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import os
|
|
import pickle
|
|
import random
|
|
|
|
|
|
import scripts.read as read
|
|
from scripts.sampling import *
|
|
from scripts.misc import *
|
|
|
|
from sklearn.cluster import KMeans, DBSCAN
|
|
|
|
|
|
|
|
#######
|
|
#Clustering
|
|
#######
|
|
|
|
|
|
|
|
# load clustering data
|
|
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
|
|
|
|
|
|
# TRAINING MATRIX
|
|
|
|
## select heating columns to be repeated 3 times at the endpoints
|
|
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
|
|
## select on- and offshore wind columns to be repeated 8 times at the endpoints
|
|
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
|
|
## rest of the columns
|
|
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
|
|
|
|
## construct training matrix
|
|
|
|
M_train=np.array([])
|
|
for col in cols_others:
|
|
#print(col)
|
|
try:
|
|
M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
|
|
except: # initialisation for first entry (nothing to concat)
|
|
M_train=col_to_M(df_clust[col])
|
|
|
|
for col in cols_repeat3:
|
|
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
|
|
|
|
for col in cols_repeat12:
|
|
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
|
|
|
|
|
|
|
|
|
|
# K-MEANS CLUSTERING
|
|
|
|
K=30 # number of clusters
|
|
|
|
kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
|
|
kmeans_centres=kmeans.cluster_centers_
|
|
kmeans_daylabels=kmeans.labels_
|
|
|
|
|
|
#######
|
|
# RANDOM DRAWS
|
|
#######
|
|
|
|
# load data based on which samples are drawn
|
|
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
|
|
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
|
|
|
|
|
|
# main sampling loop
|
|
draws=dict()
|
|
valid_draws=dict()
|
|
|
|
daytypes=np.sort(df_draw['daylabel'].unique())
|
|
|
|
for col in [col for col in df_draw.columns if 'label' not in col]:
|
|
|
|
|
|
col_draws=dict()
|
|
col_valid_draws=dict()
|
|
|
|
for dtype in daytypes:
|
|
|
|
df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
|
|
M_sample=col_to_M(df_sample)
|
|
|
|
# VRE columns
|
|
|
|
if '_VRE_' in col:
|
|
|
|
# PV - return original samples mod the outliers
|
|
if '_rtpv' in col or '_upv' in col:
|
|
|
|
# exclude outliers
|
|
outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
|
|
outliers = outlier_detection.fit_predict(M_sample)
|
|
M_draws=M_sample[np.where(outliers!=-1)]
|
|
|
|
# wind - Gaussian process
|
|
else:
|
|
|
|
M_draws=GP_samples(df_sample)
|
|
|
|
else:
|
|
# demand columns
|
|
|
|
## cooling columns
|
|
if 'cooling' in col:
|
|
M_mean=M_sample.mean(axis=1)
|
|
# if more than half of the entries are identically 0 in a cluster, return 0
|
|
if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
|
|
M_draws=np.repeat(0,24)
|
|
|
|
else:
|
|
|
|
# find outliers
|
|
outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
|
|
outliers = outlier_detection.fit_predict(M_sample)
|
|
M_valid=M_sample[np.where(outliers!=-1)]
|
|
|
|
# return all samples if all valid samples start at 0
|
|
if all(M_valid[:,0]==0):
|
|
M_draws=M_valid
|
|
|
|
else:
|
|
|
|
try:
|
|
M_draws=mcmc_samples(M_valid)
|
|
except:
|
|
M_draws=M_valid
|
|
|
|
else:
|
|
# if all days in the cluster are sufficiently similar, take the mean
|
|
if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
|
|
M_draws=M_sample.mean(axis=0)
|
|
|
|
# if fewer than 5 unique rows, take all rows
|
|
elif np.unique(M_sample, axis=0).shape[0]<=5:
|
|
M_draws=M_sample
|
|
|
|
# return all samples if all samples start at 0
|
|
elif all(M_sample[:,0]==0):
|
|
M_draws=M_sample
|
|
|
|
# else try mcmc sampling
|
|
else:
|
|
try:
|
|
M_draws=mcmc_samples(M_sample)
|
|
except:
|
|
M_draws=M_sample
|
|
|
|
|
|
col_draws[dtype]=M_draws
|
|
col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
|
|
|
|
|
|
draws[col]=col_draws
|
|
valid_draws[col]=col_valid_draws
|
|
|
|
print(col)
|
|
|
|
#######
|
|
# OUTPUT
|
|
#######
|
|
|
|
# create output sub directory if not already exists
|
|
if not os.path.exists('intermediate'):
|
|
os.makedirs('intermediate')
|
|
|
|
# output kmeans daylabels
|
|
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
|
|
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
# output random sampling results
|
|
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
|
|
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
# output valid draw samples
|
|
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
|
|
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
|
|
|
|
############
|
|
# RANDOM YEARLY TIME SERIES
|
|
############
|
|
|
|
random.seed(2468)
|
|
dict_yearly_draws=dict()
|
|
|
|
N=100 # number of random reconstructions
|
|
|
|
for col in df_draw.columns:
|
|
dict_col=dict()
|
|
if col != 'daylabel':
|
|
for dtype in daytypes:
|
|
M_valid=valid_draws[col][dtype]
|
|
if M_valid.ndim==1: # a single valid draw
|
|
dict_col[dtype]=M_valid
|
|
else:
|
|
N_valid_draws=M_valid.shape[0]
|
|
dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
|
|
dict_yearly_draws[col]=dict_col
|
|
|
|
|
|
|
|
|
|
# check if output directory exists
|
|
if not os.path.exists('intermediate/random_yearly_dfs'):
|
|
os.makedirs('intermediate/random_yearly_dfs')
|
|
|
|
|
|
|
|
|
|
# reconstruct yearly dataframes column by column
|
|
|
|
for col in df_draw.columns:
|
|
_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
|
|
if col != 'daylabel':
|
|
for n in range(N):
|
|
v_col=np.array([]) # create new vector for each of the N random time series
|
|
for dtype in kmeans_daylabels:
|
|
M_col_dtype=dict_yearly_draws[col][dtype]
|
|
if M_col_dtype.shape==(24,):
|
|
v_append=M_col_dtype
|
|
else:
|
|
v_append=M_col_dtype[n]
|
|
v_col=np.append(v_col, v_append)
|
|
_df[n]=v_col
|
|
# output to individual file for each column in the original dataframe
|
|
_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
|
|
print(col)
|
|
|
|
|
|
|
|
|
|
|
|
|