import pandas as pd import numpy as np import os import pickle import random import scripts.read as read from scripts.sampling import * from scripts.misc import * from sklearn.cluster import KMeans, DBSCAN ####### #Clustering ####### # load clustering data df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC') # TRAINING MATRIX ## select heating columns to be repeated 3 times at the endpoints cols_repeat3=[col for col in df_clust.columns if 'heating' in col] ## select on- and offshore wind columns to be repeated 8 times at the endpoints cols_repeat12=[col for col in df_clust.columns if 'shore' in col] ## rest of the columns cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12] ## construct training matrix M_train=np.array([]) for col in cols_others: #print(col) try: M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1) except: # initialisation for first entry (nothing to concat) M_train=col_to_M(df_clust[col]) for col in cols_repeat3: M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1) for col in cols_repeat12: M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1) # K-MEANS CLUSTERING K=30 # number of clusters kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train) kmeans_centres=kmeans.cluster_centers_ kmeans_daylabels=kmeans.labels_ ####### # RANDOM DRAWS ####### # load data based on which samples are drawn df_draw = read.load_csv('input/data_mod.csv', index_col='UTC') df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel() # main sampling loop draws=dict() valid_draws=dict() daytypes=np.sort(df_draw['daylabel'].unique()) for col in [col for col in df_draw.columns if 'label' not in col]: col_draws=dict() col_valid_draws=dict() for dtype in daytypes: df_sample=df_draw.loc[df_draw['daylabel']==dtype, col] M_sample=col_to_M(df_sample) # VRE columns if '_VRE_' in col: # PV - return original samples mod the outliers if '_rtpv' in col or '_upv' in col: # exclude outliers outlier_detection = DBSCAN(eps = 0.75, min_samples = 3) outliers = outlier_detection.fit_predict(M_sample) M_draws=M_sample[np.where(outliers!=-1)] # wind - Gaussian process else: M_draws=GP_samples(df_sample) else: # demand columns ## cooling columns if 'cooling' in col: M_mean=M_sample.mean(axis=1) # if more than half of the entries are identically 0 in a cluster, return 0 if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2): M_draws=np.repeat(0,24) else: # find outliers outlier_detection = DBSCAN(eps = 0.5, min_samples = 3) outliers = outlier_detection.fit_predict(M_sample) M_valid=M_sample[np.where(outliers!=-1)] # return all samples if all valid samples start at 0 if all(M_valid[:,0]==0): M_draws=M_valid else: try: M_draws=mcmc_samples(M_valid) except: M_draws=M_valid else: # if all days in the cluster are sufficiently similar, take the mean if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01): M_draws=M_sample.mean(axis=0) # if fewer than 5 unique rows, take all rows elif np.unique(M_sample, axis=0).shape[0]<=5: M_draws=M_sample # return all samples if all samples start at 0 elif all(M_sample[:,0]==0): M_draws=M_sample # else try mcmc sampling else: try: M_draws=mcmc_samples(M_sample) except: M_draws=M_sample col_draws[dtype]=M_draws col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws) draws[col]=col_draws valid_draws[col]=col_valid_draws print(col) ####### # OUTPUT ####### # create output sub directory if not already exists if not os.path.exists('intermediate'): os.makedirs('intermediate') # output kmeans daylabels with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle: pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL) # output random sampling results with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle: pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL) # output valid draw samples with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle: pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL) ############ # RANDOM YEARLY TIME SERIES ############ random.seed(2468) dict_yearly_draws=dict() N=100 # number of random reconstructions for col in df_draw.columns: dict_col=dict() if col != 'daylabel': for dtype in daytypes: M_valid=valid_draws[col][dtype] if M_valid.ndim==1: # a single valid draw dict_col[dtype]=M_valid else: N_valid_draws=M_valid.shape[0] dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]] dict_yearly_draws[col]=dict_col # check if output directory exists if not os.path.exists('intermediate/random_yearly_dfs'): os.makedirs('intermediate/random_yearly_dfs') # reconstruct yearly dataframes column by column for col in df_draw.columns: _df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column if col != 'daylabel': for n in range(N): v_col=np.array([]) # create new vector for each of the N random time series for dtype in kmeans_daylabels: M_col_dtype=dict_yearly_draws[col][dtype] if M_col_dtype.shape==(24,): v_append=M_col_dtype else: v_append=M_col_dtype[n] v_col=np.append(v_col, v_append) _df[n]=v_col # output to individual file for each column in the original dataframe _df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col) print(col)