european_data/main.py

import pandas as pd
import numpy as np
import os
import pickle
import random


import scripts.read as read 
from scripts.sampling import *
from scripts.misc import *

from sklearn.cluster import KMeans, DBSCAN


#######
#Clustering
#######


# load clustering data
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')


# TRAINING MATRIX

## select heating columns to be repeated 3 times at the endpoints
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
## select on- and offshore wind columns to be repeated 8 times at the endpoints 
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
## rest of the columns 
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]

## construct training matrix 

M_train=np.array([])
for col in cols_others:
    #print(col)
    try:
        M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
    except:  # initialisation for first entry (nothing to concat)
        M_train=col_to_M(df_clust[col]) 

for col in cols_repeat3:
    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)

for col in cols_repeat12:
    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)


# K-MEANS CLUSTERING

K=30 # number of clusters

kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train) 
kmeans_centres=kmeans.cluster_centers_
kmeans_daylabels=kmeans.labels_


#######
# RANDOM DRAWS
#######

# load data based on which samples are drawn
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()


# main sampling loop
draws=dict()
valid_draws=dict()

daytypes=np.sort(df_draw['daylabel'].unique())

for col in [col for col in df_draw.columns if 'label' not in col]:
    
    
    col_draws=dict()
    col_valid_draws=dict()
    
    for dtype in daytypes:
        
        df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
        M_sample=col_to_M(df_sample)
        
        # VRE columns
        
        if '_VRE_' in col:
            
            # PV - return original samples mod the outliers
            if '_rtpv' in col or '_upv' in col:

                # exclude outliers
                outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
                outliers = outlier_detection.fit_predict(M_sample)
                M_draws=M_sample[np.where(outliers!=-1)]
            
            # wind - Gaussian process        
            else:
                
                M_draws=GP_samples(df_sample)
              
        else:    
        # demand columns
        
            ## cooling columns
            if 'cooling' in col:
                M_mean=M_sample.mean(axis=1)
                # if more than half of the entries are identically 0 in a cluster, return 0
                if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
                    M_draws=np.repeat(0,24)

                else: 

                    # find outliers
                    outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
                    outliers = outlier_detection.fit_predict(M_sample)
                    M_valid=M_sample[np.where(outliers!=-1)]

                    # return all samples if all valid samples start at 0
                    if all(M_valid[:,0]==0):
                        M_draws=M_valid

                    else:

                        try:
                            M_draws=mcmc_samples(M_valid)
                        except:
                            M_draws=M_valid
                            
            else:
                # if all days in the cluster are sufficiently similar, take the mean 
                if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
                    M_draws=M_sample.mean(axis=0)

                # if fewer than 5 unique rows, take all rows
                elif np.unique(M_sample, axis=0).shape[0]<=5:
                    M_draws=M_sample
                
                # return all samples if all samples start at 0
                elif all(M_sample[:,0]==0):
                    M_draws=M_sample
                    
                # else try mcmc sampling
                else: 
                    try:
                        M_draws=mcmc_samples(M_sample)
                    except:
                        M_draws=M_sample
                            

        col_draws[dtype]=M_draws
        col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
        
    
    draws[col]=col_draws
    valid_draws[col]=col_valid_draws
    
    print(col)
    
#######
# OUTPUT
#######

# create output sub directory if not already exists
if not os.path.exists('intermediate'):
    os.makedirs('intermediate')

# output kmeans daylabels 
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
    pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)

# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
    pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
    pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)


############
# RANDOM YEARLY TIME SERIES
############

random.seed(2468)
dict_yearly_draws=dict()

N=100 # number of random reconstructions

for col in df_draw.columns:
    dict_col=dict()
    if col != 'daylabel':
        for dtype in daytypes:
            M_valid=valid_draws[col][dtype]
            if M_valid.ndim==1: # a single valid draw
                dict_col[dtype]=M_valid
            else:
                N_valid_draws=M_valid.shape[0]
                dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
        dict_yearly_draws[col]=dict_col
        
        
# check if output directory exists
if not os.path.exists('intermediate/random_yearly_dfs'):
    os.makedirs('intermediate/random_yearly_dfs')


# reconstruct yearly dataframes column by column 

for col in df_draw.columns:
    _df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
    if col != 'daylabel':
        for n in range(N): 
            v_col=np.array([]) # create new vector for each of the N random time series 
            for dtype in kmeans_daylabels:
                M_col_dtype=dict_yearly_draws[col][dtype]
                if M_col_dtype.shape==(24,):
                    v_append=M_col_dtype
                else:
                    v_append=M_col_dtype[n]
                v_col=np.append(v_col, v_append)
            _df[n]=v_col
        # output to individual file for each column in the original dataframe    
        _df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
        print(col)
First version of the data reduction procedure. 2021-05-26 15:30:49 +01:00			`import pandas as pd`
			`import numpy as np`
			`import os`
			`import pickle`
			`import random`


			`import scripts.read as read`
			`from scripts.sampling import *`
			`from scripts.misc import *`

			`from sklearn.cluster import KMeans, DBSCAN`



			`#######`
			`#Clustering`
			`#######`



			`# load clustering data`
			`df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')`


			`# TRAINING MATRIX`

			`## select heating columns to be repeated 3 times at the endpoints`
			`cols_repeat3=[col for col in df_clust.columns if 'heating' in col]`
			`## select on- and offshore wind columns to be repeated 8 times at the endpoints`
			`cols_repeat12=[col for col in df_clust.columns if 'shore' in col]`
			`## rest of the columns`
			`cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]`

			`## construct training matrix`

			`M_train=np.array([])`
			`for col in cols_others:`
			`#print(col)`
			`try:`
			`M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)`
			`except: # initialisation for first entry (nothing to concat)`
			`M_train=col_to_M(df_clust[col])`

			`for col in cols_repeat3:`
			`M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)`

			`for col in cols_repeat12:`
			`M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)`




			`# K-MEANS CLUSTERING`

			`K=30 # number of clusters`

			`kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)`
			`kmeans_centres=kmeans.cluster_centers_`
			`kmeans_daylabels=kmeans.labels_`


			`#######`
			`# RANDOM DRAWS`
			`#######`

			`# load data based on which samples are drawn`
			`df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')`
			`df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()`


			`# main sampling loop`
			`draws=dict()`
			`valid_draws=dict()`

			`daytypes=np.sort(df_draw['daylabel'].unique())`

			`for col in [col for col in df_draw.columns if 'label' not in col]:`


			`col_draws=dict()`
			`col_valid_draws=dict()`

			`for dtype in daytypes:`

			`df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]`
			`M_sample=col_to_M(df_sample)`

			`# VRE columns`

			`if '_VRE_' in col:`

			`# PV - return original samples mod the outliers`
			`if '_rtpv' in col or '_upv' in col:`

			`# exclude outliers`
			`outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)`
			`outliers = outlier_detection.fit_predict(M_sample)`
			`M_draws=M_sample[np.where(outliers!=-1)]`

			`# wind - Gaussian process`
			`else:`

			`M_draws=GP_samples(df_sample)`

			`else:`
			`# demand columns`

			`## cooling columns`
			`if 'cooling' in col:`
			`M_mean=M_sample.mean(axis=1)`
			`# if more than half of the entries are identically 0 in a cluster, return 0`
			`if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):`
			`M_draws=np.repeat(0,24)`

			`else:`

			`# find outliers`
			`outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)`
			`outliers = outlier_detection.fit_predict(M_sample)`
			`M_valid=M_sample[np.where(outliers!=-1)]`

			`# return all samples if all valid samples start at 0`
			`if all(M_valid[:,0]==0):`
			`M_draws=M_valid`

			`else:`

			`try:`
			`M_draws=mcmc_samples(M_valid)`
			`except:`
			`M_draws=M_valid`

			`else:`
			`# if all days in the cluster are sufficiently similar, take the mean`
			`if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):`
			`M_draws=M_sample.mean(axis=0)`

			`# if fewer than 5 unique rows, take all rows`
			`elif np.unique(M_sample, axis=0).shape[0]<=5:`
			`M_draws=M_sample`

			`# return all samples if all samples start at 0`
			`elif all(M_sample[:,0]==0):`
			`M_draws=M_sample`

			`# else try mcmc sampling`
			`else:`
			`try:`
			`M_draws=mcmc_samples(M_sample)`
			`except:`
			`M_draws=M_sample`


			`col_draws[dtype]=M_draws`
			`col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)`


			`draws[col]=col_draws`
			`valid_draws[col]=col_valid_draws`

			`print(col)`

added selection.py to select the best sample from the random samples according to user defined selection rules rearranged main.py 2021-06-02 11:28:19 +01:00			`#######`
			`# OUTPUT`
			`#######`

			`# create output sub directory if not already exists`
			`if not os.path.exists('intermediate'):`
			`os.makedirs('intermediate')`

			`# output kmeans daylabels`
			`with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:`
			`pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)`

			`# output random sampling results`
			`with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:`
			`pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)`

			`# output valid draw samples`
			`with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:`
			`pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)`



First version of the data reduction procedure. 2021-05-26 15:30:49 +01:00
			`############`
			`# RANDOM YEARLY TIME SERIES`
			`############`

			`random.seed(2468)`
			`dict_yearly_draws=dict()`

			`N=100 # number of random reconstructions`

			`for col in df_draw.columns:`
			`dict_col=dict()`
			`if col != 'daylabel':`
			`for dtype in daytypes:`
			`M_valid=valid_draws[col][dtype]`
			`if M_valid.ndim==1: # a single valid draw`
			`dict_col[dtype]=M_valid`
			`else:`
			`N_valid_draws=M_valid.shape[0]`
			`dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]`
			`dict_yearly_draws[col]=dict_col`




			`# check if output directory exists`
			`if not os.path.exists('intermediate/random_yearly_dfs'):`
			`os.makedirs('intermediate/random_yearly_dfs')`




			`# reconstruct yearly dataframes column by column`

			`for col in df_draw.columns:`
			`_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column`
			`if col != 'daylabel':`
			`for n in range(N):`
			`v_col=np.array([]) # create new vector for each of the N random time series`
			`for dtype in kmeans_daylabels:`
			`M_col_dtype=dict_yearly_draws[col][dtype]`
			`if M_col_dtype.shape==(24,):`
			`v_append=M_col_dtype`
			`else:`
			`v_append=M_col_dtype[n]`
			`v_col=np.append(v_col, v_append)`
			`_df[n]=v_col`
			`# output to individual file for each column in the original dataframe`
			`_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)`
			`print(col)`