european_data/main.py

import pandas as pd
import numpy as np
import os
import pickle
import random


import scripts.read as read
from scripts.sampling import *
from scripts.misc import *

from sklearn.cluster import KMeans, DBSCAN


#######
#Clustering
#######


# load clustering data
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')


# TRAINING MATRIX

## select heating columns to be repeated 3 times at the endpoints
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
## select on- and offshore wind columns to be repeated 8 times at the endpoints
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
## rest of the columns
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]

## construct training matrix

M_train=np.array([])
for col in cols_others:
    #print(col)
    try:
        M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
    except:  # initialisation for first entry (nothing to concat)
        M_train=col_to_M(df_clust[col])

for col in cols_repeat3:
    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)

for col in cols_repeat12:
    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)


# K-MEANS CLUSTERING

K=30 # number of clusters

kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
kmeans_centres=kmeans.cluster_centers_
kmeans_daylabels=kmeans.labels_


#######
# RANDOM DRAWS
#######

# load data based on which samples are drawn
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()


# main sampling loop
draws=dict()
valid_draws=dict()

daytypes=np.sort(df_draw['daylabel'].unique())

for col in [col for col in df_draw.columns if 'label' not in col]:


    col_draws=dict()
    col_valid_draws=dict()

    for dtype in daytypes:

        df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
        M_sample=col_to_M(df_sample)

        # VRE columns

        if '_VRE_' in col:

            # PV - return original samples mod the outliers
            if '_rtpv' in col or '_upv' in col:

                # exclude outliers
                outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
                outliers = outlier_detection.fit_predict(M_sample)
                M_draws=M_sample[np.where(outliers!=-1)]

            # wind - Gaussian process
            else:

                M_draws=GP_samples(df_sample)

        else:
        # demand columns

            ## cooling columns
            if 'cooling' in col:
                M_mean=M_sample.mean(axis=1)
                # if more than half of the entries are identically 0 in a cluster, return 0
                if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
                    M_draws=np.repeat(0,24)

                else:

                    # find outliers
                    outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
                    outliers = outlier_detection.fit_predict(M_sample)
                    M_valid=M_sample[np.where(outliers!=-1)]

                    # return all samples if all valid samples start at 0
                    if all(M_valid[:,0]==0):
                        M_draws=M_valid

                    else:

                        try:
                            M_draws=mcmc_samples(M_valid)
                        except:
                            M_draws=M_valid

            else:
                # if all days in the cluster are sufficiently similar, take the mean
                if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
                    M_draws=M_sample.mean(axis=0)

                # if fewer than 5 unique rows, take all rows
                elif np.unique(M_sample, axis=0).shape[0]<=5:
                    M_draws=M_sample

                # return all samples if all samples start at 0
                elif all(M_sample[:,0]==0):
                    M_draws=M_sample

                # else try mcmc sampling
                else:
                    try:
                        M_draws=mcmc_samples(M_sample)
                    except:
                        M_draws=M_sample


        col_draws[dtype]=M_draws
        col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)


    draws[col]=col_draws
    valid_draws[col]=col_valid_draws

    print(col)

#######
# OUTPUT
#######

# create output sub directory if not already exists
if not os.path.exists('intermediate'):
    os.makedirs('intermediate')

# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
    pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)

# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
    pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)

# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
    pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)


############
# RANDOM YEARLY TIME SERIES
############

random.seed(2468)
dict_yearly_draws=dict()

N=100 # number of random reconstructions

for col in df_draw.columns:
    dict_col=dict()
    if col != 'daylabel':
        for dtype in daytypes:
            M_valid=valid_draws[col][dtype]
            if M_valid.ndim==1: # a single valid draw
                dict_col[dtype]=M_valid
            else:
                N_valid_draws=M_valid.shape[0]
                dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
        dict_yearly_draws[col]=dict_col


# check if output directory exists
if not os.path.exists('intermediate/random_yearly_dfs'):
    os.makedirs('intermediate/random_yearly_dfs')


# reconstruct yearly dataframes column by column

for col in df_draw.columns:
    _df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
    if col != 'daylabel':
        for n in range(N):
            v_col=np.array([]) # create new vector for each of the N random time series
            for dtype in kmeans_daylabels:
                M_col_dtype=dict_yearly_draws[col][dtype]
                if M_col_dtype.shape==(24,):
                    v_append=M_col_dtype
                else:
                    v_append=M_col_dtype[n]
                v_col=np.append(v_col, v_append)
            _df[n]=v_col
        # output to individual file for each column in the original dataframe
        _df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
        print(col)