european_data/main.py

242 lines
6.8 KiB
Python
Raw Permalink Normal View History

import pandas as pd
import numpy as np
import os
import pickle
import random
import scripts.read as read
from scripts.sampling import *
from scripts.misc import *
from sklearn.cluster import KMeans, DBSCAN
#######
#Clustering
#######
# load clustering data
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
# TRAINING MATRIX
## select heating columns to be repeated 3 times at the endpoints
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
## select on- and offshore wind columns to be repeated 8 times at the endpoints
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
## rest of the columns
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
## construct training matrix
M_train=np.array([])
for col in cols_others:
#print(col)
try:
M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
except: # initialisation for first entry (nothing to concat)
M_train=col_to_M(df_clust[col])
for col in cols_repeat3:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
for col in cols_repeat12:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
# K-MEANS CLUSTERING
K=30 # number of clusters
kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
kmeans_centres=kmeans.cluster_centers_
kmeans_daylabels=kmeans.labels_
#######
# RANDOM DRAWS
#######
# load data based on which samples are drawn
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
# main sampling loop
draws=dict()
valid_draws=dict()
daytypes=np.sort(df_draw['daylabel'].unique())
for col in [col for col in df_draw.columns if 'label' not in col]:
col_draws=dict()
col_valid_draws=dict()
for dtype in daytypes:
df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
M_sample=col_to_M(df_sample)
# VRE columns
if '_VRE_' in col:
# PV - return original samples mod the outliers
if '_rtpv' in col or '_upv' in col:
# exclude outliers
outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_draws=M_sample[np.where(outliers!=-1)]
# wind - Gaussian process
else:
M_draws=GP_samples(df_sample)
else:
# demand columns
## cooling columns
if 'cooling' in col:
M_mean=M_sample.mean(axis=1)
# if more than half of the entries are identically 0 in a cluster, return 0
if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
M_draws=np.repeat(0,24)
else:
# find outliers
outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_valid=M_sample[np.where(outliers!=-1)]
# return all samples if all valid samples start at 0
if all(M_valid[:,0]==0):
M_draws=M_valid
else:
try:
M_draws=mcmc_samples(M_valid)
except:
M_draws=M_valid
else:
# if all days in the cluster are sufficiently similar, take the mean
if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
M_draws=M_sample.mean(axis=0)
# if fewer than 5 unique rows, take all rows
elif np.unique(M_sample, axis=0).shape[0]<=5:
M_draws=M_sample
# return all samples if all samples start at 0
elif all(M_sample[:,0]==0):
M_draws=M_sample
# else try mcmc sampling
else:
try:
M_draws=mcmc_samples(M_sample)
except:
M_draws=M_sample
col_draws[dtype]=M_draws
col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
draws[col]=col_draws
valid_draws[col]=col_valid_draws
print(col)
#######
# OUTPUT
#######
# create output sub directory if not already exists
if not os.path.exists('intermediate'):
os.makedirs('intermediate')
# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
############
# RANDOM YEARLY TIME SERIES
############
random.seed(2468)
dict_yearly_draws=dict()
N=100 # number of random reconstructions
for col in df_draw.columns:
dict_col=dict()
if col != 'daylabel':
for dtype in daytypes:
M_valid=valid_draws[col][dtype]
if M_valid.ndim==1: # a single valid draw
dict_col[dtype]=M_valid
else:
N_valid_draws=M_valid.shape[0]
dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
dict_yearly_draws[col]=dict_col
# check if output directory exists
if not os.path.exists('intermediate/random_yearly_dfs'):
os.makedirs('intermediate/random_yearly_dfs')
# reconstruct yearly dataframes column by column
for col in df_draw.columns:
_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
if col != 'daylabel':
for n in range(N):
v_col=np.array([]) # create new vector for each of the N random time series
for dtype in kmeans_daylabels:
M_col_dtype=dict_yearly_draws[col][dtype]
if M_col_dtype.shape==(24,):
v_append=M_col_dtype
else:
v_append=M_col_dtype[n]
v_col=np.append(v_col, v_append)
_df[n]=v_col
# output to individual file for each column in the original dataframe
_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
print(col)