european_data/main.py

242 lines
6.8 KiB
Python

import pandas as pd
import numpy as np
import os
import pickle
import random
import scripts.read as read
from scripts.sampling import *
from scripts.misc import *
from sklearn.cluster import KMeans, DBSCAN
#######
#Clustering
#######
# load clustering data
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
# TRAINING MATRIX
## select heating columns to be repeated 3 times at the endpoints
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
## select on- and offshore wind columns to be repeated 8 times at the endpoints
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
## rest of the columns
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
## construct training matrix
M_train=np.array([])
for col in cols_others:
#print(col)
try:
M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
except: # initialisation for first entry (nothing to concat)
M_train=col_to_M(df_clust[col])
for col in cols_repeat3:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
for col in cols_repeat12:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
# K-MEANS CLUSTERING
K=30 # number of clusters
kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
kmeans_centres=kmeans.cluster_centers_
kmeans_daylabels=kmeans.labels_
#######
# RANDOM DRAWS
#######
# load data based on which samples are drawn
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
# main sampling loop
draws=dict()
valid_draws=dict()
daytypes=np.sort(df_draw['daylabel'].unique())
for col in [col for col in df_draw.columns if 'label' not in col]:
col_draws=dict()
col_valid_draws=dict()
for dtype in daytypes:
df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
M_sample=col_to_M(df_sample)
# VRE columns
if '_VRE_' in col:
# PV - return original samples mod the outliers
if '_rtpv' in col or '_upv' in col:
# exclude outliers
outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_draws=M_sample[np.where(outliers!=-1)]
# wind - Gaussian process
else:
M_draws=GP_samples(df_sample)
else:
# demand columns
## cooling columns
if 'cooling' in col:
M_mean=M_sample.mean(axis=1)
# if more than half of the entries are identically 0 in a cluster, return 0
if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
M_draws=np.repeat(0,24)
else:
# find outliers
outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_valid=M_sample[np.where(outliers!=-1)]
# return all samples if all valid samples start at 0
if all(M_valid[:,0]==0):
M_draws=M_valid
else:
try:
M_draws=mcmc_samples(M_valid)
except:
M_draws=M_valid
else:
# if all days in the cluster are sufficiently similar, take the mean
if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
M_draws=M_sample.mean(axis=0)
# if fewer than 5 unique rows, take all rows
elif np.unique(M_sample, axis=0).shape[0]<=5:
M_draws=M_sample
# return all samples if all samples start at 0
elif all(M_sample[:,0]==0):
M_draws=M_sample
# else try mcmc sampling
else:
try:
M_draws=mcmc_samples(M_sample)
except:
M_draws=M_sample
col_draws[dtype]=M_draws
col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
draws[col]=col_draws
valid_draws[col]=col_valid_draws
print(col)
#######
# OUTPUT
#######
# create output sub directory if not already exists
if not os.path.exists('intermediate'):
os.makedirs('intermediate')
# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
############
# RANDOM YEARLY TIME SERIES
############
random.seed(2468)
dict_yearly_draws=dict()
N=100 # number of random reconstructions
for col in df_draw.columns:
dict_col=dict()
if col != 'daylabel':
for dtype in daytypes:
M_valid=valid_draws[col][dtype]
if M_valid.ndim==1: # a single valid draw
dict_col[dtype]=M_valid
else:
N_valid_draws=M_valid.shape[0]
dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
dict_yearly_draws[col]=dict_col
# check if output directory exists
if not os.path.exists('intermediate/random_yearly_dfs'):
os.makedirs('intermediate/random_yearly_dfs')
# reconstruct yearly dataframes column by column
for col in df_draw.columns:
_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
if col != 'daylabel':
for n in range(N):
v_col=np.array([]) # create new vector for each of the N random time series
for dtype in kmeans_daylabels:
M_col_dtype=dict_yearly_draws[col][dtype]
if M_col_dtype.shape==(24,):
v_append=M_col_dtype
else:
v_append=M_col_dtype[n]
v_col=np.append(v_col, v_append)
_df[n]=v_col
# output to individual file for each column in the original dataframe
_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
print(col)