First version of the data reduction procedure.

This commit is contained in:
Carmen 2021-05-26 15:30:49 +01:00
parent 8070f33511
commit 7a41174593
9 changed files with 35527 additions and 0 deletions

8761
input/data_clustering.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

8761
input/data_mod.csv Normal file

File diff suppressed because it is too large Load Diff

8761
input/data_mod_all.csv Normal file

File diff suppressed because one or more lines are too long

3
input/readme.md Normal file
View File

@ -0,0 +1,3 @@
Two larger files contain all data for all countries
two smaller files contain only UK data for code development / testing
Change file names or path in main.py as appropriate

237
main.py Normal file
View File

@ -0,0 +1,237 @@
import pandas as pd
import numpy as np
import os
import pickle
import random
import scripts.read as read
from scripts.sampling import *
from scripts.misc import *
from sklearn.cluster import KMeans, DBSCAN
#######
#Clustering
#######
# load clustering data
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
# TRAINING MATRIX
## select heating columns to be repeated 3 times at the endpoints
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
## select on- and offshore wind columns to be repeated 8 times at the endpoints
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
## rest of the columns
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
## construct training matrix
M_train=np.array([])
for col in cols_others:
#print(col)
try:
M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
except: # initialisation for first entry (nothing to concat)
M_train=col_to_M(df_clust[col])
for col in cols_repeat3:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
for col in cols_repeat12:
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
# K-MEANS CLUSTERING
K=30 # number of clusters
kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
kmeans_centres=kmeans.cluster_centers_
kmeans_daylabels=kmeans.labels_
#######
# RANDOM DRAWS
#######
# load data based on which samples are drawn
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
# main sampling loop
draws=dict()
valid_draws=dict()
daytypes=np.sort(df_draw['daylabel'].unique())
for col in [col for col in df_draw.columns if 'label' not in col]:
col_draws=dict()
col_valid_draws=dict()
for dtype in daytypes:
df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
M_sample=col_to_M(df_sample)
# VRE columns
if '_VRE_' in col:
# PV - return original samples mod the outliers
if '_rtpv' in col or '_upv' in col:
# exclude outliers
outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_draws=M_sample[np.where(outliers!=-1)]
# wind - Gaussian process
else:
M_draws=GP_samples(df_sample)
else:
# demand columns
## cooling columns
if 'cooling' in col:
M_mean=M_sample.mean(axis=1)
# if more than half of the entries are identically 0 in a cluster, return 0
if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
M_draws=np.repeat(0,24)
else:
# find outliers
outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
outliers = outlier_detection.fit_predict(M_sample)
M_valid=M_sample[np.where(outliers!=-1)]
# return all samples if all valid samples start at 0
if all(M_valid[:,0]==0):
M_draws=M_valid
else:
try:
M_draws=mcmc_samples(M_valid)
except:
M_draws=M_valid
else:
# if all days in the cluster are sufficiently similar, take the mean
if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
M_draws=M_sample.mean(axis=0)
# if fewer than 5 unique rows, take all rows
elif np.unique(M_sample, axis=0).shape[0]<=5:
M_draws=M_sample
# return all samples if all samples start at 0
elif all(M_sample[:,0]==0):
M_draws=M_sample
# else try mcmc sampling
else:
try:
M_draws=mcmc_samples(M_sample)
except:
M_draws=M_sample
col_draws[dtype]=M_draws
col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
draws[col]=col_draws
valid_draws[col]=col_valid_draws
print(col)
############
# RANDOM YEARLY TIME SERIES
############
random.seed(2468)
dict_yearly_draws=dict()
N=100 # number of random reconstructions
for col in df_draw.columns:
dict_col=dict()
if col != 'daylabel':
for dtype in daytypes:
M_valid=valid_draws[col][dtype]
if M_valid.ndim==1: # a single valid draw
dict_col[dtype]=M_valid
else:
N_valid_draws=M_valid.shape[0]
dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
dict_yearly_draws[col]=dict_col
# check if output directory exists
if not os.path.exists('intermediate/random_yearly_dfs'):
os.makedirs('intermediate/random_yearly_dfs')
# reconstruct yearly dataframes column by column
for col in df_draw.columns:
_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
if col != 'daylabel':
for n in range(N):
v_col=np.array([]) # create new vector for each of the N random time series
for dtype in kmeans_daylabels:
M_col_dtype=dict_yearly_draws[col][dtype]
if M_col_dtype.shape==(24,):
v_append=M_col_dtype
else:
v_append=M_col_dtype[n]
v_col=np.append(v_col, v_append)
_df[n]=v_col
# output to individual file for each column in the original dataframe
_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
print(col)
#######
# OUTPUT
#######
# create output sub directory if not already exists
if not os.path.exists('intermediate'):
os.makedirs('intermediate')
# output kmeans daylabels
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output random sampling results
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
# output valid draw samples
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)

54
scripts/misc.py Normal file
View File

@ -0,0 +1,54 @@
import pandas as pd
import numpy as np
# convert 1-dimensional hourly time series into numpy array with 24 columns and corresponding number of days (rows)
# option to repeat first and last hours N times everyday (used in clustering)
def col_to_M(v_column, repeat=None):
if isinstance(v_column, pd.Series):
N_days=int(v_column.count()/24) # to take leap year into account
series=v_column.values
elif isinstance(v_column, np.ndarray):
N_days=int(len(v_column)/24)
series=v_column
else:
raise ValueError("Input vector must be a pandas series or numpy array.")
if repeat is None:
return series.reshape(N_days, 24)
elif isinstance(repeat,int):
# repeat first and last hours to increase their weights
M=series.reshape(N_days, 24)
return np.concatenate([np.repeat(M[:,0].reshape(N_days,1), repeat, axis=1), M, np.repeat(M[:,-1].reshape(N_days,1), repeat, axis=1)], axis=1)
else:
raise ValueError("The number of repeats must be an integer!")
# select valid draws by start and end values (for continuity)
def get_valid_draws(M_original , M_draw, epsilon=0.05):
if M_draw.ndim == 1:
M_valid = M_draw
elif M_draw.ndim == 2 and M_draw.shape[0]<=5:
M_valid = M_draw
else:
start_mean=M_original[:,0].mean()
end_mean=M_original[:,-1].mean()
M_valid=M_draw[np.where((M_draw[:,0]<start_mean+epsilon) & (M_draw[:,0]>start_mean-epsilon) &
(M_draw[:,-1]<end_mean+epsilon) & (M_draw[:,-1]>end_mean-epsilon))]
# valid draws must also have all values between 0 and 1 (1.1 to allow for scaling back down to original maximum later on)
M_valid=M_valid[np.all((M_valid>=0) & (M_valid<=1.1), axis=1)]
if M_valid.size==0:
M_valid=M_original.mean(axis=0)
return M_valid

23
scripts/read.py Normal file
View File

@ -0,0 +1,23 @@
import pandas as pd
def load_csv(path, index_col=None):
if index_col is None:
df=pd.read_csv(path, index_col=0)
elif index_col =='UTC':
df=pd.read_csv(path)
if 'UTC' in df.columns:
df['UTC']=pd.to_datetime(df['UTC'])
df=df.set_index('UTC')
else:
df=pd.read_csv(path, index_col=0)
df.index=pd.to_datetime(df.index)
df.index.name='UTC'
else:
df=pd.read_csv(path)
df=df.set_index(index_col)
return df

166
scripts/sampling.py Normal file
View File

@ -0,0 +1,166 @@
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from scipy.stats import norm
from scripts.misc import *
def GP_samples(s, n_std=3, max_noise=0.5): # s can be pandas series or ordered 1-d np array
n_samples=int(len(s)/24)
x=np.repeat([np.arange(24)], n_samples, axis=0).ravel()
# convert input to 1-d np array
if isinstance(s, pd.Series):
y=s.values
elif isinstance(s, np.ndarray):
y=s
else:
raise ValueError("Input vector must be a pandas series or numpy array.")
# set noise parameter
mean_hourly_std=col_to_M(s).std(axis=0).mean()
noise=max(n_std*mean_hourly_std, max_noise)
# GP kernel
kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2) #nu=3/4 for less smooth knernel
#define GP
gp = GaussianProcessRegressor(kernel=kernel, alpha=noise, random_state=2468)
gp.fit(x.reshape(len(x),1), y)
x_ = np.arange(24)
#mean function
y_mean, y_std = gp.predict(x_.reshape(24,1), return_std=True)
# draw 100 posterior samples
n_y_samples=100
y_samples = gp.sample_y(x_.reshape(24,1), n_y_samples)
return (y_samples.T)
def mcmc_samples(sM, n_draws=100, start='mean'): # input can be pandas series or ordered 1-d array
# or maxtrix with the 24 hours in the columns
np.random.seed(seed=2468) # fix seed for reproducibility
if isinstance(sM, pd.Series):
M=col_to_M(sM)
elif isinstance(sM, np.ndarray):
if sM.ndim ==1:
M=col_to_M(sM)
elif sM.ndim ==2 and sM.shape[1]==24:
M=sM
else:
raise ValueError("Input array must have 24-hour columns")
else:
raise ValueError("Input vector must be a pandas series or numpy array.")
draws=np.array([])
fails=0
while draws.shape[0]<n_draws:
sample=np.array([])
#start from t=0
t=0
while t<24:
if t==0:
data0=M[:,0]
mean0, std0=norm.fit(data0)
#print(mean0, std0)
while len(sample)==0:
if start == 'mean':
cand0 = norm(mean0, 0.01).rvs()
else:
cand0 = norm(mean0, std0).rvs()
if max(0, data0.min()*0.8) <=cand0<= min(1, data0.max()*1.2):
sample=np.array([cand0])
t+=1
prior=norm.pdf(cand0, mean0, std0)
else:
# compute mu and Sigma for conditional normal
data_t=M[:,t]
data_p=M[:,t-1] # _p for "previous" time step
Min_t=data_t.min()
Max_t=data_t.max()
mu_t=data_t.mean()
mu_p=data_p.mean()
Cov=np.cov(data_p, data_t)
mu_= mu_t + Cov[1,0]*(sample[t-1]-mu_p)/Cov[0,0]
Sigma_=Cov[1,1] - Cov[0,1]*Cov[0,1]/Cov[0,0]
# draw a candidate, abort chain after 20 failed attempts
# return to 0th hour and draw a new initial point
attempts=1
while attempts<=20:
cand=norm(mu_, np.sqrt(Sigma_)).rvs()
prior_cand=norm.pdf(cand ,mu_t, np.sqrt(Cov[1,1]))
alpha=min([1, prior_cand/prior])
attempts+=1
if (np.random.uniform(0,1) < alpha) & (cand>=Min_t*0.8) & (cand<=Max_t*1.2):
sample=np.append(sample , cand)
prior=prior_cand
t+=1
break
else:
sample=np.array([])
t=0 # rest chain
# abort mcmc sampling after 100 failed attempts
fails+=1
if fails>100:
raise TimeoutError
break
if sample.size==24 and draws.size ==0:
draws=np.array([sample])
elif sample.size==24 and draws.size >0:
draws=np.vstack([draws, sample])
else:
pass
return draws