First version of the data reduction procedure.
This commit is contained in:
parent
8070f33511
commit
7a41174593
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,3 @@
|
|||
Two larger files contain all data for all countries
|
||||
two smaller files contain only UK data for code development / testing
|
||||
Change file names or path in main.py as appropriate
|
|
@ -0,0 +1,237 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
|
||||
|
||||
import scripts.read as read
|
||||
from scripts.sampling import *
|
||||
from scripts.misc import *
|
||||
|
||||
from sklearn.cluster import KMeans, DBSCAN
|
||||
|
||||
|
||||
|
||||
#######
|
||||
#Clustering
|
||||
#######
|
||||
|
||||
|
||||
|
||||
# load clustering data
|
||||
df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
|
||||
|
||||
|
||||
# TRAINING MATRIX
|
||||
|
||||
## select heating columns to be repeated 3 times at the endpoints
|
||||
cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
|
||||
## select on- and offshore wind columns to be repeated 8 times at the endpoints
|
||||
cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
|
||||
## rest of the columns
|
||||
cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
|
||||
|
||||
## construct training matrix
|
||||
|
||||
M_train=np.array([])
|
||||
for col in cols_others:
|
||||
#print(col)
|
||||
try:
|
||||
M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
|
||||
except: # initialisation for first entry (nothing to concat)
|
||||
M_train=col_to_M(df_clust[col])
|
||||
|
||||
for col in cols_repeat3:
|
||||
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
|
||||
|
||||
for col in cols_repeat12:
|
||||
M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
|
||||
|
||||
|
||||
|
||||
|
||||
# K-MEANS CLUSTERING
|
||||
|
||||
K=30 # number of clusters
|
||||
|
||||
kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train)
|
||||
kmeans_centres=kmeans.cluster_centers_
|
||||
kmeans_daylabels=kmeans.labels_
|
||||
|
||||
|
||||
#######
|
||||
# RANDOM DRAWS
|
||||
#######
|
||||
|
||||
# load data based on which samples are drawn
|
||||
df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
|
||||
df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
|
||||
|
||||
|
||||
# main sampling loop
|
||||
draws=dict()
|
||||
valid_draws=dict()
|
||||
|
||||
daytypes=np.sort(df_draw['daylabel'].unique())
|
||||
|
||||
for col in [col for col in df_draw.columns if 'label' not in col]:
|
||||
|
||||
|
||||
col_draws=dict()
|
||||
col_valid_draws=dict()
|
||||
|
||||
for dtype in daytypes:
|
||||
|
||||
df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
|
||||
M_sample=col_to_M(df_sample)
|
||||
|
||||
# VRE columns
|
||||
|
||||
if '_VRE_' in col:
|
||||
|
||||
# PV - return original samples mod the outliers
|
||||
if '_rtpv' in col or '_upv' in col:
|
||||
|
||||
# exclude outliers
|
||||
outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
|
||||
outliers = outlier_detection.fit_predict(M_sample)
|
||||
M_draws=M_sample[np.where(outliers!=-1)]
|
||||
|
||||
# wind - Gaussian process
|
||||
else:
|
||||
|
||||
M_draws=GP_samples(df_sample)
|
||||
|
||||
else:
|
||||
# demand columns
|
||||
|
||||
## cooling columns
|
||||
if 'cooling' in col:
|
||||
M_mean=M_sample.mean(axis=1)
|
||||
# if more than half of the entries are identically 0 in a cluster, return 0
|
||||
if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
|
||||
M_draws=np.repeat(0,24)
|
||||
|
||||
else:
|
||||
|
||||
# find outliers
|
||||
outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
|
||||
outliers = outlier_detection.fit_predict(M_sample)
|
||||
M_valid=M_sample[np.where(outliers!=-1)]
|
||||
|
||||
# return all samples if all valid samples start at 0
|
||||
if all(M_valid[:,0]==0):
|
||||
M_draws=M_valid
|
||||
|
||||
else:
|
||||
|
||||
try:
|
||||
M_draws=mcmc_samples(M_valid)
|
||||
except:
|
||||
M_draws=M_valid
|
||||
|
||||
else:
|
||||
# if all days in the cluster are sufficiently similar, take the mean
|
||||
if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
|
||||
M_draws=M_sample.mean(axis=0)
|
||||
|
||||
# if fewer than 5 unique rows, take all rows
|
||||
elif np.unique(M_sample, axis=0).shape[0]<=5:
|
||||
M_draws=M_sample
|
||||
|
||||
# return all samples if all samples start at 0
|
||||
elif all(M_sample[:,0]==0):
|
||||
M_draws=M_sample
|
||||
|
||||
# else try mcmc sampling
|
||||
else:
|
||||
try:
|
||||
M_draws=mcmc_samples(M_sample)
|
||||
except:
|
||||
M_draws=M_sample
|
||||
|
||||
|
||||
col_draws[dtype]=M_draws
|
||||
col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
|
||||
|
||||
|
||||
draws[col]=col_draws
|
||||
valid_draws[col]=col_valid_draws
|
||||
|
||||
print(col)
|
||||
|
||||
|
||||
############
|
||||
# RANDOM YEARLY TIME SERIES
|
||||
############
|
||||
|
||||
random.seed(2468)
|
||||
dict_yearly_draws=dict()
|
||||
|
||||
N=100 # number of random reconstructions
|
||||
|
||||
for col in df_draw.columns:
|
||||
dict_col=dict()
|
||||
if col != 'daylabel':
|
||||
for dtype in daytypes:
|
||||
M_valid=valid_draws[col][dtype]
|
||||
if M_valid.ndim==1: # a single valid draw
|
||||
dict_col[dtype]=M_valid
|
||||
else:
|
||||
N_valid_draws=M_valid.shape[0]
|
||||
dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
|
||||
dict_yearly_draws[col]=dict_col
|
||||
|
||||
|
||||
|
||||
|
||||
# check if output directory exists
|
||||
if not os.path.exists('intermediate/random_yearly_dfs'):
|
||||
os.makedirs('intermediate/random_yearly_dfs')
|
||||
|
||||
|
||||
|
||||
|
||||
# reconstruct yearly dataframes column by column
|
||||
|
||||
for col in df_draw.columns:
|
||||
_df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
|
||||
if col != 'daylabel':
|
||||
for n in range(N):
|
||||
v_col=np.array([]) # create new vector for each of the N random time series
|
||||
for dtype in kmeans_daylabels:
|
||||
M_col_dtype=dict_yearly_draws[col][dtype]
|
||||
if M_col_dtype.shape==(24,):
|
||||
v_append=M_col_dtype
|
||||
else:
|
||||
v_append=M_col_dtype[n]
|
||||
v_col=np.append(v_col, v_append)
|
||||
_df[n]=v_col
|
||||
# output to individual file for each column in the original dataframe
|
||||
_df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
|
||||
print(col)
|
||||
|
||||
|
||||
|
||||
|
||||
#######
|
||||
# OUTPUT
|
||||
#######
|
||||
|
||||
# create output sub directory if not already exists
|
||||
if not os.path.exists('intermediate'):
|
||||
os.makedirs('intermediate')
|
||||
|
||||
# output kmeans daylabels
|
||||
with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
|
||||
pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# output random sampling results
|
||||
with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
|
||||
pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# output valid draw samples
|
||||
with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
|
||||
pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# convert 1-dimensional hourly time series into numpy array with 24 columns and corresponding number of days (rows)
|
||||
# option to repeat first and last hours N times everyday (used in clustering)
|
||||
def col_to_M(v_column, repeat=None):
|
||||
|
||||
if isinstance(v_column, pd.Series):
|
||||
N_days=int(v_column.count()/24) # to take leap year into account
|
||||
series=v_column.values
|
||||
elif isinstance(v_column, np.ndarray):
|
||||
N_days=int(len(v_column)/24)
|
||||
series=v_column
|
||||
else:
|
||||
raise ValueError("Input vector must be a pandas series or numpy array.")
|
||||
|
||||
if repeat is None:
|
||||
return series.reshape(N_days, 24)
|
||||
|
||||
elif isinstance(repeat,int):
|
||||
# repeat first and last hours to increase their weights
|
||||
M=series.reshape(N_days, 24)
|
||||
return np.concatenate([np.repeat(M[:,0].reshape(N_days,1), repeat, axis=1), M, np.repeat(M[:,-1].reshape(N_days,1), repeat, axis=1)], axis=1)
|
||||
|
||||
else:
|
||||
raise ValueError("The number of repeats must be an integer!")
|
||||
|
||||
# select valid draws by start and end values (for continuity)
|
||||
def get_valid_draws(M_original , M_draw, epsilon=0.05):
|
||||
|
||||
if M_draw.ndim == 1:
|
||||
M_valid = M_draw
|
||||
|
||||
elif M_draw.ndim == 2 and M_draw.shape[0]<=5:
|
||||
M_valid = M_draw
|
||||
|
||||
else:
|
||||
start_mean=M_original[:,0].mean()
|
||||
end_mean=M_original[:,-1].mean()
|
||||
|
||||
M_valid=M_draw[np.where((M_draw[:,0]<start_mean+epsilon) & (M_draw[:,0]>start_mean-epsilon) &
|
||||
(M_draw[:,-1]<end_mean+epsilon) & (M_draw[:,-1]>end_mean-epsilon))]
|
||||
|
||||
# valid draws must also have all values between 0 and 1 (1.1 to allow for scaling back down to original maximum later on)
|
||||
M_valid=M_valid[np.all((M_valid>=0) & (M_valid<=1.1), axis=1)]
|
||||
|
||||
if M_valid.size==0:
|
||||
M_valid=M_original.mean(axis=0)
|
||||
|
||||
return M_valid
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
import pandas as pd
|
||||
|
||||
def load_csv(path, index_col=None):
|
||||
|
||||
if index_col is None:
|
||||
df=pd.read_csv(path, index_col=0)
|
||||
|
||||
elif index_col =='UTC':
|
||||
df=pd.read_csv(path)
|
||||
if 'UTC' in df.columns:
|
||||
df['UTC']=pd.to_datetime(df['UTC'])
|
||||
df=df.set_index('UTC')
|
||||
else:
|
||||
df=pd.read_csv(path, index_col=0)
|
||||
df.index=pd.to_datetime(df.index)
|
||||
df.index.name='UTC'
|
||||
|
||||
else:
|
||||
df=pd.read_csv(path)
|
||||
df=df.set_index(index_col)
|
||||
|
||||
return df
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
from sklearn.gaussian_process.kernels import Matern
|
||||
|
||||
from scipy.stats import norm
|
||||
from scripts.misc import *
|
||||
|
||||
|
||||
def GP_samples(s, n_std=3, max_noise=0.5): # s can be pandas series or ordered 1-d np array
|
||||
|
||||
n_samples=int(len(s)/24)
|
||||
x=np.repeat([np.arange(24)], n_samples, axis=0).ravel()
|
||||
|
||||
# convert input to 1-d np array
|
||||
if isinstance(s, pd.Series):
|
||||
y=s.values
|
||||
elif isinstance(s, np.ndarray):
|
||||
y=s
|
||||
else:
|
||||
raise ValueError("Input vector must be a pandas series or numpy array.")
|
||||
|
||||
# set noise parameter
|
||||
mean_hourly_std=col_to_M(s).std(axis=0).mean()
|
||||
noise=max(n_std*mean_hourly_std, max_noise)
|
||||
|
||||
# GP kernel
|
||||
kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2) #nu=3/4 for less smooth knernel
|
||||
|
||||
#define GP
|
||||
gp = GaussianProcessRegressor(kernel=kernel, alpha=noise, random_state=2468)
|
||||
|
||||
gp.fit(x.reshape(len(x),1), y)
|
||||
x_ = np.arange(24)
|
||||
|
||||
#mean function
|
||||
y_mean, y_std = gp.predict(x_.reshape(24,1), return_std=True)
|
||||
|
||||
# draw 100 posterior samples
|
||||
n_y_samples=100
|
||||
y_samples = gp.sample_y(x_.reshape(24,1), n_y_samples)
|
||||
|
||||
|
||||
return (y_samples.T)
|
||||
|
||||
|
||||
def mcmc_samples(sM, n_draws=100, start='mean'): # input can be pandas series or ordered 1-d array
|
||||
# or maxtrix with the 24 hours in the columns
|
||||
|
||||
np.random.seed(seed=2468) # fix seed for reproducibility
|
||||
|
||||
if isinstance(sM, pd.Series):
|
||||
M=col_to_M(sM)
|
||||
elif isinstance(sM, np.ndarray):
|
||||
if sM.ndim ==1:
|
||||
M=col_to_M(sM)
|
||||
elif sM.ndim ==2 and sM.shape[1]==24:
|
||||
M=sM
|
||||
else:
|
||||
raise ValueError("Input array must have 24-hour columns")
|
||||
|
||||
else:
|
||||
raise ValueError("Input vector must be a pandas series or numpy array.")
|
||||
|
||||
draws=np.array([])
|
||||
|
||||
fails=0
|
||||
|
||||
while draws.shape[0]<n_draws:
|
||||
|
||||
|
||||
sample=np.array([])
|
||||
|
||||
#start from t=0
|
||||
t=0
|
||||
|
||||
while t<24:
|
||||
|
||||
if t==0:
|
||||
data0=M[:,0]
|
||||
mean0, std0=norm.fit(data0)
|
||||
#print(mean0, std0)
|
||||
|
||||
while len(sample)==0:
|
||||
if start == 'mean':
|
||||
cand0 = norm(mean0, 0.01).rvs()
|
||||
else:
|
||||
cand0 = norm(mean0, std0).rvs()
|
||||
if max(0, data0.min()*0.8) <=cand0<= min(1, data0.max()*1.2):
|
||||
sample=np.array([cand0])
|
||||
t+=1
|
||||
prior=norm.pdf(cand0, mean0, std0)
|
||||
|
||||
else:
|
||||
# compute mu and Sigma for conditional normal
|
||||
|
||||
data_t=M[:,t]
|
||||
data_p=M[:,t-1] # _p for "previous" time step
|
||||
|
||||
Min_t=data_t.min()
|
||||
Max_t=data_t.max()
|
||||
|
||||
mu_t=data_t.mean()
|
||||
mu_p=data_p.mean()
|
||||
|
||||
Cov=np.cov(data_p, data_t)
|
||||
|
||||
mu_= mu_t + Cov[1,0]*(sample[t-1]-mu_p)/Cov[0,0]
|
||||
Sigma_=Cov[1,1] - Cov[0,1]*Cov[0,1]/Cov[0,0]
|
||||
|
||||
|
||||
# draw a candidate, abort chain after 20 failed attempts
|
||||
# return to 0th hour and draw a new initial point
|
||||
attempts=1
|
||||
while attempts<=20:
|
||||
|
||||
cand=norm(mu_, np.sqrt(Sigma_)).rvs()
|
||||
prior_cand=norm.pdf(cand ,mu_t, np.sqrt(Cov[1,1]))
|
||||
alpha=min([1, prior_cand/prior])
|
||||
attempts+=1
|
||||
|
||||
if (np.random.uniform(0,1) < alpha) & (cand>=Min_t*0.8) & (cand<=Max_t*1.2):
|
||||
sample=np.append(sample , cand)
|
||||
prior=prior_cand
|
||||
t+=1
|
||||
|
||||
break
|
||||
else:
|
||||
sample=np.array([])
|
||||
t=0 # rest chain
|
||||
|
||||
# abort mcmc sampling after 100 failed attempts
|
||||
fails+=1
|
||||
if fails>100:
|
||||
raise TimeoutError
|
||||
break
|
||||
|
||||
if sample.size==24 and draws.size ==0:
|
||||
draws=np.array([sample])
|
||||
elif sample.size==24 and draws.size >0:
|
||||
draws=np.vstack([draws, sample])
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
return draws
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue