First version of the data reduction procedure.

2021-05-26 15:30:49 +01:00 · 2021-05-26 15:30:49 +01:00 · 7a41174593
parent 8070f33511
commit 7a41174593
9 changed files with 35527 additions and 0 deletions
--- a/input/data_clustering.csv
+++ b/input/data_clustering.csv
--- a/input/data_clustering_all.csv
+++ b/input/data_clustering_all.csv
--- a/input/data_mod.csv
+++ b/input/data_mod.csv
--- a/input/data_mod_all.csv
+++ b/input/data_mod_all.csv
--- a/input/readme.md
+++ b/input/readme.md
@ -0,0 +1,3 @@
+Two larger files contain all data for all countries
+two smaller files contain only UK data for code development / testing
+Change file names or path in main.py as appropriate
--- a/main.py
+++ b/main.py
@ -0,0 +1,237 @@
+import pandas as pd
+import numpy as np
+import os
+import pickle
+import random
+
+
+import scripts.read as read 
+from scripts.sampling import *
+from scripts.misc import *
+
+from sklearn.cluster import KMeans, DBSCAN
+
+
+
+#######
+#Clustering
+#######
+
+
+
+# load clustering data
+df_clust=read.load_csv('input/data_clustering.csv', index_col='UTC')
+
+
+# TRAINING MATRIX
+
+## select heating columns to be repeated 3 times at the endpoints
+cols_repeat3=[col for col in df_clust.columns if 'heating' in col]
+## select on- and offshore wind columns to be repeated 8 times at the endpoints 
+cols_repeat12=[col for col in df_clust.columns if 'shore' in col]
+## rest of the columns 
+cols_others=[col for col in df_clust.columns if col not in cols_repeat3 and col not in cols_repeat12]
+
+## construct training matrix 
+
+M_train=np.array([])
+for col in cols_others:
+    #print(col)
+    try:
+        M_train=np.concatenate([M_train,col_to_M(df_clust[col])], axis=1)
+    except:  # initialisation for first entry (nothing to concat)
+        M_train=col_to_M(df_clust[col]) 
+
+for col in cols_repeat3:
+    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=3)], axis=1)
+
+for col in cols_repeat12:
+    M_train=np.concatenate([M_train,col_to_M(df_clust[col], repeat=12)], axis=1)
+
+
+
+
+# K-MEANS CLUSTERING
+
+K=30 # number of clusters
+
+kmeans = KMeans(n_clusters=K, random_state=2468).fit(M_train) 
+kmeans_centres=kmeans.cluster_centers_
+kmeans_daylabels=kmeans.labels_
+
+
+#######
+# RANDOM DRAWS
+#######
+
+# load data based on which samples are drawn
+df_draw = read.load_csv('input/data_mod.csv', index_col='UTC')
+df_draw['daylabel']=np.array([np.repeat(i,24) for i in kmeans_daylabels]).ravel()
+
+
+# main sampling loop
+draws=dict()
+valid_draws=dict()
+
+daytypes=np.sort(df_draw['daylabel'].unique())
+
+for col in [col for col in df_draw.columns if 'label' not in col]:
+    
+    
+    col_draws=dict()
+    col_valid_draws=dict()
+    
+    for dtype in daytypes:
+        
+        df_sample=df_draw.loc[df_draw['daylabel']==dtype, col]
+        M_sample=col_to_M(df_sample)
+        
+        # VRE columns
+        
+        if '_VRE_' in col:
+            
+            # PV - return original samples mod the outliers
+            if '_rtpv' in col or '_upv' in col:
+
+                # exclude outliers
+                outlier_detection = DBSCAN(eps = 0.75, min_samples = 3)
+                outliers = outlier_detection.fit_predict(M_sample)
+                M_draws=M_sample[np.where(outliers!=-1)]
+            
+            # wind - Gaussian process        
+            else:
+                
+                M_draws=GP_samples(df_sample)
+              
+        else:    
+        # demand columns
+        
+            ## cooling columns
+            if 'cooling' in col:
+                M_mean=M_sample.mean(axis=1)
+                # if more than half of the entries are identically 0 in a cluster, return 0
+                if len(np.where(M_mean==0)[0])>=np.floor(M_mean.size/2):
+                    M_draws=np.repeat(0,24)
+
+                else: 
+
+                    # find outliers
+                    outlier_detection = DBSCAN(eps = 0.5, min_samples = 3)
+                    outliers = outlier_detection.fit_predict(M_sample)
+                    M_valid=M_sample[np.where(outliers!=-1)]
+
+                    # return all samples if all valid samples start at 0
+                    if all(M_valid[:,0]==0):
+                        M_draws=M_valid
+
+                    else:
+
+                        try:
+                            M_draws=mcmc_samples(M_valid)
+                        except:
+                            M_draws=M_valid
+                            
+            else:
+                # if all days in the cluster are sufficiently similar, take the mean 
+                if all((M_sample.max(axis=0)-M_sample.min(axis=0))<0.01):
+                    M_draws=M_sample.mean(axis=0)
+
+                # if fewer than 5 unique rows, take all rows
+                elif np.unique(M_sample, axis=0).shape[0]<=5:
+                    M_draws=M_sample
+                
+                # return all samples if all samples start at 0
+                elif all(M_sample[:,0]==0):
+                    M_draws=M_sample
+                    
+                # else try mcmc sampling
+                else: 
+                    try:
+                        M_draws=mcmc_samples(M_sample)
+                    except:
+                        M_draws=M_sample
+                            
+
+        col_draws[dtype]=M_draws
+        col_valid_draws[dtype]=get_valid_draws(M_sample, M_draws)
+        
+    
+    draws[col]=col_draws
+    valid_draws[col]=col_valid_draws
+    
+    print(col)
+    
+    
+############
+# RANDOM YEARLY TIME SERIES
+############
+
+random.seed(2468)
+dict_yearly_draws=dict()
+
+N=100 # number of random reconstructions
+
+for col in df_draw.columns:
+    dict_col=dict()
+    if col != 'daylabel':
+        for dtype in daytypes:
+            M_valid=valid_draws[col][dtype]
+            if M_valid.ndim==1: # a single valid draw
+                dict_col[dtype]=M_valid
+            else:
+                N_valid_draws=M_valid.shape[0]
+                dict_col[dtype]=M_valid[[random.randrange(N_valid_draws) for n in range(N)]]
+        dict_yearly_draws[col]=dict_col
+        
+        
+        
+        
+# check if output directory exists
+if not os.path.exists('intermediate/random_yearly_dfs'):
+    os.makedirs('intermediate/random_yearly_dfs')
+
+
+
+
+# reconstruct yearly dataframes column by column 
+
+for col in df_draw.columns:
+    _df=pd.DataFrame(index=df_draw.index, columns=range(N)) # create new dataframe for each column
+    if col != 'daylabel':
+        for n in range(N): 
+            v_col=np.array([]) # create new vector for each of the N random time series 
+            for dtype in kmeans_daylabels:
+                M_col_dtype=dict_yearly_draws[col][dtype]
+                if M_col_dtype.shape==(24,):
+                    v_append=M_col_dtype
+                else:
+                    v_append=M_col_dtype[n]
+                v_col=np.append(v_col, v_append)
+            _df[n]=v_col
+        # output to individual file for each column in the original dataframe    
+        _df.to_csv('intermediate/random_yearly_dfs/random_yearly_%s.csv' %col)
+        print(col)
+            
+        
+
+
+#######
+# OUTPUT
+#######
+
+# create output sub directory if not already exists
+if not os.path.exists('intermediate'):
+    os.makedirs('intermediate')
+
+# output kmeans daylabels 
+with open('intermediate/kmeans%d_daylabels.pickle' %K, 'wb') as handle:
+    pickle.dump(kmeans_daylabels, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+# output random sampling results
+with open('intermediate/kmeans%d_draw_samples.pickle' %K, 'wb') as handle:
+    pickle.dump(draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    
+# output valid draw samples
+with open('intermediate/kmeans%d_valid_draw_samples.pickle' %K, 'wb') as handle:
+    pickle.dump(valid_draws, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
--- a/scripts/misc.py
+++ b/scripts/misc.py
@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+
+# convert 1-dimensional hourly time series into numpy array with 24 columns and corresponding number of days (rows)
+# option to repeat first and last hours N times everyday (used in clustering)
+def col_to_M(v_column, repeat=None):
+    
+    if isinstance(v_column, pd.Series):
+        N_days=int(v_column.count()/24) # to take leap year into account
+        series=v_column.values
+    elif isinstance(v_column, np.ndarray):
+        N_days=int(len(v_column)/24)
+        series=v_column
+    else:
+        raise ValueError("Input vector must be a pandas series or numpy array.")
+    
+    if repeat is None:
+        return series.reshape(N_days, 24)
+
+    elif isinstance(repeat,int):
+        # repeat first and last hours to increase their weights
+        M=series.reshape(N_days, 24)
+        return np.concatenate([np.repeat(M[:,0].reshape(N_days,1), repeat, axis=1), M, np.repeat(M[:,-1].reshape(N_days,1), repeat, axis=1)], axis=1)
+    
+    else:
+        raise ValueError("The number of repeats must be an integer!")
+
+# select valid draws by start and end values (for continuity)
+def get_valid_draws(M_original , M_draw, epsilon=0.05):
+    
+    if M_draw.ndim == 1:
+        M_valid = M_draw
+    
+    elif M_draw.ndim == 2 and M_draw.shape[0]<=5:
+        M_valid = M_draw
+        
+    else: 
+        start_mean=M_original[:,0].mean()
+        end_mean=M_original[:,-1].mean()
+
+        M_valid=M_draw[np.where((M_draw[:,0]<start_mean+epsilon) & (M_draw[:,0]>start_mean-epsilon) &
+                    (M_draw[:,-1]<end_mean+epsilon) & (M_draw[:,-1]>end_mean-epsilon))]
+        
+        # valid draws must also have all values between 0 and 1 (1.1 to allow for scaling back down to original maximum later on) 
+        M_valid=M_valid[np.all((M_valid>=0) & (M_valid<=1.1), axis=1)]
+    
+    if M_valid.size==0:
+        M_valid=M_original.mean(axis=0)
+    
+    return M_valid
+
+    
+    
+
--- a/scripts/read.py
+++ b/scripts/read.py
@ -0,0 +1,23 @@
+import pandas as pd
+
+def load_csv(path, index_col=None):
+    
+    if index_col is None:
+        df=pd.read_csv(path, index_col=0)
+    
+    elif index_col =='UTC':
+        df=pd.read_csv(path)
+        if 'UTC' in df.columns:
+            df['UTC']=pd.to_datetime(df['UTC'])
+            df=df.set_index('UTC')
+        else:
+            df=pd.read_csv(path, index_col=0)
+            df.index=pd.to_datetime(df.index)
+            df.index.name='UTC'
+    
+    else:
+        df=pd.read_csv(path)
+        df=df.set_index(index_col)
+    
+    return df
+        
--- a/scripts/sampling.py
+++ b/scripts/sampling.py
@ -0,0 +1,166 @@
+import pandas as pd
+import numpy as np
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import Matern
+
+from scipy.stats import norm
+from scripts.misc import *
+
+
+def GP_samples(s, n_std=3, max_noise=0.5): # s can be pandas series or ordered 1-d np array
+    
+    n_samples=int(len(s)/24)
+    x=np.repeat([np.arange(24)], n_samples, axis=0).ravel() 
+    
+    # convert input to 1-d np array
+    if isinstance(s, pd.Series):
+        y=s.values
+    elif isinstance(s, np.ndarray):
+        y=s
+    else:
+        raise ValueError("Input vector must be a pandas series or numpy array.")
+
+    # set noise parameter
+    mean_hourly_std=col_to_M(s).std(axis=0).mean()
+    noise=max(n_std*mean_hourly_std, max_noise)
+
+    # GP kernel
+    kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2) #nu=3/4 for less smooth knernel
+    
+    #define GP
+    gp = GaussianProcessRegressor(kernel=kernel, alpha=noise, random_state=2468)
+    
+    gp.fit(x.reshape(len(x),1), y)
+    x_ = np.arange(24)
+    
+    #mean function
+    y_mean, y_std = gp.predict(x_.reshape(24,1), return_std=True)
+
+    # draw 100 posterior samples
+    n_y_samples=100
+    y_samples = gp.sample_y(x_.reshape(24,1), n_y_samples)
+    
+    
+    return (y_samples.T)
+
+
+def mcmc_samples(sM, n_draws=100, start='mean'): # input can be pandas series or ordered 1-d array 
+                                                 # or maxtrix with the 24 hours in the columns 
+    
+    np.random.seed(seed=2468) # fix seed for reproducibility
+    
+    if isinstance(sM, pd.Series):
+        M=col_to_M(sM)
+    elif isinstance(sM, np.ndarray):
+        if sM.ndim ==1:
+            M=col_to_M(sM)
+        elif sM.ndim ==2 and sM.shape[1]==24:
+            M=sM
+        else: 
+            raise ValueError("Input array must have 24-hour columns")
+            
+    else:
+        raise ValueError("Input vector must be a pandas series or numpy array.")
+    
+    draws=np.array([])
+    
+    fails=0
+    
+    while draws.shape[0]<n_draws:
+        
+              
+        sample=np.array([])
+
+        #start from t=0
+        t=0  
+
+        while t<24:
+
+            if t==0:
+                data0=M[:,0]
+                mean0, std0=norm.fit(data0)
+                #print(mean0, std0)
+
+                while len(sample)==0:
+                    if start == 'mean':
+                        cand0 = norm(mean0, 0.01).rvs()
+                    else:
+                        cand0 = norm(mean0, std0).rvs()
+                    if max(0, data0.min()*0.8) <=cand0<= min(1, data0.max()*1.2):
+                        sample=np.array([cand0])
+                        t+=1
+                        prior=norm.pdf(cand0, mean0, std0)
+
+            else:
+            # compute mu and Sigma for conditional normal 
+
+                data_t=M[:,t]
+                data_p=M[:,t-1]   # _p for "previous" time step
+
+                Min_t=data_t.min()
+                Max_t=data_t.max()
+
+                mu_t=data_t.mean()
+                mu_p=data_p.mean()
+
+                Cov=np.cov(data_p, data_t)
+
+                mu_= mu_t + Cov[1,0]*(sample[t-1]-mu_p)/Cov[0,0]
+                Sigma_=Cov[1,1] - Cov[0,1]*Cov[0,1]/Cov[0,0]
+
+
+                # draw a candidate, abort chain after 20 failed attempts
+                # return to 0th hour and draw a new initial point
+                attempts=1
+                while attempts<=20:
+                    
+                    cand=norm(mu_, np.sqrt(Sigma_)).rvs()
+                    prior_cand=norm.pdf(cand ,mu_t, np.sqrt(Cov[1,1]))
+                    alpha=min([1, prior_cand/prior])
+                    attempts+=1
+                    
+                    if (np.random.uniform(0,1) < alpha) & (cand>=Min_t*0.8) & (cand<=Max_t*1.2):
+                        sample=np.append(sample , cand)
+                        prior=prior_cand
+                        t+=1
+                        
+                        break
+                else:
+                    sample=np.array([])
+                    t=0 # rest chain
+                    
+                    # abort mcmc sampling after 100 failed attempts
+                    fails+=1
+                    if fails>100:
+                        raise TimeoutError
+                    break
+        
+        if sample.size==24 and draws.size ==0:
+            draws=np.array([sample])
+        elif sample.size==24 and draws.size >0:
+            draws=np.vstack([draws, sample])
+        else:
+            pass
+        
+        
+    return draws
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+