european_data/scripts/sampling.py

167 lines
4.4 KiB
Python

import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from scipy.stats import norm
from scripts.misc import *
def GP_samples(s, n_std=3, max_noise=0.5): # s can be pandas series or ordered 1-d np array
n_samples=int(len(s)/24)
x=np.repeat([np.arange(24)], n_samples, axis=0).ravel()
# convert input to 1-d np array
if isinstance(s, pd.Series):
y=s.values
elif isinstance(s, np.ndarray):
y=s
else:
raise ValueError("Input vector must be a pandas series or numpy array.")
# set noise parameter
mean_hourly_std=col_to_M(s).std(axis=0).mean()
noise=max(n_std*mean_hourly_std, max_noise)
# GP kernel
kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2) #nu=3/4 for less smooth knernel
#define GP
gp = GaussianProcessRegressor(kernel=kernel, alpha=noise, random_state=2468)
gp.fit(x.reshape(len(x),1), y)
x_ = np.arange(24)
#mean function
y_mean, y_std = gp.predict(x_.reshape(24,1), return_std=True)
# draw 100 posterior samples
n_y_samples=100
y_samples = gp.sample_y(x_.reshape(24,1), n_y_samples)
return (y_samples.T)
def mcmc_samples(sM, n_draws=100, start='mean'): # input can be pandas series or ordered 1-d array
# or maxtrix with the 24 hours in the columns
np.random.seed(seed=2468) # fix seed for reproducibility
if isinstance(sM, pd.Series):
M=col_to_M(sM)
elif isinstance(sM, np.ndarray):
if sM.ndim ==1:
M=col_to_M(sM)
elif sM.ndim ==2 and sM.shape[1]==24:
M=sM
else:
raise ValueError("Input array must have 24-hour columns")
else:
raise ValueError("Input vector must be a pandas series or numpy array.")
draws=np.array([])
fails=0
while draws.shape[0]<n_draws:
sample=np.array([])
#start from t=0
t=0
while t<24:
if t==0:
data0=M[:,0]
mean0, std0=norm.fit(data0)
#print(mean0, std0)
while len(sample)==0:
if start == 'mean':
cand0 = norm(mean0, 0.01).rvs()
else:
cand0 = norm(mean0, std0).rvs()
if max(0, data0.min()*0.8) <=cand0<= min(1, data0.max()*1.2):
sample=np.array([cand0])
t+=1
prior=norm.pdf(cand0, mean0, std0)
else:
# compute mu and Sigma for conditional normal
data_t=M[:,t]
data_p=M[:,t-1] # _p for "previous" time step
Min_t=data_t.min()
Max_t=data_t.max()
mu_t=data_t.mean()
mu_p=data_p.mean()
Cov=np.cov(data_p, data_t)
mu_= mu_t + Cov[1,0]*(sample[t-1]-mu_p)/Cov[0,0]
Sigma_=Cov[1,1] - Cov[0,1]*Cov[0,1]/Cov[0,0]
# draw a candidate, abort chain after 20 failed attempts
# return to 0th hour and draw a new initial point
attempts=1
while attempts<=20:
cand=norm(mu_, np.sqrt(Sigma_)).rvs()
prior_cand=norm.pdf(cand ,mu_t, np.sqrt(Cov[1,1]))
alpha=min([1, prior_cand/prior])
attempts+=1
if (np.random.uniform(0,1) < alpha) & (cand>=Min_t*0.8) & (cand<=Max_t*1.2):
sample=np.append(sample , cand)
prior=prior_cand
t+=1
break
else:
sample=np.array([])
t=0 # rest chain
# abort mcmc sampling after 100 failed attempts
fails+=1
if fails>100:
raise TimeoutError
break
if sample.size==24 and draws.size ==0:
draws=np.array([sample])
elif sample.size==24 and draws.size >0:
draws=np.vstack([draws, sample])
else:
pass
return draws