european_data/scripts/sampling.py

import pandas as pd
import numpy as np

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern

from scipy.stats import norm
from scripts.misc import *


def GP_samples(s, n_std=3, max_noise=0.5): # s can be pandas series or ordered 1-d np array

    n_samples=int(len(s)/24)
    x=np.repeat([np.arange(24)], n_samples, axis=0).ravel()

    # convert input to 1-d np array
    if isinstance(s, pd.Series):
        y=s.values
    elif isinstance(s, np.ndarray):
        y=s
    else:
        raise ValueError("Input vector must be a pandas series or numpy array.")

    # set noise parameter
    mean_hourly_std=col_to_M(s).std(axis=0).mean()
    noise=max(n_std*mean_hourly_std, max_noise)

    # GP kernel
    kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=2) #nu=3/4 for less smooth knernel

    #define GP
    gp = GaussianProcessRegressor(kernel=kernel, alpha=noise, random_state=2468)

    gp.fit(x.reshape(len(x),1), y)
    x_ = np.arange(24)

    #mean function
    y_mean, y_std = gp.predict(x_.reshape(24,1), return_std=True)

    # draw 100 posterior samples
    n_y_samples=100
    y_samples = gp.sample_y(x_.reshape(24,1), n_y_samples)


    return (y_samples.T)


def mcmc_samples(sM, n_draws=100, start='mean'): # input can be pandas series or ordered 1-d array
                                                 # or maxtrix with the 24 hours in the columns

    np.random.seed(seed=2468) # fix seed for reproducibility

    if isinstance(sM, pd.Series):
        M=col_to_M(sM)
    elif isinstance(sM, np.ndarray):
        if sM.ndim ==1:
            M=col_to_M(sM)
        elif sM.ndim ==2 and sM.shape[1]==24:
            M=sM
        else:
            raise ValueError("Input array must have 24-hour columns")

    else:
        raise ValueError("Input vector must be a pandas series or numpy array.")

    draws=np.array([])

    fails=0

    while draws.shape[0]<n_draws:


        sample=np.array([])

        #start from t=0
        t=0

        while t<24:

            if t==0:
                data0=M[:,0]
                mean0, std0=norm.fit(data0)
                #print(mean0, std0)

                while len(sample)==0:
                    if start == 'mean':
                        cand0 = norm(mean0, 0.01).rvs()
                    else:
                        cand0 = norm(mean0, std0).rvs()
                    if max(0, data0.min()*0.8) <=cand0<= min(1, data0.max()*1.2):
                        sample=np.array([cand0])
                        t+=1
                        prior=norm.pdf(cand0, mean0, std0)

            else:
            # compute mu and Sigma for conditional normal

                data_t=M[:,t]
                data_p=M[:,t-1]   # _p for "previous" time step

                Min_t=data_t.min()
                Max_t=data_t.max()

                mu_t=data_t.mean()
                mu_p=data_p.mean()

                Cov=np.cov(data_p, data_t)

                mu_= mu_t + Cov[1,0]*(sample[t-1]-mu_p)/Cov[0,0]
                Sigma_=Cov[1,1] - Cov[0,1]*Cov[0,1]/Cov[0,0]


                # draw a candidate, abort chain after 20 failed attempts
                # return to 0th hour and draw a new initial point
                attempts=1
                while attempts<=20:

                    cand=norm(mu_, np.sqrt(Sigma_)).rvs()
                    prior_cand=norm.pdf(cand ,mu_t, np.sqrt(Cov[1,1]))
                    alpha=min([1, prior_cand/prior])
                    attempts+=1

                    if (np.random.uniform(0,1) < alpha) & (cand>=Min_t*0.8) & (cand<=Max_t*1.2):
                        sample=np.append(sample , cand)
                        prior=prior_cand
                        t+=1

                        break
                else:
                    sample=np.array([])
                    t=0 # rest chain

                    # abort mcmc sampling after 100 failed attempts
                    fails+=1
                    if fails>100:
                        raise TimeoutError
                    break

        if sample.size==24 and draws.size ==0:
            draws=np.array([sample])
        elif sample.size==24 and draws.size >0:
            draws=np.vstack([draws, sample])
        else:
            pass


    return draws