934 KiB
934 KiB
example k-means reduced zonal dataset¶
In [1]:
import pandas as pd import numpy as np import matplotlib.pyplot as plt # custom scripts import scripts.read as read import scripts.prep as prep import scripts.kmeans as kmeans
In [2]:
# load BE data df_BE=read.zonal_data('zonal_data/INPUT_DATA_ZONAL_BE.xlsx') # normalisation factor of 4 for BE_ND to increase its importance in the subsequent clustering df_BE['ND']=prep.max_normalise(df_BE['ND'], 4) # add 'BE' in front of every column in the dataframe to to distinguish between countries in merged dataframe df_BE=prep.add_country_name(df_BE, 'BE') df_BE.describe()
INTERCONN is identically zero.
Out[2]:
BE_ND | BE_WIND_ONSHORE | BE_WIND_OFFSHORE | BE_SOLAR | BE_HYDRO | BE_BIOMASS | BE_NUCLEAR | BE_OTHER_GEN | |
---|---|---|---|---|---|---|---|---|
count | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 |
mean | 2.975173 | 0.171542 | 0.408345 | 0.102262 | 0.081661 | 0.462982 | 0.478261 | 0.498569 |
std | 0.378897 | 0.160393 | 0.330119 | 0.151439 | 0.067165 | 0.117066 | 0.125715 | 0.045038 |
min | 2.055689 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.060649 | 0.236078 | 0.255435 |
25% | 2.721740 | 0.043827 | 0.099719 | 0.000000 | 0.017391 | 0.444288 | 0.412420 | 0.470109 |
50% | 3.016437 | 0.116049 | 0.334270 | 0.002615 | 0.069565 | 0.469676 | 0.507256 | 0.502717 |
75% | 3.234711 | 0.257407 | 0.745787 | 0.173244 | 0.147826 | 0.554302 | 0.579312 | 0.532609 |
max | 4.000000 | 0.998148 | 0.967697 | 0.660135 | 0.208696 | 0.630465 | 0.822140 | 0.614130 |
In [3]:
# load AT data df_AT=read.zonal_data('zonal_data/INPUT_DATA_ZONAL_AT.xlsx') # normalisation factor of 4 for BE_ND to increase its importance in the subsequent clustering df_AT['ND']=prep.max_normalise(df_AT['ND'], 4) # AT_INTERCONN has normalisation factor of 1 (default) df_AT['INTERCONN']=prep.max_normalise(df_AT['INTERCONN']) # add 'AT' in front of every column in the dataframe df_AT=prep.add_country_name(df_AT, 'AT') df_AT.describe()
WIND_OFFSHORE is identically zero. NUCLEAR is identically zero. OTHER_GEN is identically zero.
Out[3]:
AT_ND | AT_WIND_ONSHORE | AT_SOLAR | AT_HYDRO | AT_BIOMASS | AT_INTERCONN | |
---|---|---|---|---|---|---|
count | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 |
mean | 2.985608 | 0.253861 | 0.159438 | 0.398663 | 0.644243 | 0.237638 |
std | 0.474555 | 0.143749 | 0.224376 | 0.133731 | 0.043179 | 0.247113 |
min | 1.868206 | 0.003713 | 0.000000 | 0.165434 | 0.521552 | -0.697967 |
25% | 2.654007 | 0.142456 | 0.000000 | 0.299161 | 0.620690 | 0.074954 |
50% | 3.034726 | 0.235946 | 0.017462 | 0.364130 | 0.655172 | 0.242514 |
75% | 3.318725 | 0.345979 | 0.281090 | 0.491588 | 0.672414 | 0.409242 |
max | 4.000000 | 0.841730 | 0.867547 | 0.791855 | 0.741379 | 1.000000 |
In [4]:
# merge BE and AT data for clustering dfs=[df_BE, df_AT] df=prep.merge(dfs) df.describe()
Out[4]:
BE_ND | BE_WIND_ONSHORE | BE_WIND_OFFSHORE | BE_SOLAR | BE_HYDRO | BE_BIOMASS | BE_NUCLEAR | BE_OTHER_GEN | AT_ND | AT_WIND_ONSHORE | AT_SOLAR | AT_HYDRO | AT_BIOMASS | AT_INTERCONN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 | 8760.000000 |
mean | 2.975173 | 0.171542 | 0.408345 | 0.102262 | 0.081661 | 0.462982 | 0.478261 | 0.498569 | 2.985608 | 0.253861 | 0.159438 | 0.398663 | 0.644243 | 0.237638 |
std | 0.378897 | 0.160393 | 0.330119 | 0.151439 | 0.067165 | 0.117066 | 0.125715 | 0.045038 | 0.474555 | 0.143749 | 0.224376 | 0.133731 | 0.043179 | 0.247113 |
min | 2.055689 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.060649 | 0.236078 | 0.255435 | 1.868206 | 0.003713 | 0.000000 | 0.165434 | 0.521552 | -0.697967 |
25% | 2.721740 | 0.043827 | 0.099719 | 0.000000 | 0.017391 | 0.444288 | 0.412420 | 0.470109 | 2.654007 | 0.142456 | 0.000000 | 0.299161 | 0.620690 | 0.074954 |
50% | 3.016437 | 0.116049 | 0.334270 | 0.002615 | 0.069565 | 0.469676 | 0.507256 | 0.502717 | 3.034726 | 0.235946 | 0.017462 | 0.364130 | 0.655172 | 0.242514 |
75% | 3.234711 | 0.257407 | 0.745787 | 0.173244 | 0.147826 | 0.554302 | 0.579312 | 0.532609 | 3.318725 | 0.345979 | 0.281090 | 0.491588 | 0.672414 | 0.409242 |
max | 4.000000 | 0.998148 | 0.967697 | 0.660135 | 0.208696 | 0.630465 | 0.822140 | 0.614130 | 4.000000 | 0.841730 | 0.867547 | 0.791855 | 0.741379 | 1.000000 |
In [5]:
# train kmeans model labels, centres=kmeans.clustering(df) # using default 20 clusters
In [6]:
# output daily day type table df_labels=kmeans.df_daily_label(labels, 2018) df_labels
Out[6]:
daytype | |
---|---|
2018-01-01 | 16 |
2018-01-02 | 17 |
2018-01-03 | 19 |
2018-01-04 | 19 |
2018-01-05 | 19 |
... | ... |
2018-12-27 | 17 |
2018-12-28 | 17 |
2018-12-29 | 17 |
2018-12-30 | 17 |
2018-12-31 | 17 |
365 rows × 1 columns
In [7]:
# reconstruct the data table with cluster centres df_reduced=kmeans.df_centres(df, labels, centres) df_reduced
Out[7]:
BE_ND | BE_WIND_ONSHORE | BE_WIND_OFFSHORE | BE_SOLAR | BE_HYDRO | BE_BIOMASS | BE_NUCLEAR | BE_OTHER_GEN | AT_ND | AT_WIND_ONSHORE | AT_SOLAR | AT_HYDRO | AT_BIOMASS | AT_INTERCONN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Time | ||||||||||||||
1 | 2.838218 | 0.182099 | 0.794242 | 0.000000e+00 | 0.160870 | 0.227786 | 0.508016 | 0.551630 | 2.546124 | 0.109984 | 0.0 | 0.228238 | 0.659483 | 0.568392 |
2 | 2.616419 | 0.185494 | 0.798455 | 0.000000e+00 | 0.160870 | 0.228491 | 0.506244 | 0.555707 | 2.410350 | 0.104818 | 0.0 | 0.227246 | 0.659483 | 0.629205 |
3 | 2.469548 | 0.183951 | 0.761236 | 0.000000e+00 | 0.160870 | 0.224260 | 0.509450 | 0.551630 | 2.315056 | 0.098545 | 0.0 | 0.224202 | 0.659483 | 0.635860 |
4 | 2.350503 | 0.201852 | 0.556180 | 0.000000e+00 | 0.160870 | 0.226375 | 0.510715 | 0.552989 | 2.206895 | 0.095795 | 0.0 | 0.225893 | 0.658405 | 0.587246 |
5 | 2.281765 | 0.187963 | 0.509129 | 0.000000e+00 | 0.160870 | 0.224965 | 0.510547 | 0.554348 | 2.190770 | 0.103490 | 0.0 | 0.226073 | 0.657328 | 0.544362 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8756 | 3.163095 | 0.325679 | 0.780337 | 0.000000e+00 | 0.129565 | 0.392525 | 0.707239 | 0.538859 | 3.217167 | 0.332767 | 0.0 | 0.224811 | 0.664871 | 0.372569 |
8757 | 2.987369 | 0.320926 | 0.799157 | -3.469447e-18 | 0.126087 | 0.381946 | 0.708826 | 0.539674 | 3.037690 | 0.332606 | 0.0 | 0.213553 | 0.666164 | 0.371608 |
8758 | 2.890612 | 0.300185 | 0.788904 | 0.000000e+00 | 0.122609 | 0.368406 | 0.709467 | 0.540489 | 2.865606 | 0.331732 | 0.0 | 0.204794 | 0.667241 | 0.352717 |
8759 | 2.998012 | 0.296543 | 0.744803 | 0.000000e+00 | 0.123478 | 0.359661 | 0.709838 | 0.541033 | 2.918659 | 0.329311 | 0.0 | 0.198548 | 0.667672 | 0.390684 |
8760 | 2.983878 | 0.257654 | 0.736517 | 0.000000e+00 | 0.125217 | 0.348096 | 0.705451 | 0.537500 | 2.724385 | 0.324402 | 0.0 | 0.196496 | 0.667241 | 0.453457 |
8760 rows × 14 columns
In [ ]:
# output files # df_labels.to_csv('daytype.csv') # df_reduced.to_csv('reduced_data.csv')
In [8]:
# inspect output graphically def plot_compare(col_name): plt.figure(figsize=(20,8)) plt.plot(df[col_name], label='data') plt.plot(df_reduced[col_name], label='kmeans centre') plt.title(col_name.replace('_', ' '), size=16) plt.xticks(size=16) # x-axis font size plt.yticks(size=16) # y-axis font size plt.xlabel('time step', size=16) # x-axis label plt.ylabel('factor', size=16) # y-axis label plt.legend(prop={'size':16}) plt.show()
In [9]:
plot_compare('BE_ND')
In [10]:
plot_compare('AT_ND')
In [11]:
plot_compare('BE_WIND_OFFSHORE')
In [12]:
plot_compare('AT_WIND_ONSHORE')
In [ ]: