Source code for sustaingym.data.cogen.load_ambients

"""
Loads the ambient conditions data from for the cogen environment
~9 months worth of temperature, pressure, humidity, fuel price, energy price
"""
from __future__ import annotations

import os

import numpy as np
import pandas as pd

from sustaingym.data.utils import read_csv, read_to_bytesio, save_pickle



[docs]
DATA_DIR = 'data/cogen/ambients_data/'




[docs]
def load_wind_data(n_mw: float) -> np.ndarray:
    """Load wind speed data."""
    csv_path = os.path.join(DATA_DIR, '0_39.97_-128.77_2019_15min.csv')
    df = read_csv(csv_path, header=1)
    # points to interpolate for an IEC Class 2 wind turbine
    wind_curve_pts = [0, 0, 0, 0.0052, 0.0423, 0.1031, 0.1909,
                      0.3127, 0.4731, 0.6693, 0.8554, 0.9641, 0.9942, 0.9994,
                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]

    def wind_curve(x):
        return np.interp(x, np.arange(32), wind_curve_pts)

    # scale by n_mw for number of megawatts worth of wind capacity
    wind_capacity = n_mw
    # apply the wind curve to wind speed to get capacity factors
    cap_factors = wind_curve(df['wind speed at 100m (m/s)'])
    return wind_capacity * cap_factors




[docs]
def construct_df(renewables_magnitude: float = 0.) -> list[pd.DataFrame]:
    """
    Constructs the dataframe of all ambient conditions
    """
    renewables_magnitude = float(renewables_magnitude)

    # try to load the dataframe
    try:
        path = os.path.join(DATA_DIR, f'ambients_wind={renewables_magnitude}.pkl')
        df = pd.read_pickle(read_to_bytesio(path))
    except FileNotFoundError:
        # if it doesn't exist, construct it

        # ===== ELECTRICITY PRICE DATA =====

        bytesio = read_to_bytesio(os.path.join(DATA_DIR, 'rpt.00013060.0000000000000000.DAMLZHBSPP_2021.xlsx'))
        sheet_to_df_map_2021 = pd.read_excel(bytesio, sheet_name=None)
        bytesio = read_to_bytesio(os.path.join(DATA_DIR, 'rpt.00013060.0000000000000000.DAMLZHBSPP_2022.xlsx'))
        sheet_to_df_map_2022 = pd.read_excel(bytesio, sheet_name=None)
        energy_df = pd.concat([
            df[df['Settlement Point'] == 'HB_HOUSTON']
            for df in list(sheet_to_df_map_2021.values()) + list(sheet_to_df_map_2022.values())
        ]).reset_index(drop=True)

        # set Hour Beginning = Hour Ending minus 1 hour
        # convert the date and hour beginning columns to a single datetime
        energy_df['Delivery Date'] = pd.to_datetime(energy_df['Delivery Date'])
        energy_df['Hour Beginning'] = energy_df['Hour Ending'].map(lambda x: int(x[:2])-1)
        energy_df['Hour Beginning'] = energy_df.apply(lambda row: row['Delivery Date'] + pd.Timedelta(hours=row['Hour Beginning']), axis=1)
        energy_df.drop(columns=['Hour Ending', 'Delivery Date'], inplace=True)

        # remove days in energy_df with more or fewer than 24 hours (due to daylight savings)
        idxs = energy_df['Hour Beginning'].dt.date.value_counts()
        idxs = idxs[idxs != 24].index
        energy_df = energy_df[~energy_df['Hour Beginning'].dt.date.isin(idxs)]

        # subsample every 15 minutes
        energy_df.set_index('Hour Beginning', inplace=True)
        energy_df_15min = energy_df.resample('15min').ffill()

        # ===== GAS SPOT PRICE DATA =====
        csv_path = os.path.join(DATA_DIR, 'Henry_Hub_Natural_Gas_Spot_Price.csv')
        gas_df = read_csv(csv_path, sep=',', header=4)

        # add missing days, and fill in NaNs with the previous day's price
        gas_df['Day'] = pd.to_datetime(gas_df['Day'])
        gas_df.set_index('Day', inplace=True)
        gas_df = gas_df.reindex(pd.date_range(start=gas_df.index.min(), end=gas_df.index.max(), freq='D'))
        gas_df.ffill(inplace=True)

        # subsample gas_df every 15 minutes
        gas_df_15min = gas_df.resample('15min').ffill()

        # ===== AMBIENT CONDITIONS DATA =====

        bytesio = read_to_bytesio(os.path.join(DATA_DIR, 'operating_data.xlsx'))
        df = pd.read_excel(bytesio, header=3)
        df = df[[
            'Timestamp',               # datetime64[ns]
            'Target Net Power',        # float64 [MW]
            'Target Process Steam',    # float64 [klb/h]
            'Ambient Temperature',     # float64 [F]
            'Ambient Pressure',        # float64 [psia]
            'Ambient rel. Humidity'    # float64 [% in range [0, 100]]
        ]]
        df['Ambient rel. Humidity'] /= 100  # convert to a fraction

        # create column for float time of day (as a fraction of overall day)
        df['time'] = (df['Timestamp'].dt.hour * 60 + df['Timestamp'].dt.minute) / (24 * 60)

        # add column "Settlement Point Price" from energy_df_15min
        df['Energy Price'] = df.apply(lambda row: energy_df_15min.loc[row.Timestamp]['Settlement Point Price'], axis=1)

        # add column "Henry Hub Natural Gas Spot Price Dollars per Million Btu" from gas_df_15min
        df['Gas Price'] = df.apply(lambda row: gas_df_15min.loc[row.Timestamp]['Henry Hub Natural Gas Spot Price Dollars per Million Btu'], axis=1)

        # get the wind power data
        wind_data = load_wind_data(renewables_magnitude)[:len(df)]
        df['Target Net Power'] = np.maximum(df['Target Net Power'] - wind_data, 0)

        try:
            path = os.path.join(DATA_DIR, f'ambients_wind={renewables_magnitude}.pkl')
            save_pickle(df, path)
        except Exception as e:
            print('Saving pkl raised the following Exception:')
            print(e)
            print('This Exception means that we cannot cache files for faster'
                  ' future loads, but it has no other effect.')

    dates = df['Timestamp'].dt.date.unique()
    # drop the first and last days so each day has 96 datapoints
    dfs = [df[df['Timestamp'].dt.date == val] for val in dates][1:-1]
    # exclude any day that has more or fewer than 96 intervals due to daylight savings
    dfs = [df for df in dfs if len(df) == 96]

    return dfs