demand.py

import os
from datetime import datetime, timedelta, timezone
import pandas as pd
from sklearn.linear_model import LinearRegression

def demand(exp_id, directory, threshold, warmup_sec):
    raw_runs = []

    # Compute SL, i.e., lag trend, for each tested configuration
    filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
    for filename in filenames:
        #print(filename)
        run_params = filename[:-4].split("_")
        dim_value = run_params[2]
        instances = run_params[3]

        df = pd.read_csv(os.path.join(directory, filename))
        #input = df.loc[df['topic'] == "input"]
        input = df
        #print(input)
        input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
        #print(input)
        #print(input.iloc[0, 'timestamp'])
        regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
        #regress = input

        #input.plot(kind='line',x='timestamp',y='value',color='red')
        #plt.show()

        X = regress.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array
        Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
        linear_regressor = LinearRegression()  # create object for the class
        linear_regressor.fit(X, Y)  # perform linear regression
        Y_pred = linear_regressor.predict(X)  # make predictions

        trend_slope = linear_regressor.coef_[0][0]
        #print(linear_regressor.coef_)

        row = {'load': int(dim_value), 'resources': int(instances), 'trend_slope': trend_slope}
        #print(row)
        raw_runs.append(row)

    runs = pd.DataFrame(raw_runs)

    # Set suitable = True if SLOs are met, i.e., lag trend is below threshold
    runs["suitable"] =  runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)

    # Sort results table (unsure if required)
    runs.columns = runs.columns.str.strip()
    runs.sort_values(by=["load", "resources"])

    # Filter only suitable configurations
    filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]

    # Compute demand per load intensity
    grouped = filtered.groupby(['load'])['resources'].min()
    demand_per_load = grouped.to_frame().reset_index()

    return demand_per_load