import os from datetime import datetime, timedelta, timezone import pandas as pd from pandas.core.frame import DataFrame from sklearn.linear_model import LinearRegression def demand(exp_id, directory, threshold, warmup_sec): raw_runs = [] # Compute SL, i.e., lag trend, for each tested configuration filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.__contains__("lag-trend") and filename.endswith(".csv")] for filename in filenames: #print(filename) run_params = filename[:-4].split("_") dim_value = run_params[1] instances = run_params[2] df = pd.read_csv(os.path.join(directory, filename)) input = df input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp'] regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up X = regress.iloc[:, 1].values.reshape(-1, 1) # values converts it into a numpy array Y = regress.iloc[:, 2].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column linear_regressor = LinearRegression() # create object for the class linear_regressor.fit(X, Y) # perform linear regression Y_pred = linear_regressor.predict(X) # make predictions trend_slope = linear_regressor.coef_[0][0] #print(linear_regressor.coef_) row = {'load': int(dim_value), 'resources': int(instances), 'trend_slope': trend_slope} #print(row) raw_runs.append(row) runs = pd.DataFrame(raw_runs) # Group by the load and resources to handle repetitions, and take from the reptitions the median # for even reptitions the the average of the two middle values is used medians = runs.groupby(by=['load', 'resources'], as_index=False).median() # Set suitable = True if SLOs are met, i.e., lag trend is below threshold_ratio # Calculate the absolute threshold for each row based on threshold_ratio and check if lag is below this threshold medians["suitable"] = medians.apply(lambda row: row['trend_slope'] < threshold, axis=1) suitable = medians[medians.apply(lambda x: x['suitable'], axis=1)] #print(suitable) # Compute minimal demand per load intensity demand_per_load = suitable.groupby(by=['load'], as_index=False)['resources'].min() return demand_per_load