From 1e8e383ffc12412ffc9418fff0bad35327e26990 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B6ren=20Henning?= <soeren.henning@email.uni-kiel.de>
Date: Thu, 14 Jan 2021 15:55:37 +0100
Subject: [PATCH] Add demand metric notebook

---
 analysis/demand-metric.ipynb | 117 +++++++++++++++++++++++++++++++++++
 analysis/src/demand.py       |  59 ++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 analysis/demand-metric.ipynb
 create mode 100644 analysis/src/demand.py

diff --git a/analysis/demand-metric.ipynb b/analysis/demand-metric.ipynb
new file mode 100644
index 000000000..c86f117dd
--- /dev/null
+++ b/analysis/demand-metric.ipynb
@@ -0,0 +1,117 @@
+{
+ "cells": [
+  {
+   "source": [
+    "# Theodolite Analysis - Demand Metric\n",
+    "\n",
+    "This notebook allows applies Theodolite's *demand* metric to describe scalability of a SUT based on Theodolite measurement data.\n",
+    "\n",
+    "Theodolite's *demand* metric is a function, mapping load intensities to the minimum required resources (e.g., instances) that are required to process this load. With this notebook, the *demand* metric function is approximated by a map of tested load intensities to their minimum required resources.\n",
+    "\n",
+    "The final output when running this notebook will be a CSV file, providig this mapping. It can be used to create nice plots of a system's scalability using the `demand-metric-plot.ipynb` notebook."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "In the following cell, we need to specifiy:\n",
+    "\n",
+    "* `exp_id`: The experiment id  that is to be analyzed.\n",
+    "* `warmup_sec`: The number of seconds which are to be ignored in the beginning of each experiment.\n",
+    "* `max_lag_trend_slope`: The maximum tolerable increase in queued messages per second.\n",
+    "* `measurement_dir`: The directory where the measurement data files are to be found.\n",
+    "* `results_dir`: The directory where the computed demand CSV files are to be stored."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exp_id = 200\n",
+    "warmup_sec = 60\n",
+    "max_lag_trend_slope = 2000\n",
+    "directory = '<path-to>/results'\n",
+    "results_dir = '<path-to>/results-inst'\n"
+   ]
+  },
+  {
+   "source": [
+    "With the following call, we compute our demand mapping."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.demand import demand\n",
+    "\n",
+    "demand = demand(exp_id, measurement_dir, max_lag_trend_slope, warmup_sec)"
+   ]
+  },
+  {
+   "source": [
+    "We might already want to plot a simple visualization here:"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "demand.plot(kind='line',x='load',y='resources')"
+   ]
+  },
+  {
+   "source": [
+    "Finally we store the results in a CSV file."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "demand.to_csv(os.path.join(results_dir, f'exp{exp_id}_demand.csv'), index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.8.5-final"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "kernelspec": {
+   "name": "python37064bitvenvvenv6c432ee1239d4f3cb23f871068b0267d",
+   "display_name": "Python 3.7.0 64-bit ('.venv': venv)",
+   "language": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/analysis/src/demand.py b/analysis/src/demand.py
new file mode 100644
index 000000000..dfb20c05a
--- /dev/null
+++ b/analysis/src/demand.py
@@ -0,0 +1,59 @@
+import os
+from datetime import datetime, timedelta, timezone
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+
+def demand(exp_id, directory, threshold, warmup_sec):
+    raw_runs = []
+
+    # Compute SL, i.e., lag trend, for each tested configuration
+    filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
+    for filename in filenames:
+        #print(filename)
+        run_params = filename[:-4].split("_")
+        dim_value = run_params[2]
+        instances = run_params[3]
+
+        df = pd.read_csv(os.path.join(directory, filename))
+        #input = df.loc[df['topic'] == "input"]
+        input = df
+        #print(input)
+        input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
+        #print(input)
+        #print(input.iloc[0, 'timestamp'])
+        regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
+        #regress = input
+
+        #input.plot(kind='line',x='timestamp',y='value',color='red')
+        #plt.show()
+
+        X = regress.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array
+        Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
+        linear_regressor = LinearRegression()  # create object for the class
+        linear_regressor.fit(X, Y)  # perform linear regression
+        Y_pred = linear_regressor.predict(X)  # make predictions
+
+        trend_slope = linear_regressor.coef_[0][0]
+        #print(linear_regressor.coef_)
+
+        row = {'load': int(dim_value), 'resources': int(instances), 'trend_slope': trend_slope}
+        #print(row)
+        raw_runs.append(row)
+
+    runs = pd.DataFrame(raw_runs)
+
+    # Set suitable = True if SLOs are met, i.e., lag trend is below threshold
+    runs["suitable"] =  runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)
+
+    # Sort results table (unsure if required)
+    runs.columns = runs.columns.str.strip()
+    runs.sort_values(by=["load", "resources"])
+
+    # Filter only suitable configurations
+    filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]
+
+    # Compute demand per load intensity
+    grouped = filtered.groupby(['load'])['resources'].min()
+    demand_per_load = grouped.to_frame().reset_index()
+
+    return demand_per_load
-- 
GitLab