Skip to content
Snippets Groups Projects
Commit e54e610a authored by Sören Henning's avatar Sören Henning
Browse files

Move Jupyter notebooks to analysis directory

parent ca6ec352
No related branches found
No related tags found
No related merge requests found
# Theodolite Analysis
This directory contains Jupyter notebooks for analyzing and visualizing
benchmark execution results and plotting. The following notebooks are provided:
* [scalability-graph.ipynb](scalability-graph.ipynb): Creates a scalability graph for a certain benchmark execution.
* [scalability-graph-final.ipynb](scalability-graph-final.ipynb): Combines the scalability graphs of multiple benchmarks executions (e.g. for comparing different configuration).
* [lag-trend-graph.ipynb](lag-trend-graph.ipynb): Visualizes the consumer lag evaluation over time along with the computed trend.
## Usage
For executing benchmarks and analyzing their results, a **Python 3.7**
installation is required (e.g., in a virtual environment). Our notebooks require some
Python libraries, which can be installed via:
```sh
pip install -r requirements.txt
```
We have tested these
notebooks with [Visual Studio Code](https://code.visualstudio.com/docs/python/jupyter-support),
however, every other server should be fine as well.
%% Cell type:code id: tags:
```
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib
```
%% Cell type:code id: tags:
```
directory = ''
filename = 'xxx_totallag.csv'
directory = '<path-to>/results'
#filename = 'exp1002_uc3_75000_1_totallag.csv'
filename = 'exp1002_uc3_50000_2_totallag.csv'
warmup_sec = 60
threshold = 2000 #slope
```
%% Cell type:code id: tags:
```
df = pd.read_csv(os.path.join(directory, filename))
input = df.iloc[::3]
#print(input)
input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
#print(input)
#print(input.iloc[0, 'timestamp'])
regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
#regress = input
#input.plot(kind='line',x='timestamp',y='value',color='red')
#plt.show()
X = regress.iloc[:, 4].values.reshape(-1, 1) # values converts it into a numpy array
Y = regress.iloc[:, 3].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
```
%% Cell type:code id: tags:
```
print(linear_regressor.coef_)
```
%% Cell type:code id: tags:
```
plt.style.use('ggplot')
plt.rcParams['axes.facecolor']='w'
plt.rcParams['axes.edgecolor']='555555'
#plt.rcParams['ytick.color']='black'
plt.rcParams['grid.color']='dddddd'
plt.rcParams['axes.spines.top']='false'
plt.rcParams['axes.spines.right']='false'
plt.rcParams['legend.frameon']='true'
plt.rcParams['legend.framealpha']='1'
plt.rcParams['legend.edgecolor']='1'
plt.rcParams['legend.borderpad']='1'
#filename = f"exp{exp_id}_{benchmark}_{dim_value}_{instances}"
t_warmup = input.loc[input['sec_start'] <= warmup_sec].iloc[:, 4].values
y_warmup = input.loc[input['sec_start'] <= warmup_sec].iloc[:, 3].values
plt.figure()
#plt.figure(figsize=(4, 3))
plt.plot(X, Y, c="#348ABD", label="observed")
#plt.plot(t_warmup, y_warmup)
plt.plot(X, Y_pred, c="#E24A33", label="trend") # color='red')
#348ABD, 7A68A6, A60628, 467821, CF4457, 188487, E24A33
plt.gca().yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, pos: '%1.0fK' % (x * 1e-3)))
plt.ylabel('queued messages')
plt.xlabel('seconds since start')
plt.legend()
#ax.set_ylim(ymin=0)
#ax.set_xlim(xmin=0)
plt.savefig("plot.pdf", bbox_inches='tight')
```
%% Cell type:code id: tags:
```
```
%% Cell type:code id: tags:
```
```
......
jupyter==1.0.0
matplotlib==3.2.0
pandas==1.0.1
scikit-learn==0.22.2.post1
\ No newline at end of file
%% Cell type:code id: tags:
```
import os
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
```
directory = '../results-inst'
directory = '<path-to>/results-inst'
experiments = {
'exp1003': 'exp1003',
'exp1025': 'exp1025',
}
```
%% Cell type:code id: tags:
```
dataframes = [pd.read_csv(os.path.join(directory, f'{v}_min-suitable-instances.csv')).set_index('dim_value').rename(columns={"instances": k}) for k, v in experiments.items()]
df = reduce(lambda df1,df2: df1.join(df2,how='outer'), dataframes)
df
```
%% Cell type:code id: tags:
```
plt.style.use('ggplot')
plt.rcParams['axes.facecolor']='w'
plt.rcParams['axes.edgecolor']='555555'
#plt.rcParams['ytick.color']='black'
plt.rcParams['grid.color']='dddddd'
plt.rcParams['axes.spines.top']='false'
plt.rcParams['axes.spines.right']='false'
plt.rcParams['legend.frameon']='true'
plt.rcParams['legend.framealpha']='1'
plt.rcParams['legend.edgecolor']='1'
plt.rcParams['legend.borderpad']='1'
plt.figure()
ax = df.plot(kind='line', marker='o')
#ax = df.plot(kind='line',x='dim_value', legend=False, use_index=True)
ax.set_ylabel('instances')
ax.set_xlabel('data sources')
ax.set_ylim(ymin=0)
#ax.set_xlim(xmin=0)
```
%% Cell type:code id: tags:
```
```
......
%% Cell type:code id: tags:
```
print("hello")
```
%% Cell type:code id: tags:
```
import os
import requests
from datetime import datetime, timedelta, timezone
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
```
os.getcwd()
```
%% Cell type:code id: tags:
```
exp_id = 1003
exp_id = 2012
warmup_sec = 60
warmup_partitions_sec = 120
threshold = 2000 #slope
directory = '../results'
#directory = '../results'
directory = '<path-to>/results'
directory_out = '<path-to>/results-inst'
```
%% Cell type:code id: tags:outputPrepend,outputPrepend
```
#exp_id = 35
#os.chdir("./results-final")
raw_runs = []
filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
for filename in filenames:
#print(filename)
run_params = filename[:-4].split("_")
dim_value = run_params[2]
instances = run_params[3]
df = pd.read_csv(os.path.join(directory, filename))
#input = df.loc[df['topic'] == "input"]
input = df
#print(input)
input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
#print(input)
#print(input.iloc[0, 'timestamp'])
regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
#regress = input
#input.plot(kind='line',x='timestamp',y='value',color='red')
#plt.show()
X = regress.iloc[:, 2].values.reshape(-1, 1) # values converts it into a numpy array
Y = regress.iloc[:, 3].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions
trend_slope = linear_regressor.coef_[0][0]
#print(linear_regressor.coef_)
row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}
#print(row)
raw_runs.append(row)
lags = pd.DataFrame(raw_runs)
```
%% Cell type:code id: tags:
```
lags.head()
```
%% Cell type:code id: tags:
```
raw_partitions = []
filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("partitions.csv")]
for filename in filenames:
#print(filename)
run_params = filename[:-4].split("_")
dim_value = run_params[2]
instances = run_params[3]
df = pd.read_csv(os.path.join(directory, filename))
#input = df.loc[df['topic'] == "input"]
input = df
#print(input)
input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
#print(input)
#print(input.iloc[0, 'timestamp'])
input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
#regress = input
input = input.loc[input['topic'] >= 'input']
mean = input['value'].mean()
#input.plot(kind='line',x='timestamp',y='value',color='red')
#plt.show()
row = {'dim_value': int(dim_value), 'instances': int(instances), 'partitions': mean}
#print(row)
raw_partitions.append(row)
partitions = pd.DataFrame(raw_partitions)
#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])
```
%% Cell type:code id: tags:
```
raw_obs_instances = []
filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("instances.csv")]
for filename in filenames:
run_params = filename[:-4].split("_")
dim_value = run_params[2]
instances = run_params[3]
df = pd.read_csv(os.path.join(directory, filename))
if df.empty:
continue
#input = df.loc[df['topic'] == "input"]
input = df
#print(input)
input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
#print(input)
#print(input.iloc[0, 'timestamp'])
input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
#regress = input
#input = input.loc[input['topic'] >= 'input']
#mean = input['value'].mean()
#input.plot(kind='line',x='timestamp',y='value',color='red')
#plt.show()
#row = {'dim_value': int(dim_value), 'instances': int(instances), 'obs_instances': mean}
#print(row)
raw_obs_instances.append(row)
obs_instances = pd.DataFrame(raw_obs_instances)
obs_instances.head()
```
%% Cell type:code id: tags:
```
runs = lags
#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])#.join(obs_instances.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])
#runs["failed"] = runs.apply(lambda row: (abs(row['instances'] - row['obs_instances']) / row['instances']) > 0.1, axis=1)
#runs.loc[runs['failed']==True]
```
%% Cell type:code id: tags:
```
#threshold = 1000
# Set to true if the trend line has a slope less than
runs["suitable"] = runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)
runs.columns = runs.columns.str.strip()
runs.sort_values(by=["dim_value", "instances"])
```
%% Cell type:code id: tags:
```
filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]
grouped = filtered.groupby(['dim_value'])['instances'].min()
min_suitable_instances = grouped.to_frame().reset_index()
min_suitable_instances
```
%% Cell type:code id: tags:
```
min_suitable_instances.to_csv(f'../results-inst/exp{exp_id}_min-suitable-instances.csv', index=False)
min_suitable_instances.to_csv(os.path.join(directory_out, f'../results-inst/exp{exp_id}_min-suitable-instances.csv'), index=False)
```
%% Cell type:code id: tags:
```
min_suitable_instances.plot(kind='line',x='dim_value',y='instances')
# min_suitable_instances.plot(kind='line',x='dim_value',y='instances', logy=True)
plt.show()
```
%% Cell type:code id: tags:
```
```
......
attrs==19.3.0
backcall==0.1.0
bleach==3.1.1
certifi==2019.11.28
chardet==3.0.4
cycler==0.10.0
decorator==4.4.2
defusedxml==0.6.0
entrypoints==0.3
idna==2.9
importlib-metadata==1.5.0
ipykernel==5.1.4
ipython==7.13.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.16.0
Jinja2==2.11.1
joblib==0.14.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.0.0
jupyter-console==6.1.0
jupyter-core==4.6.3
kiwisolver==1.1.0
MarkupSafe==1.1.1
matplotlib==3.2.0
mistune==0.8.4
nbconvert==5.6.1
nbformat==5.0.4
notebook==6.0.3
numpy==1.18.1
pandas==1.0.1
pandocfilters==1.4.2
parso==0.6.2
pexpect==4.8.0
pickleshare==0.7.5
prometheus-client==0.7.1
prompt-toolkit==3.0.4
ptyprocess==0.6.0
Pygments==2.6.1
pyparsing==2.4.6
pyrsistent==0.15.7
python-dateutil==2.8.1
pytz==2019.3
pyzmq==19.0.0
qtconsole==4.7.1
QtPy==1.9.0
requests==2.23.0
scikit-learn==0.22.2.post1
\ No newline at end of file
scipy==1.4.1
Send2Trash==1.5.0
six==1.14.0
sklearn==0.0
terminado==0.8.3
testpath==0.4.4
tornado==6.0.4
traitlets==4.3.3
urllib3==1.25.8
wcwidth==0.1.8
webencodings==0.5.1
widgetsnbextension==3.5.1
zipp==3.1.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment