Move Jupyter notebooks to analysis directory

e54e610a · Sören Henning · ca6ec352 · e54e610a · e54e610a · e54e610a
Commit e54e610a authored 5 years ago by Sören Henning
--- a/analysis/README.md
+++ b/analysis/README.md
+# Theodolite Analysis
+
+This directory contains Jupyter notebooks for analyzing and visualizing
+benchmark execution results and plotting. The following notebooks are provided:
+
+* [scalability-graph.ipynb](scalability-graph.ipynb): Creates a scalability graph for a certain benchmark execution.
+* [scalability-graph-final.ipynb](scalability-graph-final.ipynb): Combines the scalability graphs of multiple benchmarks executions (e.g. for comparing different configuration).
+* [lag-trend-graph.ipynb](lag-trend-graph.ipynb): Visualizes the consumer lag evaluation over time along with the computed trend.
+
+## Usage
+
+For executing benchmarks and analyzing their results, a **Python 3.7**
+installation is required (e.g., in a virtual environment). Our notebooks require some
+Python libraries, which can be installed via:
+
+```sh
+pip install -r requirements.txt 
+```
+
+ We have tested these
+notebooks with [Visual Studio Code](https://code.visualstudio.com/docs/python/jupyter-support),
+however, every other server should be fine as well.
--- a/execution/lag-trend-graph.ipynb
+++ b/execution/lag-trend-graph.ipynb
@@ -20,8 +20,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "directory = ''\n",
-    "filename = 'xxx_totallag.csv'\n",
+    "directory = '<path-to>/results'\n",
+    "#filename = 'exp1002_uc3_75000_1_totallag.csv'\n",
+    "filename = 'exp1002_uc3_50000_2_totallag.csv'\n",
    "warmup_sec = 60\n",
    "threshold = 2000 #slope"
   ]
@@ -105,20 +106,6 @@
    "\n",
    "plt.savefig(\"plot.pdf\", bbox_inches='tight')\n"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@@ -138,7 +125,7 @@
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
-   "name": "python37064bitvenvvenv469ea2e0a7854dc7b367eee45386afee",
+   "name": "python37064bitvenvvenv21b61136d7f443749f2918b47e00d223",
   "display_name": "Python 3.7.0 64-bit ('.venv': venv)"
  }
 },

 %% Cell type:code id: tags:

 ``` 
 import os
 import pandas as pd
 import numpy as np
 from sklearn.linear_model import LinearRegression
 import matplotlib.pyplot as plt
 import matplotlib
 ```

 %% Cell type:code id: tags:

 ``` 
-directory = ''
-filename = 'xxx_totallag.csv'
+directory = '<path-to>/results'
+#filename = 'exp1002_uc3_75000_1_totallag.csv'
+filename = 'exp1002_uc3_50000_2_totallag.csv'
 warmup_sec = 60
 threshold = 2000 #slope
 ```

 %% Cell type:code id: tags:

 ``` 
 df = pd.read_csv(os.path.join(directory, filename))

 input = df.iloc[::3]
 #print(input)
 input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
 #print(input)
 #print(input.iloc[0, 'timestamp'])
 regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
 #regress = input

 #input.plot(kind='line',x='timestamp',y='value',color='red')
 #plt.show()

 X = regress.iloc[:, 4].values.reshape(-1, 1)  # values converts it into a numpy array
 Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
 linear_regressor = LinearRegression()  # create object for the class
 linear_regressor.fit(X, Y)  # perform linear regression
 Y_pred = linear_regressor.predict(X)  # make predictions
 ```

 %% Cell type:code id: tags:

 ``` 
 print(linear_regressor.coef_)
 ```

 %% Cell type:code id: tags:

 ``` 
 plt.style.use('ggplot')
 plt.rcParams['axes.facecolor']='w'
 plt.rcParams['axes.edgecolor']='555555'
 #plt.rcParams['ytick.color']='black'
 plt.rcParams['grid.color']='dddddd'
 plt.rcParams['axes.spines.top']='false'
 plt.rcParams['axes.spines.right']='false'
 plt.rcParams['legend.frameon']='true'
 plt.rcParams['legend.framealpha']='1'
 plt.rcParams['legend.edgecolor']='1'
 plt.rcParams['legend.borderpad']='1'


 #filename = f"exp{exp_id}_{benchmark}_{dim_value}_{instances}"


 t_warmup = input.loc[input['sec_start'] <= warmup_sec].iloc[:, 4].values
 y_warmup = input.loc[input['sec_start'] <= warmup_sec].iloc[:, 3].values

 plt.figure()
 #plt.figure(figsize=(4, 3))

 plt.plot(X, Y, c="#348ABD", label="observed")
 #plt.plot(t_warmup, y_warmup)

 plt.plot(X, Y_pred, c="#E24A33", label="trend") # color='red')

 #348ABD, 7A68A6, A60628, 467821, CF4457, 188487, E24A33

 plt.gca().yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, pos: '%1.0fK' % (x * 1e-3)))
 plt.ylabel('queued messages')
 plt.xlabel('seconds since start')
 plt.legend()
 #ax.set_ylim(ymin=0)
 #ax.set_xlim(xmin=0)

 plt.savefig("plot.pdf", bbox_inches='tight')
 ```
-
-%% Cell type:code id: tags:
-
-``` 
-```
-
-%% Cell type:code id: tags:
-
-``` 
-```

--- a/analysis/requirements.txt
+++ b/analysis/requirements.txt
+jupyter==1.0.0
+matplotlib==3.2.0
+pandas==1.0.1
+scikit-learn==0.22.2.post1
\ No newline at end of file
--- a/execution/scalability-graph-finish.ipynb
+++ b/execution/scalability-graph-finish.ipynb
@@ -18,7 +18,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "directory = '../results-inst'\n",
+    "directory = '<path-to>/results-inst'\n",
    "\n",
    "experiments = {\n",
    "    'exp1003': 'exp1003',\n",

 %% Cell type:code id: tags:

 ``` 
 import os
 import pandas as pd
 from functools import reduce
 import matplotlib.pyplot as plt
 ```

 %% Cell type:code id: tags:

 ``` 
-directory = '../results-inst'
+directory = '<path-to>/results-inst'

 experiments = {
    'exp1003': 'exp1003',
    'exp1025': 'exp1025',
 }
 ```

 %% Cell type:code id: tags:

 ``` 
 dataframes = [pd.read_csv(os.path.join(directory, f'{v}_min-suitable-instances.csv')).set_index('dim_value').rename(columns={"instances": k}) for k, v in experiments.items()]

 df = reduce(lambda df1,df2: df1.join(df2,how='outer'), dataframes)

 df
 ```

 %% Cell type:code id: tags:

 ``` 
 plt.style.use('ggplot')
 plt.rcParams['axes.facecolor']='w'
 plt.rcParams['axes.edgecolor']='555555'
 #plt.rcParams['ytick.color']='black'
 plt.rcParams['grid.color']='dddddd'
 plt.rcParams['axes.spines.top']='false'
 plt.rcParams['axes.spines.right']='false'
 plt.rcParams['legend.frameon']='true'
 plt.rcParams['legend.framealpha']='1'
 plt.rcParams['legend.edgecolor']='1'
 plt.rcParams['legend.borderpad']='1'





 plt.figure()
 ax = df.plot(kind='line', marker='o')
 #ax = df.plot(kind='line',x='dim_value', legend=False, use_index=True)
 ax.set_ylabel('instances')
 ax.set_xlabel('data sources')
 ax.set_ylim(ymin=0)
 #ax.set_xlim(xmin=0)
 ```

 %% Cell type:code id: tags:

 ``` 
 ```

--- a/execution/scalability-graph.ipynb
+++ b/execution/scalability-graph.ipynb
@@ -16,7 +16,6 @@
   "outputs": [],
   "source": [
    "import os\n",
-    "import requests\n",
    "from datetime import datetime, timedelta, timezone\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LinearRegression\n",
@@ -38,11 +37,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "exp_id = 1003\n",
+    "exp_id = 2012\n",
    "warmup_sec = 60\n",
    "warmup_partitions_sec = 120\n",
    "threshold = 2000 #slope\n",
-    "directory = '../results'\n"
+    "#directory = '../results'\n",
+    "directory = '<path-to>/results'\n",
+    "directory_out = '<path-to>/results-inst'\n"
   ]
  },
  {
@@ -244,7 +245,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "min_suitable_instances.to_csv(f'../results-inst/exp{exp_id}_min-suitable-instances.csv', index=False)"
+    "min_suitable_instances.to_csv(os.path.join(directory_out, f'../results-inst/exp{exp_id}_min-suitable-instances.csv'), index=False)"
   ]
  },
  {
@@ -284,7 +285,7 @@
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
-   "name": "python37064bitvenvvenv469ea2e0a7854dc7b367eee45386afee",
+   "name": "python37064bitvenvvenv6c432ee1239d4f3cb23f871068b0267d",
   "display_name": "Python 3.7.0 64-bit ('.venv': venv)"
  }
 },

 %% Cell type:code id: tags:

 ``` 
 print("hello")
 ```

 %% Cell type:code id: tags:

 ``` 
 import os
-import requests
 from datetime import datetime, timedelta, timezone
 import pandas as pd
 from sklearn.linear_model import LinearRegression
 import matplotlib.pyplot as plt
 ```

 %% Cell type:code id: tags:

 ``` 
 os.getcwd()
 ```

 %% Cell type:code id: tags:

 ``` 
-exp_id = 1003
+exp_id = 2012
 warmup_sec = 60
 warmup_partitions_sec = 120
 threshold = 2000 #slope
-directory = '../results'
+#directory = '../results'
+directory = '<path-to>/results'
+directory_out = '<path-to>/results-inst'
 ```

 %% Cell type:code id: tags:outputPrepend,outputPrepend

 ``` 
 #exp_id = 35

 #os.chdir("./results-final")

 raw_runs = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
 for filename in filenames:
    #print(filename)
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))
    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()

    X = regress.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array
    Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
    linear_regressor = LinearRegression()  # create object for the class
    linear_regressor.fit(X, Y)  # perform linear regression
    Y_pred = linear_regressor.predict(X)  # make predictions

    trend_slope = linear_regressor.coef_[0][0]
    #print(linear_regressor.coef_)

    row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}
    #print(row)
    raw_runs.append(row)

 lags = pd.DataFrame(raw_runs)
 ```

 %% Cell type:code id: tags:

 ``` 
 lags.head()
 ```

 %% Cell type:code id: tags:

 ``` 

 raw_partitions = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("partitions.csv")]
 for filename in filenames:
    #print(filename)
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))
    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    input = input.loc[input['topic'] >= 'input']
    mean = input['value'].mean()

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()


    row = {'dim_value': int(dim_value), 'instances': int(instances), 'partitions': mean}
    #print(row)
    raw_partitions.append(row)


 partitions = pd.DataFrame(raw_partitions)

 #runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])
 ```

 %% Cell type:code id: tags:

 ``` 
 raw_obs_instances = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("instances.csv")]
 for filename in filenames:
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))

    if df.empty:
        continue

    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    #input = input.loc[input['topic'] >= 'input']
    #mean = input['value'].mean()

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()


    #row = {'dim_value': int(dim_value), 'instances': int(instances), 'obs_instances': mean}
    #print(row)
    raw_obs_instances.append(row)


 obs_instances = pd.DataFrame(raw_obs_instances)

 obs_instances.head()
 ```

 %% Cell type:code id: tags:

 ``` 
 runs = lags
 #runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])#.join(obs_instances.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])

 #runs["failed"] = runs.apply(lambda row: (abs(row['instances'] - row['obs_instances']) / row['instances']) > 0.1, axis=1)

 #runs.loc[runs['failed']==True]
 ```

 %% Cell type:code id: tags:

 ``` 
 #threshold = 1000

 # Set to true if the trend line has a slope less than
 runs["suitable"] =  runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)

 runs.columns = runs.columns.str.strip()
 runs.sort_values(by=["dim_value", "instances"])
 ```

 %% Cell type:code id: tags:

 ``` 
 filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]

 grouped = filtered.groupby(['dim_value'])['instances'].min()
 min_suitable_instances = grouped.to_frame().reset_index()

 min_suitable_instances
 ```

 %% Cell type:code id: tags:

 ``` 
-min_suitable_instances.to_csv(f'../results-inst/exp{exp_id}_min-suitable-instances.csv', index=False)
+min_suitable_instances.to_csv(os.path.join(directory_out, f'../results-inst/exp{exp_id}_min-suitable-instances.csv'), index=False)
 ```

 %% Cell type:code id: tags:

 ``` 
 min_suitable_instances.plot(kind='line',x='dim_value',y='instances')
 # min_suitable_instances.plot(kind='line',x='dim_value',y='instances', logy=True)

 plt.show()
 ```

 %% Cell type:code id: tags:

 ``` 
 ```

--- a/execution/requirements.txt
+++ b/execution/requirements.txt
-attrs==19.3.0
-backcall==0.1.0
-bleach==3.1.1
-certifi==2019.11.28
-chardet==3.0.4
-cycler==0.10.0
-decorator==4.4.2
-defusedxml==0.6.0
-entrypoints==0.3
-idna==2.9
-importlib-metadata==1.5.0
-ipykernel==5.1.4
-ipython==7.13.0
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.16.0
-Jinja2==2.11.1
-joblib==0.14.1
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==6.0.0
-jupyter-console==6.1.0
-jupyter-core==4.6.3
-kiwisolver==1.1.0
-MarkupSafe==1.1.1
 matplotlib==3.2.0
-mistune==0.8.4
-nbconvert==5.6.1
-nbformat==5.0.4
-notebook==6.0.3
-numpy==1.18.1
 pandas==1.0.1
-pandocfilters==1.4.2
-parso==0.6.2
-pexpect==4.8.0
-pickleshare==0.7.5
-prometheus-client==0.7.1
-prompt-toolkit==3.0.4
-ptyprocess==0.6.0
-Pygments==2.6.1
-pyparsing==2.4.6
-pyrsistent==0.15.7
-python-dateutil==2.8.1
-pytz==2019.3
-pyzmq==19.0.0
-qtconsole==4.7.1
-QtPy==1.9.0
 requests==2.23.0
 scikit-learn==0.22.2.post1
\ No newline at end of file
-scipy==1.4.1
-Send2Trash==1.5.0
-six==1.14.0
-sklearn==0.0
-terminado==0.8.3
-testpath==0.4.4
-tornado==6.0.4
-traitlets==4.3.3
-urllib3==1.25.8
-wcwidth==0.1.8
-webencodings==0.5.1
-widgetsnbextension==3.5.1
-zipp==3.1.0