Work on scalability graphs

fb176462 · Sören Henning · 3f6da3a1 · fb176462 · fb176462
Commit fb176462 authored 5 years ago by Sören Henning
--- a/execution/scalability-graph-finish.ipynb
+++ b/execution/scalability-graph-finish.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "from datetime import datetime, timedelta, timezone\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exp_id = 1025\n",
+    "warmup_sec = 60\n",
+    "warmup_partitions_sec = 120\n",
+    "threshold = 2000 #slope\n",
+    "directory = '../results-inst'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_csv(os.path.join(directory, 'exp1025_min-suitable-instances.csv')).set_index('dim_value').rename(columns={\"instances\": \"1025\"})\n",
+    "df2 = pd.read_csv(os.path.join(directory, 'exp1003_min-suitable-instances.csv')).set_index('dim_value').rename(columns={\"instances\": \"1003\"})\n",
+    "\n",
+    "#df1.join(df2, on='instances')\n",
+    "df = df1.join(df2, how='outer')\n",
+    "\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.style.use('ggplot')\n",
+    "plt.rcParams['axes.facecolor']='w'\n",
+    "plt.rcParams['axes.edgecolor']='555555'\n",
+    "#plt.rcParams['ytick.color']='black'\n",
+    "plt.rcParams['grid.color']='dddddd'\n",
+    "plt.rcParams['axes.spines.top']='false'\n",
+    "plt.rcParams['axes.spines.right']='false'\n",
+    "plt.rcParams['legend.frameon']='true'\n",
+    "plt.rcParams['legend.framealpha']='1'\n",
+    "plt.rcParams['legend.edgecolor']='1'\n",
+    "plt.rcParams['legend.borderpad']='1'\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "plt.figure() \n",
+    "ax = df.plot(kind='line', marker='o')\n",
+    "#ax = df.plot(kind='line',x='dim_value', legend=False, use_index=True)\n",
+    "ax.set_ylabel('instances')\n",
+    "ax.set_xlabel('data sources')\n",
+    "ax.set_ylim(ymin=0)\n",
+    "#ax.set_xlim(xmin=0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.7.0-final"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "kernelspec": {
+   "name": "python37064bitvenvvenv469ea2e0a7854dc7b367eee45386afee",
+   "display_name": "Python 3.7.0 64-bit ('.venv': venv)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
+%% Cell type:code id: tags:
+
+``` 
+import os
+import requests
+from datetime import datetime, timedelta, timezone
+import pandas as pd
+import matplotlib.pyplot as plt
+```
+
+%% Cell type:code id: tags:
+
+``` 
+exp_id = 1025
+warmup_sec = 60
+warmup_partitions_sec = 120
+threshold = 2000 #slope
+directory = '../results-inst'
+```
+
+%% Cell type:code id: tags:
+
+``` 
+df1 = pd.read_csv(os.path.join(directory, 'exp1025_min-suitable-instances.csv')).set_index('dim_value').rename(columns={"instances": "1025"})
+df2 = pd.read_csv(os.path.join(directory, 'exp1003_min-suitable-instances.csv')).set_index('dim_value').rename(columns={"instances": "1003"})
+
+#df1.join(df2, on='instances')
+df = df1.join(df2, how='outer')
+
+df
+```
+
+%% Cell type:code id: tags:
+
+``` 
+plt.style.use('ggplot')
+plt.rcParams['axes.facecolor']='w'
+plt.rcParams['axes.edgecolor']='555555'
+#plt.rcParams['ytick.color']='black'
+plt.rcParams['grid.color']='dddddd'
+plt.rcParams['axes.spines.top']='false'
+plt.rcParams['axes.spines.right']='false'
+plt.rcParams['legend.frameon']='true'
+plt.rcParams['legend.framealpha']='1'
+plt.rcParams['legend.edgecolor']='1'
+plt.rcParams['legend.borderpad']='1'
+
+
+
+
+
+plt.figure()
+ax = df.plot(kind='line', marker='o')
+#ax = df.plot(kind='line',x='dim_value', legend=False, use_index=True)
+ax.set_ylabel('instances')
+ax.set_xlabel('data sources')
+ax.set_ylim(ymin=0)
+#ax.set_xlim(xmin=0)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+```
--- a/execution/scalability-graph.ipynb
+++ b/execution/scalability-graph.ipynb
@@ -38,11 +38,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "exp_id = 1009\n",
+    "exp_id = 1003\n",
    "warmup_sec = 60\n",
    "warmup_partitions_sec = 120\n",
    "threshold = 2000 #slope\n",
-    "directory = './results-final'\n"
+    "directory = '../results'\n"
   ]
  },
  {
@@ -238,6 +238,15 @@
    "min_suitable_instances"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_suitable_instances.to_csv(f'../results-inst/exp{exp_id}_min-suitable-instances.csv', index=False)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,

 %% Cell type:code id: tags:

 ``` 
 print("hello")
 ```

 %% Cell type:code id: tags:

 ``` 
 import os
 import requests
 from datetime import datetime, timedelta, timezone
 import pandas as pd
 from sklearn.linear_model import LinearRegression
 import matplotlib.pyplot as plt
 ```

 %% Cell type:code id: tags:

 ``` 
 os.getcwd()
 ```

 %% Cell type:code id: tags:

 ``` 
-exp_id = 1009
+exp_id = 1003
 warmup_sec = 60
 warmup_partitions_sec = 120
 threshold = 2000 #slope
-directory = './results-final'
+directory = '../results'
 ```

 %% Cell type:code id: tags:outputPrepend,outputPrepend

 ``` 
 #exp_id = 35

 #os.chdir("./results-final")

 raw_runs = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
 for filename in filenames:
    #print(filename)
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))
    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()

    X = regress.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array
    Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
    linear_regressor = LinearRegression()  # create object for the class
    linear_regressor.fit(X, Y)  # perform linear regression
    Y_pred = linear_regressor.predict(X)  # make predictions

    trend_slope = linear_regressor.coef_[0][0]
    #print(linear_regressor.coef_)

    row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}
    #print(row)
    raw_runs.append(row)

 lags = pd.DataFrame(raw_runs)
 ```

 %% Cell type:code id: tags:

 ``` 
 lags.head()
 ```

 %% Cell type:code id: tags:

 ``` 

 raw_partitions = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("partitions.csv")]
 for filename in filenames:
    #print(filename)
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))
    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    input = input.loc[input['topic'] >= 'input']
    mean = input['value'].mean()

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()


    row = {'dim_value': int(dim_value), 'instances': int(instances), 'partitions': mean}
    #print(row)
    raw_partitions.append(row)


 partitions = pd.DataFrame(raw_partitions)

 #runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])
 ```

 %% Cell type:code id: tags:

 ``` 
 raw_obs_instances = []

 filenames = [filename for filename in os.listdir(directory) if filename.startswith(f"exp{exp_id}") and filename.endswith("instances.csv")]
 for filename in filenames:
    run_params = filename[:-4].split("_")
    dim_value = run_params[2]
    instances = run_params[3]

    df = pd.read_csv(os.path.join(directory, filename))

    if df.empty:
        continue

    #input = df.loc[df['topic'] == "input"]
    input = df
    #print(input)
    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
    #print(input)
    #print(input.iloc[0, 'timestamp'])
    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
    #regress = input

    #input = input.loc[input['topic'] >= 'input']
    #mean = input['value'].mean()

    #input.plot(kind='line',x='timestamp',y='value',color='red')
    #plt.show()


    #row = {'dim_value': int(dim_value), 'instances': int(instances), 'obs_instances': mean}
    #print(row)
    raw_obs_instances.append(row)


 obs_instances = pd.DataFrame(raw_obs_instances)

 obs_instances.head()
 ```

 %% Cell type:code id: tags:

 ``` 
 runs = lags
 #runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])#.join(obs_instances.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])

 #runs["failed"] = runs.apply(lambda row: (abs(row['instances'] - row['obs_instances']) / row['instances']) > 0.1, axis=1)

 #runs.loc[runs['failed']==True]
 ```

 %% Cell type:code id: tags:

 ``` 
 #threshold = 1000

 # Set to true if the trend line has a slope less than
 runs["suitable"] =  runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)

 runs.columns = runs.columns.str.strip()
 runs.sort_values(by=["dim_value", "instances"])
 ```

 %% Cell type:code id: tags:

 ``` 
 filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]

 grouped = filtered.groupby(['dim_value'])['instances'].min()
 min_suitable_instances = grouped.to_frame().reset_index()

 min_suitable_instances
 ```

 %% Cell type:code id: tags:

 ``` 
+min_suitable_instances.to_csv(f'../results-inst/exp{exp_id}_min-suitable-instances.csv', index=False)
+```
+
+%% Cell type:code id: tags:
+
+``` 
 min_suitable_instances.plot(kind='line',x='dim_value',y='instances')
 # min_suitable_instances.plot(kind='line',x='dim_value',y='instances', logy=True)

 plt.show()
 ```

 %% Cell type:code id: tags:

 ``` 
 ```