{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"hello\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from datetime import datetime, timedelta, timezone\n", "import pandas as pd\n", "from sklearn.linear_model import LinearRegression\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.getcwd()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "exp_id = 2012\n", "warmup_sec = 60\n", "warmup_partitions_sec = 120\n", "threshold = 2000 #slope\n", "#directory = '../results'\n", "directory = '<path-to>/results'\n", "directory_out = '<path-to>/results-inst'\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "outputPrepend", "outputPrepend" ] }, "outputs": [], "source": [ "#exp_id = 35\n", "\n", "#os.chdir(\"./results-final\")\n", "\n", "raw_runs = []\n", "\n", "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"totallag.csv\")]\n", "for filename in filenames:\n", " #print(filename)\n", " run_params = filename[:-4].split(\"_\")\n", " dim_value = run_params[2]\n", " instances = run_params[3]\n", "\n", " df = pd.read_csv(os.path.join(directory, filename))\n", " #input = df.loc[df['topic'] == \"input\"]\n", " input = df\n", " #print(input)\n", " input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n", " #print(input)\n", " #print(input.iloc[0, 'timestamp'])\n", " regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n", " #regress = input\n", "\n", " #input.plot(kind='line',x='timestamp',y='value',color='red')\n", " #plt.show()\n", "\n", " X = regress.iloc[:, 2].values.reshape(-1, 1) # values converts it into a numpy array\n", " Y = regress.iloc[:, 3].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column\n", " linear_regressor = LinearRegression() # create object for the class\n", " linear_regressor.fit(X, Y) # perform linear regression\n", " Y_pred = linear_regressor.predict(X) # make predictions\n", "\n", " trend_slope = linear_regressor.coef_[0][0]\n", " #print(linear_regressor.coef_)\n", "\n", " row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}\n", " #print(row)\n", " raw_runs.append(row)\n", "\n", "lags = pd.DataFrame(raw_runs)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lags.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "raw_partitions = []\n", "\n", "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"partitions.csv\")]\n", "for filename in filenames:\n", " #print(filename)\n", " run_params = filename[:-4].split(\"_\")\n", " dim_value = run_params[2]\n", " instances = run_params[3]\n", "\n", " df = pd.read_csv(os.path.join(directory, filename))\n", " #input = df.loc[df['topic'] == \"input\"]\n", " input = df\n", " #print(input)\n", " input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n", " #print(input)\n", " #print(input.iloc[0, 'timestamp'])\n", " input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n", " #regress = input\n", "\n", " input = input.loc[input['topic'] >= 'input']\n", " mean = input['value'].mean()\n", "\n", " #input.plot(kind='line',x='timestamp',y='value',color='red')\n", " #plt.show()\n", "\n", "\n", " row = {'dim_value': int(dim_value), 'instances': int(instances), 'partitions': mean}\n", " #print(row)\n", " raw_partitions.append(row)\n", "\n", "\n", "partitions = pd.DataFrame(raw_partitions)\n", "\n", "#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw_obs_instances = []\n", "\n", "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"instances.csv\")]\n", "for filename in filenames:\n", " run_params = filename[:-4].split(\"_\")\n", " dim_value = run_params[2]\n", " instances = run_params[3]\n", "\n", " df = pd.read_csv(os.path.join(directory, filename))\n", "\n", " if df.empty:\n", " continue\n", "\n", " #input = df.loc[df['topic'] == \"input\"]\n", " input = df\n", " #print(input)\n", " input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n", " #print(input)\n", " #print(input.iloc[0, 'timestamp'])\n", " input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n", " #regress = input\n", "\n", " #input = input.loc[input['topic'] >= 'input']\n", " #mean = input['value'].mean()\n", "\n", " #input.plot(kind='line',x='timestamp',y='value',color='red')\n", " #plt.show()\n", "\n", "\n", " #row = {'dim_value': int(dim_value), 'instances': int(instances), 'obs_instances': mean}\n", " #print(row)\n", " raw_obs_instances.append(row)\n", "\n", "\n", "obs_instances = pd.DataFrame(raw_obs_instances)\n", "\n", "obs_instances.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "runs = lags\n", "#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])#.join(obs_instances.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])\n", "\n", "#runs[\"failed\"] = runs.apply(lambda row: (abs(row['instances'] - row['obs_instances']) / row['instances']) > 0.1, axis=1)\n", "\n", "#runs.loc[runs['failed']==True]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#threshold = 1000\n", "\n", "# Set to true if the trend line has a slope less than \n", "runs[\"suitable\"] = runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)\n", "\n", "runs.columns = runs.columns.str.strip()\n", "runs.sort_values(by=[\"dim_value\", \"instances\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]\n", "\n", "grouped = filtered.groupby(['dim_value'])['instances'].min()\n", "min_suitable_instances = grouped.to_frame().reset_index()\n", "\n", "min_suitable_instances" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "min_suitable_instances.to_csv(os.path.join(directory_out, f'exp{exp_id}_min-suitable-instances.csv'), index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "min_suitable_instances.plot(kind='line',x='dim_value',y='instances')\n", "# min_suitable_instances.plot(kind='line',x='dim_value',y='instances', logy=True)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "language_info": { "name": "python", "codemirror_mode": { "name": "ipython", "version": 3 }, "version": "3.7.0-final" }, "orig_nbformat": 2, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3, "kernelspec": { "name": "python37064bitvenvvenv6c432ee1239d4f3cb23f871068b0267d", "display_name": "Python 3.7.0 64-bit ('.venv': venv)" } }, "nbformat": 4, "nbformat_minor": 2 }