{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"hello\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from datetime import datetime, timedelta, timezone\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_id = 2012\n",
    "warmup_sec = 60\n",
    "warmup_partitions_sec = 120\n",
    "threshold = 2000 #slope\n",
    "#directory = '../results'\n",
    "directory = '<path-to>/results'\n",
    "directory_out = '<path-to>/results-inst'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "outputPrepend",
     "outputPrepend"
    ]
   },
   "outputs": [],
   "source": [
    "#exp_id = 35\n",
    "\n",
    "#os.chdir(\"./results-final\")\n",
    "\n",
    "raw_runs = []\n",
    "\n",
    "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"totallag.csv\")]\n",
    "for filename in filenames:\n",
    "    #print(filename)\n",
    "    run_params = filename[:-4].split(\"_\")\n",
    "    dim_value = run_params[2]\n",
    "    instances = run_params[3]\n",
    "\n",
    "    df = pd.read_csv(os.path.join(directory, filename))\n",
    "    #input = df.loc[df['topic'] == \"input\"]\n",
    "    input = df\n",
    "    #print(input)\n",
    "    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n",
    "    #print(input)\n",
    "    #print(input.iloc[0, 'timestamp'])\n",
    "    regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n",
    "    #regress = input\n",
    "\n",
    "    #input.plot(kind='line',x='timestamp',y='value',color='red')\n",
    "    #plt.show()\n",
    "\n",
    "    X = regress.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array\n",
    "    Y = regress.iloc[:, 3].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column\n",
    "    linear_regressor = LinearRegression()  # create object for the class\n",
    "    linear_regressor.fit(X, Y)  # perform linear regression\n",
    "    Y_pred = linear_regressor.predict(X)  # make predictions\n",
    "\n",
    "    trend_slope = linear_regressor.coef_[0][0]\n",
    "    #print(linear_regressor.coef_)\n",
    "\n",
    "    row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}\n",
    "    #print(row)\n",
    "    raw_runs.append(row)\n",
    "\n",
    "lags = pd.DataFrame(raw_runs)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lags.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "raw_partitions = []\n",
    "\n",
    "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"partitions.csv\")]\n",
    "for filename in filenames:\n",
    "    #print(filename)\n",
    "    run_params = filename[:-4].split(\"_\")\n",
    "    dim_value = run_params[2]\n",
    "    instances = run_params[3]\n",
    "\n",
    "    df = pd.read_csv(os.path.join(directory, filename))\n",
    "    #input = df.loc[df['topic'] == \"input\"]\n",
    "    input = df\n",
    "    #print(input)\n",
    "    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n",
    "    #print(input)\n",
    "    #print(input.iloc[0, 'timestamp'])\n",
    "    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n",
    "    #regress = input\n",
    "\n",
    "    input = input.loc[input['topic'] >= 'input']\n",
    "    mean = input['value'].mean()\n",
    "\n",
    "    #input.plot(kind='line',x='timestamp',y='value',color='red')\n",
    "    #plt.show()\n",
    "\n",
    "\n",
    "    row = {'dim_value': int(dim_value), 'instances': int(instances), 'partitions': mean}\n",
    "    #print(row)\n",
    "    raw_partitions.append(row)\n",
    "\n",
    "\n",
    "partitions = pd.DataFrame(raw_partitions)\n",
    "\n",
    "#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_obs_instances = []\n",
    "\n",
    "filenames = [filename for filename in os.listdir(directory) if filename.startswith(f\"exp{exp_id}\") and filename.endswith(\"instances.csv\")]\n",
    "for filename in filenames:\n",
    "    run_params = filename[:-4].split(\"_\")\n",
    "    dim_value = run_params[2]\n",
    "    instances = run_params[3]\n",
    "\n",
    "    df = pd.read_csv(os.path.join(directory, filename))\n",
    "\n",
    "    if df.empty:\n",
    "        continue\n",
    "\n",
    "    #input = df.loc[df['topic'] == \"input\"]\n",
    "    input = df\n",
    "    #print(input)\n",
    "    input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']\n",
    "    #print(input)\n",
    "    #print(input.iloc[0, 'timestamp'])\n",
    "    input = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up\n",
    "    #regress = input\n",
    "\n",
    "    #input = input.loc[input['topic'] >= 'input']\n",
    "    #mean = input['value'].mean()\n",
    "\n",
    "    #input.plot(kind='line',x='timestamp',y='value',color='red')\n",
    "    #plt.show()\n",
    "\n",
    "\n",
    "    #row = {'dim_value': int(dim_value), 'instances': int(instances), 'obs_instances': mean}\n",
    "    #print(row)\n",
    "    raw_obs_instances.append(row)\n",
    "\n",
    "\n",
    "obs_instances = pd.DataFrame(raw_obs_instances)\n",
    "\n",
    "obs_instances.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "runs = lags\n",
    "#runs = lags.join(partitions.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])#.join(obs_instances.set_index(['dim_value', 'instances']), on=['dim_value', 'instances'])\n",
    "\n",
    "#runs[\"failed\"] = runs.apply(lambda row: (abs(row['instances'] - row['obs_instances']) / row['instances']) > 0.1, axis=1)\n",
    "\n",
    "#runs.loc[runs['failed']==True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#threshold = 1000\n",
    "\n",
    "# Set to true if the trend line has a slope less than \n",
    "runs[\"suitable\"] =  runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)\n",
    "\n",
    "runs.columns = runs.columns.str.strip()\n",
    "runs.sort_values(by=[\"dim_value\", \"instances\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]\n",
    "\n",
    "grouped = filtered.groupby(['dim_value'])['instances'].min()\n",
    "min_suitable_instances = grouped.to_frame().reset_index()\n",
    "\n",
    "min_suitable_instances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_suitable_instances.to_csv(os.path.join(directory_out, f'exp{exp_id}_min-suitable-instances.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_suitable_instances.plot(kind='line',x='dim_value',y='instances')\n",
    "# min_suitable_instances.plot(kind='line',x='dim_value',y='instances', logy=True)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.0-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python37064bitvenvvenv6c432ee1239d4f3cb23f871068b0267d",
   "display_name": "Python 3.7.0 64-bit ('.venv': venv)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}