Skip to content
Snippets Groups Projects
Commit 7844395b authored by Sören Henning's avatar Sören Henning
Browse files

Use summed lag instead of just for input topic

parent 2d7c11d9
No related branches found
No related tags found
No related merge requests found
...@@ -50,6 +50,8 @@ for result in results: ...@@ -50,6 +50,8 @@ for result in results:
df = pd.DataFrame(d) df = pd.DataFrame(d)
# Do some analysis
input = df.loc[df['topic'] == "input"] input = df.loc[df['topic'] == "input"]
#input.plot(kind='line',x='timestamp',y='value',color='red') #input.plot(kind='line',x='timestamp',y='value',color='red')
...@@ -83,6 +85,30 @@ plt.savefig(f"{filename}_plot.png") ...@@ -83,6 +85,30 @@ plt.savefig(f"{filename}_plot.png")
df.to_csv(f"{filename}_values.csv") df.to_csv(f"{filename}_values.csv")
# Load total lag count
response = requests.get('http://kube1.se.internal:32529/api/v1/query_range', params={
'query': "sum by(group)(kafka_consumergroup_group_lag > 0)",
'start': start.isoformat(),
'end': end.isoformat(),
'step': '5s'})
results = response.json()['data']['result']
d = []
for result in results:
#print(result['metric']['topic'])
group = result['metric']['group']
for value in result['values']:
#print(value)
d.append({'group': group, 'timestamp': int(value[0]), 'value': int(value[1]) if value[1] != 'NaN' else 0})
df = pd.DataFrame(d)
df.to_csv(f"{filename}_totallag.csv")
# Load partition count # Load partition count
response = requests.get('http://kube1.se.internal:32529/api/v1/query_range', params={ response = requests.get('http://kube1.se.internal:32529/api/v1/query_range', params={
......
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
print("hello") print("hello")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
import os import os
import requests import requests
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
import pandas as pd import pandas as pd
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
os.getcwd() os.getcwd()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
exp_id = 35 exp_id = 159
warmup_sec = 60 warmup_sec = 60
threshold = 2000 #slope threshold = 2000 #slope
``` ```
%% Cell type:code id: tags:outputPrepend,outputPrepend %% Cell type:code id: tags:outputPrepend,outputPrepend
``` ```
#exp_id = 35 #exp_id = 35
#os.chdir("./results-new") #os.chdir("./results-new")
raw_runs = [] raw_runs = []
filenames = [filename for filename in os.listdir('.') if filename.startswith(f"exp{exp_id}") and filename.endswith(".csv")] filenames = [filename for filename in os.listdir('.') if filename.startswith(f"exp{exp_id}") and filename.endswith("totallag.csv")]
for filename in filenames: for filename in filenames:
#print(filename) #print(filename)
run_params = filename[:-4].split("_") run_params = filename[:-4].split("_")
dim_value = run_params[2] dim_value = run_params[2]
instances = run_params[3] instances = run_params[3]
df = pd.read_csv(filename) df = pd.read_csv(filename)
input = df.loc[df['topic'] == "input"] #input = df.loc[df['topic'] == "input"]
input = df
#print(input) #print(input)
input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp'] input['sec_start'] = input.loc[0:, 'timestamp'] - input.iloc[0]['timestamp']
#print(input) #print(input)
#print(input.iloc[0, 'timestamp']) #print(input.iloc[0, 'timestamp'])
regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up regress = input.loc[input['sec_start'] >= warmup_sec] # Warm-Up
#regress = input #regress = input
#input.plot(kind='line',x='timestamp',y='value',color='red') #input.plot(kind='line',x='timestamp',y='value',color='red')
#plt.show() #plt.show()
X = regress.iloc[:, 2].values.reshape(-1, 1) # values converts it into a numpy array X = regress.iloc[:, 2].values.reshape(-1, 1) # values converts it into a numpy array
Y = regress.iloc[:, 3].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column Y = regress.iloc[:, 3].values.reshape(-1, 1) # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression() # create object for the class linear_regressor = LinearRegression() # create object for the class
linear_regressor.fit(X, Y) # perform linear regression linear_regressor.fit(X, Y) # perform linear regression
Y_pred = linear_regressor.predict(X) # make predictions Y_pred = linear_regressor.predict(X) # make predictions
trend_slope = linear_regressor.coef_[0][0] trend_slope = linear_regressor.coef_[0][0]
#print(linear_regressor.coef_) #print(linear_regressor.coef_)
row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope} row = {'dim_value': int(dim_value), 'instances': int(instances), 'trend_slope': trend_slope}
#print(row) #print(row)
raw_runs.append(row) raw_runs.append(row)
runs = pd.DataFrame(raw_runs) runs = pd.DataFrame(raw_runs)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
runs.head() runs.head()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#threshold = 1000 #threshold = 1000
# Set to true if the trend line has a slope less than # Set to true if the trend line has a slope less than
runs["suitable"] = runs.apply(lambda row: row['trend_slope'] < threshold, axis=1) runs["suitable"] = runs.apply(lambda row: row['trend_slope'] < threshold, axis=1)
runs.columns = runs.columns.str.strip() runs.columns = runs.columns.str.strip()
runs.sort_values(by=["dim_value", "instances"]) runs.sort_values(by=["dim_value", "instances"])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)] filtered = runs[runs.apply(lambda x: x['suitable'], axis=1)]
grouped = filtered.groupby(['dim_value'])['instances'].min() grouped = filtered.groupby(['dim_value'])['instances'].min()
min_suitable_instances = grouped.to_frame().reset_index() min_suitable_instances = grouped.to_frame().reset_index()
min_suitable_instances min_suitable_instances
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
min_suitable_instances.plot(kind='line',x='dim_value',y='instances') min_suitable_instances.plot(kind='line',x='dim_value',y='instances')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment