import pandas as pd
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sea
from IPython.display import display,Markdown, HTML
import inflection
import numpy as np
from statsmodels import regression
import statsmodels.api as sm
import math
def printmd(text, fmts = {},all_fmt = '{:,.2f}', humanize = True):
if type(text) == pd.Series:
text = pd.DataFrame(text)
if type(text) == pd.DataFrame:
text = text.copy()
for col in text.columns:
try:
text[col] = pd.to_numeric(text[col])
fmt = all_fmt
if col in fmts.keys():
fmt = fmts[col]
text[col] = text[col].apply(lambda x: fmt.format(x))
except Exception as e:
pass
if humanize:
text.columns = text.columns.map(inflection.humanize)
display(HTML(text.to_html()))
return
display(Markdown(text))
phi = 1.6180339887498948420
import xmltodict
def conv_text(text):
ans = [float(x.strip()) for x in text.split('\n')]
return ans
def load_ve():
with open('ve1.table') as f:
x = f.read()
x = xmltodict.parse(x)
x = x['tableData']['table']
x_axis = conv_text(x['xAxis']['#text'])
y_axis = conv_text(x['yAxis']['#text'])
z_values = x['zValues']['#text']
rows = z_values.split('\n')
data = []
for row in rows:
data.append(row.strip().split(' '))
data = pd.DataFrame(data, index = y_axis, columns=x_axis)
for col in data.columns:
data[col] = pd.to_numeric(data[col])
return data
files = glob('*.msl')
dtypes = False
cache = {}
def get_data():
global cache
global dtypes
if 'data' in cache.keys():
return cache['data'].copy()
file_list = dict(zip(list(range(len(files))),files))
print(file_list)
num = input('File number?')
datas =[]
for i in num.split('-'):
with open(file_list[int(i)]) as f:
raw_data = f.read()
lines = raw_data.split('\n')
columns = lines[2].split('\t')
dtypes = lines[3].split('\t')
data = lines[4:-1]
data = pd.DataFrame(data = [x.split('\t') for x in data])
data.columns = columns
dtypes = dict(zip(columns, dtypes))
datas.append(data)
data = pd.concat(datas)
for col in tqdm(data.columns):
data[col] = pd.to_numeric(data[col].str.replace("NaN", ""))
cache['data'] = data.copy()
return data
data = get_data()
data = get_data()
' , '.join(data.columns)
dtypes['MPH']
print("Drive Length")
print(round(data['Time'].max()/60), 'Minutes')
print("Average AFR")
print(data['AFR'].mean())
print("Average AFR under Positive TPSDOT Acceleration")
data[data['TPSdot'] > 0]['AFR'].mean()
def engine_pic(data):
fig, ax = plt.subplots()
fig.set_size_inches(8*1.6, 8)
sc = ax.scatter(data['RPM'], data['Load'], alpha = .3, c = data['AFR'])
plt.colorbar(sc)
ax.set_xlabel("RPM")
ax.set_ylabel("Load")
plt.show()
data = get_data()
engine_pic(data)
Color is AFR
data = get_data()
data.set_index("Time")['CLT'].plot()
min_clt = 60
data = get_data()
data = data[data["CLT"] > min_clt]
data['rpm_binned'] = (data['RPM']/100).apply(int) * 100
data['load_binned'] = (data['Load']/10).apply(int) * 10
table = data.pivot_table('AFR', index = "load_binned", columns='rpm_binned', aggfunc= np.mean)
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8)
sea.heatmap(table.loc[reversed(table.index)], center = 14.7)
We're running pretty Rich down low
data = get_data()
data = data[data["CLT"] > min_clt]
data['rpm_binned'] = (data['RPM']/100).apply(int) * 100
data['load_binned'] = (data['Load']/10).apply(int) * 10
table = data.pivot_table('VE1', index = "load_binned", columns='rpm_binned', aggfunc= np.mean)
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8)
sea.heatmap(table.loc[reversed(table.index)], center = data['VE1'].mean())
VE suggested by computer
vs target AFR
data = get_data()
data = data[data["CLT"] > min_clt]
data['rpm_binned'] = (data['RPM']/100).apply(int) * 100
data['load_binned'] = (data['Load']/10).apply(int) * 10
table = data.pivot_table('AFR 1 Error', index = "load_binned", columns='rpm_binned', aggfunc= np.median)
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8)
sea.heatmap(table.loc[reversed(table.index)], center = 0)
errors = table.copy()
I originally though I would need to do lag time calculations on this, but averaging the data in each bin should do the trick. I can see a few weird spots but I don't know if that is due to low amount of observations.
I just count the number of time I have an observation in each bin, that way I can weight my adjustments. to avoid over adjusting an area based on very few observations
data = get_data()
data['count'] = 1
data = data[data["CLT"] > min_clt]
data['rpm_binned'] = (data['RPM']/100).apply(int) * 100
data['load_binned'] = (data['Load']/10).apply(int) * 10
table = data.pivot_table('count', index = "load_binned", columns='rpm_binned', aggfunc= np.sum)
table = table/100
for col in table.columns:
table[col] = table[col].apply(lambda x: min(x, 1))
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8)
sea.heatmap(table.loc[reversed(table.index)], center = 0, cmap = 'Spectral')
conf = table.copy()
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8)
sea.heatmap((errors * conf).loc[reversed(conf.index)], center = 0)
ve = load_ve()
((errors/errors.max().max()) * conf/4) + 1
Adjust based on something like this? TODO: Figure how how to adjust
ve
sea.heatmap(ve.loc[reversed(ve.index)])
Need to translate bins...
def linreg(series_x,series_y):
X = series_x.values
Y = series_y.values
# Running the linear regression
X = sm.add_constant(X)
model = regression.linear_model.OLS(Y, X).fit()
a = model.params[0]
b = model.params[1]
X = X[:, 1]
# Return summary of the regression and plot results
X2 = np.linspace(X.min(), X.max(), 100)
Y_hat = X2 * b + a
fig, ax = plt.subplots()
fig.set_size_inches(8 * phi, 8 )
plt.scatter(X, Y, alpha=0.3) # Plot the raw data
plt.plot(X2, Y_hat, 'r', alpha=0.9); # Add the regression line, colored in red
plt.xlabel(series_x.name)
plt.ylabel('Y values')
return model.summary()
data = get_data()
data = data[data['Engine in overrun'] != 1]
data = data[data["Engine idling"] != 1]
data = data[data['Engine decelerating slowly'] != 1]
linreg(data['Load'], data['AFR'])
Questions:
x = data[(data['Load'] < 60) & (data['AFR'] > 15)]
not_x = data[(data['Load'] >= 60) & (data['AFR'] <= 15)]
probs = pd.DataFrame()
for column in x.columns:
probs.loc[column, 'x' ] = x[column].mean()
probs.loc[column, 'not_x'] = not_x[column].mean()
probs['difference'] = probs['x'] - probs['not_x']
probs['pct_error'] = probs['difference']/probs['not_x']
probs = probs.sort_values('pct_error')
probs = probs.dropna()
probs = probs.replace(np.inf, probs[probs['pct_error'].apply(abs) != np.inf]['pct_error'].max() + 1)
fig, ax = plt.subplots()
fig.set_size_inches(8 , 8 * phi)
sea.barplot( probs['pct_error'], probs.index)
sea.despine(ax = ax)
Estimated by taking number of records during more than 15 afr and less than 60 loads and multiplying it by the average time difference in the dataset
time_spent = len(x) * data['Time'].diff().mean()
'{:,.2f} seconds in a {:.0f} minute drive - % {:,.2f} of driving time'.format(time_spent, data['Time'].max()/60,100* time_spent/data["Time"].max())