Gold mining DS Project
Task statement
Prepare a prototype of a machine learning model for Zyfra. The company develops efficiency solutions for heavy industry.
The model should predict the amount of gold recovered from gold ore. There are the data on extraction and purification.
The model will help to optimize the production and eliminate unprofitable parameters.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from numpy.random import RandomState
from scipy import stats as st
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
train = pd.read_csv("/datasets/gold_recovery_train.csv")
test = pd.read_csv("/datasets/gold_recovery_test.csv")
full = pd.read_csv("/datasets/gold_recovery_full.csv")
train.info()
test.info()
rougher_table = train[['rougher.input.feed_au', 'rougher.output.concentrate_au',
'rougher.output.tail_au', 'rougher.output.recovery']]
rougher_table.columns = ['feed', 'concentrate', 'tail', 'recovery']
rougher_table.info()
rougher_table[['feed', 'concentrate', 'tail', 'recovery']].isnull().mean()
rougher_table = rougher_table.dropna().reset_index()
rougher_table.info()
rougher_table.head()
def recovery_cal(table):
r = []
for i in range(len(table)):
c = table.loc[i, 'concentrate']
f = table.loc[i, 'feed']
t = table.loc[i, 'tail']
if (f*(c-t)) == 0:
r.append(0)
else:
result = (c*(f-t)) / (f*(c-t)) * 100
r.append(result)
return r
rougher_table['r_pred'] = recovery_cal(rougher_table)
rougher_table['r_pred'] = rougher_table['r_pred'].fillna(value=0)
mae = mean_squared_error(rougher_table['recovery'], rougher_table['r_pred'])
mae
To re-calculate recovery it's better to drop all NaN value from the table to avoid errors.
In general original recovery value is not so different from re-calculated ones.
not_in_test = [x for x in train.columns if x not in test.columns]
not_in_test
Nearly all this values that are not in the test table can be predicted from existing values, so they are targets. Test table consist of features values.
target_columns = ['rougher.output.recovery', 'final.output.recovery']
features_columns = [x for x in train.columns if x in test.columns]
features_columns = features_columns[1:]
for i in features_columns:
i.replace('\\n ', '')
target_features_colums = target_columns + features_columns
print("We will lost {0:.2f}% of test data".format((len(train) - len(train.dropna(subset=target_features_colums))) / len(train)))
print("We will lost {0:.2f}% of test data".format((len(test) - len(test.dropna())) / len(test)))
train = train.dropna(subset=features_columns)
train = train.dropna(subset=target_columns)
test = test.dropna()
concent_au = full[['rougher.output.concentrate_au',
'primary_cleaner.output.concentrate_au',
'final.output.concentrate_au']]
concent_ag = full[['rougher.output.concentrate_ag',
'primary_cleaner.output.concentrate_ag',
'final.output.concentrate_ag']]
concent_pd = full[['rougher.output.concentrate_pb',
'primary_cleaner.output.concentrate_pb',
'final.output.concentrate_pb']]
plt.hist(concent_au.iloc[:, 0], bins=20, density=True, label='rough', histtype = 'step')
plt.hist(concent_au.iloc[:, 1], bins=20, density=True, label='primary', histtype = 'step')
plt.hist(concent_au.iloc[:, 2], bins=20, density=True, label='final', histtype = 'step')
plt.legend()
plt.show()
plt.hist(concent_ag.iloc[:, 0], bins=20, density=True, label='rough', histtype = 'step')
plt.hist(concent_ag.iloc[:, 1], bins=20, density=True, label='primary', histtype = 'step')
plt.hist(concent_ag.iloc[:, 2], bins=20, density=True, label='final', histtype = 'step')
plt.legend()
plt.show()
plt.hist(concent_pd.iloc[:, 0], bins=20, density=True, label='rough', histtype = 'step')
plt.hist(concent_pd.iloc[:, 1], bins=20, density=True, label='primary', histtype = 'step')
plt.hist(concent_pd.iloc[:, 2], bins=20, density=True, label='final', histtype = 'step')
plt.legend()
plt.show()
By the histograms we can say that concentration of gold is increasing on each cleaner step and it's expected. Obviouse all purification processes filter a lot off non-related elements, like Silver. Concentrate of Plumbum is encrease as well, it's interesting why.
feed_columns = ['primary_cleaner.input.feed_size', 'rougher.input.feed_size']
train_feed = train[feed_columns]
test_feed = test[feed_columns]
plt.hist(train_feed.iloc[:, 0], bins=20, density=True, label='train', histtype = 'step')
plt.hist(test_feed.iloc[:, 0], bins=20, density=True, label='test', histtype = 'step')
plt.legend()
plt.show()
plt.hist(train_feed.iloc[:, 1], bins=20, density=True, label='train', histtype = 'step')
plt.hist(test_feed.iloc[:, 1], bins=20, density=True, label='test', histtype = 'step')
plt.legend()
plt.show()
There is almost no difference and we can consider these distribution the same.
raw_feed = ['rougher.input.feed_ag', 'rougher.input.feed_pb',
'rougher.input.feed_sol', 'rougher.input.feed_au']
rougher_concentrate = ['rougher.output.concentrate_ag', 'rougher.output.concentrate_pb',
'rougher.output.concentrate_sol', 'rougher.output.concentrate_au']
final_concentrate = ['final.output.concentrate_ag', 'final.output.concentrate_pb',
'final.output.concentrate_sol', 'final.output.concentrate_au']
full[raw_feed].sum(1).hist(bins=50)
full[rougher_concentrate].sum(1).hist()
full[final_concentrate].sum(1).hist()
train = train.loc[train[raw_feed].sum(1) > 1]
test = test.loc[test[raw_feed].sum(1) > 1]
train = train.loc[train[rougher_concentrate].sum(1) > 1]
train = train.loc[train[final_concentrate].sum(1) > 1]
features_r_columns = [x for x in features_columns if 'rougher.input' in x]
features_f_columns = [x for x in features_columns if 'primary' in x]
features_r = train[features_r_columns]
features_f = train[features_f_columns]
target_r = train[target_columns[0]]
target_f = train[target_columns[1]]
def smape(target, prediction):
s = sum(np.abs(prediction - target) / ((np.abs(target) + np.abs(prediction)) / 2))
return s * 100 / len(target)
my_scorer = make_scorer(smape, greater_is_better=False)
model = LinearRegression()
print("LinearRegression:", cross_val_score(model, features_r, target_r, cv=3, scoring=my_scorer).mean())
for i in range(1,10):
model = DecisionTreeRegressor(max_depth = i, random_state=12345)
print("DecisionTreeRegressor:", cross_val_score(model, features_r, target_r, cv=3, scoring=my_scorer).mean())
for i in range(20,50,10):
model = RandomForestRegressor(n_estimators=i, max_depth=3, random_state=12345)
print("RandomForestRegressor with", i, "estimators:", cross_val_score(model, features_r, target_r, cv=3, scoring=my_scorer).mean())
model = LinearRegression()
print("LinearRegression:", cross_val_score(model, features_f, target_f, cv=3, scoring=my_scorer).mean())
for i in range(1,10):
model = DecisionTreeRegressor(max_depth = i, random_state=12345)
print("DecisionTreeRegressor:", i, "max_depth:", cross_val_score(model, features_f, target_f, cv=3, scoring=my_scorer).mean())
for i in range(20,50,10):
model = RandomForestRegressor(n_estimators=20, max_depth=1, random_state=12345)
print("RandomForestRegressor with", i, "estimators:", cross_val_score(model, features_f, target_f, cv=3, scoring=my_scorer).mean())
The most better model for rougher target is LinearRegression, for final recovery target is RandomForestRegressor with max_depth=1. Will test them
targets_all = full[['date', 'rougher.output.recovery', 'final.output.recovery']]
test_new = test.set_index('date').join(targets_all.set_index('date'), on='date', how='left')
test_new = test_new.dropna()
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(features_r, target_r)
dummy_predict_r = dummy_regr.predict(test_new[features_r_columns])
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(features_f, target_f)
dummy_predict_f = dummy_regr.predict(test_new[features_f_columns])
final_smape_dummy = 0.25*smape(test_new['rougher.output.recovery'], dummy_predict_r) + 0.75*smape(test_new['final.output.recovery'], dummy_predict_f)
final_smape_dummy
model = LinearRegression()
model.fit(features_r, target_r)
predict_r = model.predict(test_new[features_r_columns])
rough_smape = smape(test_new['rougher.output.recovery'], predict_r)
rough_smape
model = RandomForestRegressor(n_estimators=20, max_depth=1, random_state=12345)
model.fit(features_f, target_f)
predict_f = model.predict(test_new[features_f_columns])
final_smape = smape(test_new['final.output.recovery'], predict_f)
final_smape
final_smape = 0.25*smape(test_new['rougher.output.recovery'], predict_r) + 0.75*smape(test_new['final.output.recovery'], predict_f)
final_smape