{
"cells": [
{
"cell_type": "code",
"execution_count": 351,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import xgboost as xgb\n",
"#Load Data Prepr Librarier\n",
"import pandas as pd\n",
"import numpy as np\n",
"#Load Plotly\n",
"from plotly import __version__\n",
"from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n",
"import plotly.graph_objs as go\n",
"init_notebook_mode(connected=True)\n",
"from sklearn.model_selection import KFold\n",
"import statsmodels.formula.api as smf"
]
},
{
"cell_type": "code",
"execution_count": 378,
"metadata": {},
"outputs": [],
"source": [
"#Import Data\n",
"pr_folder = \"C:\\\\Users\\\\c10670A\\\\Documents\\\\ProjectLibrary\\\\Datathon_2018\"\n",
"\n",
"#Get the data\n",
"datafile = pd.read_csv(pr_folder + '\\\\Data\\\\All_data.csv', delimiter = \",\").fillna(0)\n",
"\n",
"datafile['Holiday'] = np.where(datafile['Week']==6,1,\n",
" np.where(datafile['Week']==58,1,\n",
" np.where(datafile['Week']==110,1,\n",
" np.where(datafile['Week']==33,1,\n",
" np.where(datafile['Week']==85,1,\n",
" np.where(datafile['Week']==137,1,0))))))\n",
"#Add last discount"
]
},
{
"cell_type": "code",
"execution_count": 379,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"89\n",
"BASE2_ACTUAL_PRICE_y\n",
"BASE2_ACTUAL_PRICE_PROM\n",
"ACTUAL_PRICE\n",
"DISCOUNT2_ACTUAL_PRICE_x\n",
"BASE2_ACTUAL_PRICE_HP_C\n",
"BASE2_ACTUAL_PRICE_HP_T\n",
"DISCOUNT2_ACTUAL_PRICE_y\n",
"LAST_DISCOUNT\n",
"VOLUME_LAG1\n",
"VOLUME_LAG2\n",
"VOLUME_LAG3\n",
"VOLUME_AVGLAG2\n",
"VOLUME_AVGLAG3\n",
"Week from promotion start\n",
"Week 1\n",
"Week 2\n",
"Week 3\n",
"Week 4\n",
"Week 5\n",
"Week 6\n",
"Week 7\n",
"time_from_prev2\n",
"time_from_prev_A\n",
"time_from_prev_B\n",
"time_from_prev_C\n",
"time_from_prev_D\n",
"time_from_prev_E\n",
"Promotion A dummy\n",
"Promotion B dummy\n",
"Promotion C dummy\n",
"Promotion D dummy\n",
"Promotion E dummy\n",
"VOLUMES_LAST_PROM\n",
"TIME_LAST_PROM\n",
"AVG_VOL_LAST_PROM\n",
"Price_Diff_1\n",
"Price_Diff_2\n",
"Price_Diff_3\n",
"Price_Diff_4\n",
"Price_Diff_5\n",
"Price_Diff_6\n",
"Price_log_Diff_1\n",
"Price_log_Diff_2\n",
"Price_log_Diff_3\n",
"Price_log_Diff_4\n",
"Price_log_Diff_5\n",
"Price_log_Diff_6\n",
"TYPE_COMPETITOR1_PRICE_3\n",
"TYPE_COMPETITOR2_PRICE_3\n",
"TYPE_COMPETITOR3_PRICE_3\n",
"TYPE_COMPETITOR4_PRICE_3\n",
"TYPE_COMPETITOR5_PRICE_3\n",
"TYPE_COMPETITOR6_PRICE_3\n",
"TYPE_COMPETITOR7_PRICE_3\n",
"BASE2_COMPETITOR1_PRICE\n",
"BASE2_COMPETITOR2_PRICE\n",
"BASE2_COMPETITOR3_PRICE\n",
"BASE2_COMPETITOR4_PRICE\n",
"BASE2_COMPETITOR5_PRICE\n",
"BASE2_COMPETITOR6_PRICE\n",
"BASE2_COMPETITOR7_PRICE\n",
"DISCOUNT2_COMPETITOR1_PRICE\n",
"DISCOUNT2_COMPETITOR2_PRICE\n",
"DISCOUNT2_COMPETITOR3_PRICE\n",
"DISCOUNT2_COMPETITOR4_PRICE\n",
"DISCOUNT2_COMPETITOR5_PRICE\n",
"DISCOUNT2_COMPETITOR6_PRICE\n",
"DISCOUNT2_COMPETITOR7_PRICE\n",
"BASE1_COMPETITOR1_PRICE\n",
"BASE1_COMPETITOR2_PRICE\n",
"BASE1_COMPETITOR3_PRICE\n",
"BASE1_COMPETITOR4_PRICE\n",
"BASE1_COMPETITOR5_PRICE\n",
"BASE1_COMPETITOR6_PRICE\n",
"BASE1_COMPETITOR7_PRICE\n",
"COMPETITOR1_PRICE\n",
"COMPETITOR2_PRICE\n",
"COMPETITOR3_PRICE\n",
"COMPETITOR4_PRICE\n",
"COMPETITOR5_PRICE\n",
"COMPETITOR6_PRICE\n",
"COMPETITOR7_PRICE\n",
"DISCOUNT1_COMPETITOR1_PRICE\n",
"DISCOUNT1_COMPETITOR2_PRICE\n",
"DISCOUNT1_COMPETITOR3_PRICE\n",
"DISCOUNT1_COMPETITOR4_PRICE\n",
"DISCOUNT1_COMPETITOR5_PRICE\n",
"DISCOUNT1_COMPETITOR6_PRICE\n",
"DISCOUNT1_COMPETITOR7_PRICE\n"
]
}
],
"source": [
"#Uplift factors\n",
"model_list = [\"BASE2_ACTUAL_PRICE_y\",\"BASE2_ACTUAL_PRICE_PROM\", \"ACTUAL_PRICE\",\"DISCOUNT2_ACTUAL_PRICE_x\", \"BASE2_ACTUAL_PRICE_HP_C\", \"BASE2_ACTUAL_PRICE_HP_T\", \"DISCOUNT2_ACTUAL_PRICE_y\",\"LAST_DISCOUNT\"]\n",
"model_list = model_list +[\"VOLUME_LAG1\", \"VOLUME_LAG2\", \"VOLUME_LAG3\" ,\"VOLUME_AVGLAG2\", \"VOLUME_AVGLAG3\"]\n",
"\n",
"#Promotion effectivness\n",
"model_list = model_list +[\"Week from promotion start\"] +['Week ' + str(p) for p in range(1, 8)]\n",
"model_list = model_list +[\"time_from_prev2\"]+[\"time_from_prev_\"+p for p in [\"A\",\"B\",\"C\",\"D\",\"E\"]]\n",
"model_list = model_list +['Promotion ' + p + ' dummy' for p in [\"A\", \"B\", \"C\", \"D\", \"E\"]]\n",
"model_list = model_list +['VOLUMES_LAST_PROM', 'TIME_LAST_PROM', 'AVG_VOL_LAST_PROM']\n",
"\n",
"\n",
"#Canibalization\n",
"model_list = model_list +[\"Price_Diff_\" + str(p) for p in range(1,7)]+ [\"Price_log_Diff_\" + str(p) for p in range(1,7)]\n",
"model_list = model_list +[\"TYPE_COMPETITOR\" + str(p) + \"_PRICE_3\" for p in range(1,8)] + [\"BASE2_COMPETITOR\" + str(p) +\"_PRICE\" for p in range(1,8)] + [\"DISCOUNT2_COMPETITOR\" + str(p) + \"_PRICE\" for p in range(1,8)] +[\"BASE1_COMPETITOR\" + str(p) +\"_PRICE\" for p in range(1,8)]+[\"COMPETITOR\" + str(p) +\"_PRICE\" for p in range(1,8)] + [\"DISCOUNT1_COMPETITOR\" + str(p) + \"_PRICE\" for p in range(1,8)] \n",
"\n",
"\n",
"print(len(model_list))\n",
"for i in model_list:\n",
" print(i)\n",
"X_columns = datafile.loc[:,model_list].columns\n",
"x_train = datafile.loc[:,model_list].values\n",
"y_train = datafile[\"VOLUME_OF_SALES\"].values"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ITERATION 0\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 1\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 2\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 3\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 4\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 5\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 6\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 7\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 8\n",
"0.99049639445\n",
"0.711359651805\n",
"ITERATION 9\n",
"0.99049639445\n",
"0.711359651805\n"
]
}
],
"source": [
"\n",
"\n",
"results = pd.DataFrame()\n",
"#row = pd.DataFrame({\"Sample\" : k, \"0\" : 0, \"1\" : 0, \"2\" : 0, \"3\" : 0, \"4\" : 0, \"5\" : 0, \"6\" : 0, \"7\" : 0, \"8\" : 0, \"9\" : 0}, index = [k],)\n",
"\n",
"for i in range(1):\n",
" print(\"ITERATION \" + str(i))\n",
" #XGB Model\n",
" xgb_params = {}\n",
" xgb_params['booster'] = ['gbtree','gblinear','dart'][2]\n",
" xgb_params['objective'] = ['reg:linear','reg:gamma','reg:tweedie'][0]\n",
" xgb_params['learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1][1]\n",
" xgb_params['eval_metric'] = ['rmse','mae','logloss','error','merror','mlogloss','ndcg','map'][0]\n",
" xgb_params['colsample_bytree'] = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0][8]\n",
" xgb_params['max_depth'] = [1, 2, 4, 6, 8, 10, 12, 14, 16, 20][3]\n",
" xgb_params['subsample'] = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0][5]\n",
" xgb_params['min_child_weight'] = [1, 2, 4, 5, 6, 8, 10, 20, 50, 100][2]\n",
" xgb_params['n_estimators'] = [10,50,100,300,500, 800, 1000, 1500, 2000][2]\n",
" num_boost_round = [5,10,25,50,100,200,250,300,500,900][3]\n",
" \n",
" NFOLDS = 5\n",
" kf = KFold(n_splits = NFOLDS, random_state = 999)\n",
" \n",
" ntrain = x_train.shape[0]\n",
" \n",
" oof_train = np.zeros((ntrain,))\n",
" oof_test_skf = np.empty((NFOLDS, ntrain))\n",
" \n",
" #Go through each FOLD, solve the problem and store the results for the train and test samples\n",
" k = 0\n",
" temp1 = 0\n",
" temp2 = 0\n",
" for train_index, test_index in kf.split(x_train):\n",
" x_tr = x_train[train_index]\n",
" y_tr = y_train[train_index]\n",
" x_te = x_train[test_index]\n",
" y_te = y_train[test_index]\n",
" \n",
" dtrain = xgb.DMatrix(x_tr, y_tr, feature_names = X_columns)\n",
" dtrain_test = xgb.DMatrix(x_te, feature_names = X_columns)\n",
" gbm = xgb.train(dict(xgb_params, silent = 1), dtrain, num_boost_round = num_boost_round)\n",
" \n",
" #Check the results\n",
" temp1 += np.power(pd.concat([pd.DataFrame(y_tr), pd.DataFrame(pd.DataFrame(gbm.predict(dtrain).tolist()).iloc[:,0])], axis = 1).corr().iloc[1,0],2)\n",
" temp2 += np.power(pd.concat([pd.DataFrame(y_te), pd.DataFrame(pd.DataFrame(gbm.predict(dtrain_test).tolist()).iloc[:,0])], axis = 1).corr().iloc[1,0],2)\n",
" k = k + 1\n",
" print(temp1/k)\n",
" print(temp2/k)\n",
"\n",
"\n",
"#table_macro = pd.concat([table_macro, row], axis = 0)\n",
"\n",
"#corr_app_sale = pd.concat([t_predict_s2, t_predict_sale, t_predict_build ,t_predict_macro], axis = 1).corr()"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"VOLUME_LAG3 36\n",
"DISCOUNT2_COMPETITOR1_PRICE 26\n",
"BASE2_ACTUAL_PRICE_HP_C 23\n",
"Price_Diff_3 21\n",
"VOLUME_LAG1 21\n",
"time_from_prev_A 20\n",
"VOLUME_AVGLAG3 19\n",
"VOLUME_AVGLAG2 19\n",
"BASE2_ACTUAL_PRICE_HP_T 19\n",
"LAST_DISCOUNT 15\n",
"BASE1_COMPETITOR1_PRICE 15\n",
"VOLUME_LAG2 15\n",
"time_from_prev_C 13\n",
"Price_Diff_2 13\n",
"Price_log_Diff_1 12\n",
"Price_Diff_1 12\n",
"time_from_prev_B 12\n",
"VOLUMES_LAST_PROM 11\n",
"BASE2_COMPETITOR1_PRICE 11\n",
"DISCOUNT1_COMPETITOR1_PRICE 11\n",
"Price_Diff_4 11\n",
"ACTUAL_PRICE 11\n",
"AVG_VOL_LAST_PROM 10\n",
"BASE2_ACTUAL_PRICE_PROM 9\n",
"COMPETITOR1_PRICE 9\n",
"time_from_prev_D 7\n",
"Week 1 6\n",
"Week from promotion start 6\n",
"BASE2_COMPETITOR3_PRICE 4\n",
"DISCOUNT2_ACTUAL_PRICE_y 4\n",
"time_from_prev_E 4\n",
"Price_log_Diff_5 4\n",
"Price_Diff_5 3\n",
"Promotion A dummy 3\n",
"BASE2_COMPETITOR5_PRICE 3\n",
"Price_log_Diff_4 2\n",
"time_from_prev2 2\n",
"Week 3 2\n",
"BASE1_COMPETITOR4_PRICE 2\n",
"BASE1_COMPETITOR5_PRICE 2\n",
"TYPE_COMPETITOR1_PRICE_3 2\n",
"Price_Diff_6 2\n",
"COMPETITOR4_PRICE 2\n",
"DISCOUNT1_COMPETITOR4_PRICE 2\n",
"DISCOUNT2_ACTUAL_PRICE_x 2\n",
"Promotion C dummy 2\n",
"Promotion D dummy 1\n",
"TIME_LAST_PROM 1\n",
"BASE2_COMPETITOR4_PRICE 1\n",
"TYPE_COMPETITOR5_PRICE_3 1\n",
"COMPETITOR5_PRICE 1\n",
"Name: Score, dtype: int64\n"
]
}
],
"source": [
"fimp = pd.Series(gbm.get_score(), name = \"Score\").sort_values(ascending = False)\n",
"print(fimp)"
]
},
{
"cell_type": "code",
"execution_count": 318,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"=========================================================================================\n",
"Dep. Variable: datafile_2['VOLUME_OF_SALES'] R-squared: 0.856\n",
"Model: OLS Adj. R-squared: 0.783\n",
"Method: Least Squares F-statistic: 11.66\n",
"Date: Sun, 11 Feb 2018 Prob (F-statistic): 4.99e-24\n",
"Time: 11:21:12 Log-Likelihood: -1625.0\n",
"No. Observations: 146 AIC: 3350.\n",
"Df Residuals: 96 BIC: 3499.\n",
"Df Model: 49 \n",
"Covariance Type: nonrobust \n",
"=============================================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------------------------------\n",
"Intercept 1.126e+05 7.85e+04 1.435 0.154 -4.31e+04 2.68e+05\n",
"datafile_2['BASE2_ACTUAL_PRICE_y'] 3.133e+04 1.98e+06 0.016 0.987 -3.89e+06 3.95e+06\n",
"datafile_2['BASE2_ACTUAL_PRICE_PROM'] -4.299e+05 1.54e+06 -0.279 0.781 -3.49e+06 2.63e+06\n",
"datafile_2['DISCOUNT2_ACTUAL_PRICE_x'] 1.413e+06 2.4e+06 0.589 0.557 -3.34e+06 6.17e+06\n",
"datafile_2['BASE2_ACTUAL_PRICE_HP_C'] -1.276e+06 1.25e+06 -1.021 0.310 -3.76e+06 1.2e+06\n",
"datafile_2['BASE2_ACTUAL_PRICE_HP_T'] -1.511e+06 1.25e+06 -1.205 0.231 -4e+06 9.78e+05\n",
"datafile_2['DISCOUNT2_ACTUAL_PRICE_y'] -1.382e+05 2.35e+06 -0.059 0.953 -4.8e+06 4.52e+06\n",
"datafile_2['LAST_DISCOUNT'] -9.067e+04 6.29e+04 -1.441 0.153 -2.16e+05 3.43e+04\n",
"datafile_2['VOLUME_LAG1'] 4.9e+04 3.65e+04 1.344 0.182 -2.34e+04 1.21e+05\n",
"datafile_2['VOLUME_LAG2'] 4.9e+04 3.65e+04 1.344 0.182 -2.34e+04 1.21e+05\n",
"datafile_2['VOLUME_LAG3'] 7399.6246 1.44e+04 0.513 0.609 -2.12e+04 3.6e+04\n",
"datafile_2['VOLUME_AVGLAG2'] -8.321e+04 5.3e+04 -1.569 0.120 -1.88e+05 2.21e+04\n",
"datafile_2['VOLUME_AVGLAG3'] -2.22e+04 4.32e+04 -0.513 0.609 -1.08e+05 6.36e+04\n",
"datafile_2['VOLUMES_LAST_PROM'] 0.0122 0.049 0.248 0.805 -0.085 0.110\n",
"datafile_2['AVG_VOL_LAST_PROM'] 0.2808 0.192 1.466 0.146 -0.099 0.661\n",
"datafile_2['Price_Diff_1'] -1.865e+05 1.77e+06 -0.105 0.916 -3.7e+06 3.33e+06\n",
"datafile_2['Price_Diff_2'] -2.451e+06 1.78e+06 -1.374 0.173 -5.99e+06 1.09e+06\n",
"datafile_2['Price_Diff_3'] -5.87e+06 3.51e+06 -1.675 0.097 -1.28e+07 1.09e+06\n",
"datafile_2['Price_Diff_4'] -1.198e+06 1.52e+06 -0.787 0.433 -4.22e+06 1.82e+06\n",
"datafile_2['Price_Diff_5'] -5.521e+05 1.65e+06 -0.335 0.738 -3.82e+06 2.72e+06\n",
"datafile_2['Price_Diff_6'] 1.154e+07 8.75e+06 1.318 0.191 -5.84e+06 2.89e+07\n",
"datafile_2['BASE2_COMPETITOR1_PRICE'] 4.064e+06 3.45e+06 1.177 0.242 -2.79e+06 1.09e+07\n",
"datafile_2['BASE2_COMPETITOR2_PRICE'] -1.735e+06 2.15e+06 -0.806 0.422 -6.01e+06 2.54e+06\n",
"datafile_2['BASE2_COMPETITOR3_PRICE'] -8.604e+06 6.51e+06 -1.322 0.189 -2.15e+07 4.31e+06\n",
"datafile_2['BASE2_COMPETITOR4_PRICE'] 9.049e+05 9.03e+05 1.003 0.319 -8.87e+05 2.7e+06\n",
"datafile_2['BASE2_COMPETITOR5_PRICE'] 2.085e+06 1.57e+06 1.326 0.188 -1.04e+06 5.21e+06\n",
"datafile_2['BASE2_COMPETITOR6_PRICE'] 2.607e+07 2.01e+07 1.298 0.197 -1.38e+07 6.59e+07\n",
"datafile_2['BASE2_COMPETITOR7_PRICE'] -2.132e+07 6.7e+06 -3.184 0.002 -3.46e+07 -8.03e+06\n",
"datafile_2['DISCOUNT2_COMPETITOR1_PRICE'] -2.585e+06 2.54e+06 -1.018 0.311 -7.62e+06 2.45e+06\n",
"datafile_2['DISCOUNT2_COMPETITOR2_PRICE'] 1.238e+06 1.49e+06 0.829 0.409 -1.73e+06 4.2e+06\n",
"datafile_2['DISCOUNT2_COMPETITOR3_PRICE'] 1.177e+07 8.62e+06 1.366 0.175 -5.33e+06 2.89e+07\n",
"datafile_2['DISCOUNT2_COMPETITOR4_PRICE'] -6.016e+05 5.34e+05 -1.126 0.263 -1.66e+06 4.59e+05\n",
"datafile_2['DISCOUNT2_COMPETITOR5_PRICE'] -1.795e+06 1.39e+06 -1.291 0.200 -4.55e+06 9.65e+05\n",
"datafile_2['DISCOUNT2_COMPETITOR6_PRICE'] -1.575e+07 1.24e+07 -1.267 0.208 -4.04e+07 8.93e+06\n",
"datafile_2['DISCOUNT2_COMPETITOR7_PRICE'] 1.439e+07 5e+06 2.880 0.005 4.47e+06 2.43e+07\n",
"datafile_2['BASE1_COMPETITOR1_PRICE'] -9.588e+05 2.75e+06 -0.349 0.728 -6.41e+06 4.49e+06\n",
"datafile_2['BASE1_COMPETITOR2_PRICE'] 4.903e+04 1.06e+05 0.461 0.645 -1.62e+05 2.6e+05\n",
"datafile_2['BASE1_COMPETITOR3_PRICE'] 8.829e+04 3.3e+04 2.672 0.009 2.27e+04 1.54e+05\n",
"datafile_2['BASE1_COMPETITOR4_PRICE'] -3.09e+04 5.2e+04 -0.594 0.554 -1.34e+05 7.23e+04\n",
"datafile_2['BASE1_COMPETITOR5_PRICE'] -4237.3285 2.51e+04 -0.169 0.866 -5.4e+04 4.55e+04\n",
"datafile_2['BASE1_COMPETITOR6_PRICE'] 2.594e+05 1.45e+05 1.789 0.077 -2.85e+04 5.47e+05\n",
"datafile_2['BASE1_COMPETITOR7_PRICE'] -4.554e+05 1.22e+05 -3.719 0.000 -6.99e+05 -2.12e+05\n",
"datafile_2['COMPETITOR1_PRICE'] -3.03e+06 1.76e+06 -1.723 0.088 -6.52e+06 4.62e+05\n",
"datafile_2['COMPETITOR2_PRICE'] -7.655e+05 1.81e+06 -0.422 0.674 -4.36e+06 2.83e+06\n",
"datafile_2['COMPETITOR3_PRICE'] 2.653e+06 3.6e+06 0.737 0.463 -4.49e+06 9.8e+06\n",
"datafile_2['COMPETITOR4_PRICE'] -2.019e+06 1.53e+06 -1.321 0.190 -5.05e+06 1.02e+06\n",
"datafile_2['COMPETITOR5_PRICE'] -2.665e+06 1.67e+06 -1.598 0.113 -5.97e+06 6.45e+05\n",
"datafile_2['COMPETITOR6_PRICE'] -1.475e+07 1.15e+07 -1.287 0.201 -3.75e+07 8e+06\n",
"datafile_2['COMPETITOR7_PRICE'] 2.178e+07 6.76e+06 3.222 0.002 8.36e+06 3.52e+07\n",
"datafile_2['DISCOUNT1_COMPETITOR1_PRICE'] 7.716e+05 2.07e+06 0.372 0.711 -3.34e+06 4.89e+06\n",
"datafile_2['DISCOUNT1_COMPETITOR2_PRICE'] -1.533e+05 3.23e+05 -0.475 0.636 -7.94e+05 4.87e+05\n",
"datafile_2['DISCOUNT1_COMPETITOR3_PRICE'] -6.458e+05 3.02e+05 -2.139 0.035 -1.25e+06 -4.64e+04\n",
"datafile_2['DISCOUNT1_COMPETITOR4_PRICE'] 1.697e+05 1.46e+05 1.161 0.249 -1.21e+05 4.6e+05\n",
"datafile_2['DISCOUNT1_COMPETITOR5_PRICE'] 6.236e+04 2.05e+05 0.305 0.761 -3.44e+05 4.69e+05\n",
"datafile_2['DISCOUNT1_COMPETITOR6_PRICE'] 9.821e+04 5.72e+05 0.172 0.864 -1.04e+06 1.23e+06\n",
"datafile_2['DISCOUNT1_COMPETITOR7_PRICE'] 1.896e+06 5.41e+05 3.505 0.001 8.22e+05 2.97e+06\n",
"==============================================================================\n",
"Omnibus: 4.827 Durbin-Watson: 1.692\n",
"Prob(Omnibus): 0.089 Jarque-Bera (JB): 6.597\n",
"Skew: -0.065 Prob(JB): 0.0369\n",
"Kurtosis: 4.033 Cond. No. 1.03e+16\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The smallest eigenvalue is 9.91e-20. This might indicate that there are\n",
"strong multicollinearity problems or that the design matrix is singular.\n"
]
}
],
"source": [
"#Copy File\n",
"datafile_2 = datafile.copy()\n",
"\n",
"#Define list of no-log\n",
"nolog_list = [\"Week from promotion start\"] +['Week ' + str(p) for p in range(1, 8)] + [\"TIME_LAST_PROM\"]\n",
"nolog_list = nolog_list +[\"time_from_prev2\"]+[\"time_from_prev_\"+p for p in [\"A\",\"B\",\"C\",\"D\",\"E\"]]\n",
"nolog_list = nolog_list +['Promotion ' + p + ' dummy' for p in [\"A\", \"B\", \"C\", \"D\", \"E\"]]\n",
"nolog_list = nolog_list +[\"TYPE_COMPETITOR\" + str(p) + \"_PRICE_3\" for p in range(1,8)]\n",
"nolog_list = nolog_list +[\"Price_log_Diff_\" + str(p) for p in range(1,7)]\n",
"\n",
"formula = \"datafile_2['VOLUME_OF_SALES'] ~ \"\n",
"#log_list = list(fimp.index)\n",
"\n",
"log_list = []\n",
"for i in model_list:\n",
" if i not in ['ACTUAL_PRICE'] + nolog_list:\n",
" log_list = log_list + [i]\n",
"\n",
"k = 0\n",
"for i in log_list:\n",
" if k == 0:\n",
" formula = formula + \" datafile_2['\" + i + \"']\"\n",
" else:\n",
" formula = formula + \" + datafile_2['\" + i + \"']\"\n",
" \n",
" k = k + 1\n",
" #if i not in nolog_list:\n",
" # datafile_2[i] = np.log(datafile_2[i].replace(0, np.nan))\n",
" # datafile_2[i] = datafile_2[i].replace(np.nan, 0)\n",
"\n",
"#Log volume\n",
"#datafile_2[\"VOLUME_OF_SALES\"] = np.log(datafile_2[\"VOLUME_OF_SALES\"]) \n",
" \n",
"model1 = smf.ols(formula=formula, data=datafile_2).fit()\n",
"print(model1.summary())\n"
]
},
{
"cell_type": "code",
"execution_count": 315,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"=======================================================================================\n",
"Dep. Variable: datafile['VOLUME_OF_SALES'] R-squared: 0.796\n",
"Model: OLS Adj. R-squared: 0.784\n",
"Method: Least Squares F-statistic: 66.63\n",
"Date: Sun, 11 Feb 2018 Prob (F-statistic): 1.78e-43\n",
"Time: 10:59:24 Log-Likelihood: -1650.6\n",
"No. Observations: 146 AIC: 3319.\n",
"Df Residuals: 137 BIC: 3346.\n",
"Df Model: 8 \n",
"Covariance Type: nonrobust \n",
"========================================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------------------\n",
"Intercept 2.771e+05 3.02e+04 9.166 0.000 2.17e+05 3.37e+05\n",
"datafile['BASE2_ACTUAL_PRICE_y'] -1.425e+05 2.74e+04 -5.197 0.000 -1.97e+05 -8.83e+04\n",
"datafile['DISCOUNT2_ACTUAL_PRICE_y'] 2.661e+05 2.47e+04 10.793 0.000 2.17e+05 3.15e+05\n",
"datafile['LAST_DISCOUNT'] -1.167e+05 4.25e+04 -2.744 0.007 -2.01e+05 -3.26e+04\n",
"datafile['VOLUME_LAG1'] -0.2884 0.049 -5.900 0.000 -0.385 -0.192\n",
"datafile['Week 1'] 2.713e+04 5678.470 4.778 0.000 1.59e+04 3.84e+04\n",
"datafile['Promotion A dummy'] 2.225e+04 6502.369 3.422 0.001 9394.642 3.51e+04\n",
"datafile['TIME_LAST_PROM'] -5894.4111 1537.557 -3.834 0.000 -8934.824 -2853.999\n",
"datafile['Price_log_Diff_3'] -8.09e+04 1.46e+04 -5.547 0.000 -1.1e+05 -5.21e+04\n",
"==============================================================================\n",
"Omnibus: 3.461 Durbin-Watson: 1.603\n",
"Prob(Omnibus): 0.177 Jarque-Bera (JB): 3.928\n",
"Skew: 0.055 Prob(JB): 0.140\n",
"Kurtosis: 3.796 Cond. No. 2.59e+06\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 2.59e+06. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n"
]
}
],
"source": [
"formula = \"datafile['VOLUME_OF_SALES'] ~ \"\n",
"#for i in list(fimp.index):\n",
"# formula = formula + \" + datafile['\" + i + \"']\"\n",
"remove_list = ['BASE2_COMPETITOR6_PRICE','Price_log_Diff_5','DISCOUNT2_COMPETITOR5_PRICE','DISCOUNT2_COMPETITOR5_PRICE','BASE2_COMPETITOR1_PRICE','DISCOUNT2_COMPETITOR4_PRICE','VOLUMES_LAST_PROM','Price_log_Diff_4','Price_log_Diff_1','BASE2_COMPETITOR3_PRICE','DISCOUNT2_COMPETITOR3_PRICE','BASE2_COMPETITOR7_PRICE','BASE2_COMPETITOR5_PRICE','DISCOUNT2_COMPETITOR2_PRICE','BASE2_COMPETITOR4_PRICE','DISCOUNT2_COMPETITOR1_PRICE','DISCOUNT2_COMPETITOR6_PRICE','DISCOUNT2_COMPETITOR7_PRICE','BASE2_COMPETITOR2_PRICE','VOLUME_AVGLAG2','VOLUME_LAG2','BASE2_ACTUAL_PRICE_HP_T','Price_Diff_6','COMPETITOR5_PRICE','COMPETITOR7_PRICE','time_from_prev_A','time_from_prev_C','time_from_prev_D','AVG_VOL_LAST_PROM','TYPE_COMPETITOR5_PRICE_3','TYPE_COMPETITOR1_PRICE_3','BASE1_COMPETITOR6_PRICE','BASE1_COMPETITOR7_PRICE','Week 3','Price_Diff_2','Price_Diff_6''Week 3','BASE1_COMPETITOR3_PRICE','DISCOUNT1_COMPETITOR7_PRICE','DISCOUNT1_COMPETITOR3_PRICE','DISCOUNT1_COMPETITOR2_PRICE','Price_Diff_1','DISCOUNT1_COMPETITOR4_PRICE','Price_Diff_3','BASE2_ACTUAL_PRICE_PROM','COMPETITOR4_PRICE','COMPETITOR6_PRICE','Week 7','time_from_prev_B','TYPE_COMPETITOR7_PRICE_3','VOLUME_LAG3','Promotion E dummy','VOLUME_AVGLAG3','BASE2_ACTUAL_PRICE_HP_C','BASE1_COMPETITOR4_PRICE','Price_log_Diff_2','Week from promotion start','TYPE_COMPETITOR3_PRICE_3','Price_Diff_5','COMPETITOR3_PRICE','COMPETITOR2_PRICE','ACTUAL_PRICE','Promotion B dummy','TYPE_COMPETITOR6_PRICE_3','DISCOUNT1_COMPETITOR5_PRICE','TYPE_COMPETITOR4_PRICE_3','TYPE_COMPETITOR6_PRICE_3''DISCOUNT1_COMPETITOR5_PRICE','BASE1_COMPETITOR5_PRICE','Week 4','Price_Diff_4','Week 2','BASE1_COMPETITOR2_PRICE','BASE1_COMPETITOR1_PRICE','DISCOUNT1_COMPETITOR1_PRICE', 'Week 6','DISCOUNT2_ACTUAL_PRICE_x','COMPETITOR1_PRICE','Week 5','Price_log_Diff_6','DISCOUNT1_COMPETITOR6_PRICE', 'Competitor3_Price_Impact','Promotion C dummy', 'Competitor1_Price_Impact', 'TYPE_COMPETITOR2_PRICE_3', 'Promotion D dummy','time_from_prev_E','time_from_prev2']\n",
"\n",
"k = 0\n",
"corr_list = []\n",
"for i in model_list:\n",
" if i not in remove_list:\n",
" corr_list = corr_list + [i]\n",
" if k == 0:\n",
" formula = formula + \" datafile['\" + i + \"']\"\n",
" else:\n",
" formula = formula + \" + datafile['\" + i + \"']\"\n",
"\n",
" k = k + 1\n",
"corr_list = corr_list + [\"VOLUME_OF_SALES\"]\n",
"\n",
"model = smf.ols(formula=formula, data=datafile).fit()\n",
"print(model.summary())\n"
]
},
{
"cell_type": "code",
"execution_count": 384,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"=======================================================================================\n",
"Dep. Variable: datafile['VOLUME_OF_SALES'] R-squared: 0.810\n",
"Model: OLS Adj. R-squared: 0.796\n",
"Method: Least Squares F-statistic: 57.45\n",
"Date: Sun, 11 Feb 2018 Prob (F-statistic): 9.92e-44\n",
"Time: 12:15:35 Log-Likelihood: -1645.4\n",
"No. Observations: 146 AIC: 3313.\n",
"Df Residuals: 135 BIC: 3346.\n",
"Df Model: 10 \n",
"Covariance Type: nonrobust \n",
"========================================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------------------\n",
"Intercept 2.722e+05 2.96e+04 9.195 0.000 2.14e+05 3.31e+05\n",
"datafile['BASE2_ACTUAL_PRICE_y'] -1.386e+05 2.7e+04 -5.129 0.000 -1.92e+05 -8.52e+04\n",
"datafile['DISCOUNT2_ACTUAL_PRICE_y'] 2.568e+05 2.42e+04 10.628 0.000 2.09e+05 3.05e+05\n",
"datafile['LAST_DISCOUNT'] -1.34e+05 4.41e+04 -3.035 0.003 -2.21e+05 -4.67e+04\n",
"datafile['VOLUME_LAG1'] -0.2788 0.048 -5.836 0.000 -0.373 -0.184\n",
"datafile['Week 1'] 2.654e+04 5523.595 4.804 0.000 1.56e+04 3.75e+04\n",
"datafile['Promotion A dummy'] 1.983e+04 6420.318 3.088 0.002 7129.573 3.25e+04\n",
"datafile['TIME_LAST_PROM'] -6055.9625 1495.528 -4.049 0.000 -9013.656 -3098.269\n",
"datafile['Price_Relative_Diff__3'] -1.122e+05 1.83e+04 -6.137 0.000 -1.48e+05 -7.6e+04\n",
"datafile['COMP_1_only'] 7895.9173 4439.175 1.779 0.078 -883.405 1.67e+04\n",
"datafile['Holiday'] 2.323e+04 8632.588 2.691 0.008 6161.366 4.03e+04\n",
"==============================================================================\n",
"Omnibus: 1.845 Durbin-Watson: 1.662\n",
"Prob(Omnibus): 0.398 Jarque-Bera (JB): 1.508\n",
"Skew: -0.056 Prob(JB): 0.470\n",
"Kurtosis: 3.485 Cond. No. 2.77e+06\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 2.77e+06. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n"
]
}
],
"source": [
"\n",
"for i in range(2,3):\n",
" formula = \"datafile['VOLUME_OF_SALES'] ~ \"\n",
" model_list2 = ['BASE2_ACTUAL_PRICE_y','DISCOUNT2_ACTUAL_PRICE_y','LAST_DISCOUNT','VOLUME_LAG1','Week 1','Promotion A dummy','TIME_LAST_PROM']\n",
" model_list2 = model_list2 + [\"Price_Relative_Diff__3\"]\n",
" model_list2 = model_list2 + [\"COMP_1_only\", \"Holiday\"]\n",
"\n",
" \n",
" k = 0\n",
" corr_list = []\n",
" for i in model_list2:\n",
" if k == 0:\n",
" formula = formula + \" datafile['\" + i + \"']\"\n",
" else:\n",
" formula = formula + \" + datafile['\" + i + \"']\"\n",
"\n",
" k = k + 1\n",
" corr_list = corr_list + [\"VOLUME_OF_SALES\"]\n",
"\n",
" model = smf.ols(formula=formula, data=datafile).fit()\n",
" print(model.summary())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}