From 4c2f955d3025df4a11eac93e7c9df5163ceb3ee8 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 18 Jul 2018 16:19:53 -0400 Subject: [PATCH 1/3] initial ipynb file --- .gitignore | 1 + balance.ipynb | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 .gitignore create mode 100644 balance.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..763513e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints diff --git a/balance.ipynb b/balance.ipynb new file mode 100644 index 0000000..d11baa0 --- /dev/null +++ b/balance.ipynb @@ -0,0 +1,48 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score\n", + "from pandas.plotting import scatter_matrix\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 313af76701b6824fa4f900003ed863cf271a9162 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 18 Jul 2018 16:32:47 -0400 Subject: [PATCH 2/3] initial ipynb file - rene - dev --- balance2.ipynb | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 balance2.ipynb diff --git a/balance2.ipynb b/balance2.ipynb new file mode 100644 index 0000000..8acbe5e --- /dev/null +++ b/balance2.ipynb @@ -0,0 +1,48 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score\n", + "from pandas.plotting import scatter_matrix\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 17ce39ffade1102722654aa5f7408555454d98f0 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 18 Jul 2018 17:42:49 -0400 Subject: [PATCH 3/3] final ipynb file - rene - dev --- balance2.ipynb | 689 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 688 insertions(+), 1 deletion(-) diff --git a/balance2.ipynb b/balance2.ipynb index 8acbe5e..fdeb522 100644 --- a/balance2.ipynb +++ b/balance2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -16,6 +16,693 @@ "%matplotlib inline" ] }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomeLimitRatingCardsAgeEducationGenderStudentMarriedEthnicityBalance
014.891360628323411MaleNoYesCaucasian333
1106.025664548338215FemaleYesYesAsian903
2104.593707551447111MaleNoNoAsian580
3148.924950468133611FemaleNoNoAsian964
455.882489735726816MaleNoYesCaucasian331
\n", + "
" + ], + "text/plain": [ + " Income Limit Rating Cards Age Education Gender Student Married \\\n", + "0 14.891 3606 283 2 34 11 Male No Yes \n", + "1 106.025 6645 483 3 82 15 Female Yes Yes \n", + "2 104.593 7075 514 4 71 11 Male No No \n", + "3 148.924 9504 681 3 36 11 Female No No \n", + "4 55.882 4897 357 2 68 16 Male No Yes \n", + "\n", + " Ethnicity Balance \n", + "0 Caucasian 333 \n", + "1 Asian 903 \n", + "2 Asian 580 \n", + "3 Asian 964 \n", + "4 Caucasian 331 " + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('balance.txt')\n", + "df = df.drop(columns=['Unnamed: 0'])\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Education -0.008062\n", + "Age 0.001835\n", + "Cards 0.086456\n", + "Income 0.463656\n", + "Limit 0.861697\n", + "Rating 0.863625\n", + "Balance 1.000000\n", + "Name: Balance, dtype: float64" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr()['Balance'].sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 333\n", + "1 903\n", + "2 580\n", + "3 964\n", + "4 331\n", + "Name: Balance, dtype: int64" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = df.Balance\n", + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RatingAgeEducationStudent
02833411No
14838215Yes
25147111No
36813611No
43576816No
\n", + "
" + ], + "text/plain": [ + " Rating Age Education Student\n", + "0 283 34 11 No\n", + "1 483 82 15 Yes\n", + "2 514 71 11 No\n", + "3 681 36 11 No\n", + "4 357 68 16 No" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = df.drop(columns=['Balance'])\n", + "X = X.drop(columns=['Limit'])\n", + "X = X.drop(columns=['Income'])\n", + "X = X.drop(columns=['Married'])\n", + "X = X.drop(columns=['Ethnicity'])\n", + "X = X.drop(columns=['Gender'])\n", + "X = X.drop(columns=['Cards'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RatingAgeEducationStudent_NoStudent_Yes
0283341110
1483821501
2514711110
3681361110
4357681610
\n", + "
" + ], + "text/plain": [ + " Rating Age Education Student_No Student_Yes\n", + "0 283 34 11 1 0\n", + "1 483 82 15 0 1\n", + "2 514 71 11 1 0\n", + "3 681 36 11 1 0\n", + "4 357 68 16 1 0" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = pd.get_dummies(X)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RatingAgeEducationStudent_Yes
028334110
148382151
251471110
368136110
435768160
\n", + "
" + ], + "text/plain": [ + " Rating Age Education Student_Yes\n", + "0 283 34 11 0\n", + "1 483 82 15 1\n", + "2 514 71 11 0\n", + "3 681 36 11 0\n", + "4 357 68 16 0" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = X.drop(columns=['Student_No'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((268, 4), (132, 4))" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: Balance R-squared: 0.924
Model: OLS Adj. R-squared: 0.923
Method: Least Squares F-statistic: 807.5
Date: Wed, 18 Jul 2018 Prob (F-statistic): 1.06e-146
Time: 17:40:01 Log-Likelihood: -1796.4
No. Observations: 268 AIC: 3601.
Df Residuals: 264 BIC: 3615.
Df Model: 4
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
Rating 2.5388 0.073 34.852 0.000 2.395 2.682
Age -3.8174 0.606 -6.297 0.000 -5.011 -2.624
Education -13.3397 2.693 -4.954 0.000 -18.642 -8.037
Student_Yes 375.7067 41.147 9.131 0.000 294.688 456.726
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 10.420 Durbin-Watson: 1.904
Prob(Omnibus): 0.005 Jarque-Bera (JB): 10.601
Skew: -0.437 Prob(JB): 0.00499
Kurtosis: 3.430 Cond. No. 1.34e+03


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.34e+03. This might indicate that there are
strong multicollinearity or other numerical problems." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Balance R-squared: 0.924\n", + "Model: OLS Adj. R-squared: 0.923\n", + "Method: Least Squares F-statistic: 807.5\n", + "Date: Wed, 18 Jul 2018 Prob (F-statistic): 1.06e-146\n", + "Time: 17:40:01 Log-Likelihood: -1796.4\n", + "No. Observations: 268 AIC: 3601.\n", + "Df Residuals: 264 BIC: 3615.\n", + "Df Model: 4 \n", + "Covariance Type: nonrobust \n", + "===============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "-------------------------------------------------------------------------------\n", + "Rating 2.5388 0.073 34.852 0.000 2.395 2.682\n", + "Age -3.8174 0.606 -6.297 0.000 -5.011 -2.624\n", + "Education -13.3397 2.693 -4.954 0.000 -18.642 -8.037\n", + "Student_Yes 375.7067 41.147 9.131 0.000 294.688 456.726\n", + "==============================================================================\n", + "Omnibus: 10.420 Durbin-Watson: 1.904\n", + "Prob(Omnibus): 0.005 Jarque-Bera (JB): 10.601\n", + "Skew: -0.437 Prob(JB): 0.00499\n", + "Kurtosis: 3.430 Cond. No. 1.34e+03\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The condition number is large, 1.34e+03. This might indicate that there are\n", + "strong multicollinearity or other numerical problems.\n", + "\"\"\"" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = sm.OLS(y_train, X_train)\n", + "results = model.fit()\n", + "results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "scatter_matrix(X);" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = results.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7471322577153838" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_test, y_hat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,