From 8f771f8be3f9bd254d0f89470af253f2464ceea8 Mon Sep 17 00:00:00 2001 From: Caitlin Date: Tue, 3 May 2016 08:55:44 -0400 Subject: [PATCH] Add completed lab --- .../Week4-Lab-1.4-checkpoint.ipynb | 413 ++++++++++++++++++ .../week4-1.4-starter-checkpoint.ipynb} | 90 ++-- 1.4-lab/code/starter-code/Week4-Lab-1.4.ipynb | 413 ++++++++++++++++++ .../code/starter-code/week4-1.4-starter.ipynb | 22 +- 4 files changed, 894 insertions(+), 44 deletions(-) create mode 100644 1.4-lab/code/starter-code/.ipynb_checkpoints/Week4-Lab-1.4-checkpoint.ipynb rename 1.4-lab/code/starter-code/{week4-1.4-starter-cancer.ipynb => .ipynb_checkpoints/week4-1.4-starter-checkpoint.ipynb} (52%) create mode 100644 1.4-lab/code/starter-code/Week4-Lab-1.4.ipynb diff --git a/1.4-lab/code/starter-code/.ipynb_checkpoints/Week4-Lab-1.4-checkpoint.ipynb b/1.4-lab/code/starter-code/.ipynb_checkpoints/Week4-Lab-1.4-checkpoint.ipynb new file mode 100644 index 0000000..75bbe90 --- /dev/null +++ b/1.4-lab/code/starter-code/.ipynb_checkpoints/Week4-Lab-1.4-checkpoint.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import Python Libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.cross_validation import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read in Wisconsin Breast Cancer Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data\", header=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split Dataset Into Data Features and Target" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", + " 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],\n", + " dtype='int64')\n", + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 \n", + "\n", + " 9 ... 22 23 24 25 26 27 28 \\\n", + "0 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 \n", + "1 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 \n", + "2 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 \n", + "3 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 \n", + "4 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 \n", + "\n", + " 29 30 31 \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + "\n", + "[5 rows x 32 columns]\n", + " 0 1 2 3 4 5 6 7 8 \\\n", + "564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 \n", + "566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 \n", + "568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 \n", + "\n", + " 9 ... 22 23 24 25 26 27 \\\n", + "564 0.13890 ... 25.450 26.40 166.10 2027.0 0.14100 0.21130 \n", + "565 0.09791 ... 23.690 38.25 155.00 1731.0 0.11660 0.19220 \n", + "566 0.05302 ... 18.980 34.12 126.70 1124.0 0.11390 0.30940 \n", + "567 0.15200 ... 25.740 39.42 184.60 1821.0 0.16500 0.86810 \n", + "568 0.00000 ... 9.456 30.37 59.16 268.6 0.08996 0.06444 \n", + "\n", + " 28 29 30 31 \n", + "564 0.4107 0.2216 0.2060 0.07115 \n", + "565 0.3215 0.1628 0.2572 0.06637 \n", + "566 0.3403 0.1418 0.2218 0.07820 \n", + "567 0.9387 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.0000 0.2871 0.07039 \n", + "\n", + "[5 rows x 32 columns]\n" + ] + } + ], + "source": [ + "print df.columns\n", + "print df.head()\n", + "print df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "We have no column headers, but from referencing https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names we can see that column 1 should be our target variable: Benign vs Malignant. We'll need to replace these with 0 for benign and 1 for malignant, since kNN requires numeric values. Also from the link documentation we can see that column 0 is subject ID, which we do not want to include in features. So, our features will be columns 2 through 31, that is, we'll drop columns 0 and 1." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(569,)\n", + "(569, 30)\n" + ] + } + ], + "source": [ + "# set target and features\n", + "target = df[1].replace(\"M\", 1).replace(\"B\", 0)\n", + "features = df.drop([0,1], axis = 1)\n", + "\n", + "# check shapes\n", + "print target.shape\n", + "print features.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(455, 30)\n", + "(455,)\n", + "(114, 30)\n", + "(114,)\n" + ] + } + ], + "source": [ + "# split data into training and testing\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n", + "\n", + "print X_train.shape\n", + "print y_train.shape\n", + "print X_test.shape\n", + "print y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", + " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", + " weights='uniform')" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Instantiate kNN model with k = 5, fit to our train data\n", + "knn = KNeighborsClassifier(n_neighbors = 5)\n", + "knn.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0\n", + " 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0\n", + " 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1\n", + " 1 1 0]\n", + "[[ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.6 0.4]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.2 0.8]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.8 0.2]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.6 0.4]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.4 0.6]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0.4 0.6]\n", + " [ 0.2 0.8]\n", + " [ 1. 0. ]]\n" + ] + } + ], + "source": [ + "# Make predictions for test data\n", + "predictions = knn.predict(X_test)\n", + "print predictions\n", + "predictionProbs = knn.predict_proba(X_test)\n", + "print predictionProbs" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1., 0.]])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/1.4-lab/code/starter-code/week4-1.4-starter-cancer.ipynb b/1.4-lab/code/starter-code/.ipynb_checkpoints/week4-1.4-starter-checkpoint.ipynb similarity index 52% rename from 1.4-lab/code/starter-code/week4-1.4-starter-cancer.ipynb rename to 1.4-lab/code/starter-code/.ipynb_checkpoints/week4-1.4-starter-checkpoint.ipynb index 3c8269b..a4fb291 100644 --- a/1.4-lab/code/starter-code/week4-1.4-starter-cancer.ipynb +++ b/1.4-lab/code/starter-code/.ipynb_checkpoints/week4-1.4-starter-checkpoint.ipynb @@ -4,18 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Import Python Libraries\n" + "Use SciKit-Learn kNN Classifier Library" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "import pandas as pd\n", "from sklearn.neighbors import KNeighborsClassifier" ] }, @@ -23,76 +22,101 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Read in Wisconsin Breast Cancer Dataset" + "Set X and y some data" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data\", header=None)" + "X = [[0], [1], [2], [3]]\n", + "y = [0, 0, 1, 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Split Dataset Into Data Features and Target" + "Tell Phyton that we will use kNN Classifier with a k value of 3" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], - "source": [] + "source": [ + "neigh = KNeighborsClassifier(n_neighbors=3)\n" + ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "collapsed": true + }, "source": [ - "Build a logit model and fit" + "Fit the kNN Classifier to the X & y data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", + " metric_params=None, n_jobs=1, n_neighbors=3, p=2,\n", + " weights='uniform')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Prepare test data and predict" + "neigh.fit(X, y) " ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { - "collapsed": false + "collapsed": true }, - "outputs": [], - "source": [] + "source": [ + "Predict and Print Results" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0]\n", + "[[ 0.66666667 0.33333333]]\n" + ] + } + ], + "source": [ + "print(neigh.predict([[1.1]]))\n", + "print(neigh.predict_proba([[0.9]]))" + ] }, { "cell_type": "code", @@ -106,21 +130,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.11" } }, "nbformat": 4, diff --git a/1.4-lab/code/starter-code/Week4-Lab-1.4.ipynb b/1.4-lab/code/starter-code/Week4-Lab-1.4.ipynb new file mode 100644 index 0000000..75bbe90 --- /dev/null +++ b/1.4-lab/code/starter-code/Week4-Lab-1.4.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import Python Libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.cross_validation import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read in Wisconsin Breast Cancer Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data\", header=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split Dataset Into Data Features and Target" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", + " 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],\n", + " dtype='int64')\n", + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 \n", + "\n", + " 9 ... 22 23 24 25 26 27 28 \\\n", + "0 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 \n", + "1 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 \n", + "2 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 \n", + "3 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 \n", + "4 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 \n", + "\n", + " 29 30 31 \n", + "0 0.2654 0.4601 0.11890 \n", + "1 0.1860 0.2750 0.08902 \n", + "2 0.2430 0.3613 0.08758 \n", + "3 0.2575 0.6638 0.17300 \n", + "4 0.1625 0.2364 0.07678 \n", + "\n", + "[5 rows x 32 columns]\n", + " 0 1 2 3 4 5 6 7 8 \\\n", + "564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 \n", + "565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 \n", + "566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 \n", + "567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 \n", + "568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 \n", + "\n", + " 9 ... 22 23 24 25 26 27 \\\n", + "564 0.13890 ... 25.450 26.40 166.10 2027.0 0.14100 0.21130 \n", + "565 0.09791 ... 23.690 38.25 155.00 1731.0 0.11660 0.19220 \n", + "566 0.05302 ... 18.980 34.12 126.70 1124.0 0.11390 0.30940 \n", + "567 0.15200 ... 25.740 39.42 184.60 1821.0 0.16500 0.86810 \n", + "568 0.00000 ... 9.456 30.37 59.16 268.6 0.08996 0.06444 \n", + "\n", + " 28 29 30 31 \n", + "564 0.4107 0.2216 0.2060 0.07115 \n", + "565 0.3215 0.1628 0.2572 0.06637 \n", + "566 0.3403 0.1418 0.2218 0.07820 \n", + "567 0.9387 0.2650 0.4087 0.12400 \n", + "568 0.0000 0.0000 0.2871 0.07039 \n", + "\n", + "[5 rows x 32 columns]\n" + ] + } + ], + "source": [ + "print df.columns\n", + "print df.head()\n", + "print df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "We have no column headers, but from referencing https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names we can see that column 1 should be our target variable: Benign vs Malignant. We'll need to replace these with 0 for benign and 1 for malignant, since kNN requires numeric values. Also from the link documentation we can see that column 0 is subject ID, which we do not want to include in features. So, our features will be columns 2 through 31, that is, we'll drop columns 0 and 1." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(569,)\n", + "(569, 30)\n" + ] + } + ], + "source": [ + "# set target and features\n", + "target = df[1].replace(\"M\", 1).replace(\"B\", 0)\n", + "features = df.drop([0,1], axis = 1)\n", + "\n", + "# check shapes\n", + "print target.shape\n", + "print features.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(455, 30)\n", + "(455,)\n", + "(114, 30)\n", + "(114,)\n" + ] + } + ], + "source": [ + "# split data into training and testing\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n", + "\n", + "print X_train.shape\n", + "print y_train.shape\n", + "print X_test.shape\n", + "print y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", + " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", + " weights='uniform')" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Instantiate kNN model with k = 5, fit to our train data\n", + "knn = KNeighborsClassifier(n_neighbors = 5)\n", + "knn.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0\n", + " 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0\n", + " 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1\n", + " 1 1 0]\n", + "[[ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.6 0.4]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.2 0.8]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.8 0.2]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0.6 0.4]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.4 0.6]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 1. 0. ]\n", + " [ 0.8 0.2]\n", + " [ 1. 0. ]\n", + " [ 0. 1. ]\n", + " [ 0. 1. ]\n", + " [ 0.4 0.6]\n", + " [ 0.2 0.8]\n", + " [ 1. 0. ]]\n" + ] + } + ], + "source": [ + "# Make predictions for test data\n", + "predictions = knn.predict(X_test)\n", + "print predictions\n", + "predictionProbs = knn.predict_proba(X_test)\n", + "print predictionProbs" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1., 0.]])" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/1.4-lab/code/starter-code/week4-1.4-starter.ipynb b/1.4-lab/code/starter-code/week4-1.4-starter.ipynb index 9020d3b..b70ec24 100644 --- a/1.4-lab/code/starter-code/week4-1.4-starter.ipynb +++ b/1.4-lab/code/starter-code/week4-1.4-starter.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": true }, @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -79,7 +79,7 @@ " weights='uniform')" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -130,21 +130,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.11" } }, "nbformat": 4,