1086 lines
89 KiB
Plaintext
1086 lines
89 KiB
Plaintext
|
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 12,
|
||
|
|
"id": "b7ee7a83",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import numpy as np"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "399df849",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# Data Cleaning"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 13,
|
||
|
|
"id": "edda6af5",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
|
"RangeIndex: 131165 entries, 0 to 131164\n",
|
||
|
|
"Data columns (total 12 columns):\n",
|
||
|
|
" # Column Non-Null Count Dtype \n",
|
||
|
|
"--- ------ -------------- ----- \n",
|
||
|
|
" 0 Animal ID 131165 non-null object\n",
|
||
|
|
" 1 Date of Birth 131165 non-null object\n",
|
||
|
|
" 2 Name 93658 non-null object\n",
|
||
|
|
" 3 DateTime 131165 non-null object\n",
|
||
|
|
" 4 MonthYear 131165 non-null object\n",
|
||
|
|
" 5 Outcome Type 131125 non-null object\n",
|
||
|
|
" 6 Outcome Subtype 65810 non-null object\n",
|
||
|
|
" 7 Animal Type 131165 non-null object\n",
|
||
|
|
" 8 Sex upon Outcome 131165 non-null object\n",
|
||
|
|
" 9 Age upon Outcome 131165 non-null object\n",
|
||
|
|
" 10 Breed 131165 non-null object\n",
|
||
|
|
" 11 Color 131165 non-null object\n",
|
||
|
|
"dtypes: object(12)\n",
|
||
|
|
"memory usage: 12.0+ MB\n",
|
||
|
|
"None\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"Unique animal ID's: 121258\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"non-unique id's:\n",
|
||
|
|
" Animal ID Date of Birth Name DateTime MonthYear \\\n",
|
||
|
|
"694 A724077 7/11/15 *Sandy 2016-05-09T00:00:00-05:00 May-16 \n",
|
||
|
|
"1455 A750513 11/28/16 NaN 2017-06-19T00:00:00-05:00 Jun-17 \n",
|
||
|
|
"1544 A755088 9/29/16 Machete 2017-08-19T00:00:00-05:00 Aug-17 \n",
|
||
|
|
"1762 A758320 9/14/07 Sophie 2018-01-31T00:00:00-05:00 Jan-18 \n",
|
||
|
|
"1825 A767543 3/3/16 NaN 2018-04-18T00:00:00-05:00 Apr-18 \n",
|
||
|
|
"\n",
|
||
|
|
" Outcome Type Outcome Subtype Animal Type Sex upon Outcome \\\n",
|
||
|
|
"694 Adoption NaN Dog Spayed Female \n",
|
||
|
|
"1455 Transfer Snr Cat Neutered Male \n",
|
||
|
|
"1544 Adoption NaN Dog Spayed Female \n",
|
||
|
|
"1762 Transfer Partner Cat Spayed Female \n",
|
||
|
|
"1825 Transfer Snr Cat Spayed Female \n",
|
||
|
|
"\n",
|
||
|
|
" Age upon Outcome Breed Color \n",
|
||
|
|
"694 9 months Queensland Heeler Mix Red \n",
|
||
|
|
"1455 6 months Domestic Medium Hair Mix Orange Tabby/White \n",
|
||
|
|
"1544 10 months Australian Kelpie Mix Black/White \n",
|
||
|
|
"1762 10 years Domestic Shorthair Mix Blue Tabby \n",
|
||
|
|
"1825 2 years Domestic Shorthair Mix Black \n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"ID A724077:\n",
|
||
|
|
" Animal ID Date of Birth Name DateTime MonthYear \\\n",
|
||
|
|
"649 A724077 7/11/15 *Sandy 2016-04-24T00:00:00-05:00 Apr-16 \n",
|
||
|
|
"694 A724077 7/11/15 *Sandy 2016-05-09T00:00:00-05:00 May-16 \n",
|
||
|
|
"35974 A724077 7/11/15 *Sandy 2016-06-12T16:20:00 Jun-16 \n",
|
||
|
|
"\n",
|
||
|
|
" Outcome Type Outcome Subtype Animal Type Sex upon Outcome \\\n",
|
||
|
|
"649 Adoption NaN Dog Spayed Female \n",
|
||
|
|
"694 Adoption NaN Dog Spayed Female \n",
|
||
|
|
"35974 Adoption NaN Dog Spayed Female \n",
|
||
|
|
"\n",
|
||
|
|
" Age upon Outcome Breed Color \n",
|
||
|
|
"649 9 months Queensland Heeler Mix Red \n",
|
||
|
|
"694 9 months Queensland Heeler Mix Red \n",
|
||
|
|
"35974 11 months Queensland Heeler Mix Red \n",
|
||
|
|
" 0 1 2 3 4 5 6 7\n",
|
||
|
|
"0 2014 -06 -10 00 00 00 -05 00\n",
|
||
|
|
"1 2014 -07 -11 00 00 00 -05 00\n",
|
||
|
|
"2 2014 -07 -12 00 00 00 -05 00\n",
|
||
|
|
"3 2014 -07 -14 00 00 00 -05 00\n",
|
||
|
|
"4 2014 -08 -20 00 00 00 -05 00\n",
|
||
|
|
" Date of Birth Name DateTime MonthYear Outcome Type \\\n",
|
||
|
|
"2052 7/23/13 *Dudley 2013-10-01T09:31:00 Oct-13 Adoption \n",
|
||
|
|
"2053 9/24/13 NaN 2013-10-01T10:39:00 Oct-13 Transfer \n",
|
||
|
|
"2054 9/24/13 NaN 2013-10-01T10:44:00 Oct-13 Transfer \n",
|
||
|
|
"2055 9/22/10 NaN 2013-10-01T11:12:00 Oct-13 Transfer \n",
|
||
|
|
"2056 9/25/11 NaN 2013-10-01T11:13:00 Oct-13 Transfer \n",
|
||
|
|
"... ... ... ... ... ... \n",
|
||
|
|
"129096 1/31/12 NaN 2014-02-04T10:14:00 Feb-14 Transfer \n",
|
||
|
|
"129097 2/4/09 Toby 2014-02-08T10:14:00 Feb-14 Transfer \n",
|
||
|
|
"129098 7/6/13 NaN 2014-02-07T10:14:00 Feb-14 Transfer \n",
|
||
|
|
"129099 2/16/14 NaN 2014-03-27T08:00:00 Mar-14 Transfer \n",
|
||
|
|
"129100 5/10/11 Gatsby 2014-05-14T08:00:00 May-14 Transfer \n",
|
||
|
|
"\n",
|
||
|
|
" Outcome Subtype Animal Type Sex upon Outcome Age upon Outcome \\\n",
|
||
|
|
"2052 Foster Dog Neutered Male 2 months \n",
|
||
|
|
"2053 Partner Cat Unknown 1 week \n",
|
||
|
|
"2054 Partner Cat Unknown 1 week \n",
|
||
|
|
"2055 Partner Dog Neutered Male 3 years \n",
|
||
|
|
"2056 Partner Dog Spayed Female 2 years \n",
|
||
|
|
"... ... ... ... ... \n",
|
||
|
|
"129096 SCRP Cat Intact Male 2 years \n",
|
||
|
|
"129097 Partner Dog Intact Male 5 years \n",
|
||
|
|
"129098 SCRP Cat Spayed Female 7 months \n",
|
||
|
|
"129099 Partner Dog Intact Male 1 month \n",
|
||
|
|
"129100 Partner Dog Intact Male 3 years \n",
|
||
|
|
"\n",
|
||
|
|
" Breed Color Year Month Hour Zone \n",
|
||
|
|
"2052 Labrador Retriever Mix Black 2013 10 9 100 \n",
|
||
|
|
"2053 Domestic Shorthair Mix Orange/White 2013 10 10 100 \n",
|
||
|
|
"2054 Domestic Shorthair Mix Orange/White 2013 10 10 100 \n",
|
||
|
|
"2055 Toy Poodle Mix White 2013 10 11 100 \n",
|
||
|
|
"2056 Boxer Mix Red/White 2013 10 11 100 \n",
|
||
|
|
"... ... ... ... ... ... ... \n",
|
||
|
|
"129096 Domestic Shorthair Mix Brown Tabby 2014 2 10 100 \n",
|
||
|
|
"129097 Pekingese Red/White 2014 2 10 100 \n",
|
||
|
|
"129098 Domestic Medium Hair Mix Brown Tabby 2014 2 10 100 \n",
|
||
|
|
"129099 Pomeranian Mix White/Brown 2014 3 8 100 \n",
|
||
|
|
"129100 Pekingese/Lhasa Apso Buff 2014 5 8 100 \n",
|
||
|
|
"\n",
|
||
|
|
"[125886 rows x 15 columns]\n",
|
||
|
|
"0\n",
|
||
|
|
" 0 1\n",
|
||
|
|
"0 2 weeks\n",
|
||
|
|
"1 2 months\n",
|
||
|
|
"2 4 weeks\n",
|
||
|
|
"3 2 weeks\n",
|
||
|
|
"4 5 months\n",
|
||
|
|
"0 7\n",
|
||
|
|
"1 30\n",
|
||
|
|
"2 7\n",
|
||
|
|
"3 7\n",
|
||
|
|
"4 30\n",
|
||
|
|
" ... \n",
|
||
|
|
"129096 365\n",
|
||
|
|
"129097 365\n",
|
||
|
|
"129098 30\n",
|
||
|
|
"129099 30\n",
|
||
|
|
"129100 365\n",
|
||
|
|
"Name: 1, Length: 129101, dtype: int64\n",
|
||
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
|
"RangeIndex: 129101 entries, 0 to 129100\n",
|
||
|
|
"Data columns (total 13 columns):\n",
|
||
|
|
" # Column Non-Null Count Dtype \n",
|
||
|
|
"--- ------ -------------- ----- \n",
|
||
|
|
" 0 Outcome Type 129061 non-null object \n",
|
||
|
|
" 1 Animal Type 129101 non-null object \n",
|
||
|
|
" 2 Sex upon Outcome 129101 non-null object \n",
|
||
|
|
" 3 AgeInDays 129101 non-null int64 \n",
|
||
|
|
" 4 Breed 129101 non-null object \n",
|
||
|
|
" 5 Color 129101 non-null object \n",
|
||
|
|
" 6 Year 129101 non-null int16 \n",
|
||
|
|
" 7 Month 129101 non-null float64\n",
|
||
|
|
" 8 Hour 129101 non-null int8 \n",
|
||
|
|
" 9 HourCosine 129101 non-null float64\n",
|
||
|
|
" 10 BirthMonth 129101 non-null int8 \n",
|
||
|
|
" 11 BirthYear 129101 non-null int8 \n",
|
||
|
|
" 12 HasName 129101 non-null bool \n",
|
||
|
|
"dtypes: bool(1), float64(2), int16(1), int64(1), int8(3), object(5)\n",
|
||
|
|
"memory usage: 8.6+ MB\n",
|
||
|
|
"None\n",
|
||
|
|
" Outcome Type Animal Type Sex upon Outcome AgeInDays \\\n",
|
||
|
|
"89032 NaN Dog Spayed Female 365 \n",
|
||
|
|
"89034 NaN Dog Spayed Female 300 \n",
|
||
|
|
"89143 NaN Dog Neutered Male 2190 \n",
|
||
|
|
"89375 NaN Dog Spayed Female 180 \n",
|
||
|
|
"89379 NaN Dog Neutered Male 120 \n",
|
||
|
|
"89491 NaN Dog Neutered Male 365 \n",
|
||
|
|
"89503 NaN Cat Spayed Female 730 \n",
|
||
|
|
"89619 NaN Dog Neutered Male 60 \n",
|
||
|
|
"89636 NaN Dog Spayed Female 1825 \n",
|
||
|
|
"89661 NaN Dog Neutered Male 1460 \n",
|
||
|
|
"89804 NaN Dog Intact Male 365 \n",
|
||
|
|
"89821 NaN Dog Unknown 30 \n",
|
||
|
|
"89936 NaN Dog Intact Male 365 \n",
|
||
|
|
"92129 NaN Cat Intact Male 30 \n",
|
||
|
|
"92298 NaN Dog Intact Male 730 \n",
|
||
|
|
"93308 NaN Cat Spayed Female 730 \n",
|
||
|
|
"94607 NaN Dog Intact Male 730 \n",
|
||
|
|
"108019 NaN Cat Spayed Female 4745 \n",
|
||
|
|
"108706 NaN Bird Unknown 365 \n",
|
||
|
|
"108884 NaN Dog Unknown 365 \n",
|
||
|
|
"111465 NaN Dog Intact Male 730 \n",
|
||
|
|
"111621 NaN Cat Neutered Male 1095 \n",
|
||
|
|
"111732 NaN Dog Neutered Male 730 \n",
|
||
|
|
"116268 NaN Cat Spayed Female 90 \n",
|
||
|
|
"116396 NaN Cat Spayed Female 90 \n",
|
||
|
|
"116522 NaN Dog Spayed Female 730 \n",
|
||
|
|
"119515 NaN Dog Unknown 1825 \n",
|
||
|
|
"120797 NaN Dog Intact Male 330 \n",
|
||
|
|
"120829 NaN Dog Intact Female 730 \n",
|
||
|
|
"122114 NaN Dog Neutered Male 300 \n",
|
||
|
|
"122224 NaN Dog Spayed Female 365 \n",
|
||
|
|
"122267 NaN Cat Spayed Female 90 \n",
|
||
|
|
"123937 NaN Dog Spayed Female 730 \n",
|
||
|
|
"123938 NaN Dog Spayed Female 730 \n",
|
||
|
|
"124535 NaN Cat Neutered Male 60 \n",
|
||
|
|
"127205 NaN Dog Intact Male 300 \n",
|
||
|
|
"128002 NaN Dog Spayed Female 365 \n",
|
||
|
|
"128308 NaN Dog Neutered Male 3285 \n",
|
||
|
|
"128476 NaN Dog Intact Female 270 \n",
|
||
|
|
"128542 NaN Dog Spayed Female 1825 \n",
|
||
|
|
"\n",
|
||
|
|
" Breed Color Year Month \\\n",
|
||
|
|
"89032 American Bulldog Mix White/Black 2021 8.660254e-01 \n",
|
||
|
|
"89034 German Shepherd Tan/Black 2021 8.660254e-01 \n",
|
||
|
|
"89143 Pit Bull Mix Blue 2021 8.660254e-01 \n",
|
||
|
|
"89375 Labrador Retriever/Border Collie Black/White 2021 8.660254e-01 \n",
|
||
|
|
"89379 American Bulldog Mix Black/White 2021 8.660254e-01 \n",
|
||
|
|
"89491 Blue Lacy Mix Blue/White 2021 5.000000e-01 \n",
|
||
|
|
"89503 Domestic Shorthair Black 2021 5.000000e-01 \n",
|
||
|
|
"89619 Pit Bull Blue/White 2021 5.000000e-01 \n",
|
||
|
|
"89636 American Pit Bull Terrier Mix White/Brown 2021 5.000000e-01 \n",
|
||
|
|
"89661 Labrador Retriever Mix Yellow/White 2021 5.000000e-01 \n",
|
||
|
|
"89804 Chihuahua Shorthair Mix Tan 2021 6.123234e-17 \n",
|
||
|
|
"89821 Pug Mix Black/White 2021 6.123234e-17 \n",
|
||
|
|
"89936 Pit Bull Black 2021 6.123234e-17 \n",
|
||
|
|
"92129 Domestic Medium Hair Black Smoke 2021 -1.000000e+00 \n",
|
||
|
|
"92298 Pug Mix Black/White 2021 -1.000000e+00 \n",
|
||
|
|
"93308 Domestic Shorthair Tortie 2021 -8.660254e-01 \n",
|
||
|
|
"94607 German Shepherd Black/Tan 2021 -5.000000e-01 \n",
|
||
|
|
"108019 Domestic Shorthair Tortie 2023 8.660254e-01 \n",
|
||
|
|
"108706 Waxwing Tan/Black 2023 5.000000e-01 \n",
|
||
|
|
"108884 Pekingese Black/White 2023 5.000000e-01 \n",
|
||
|
|
"111465 Doberman Pinsch Mix Black/Brown 2023 -1.000000e+00 \n",
|
||
|
|
"111621 Domestic Shorthair Cream Tabby 2023 -1.000000e+00 \n",
|
||
|
|
"111732 Mastiff Mix Tan 2023 -1.000000e+00 \n",
|
||
|
|
"116268 Domestic Shorthair Black/White 2023 1.000000e+00 \n",
|
||
|
|
"116396 Siamese Mix Lynx Point 2023 1.000000e+00 \n",
|
||
|
|
"116522 Cairn Terrier/Chihuahua Shorthair White 2023 1.000000e+00 \n",
|
||
|
|
"119515 American Bulldog Mix Brown 2024 -5.000000e-01 \n",
|
||
|
|
"120797 Pit Bull Blue/White 2024 -1.000000e+00 \n",
|
||
|
|
"120829 Siberian Husky White 2024 -1.000000e+00 \n",
|
||
|
|
"122114 Wire Hair Fox Terrier Mix Brown 2024 -8.660254e-01 \n",
|
||
|
|
"122224 Siberian Husky Brown/White 2024 -8.660254e-01 \n",
|
||
|
|
"122267 Domestic Shorthair Torbie/White 2024 -8.660254e-01 \n",
|
||
|
|
"123937 Belgian Malinois Brown/Black 2024 -1.836970e-16 \n",
|
||
|
|
"123938 German Shepherd Brown/Black 2024 -1.836970e-16 \n",
|
||
|
|
"124535 Domestic Medium Hair Black 2024 5.000000e-01 \n",
|
||
|
|
"127205 West Highland/Yorkshire Terrier White/Cream 2025 5.000000e-01 \n",
|
||
|
|
"128002 Pit Bull Mix Black/White 2025 6.123234e-17 \n",
|
||
|
|
"128308 German Shepherd Brown/Black 2025 6.123234e-17 \n",
|
||
|
|
"128476 Siberian Husky Mix Black/Cream 2025 -5.000000e-01 \n",
|
||
|
|
"128542 Pit Bull Mix Fawn/White 2025 -5.000000e-01 \n",
|
||
|
|
"\n",
|
||
|
|
" Hour HourCosine BirthMonth BirthYear HasName \n",
|
||
|
|
"89032 17 -2.588190e-01 6 19 True \n",
|
||
|
|
"89034 17 -2.588190e-01 2 20 False \n",
|
||
|
|
"89143 18 -1.836970e-16 4 14 True \n",
|
||
|
|
"89375 14 -8.660254e-01 7 20 True \n",
|
||
|
|
"89379 16 -5.000000e-01 9 20 True \n",
|
||
|
|
"89491 18 -1.836970e-16 1 20 True \n",
|
||
|
|
"89503 15 -7.071068e-01 1 19 False \n",
|
||
|
|
"89619 12 -1.000000e+00 12 20 True \n",
|
||
|
|
"89636 10 -8.660254e-01 10 15 True \n",
|
||
|
|
"89661 12 -1.000000e+00 10 16 True \n",
|
||
|
|
"89804 13 -9.659258e-01 3 20 True \n",
|
||
|
|
"89821 14 -8.660254e-01 1 21 False \n",
|
||
|
|
"89936 13 -9.659258e-01 12 19 True \n",
|
||
|
|
"92129 16 -5.000000e-01 4 21 False \n",
|
||
|
|
"92298 11 -9.659258e-01 6 19 True \n",
|
||
|
|
"93308 18 -1.836970e-16 6 19 True \n",
|
||
|
|
"94607 18 -1.836970e-16 8 19 False \n",
|
||
|
|
"108019 12 -1.000000e+00 5 9 True \n",
|
||
|
|
"108706 9 -7.071068e-01 2 22 False \n",
|
||
|
|
"108884 15 -7.071068e-01 2 22 True \n",
|
||
|
|
"111465 19 2.588190e-01 6 21 True \n",
|
||
|
|
"111621 10 -8.660254e-01 7 19 True \n",
|
||
|
|
"111732 18 -1.836970e-16 6 21 True \n",
|
||
|
|
"116268 14 -8.660254e-01 9 23 True \n",
|
||
|
|
"116396 16 -5.000000e-01 9 23 False \n",
|
||
|
|
"116522 17 -2.588190e-01 12 21 False \n",
|
||
|
|
"119515 10 -8.660254e-01 4 19 False \n",
|
||
|
|
"120797 10 -8.660254e-01 7 23 True \n",
|
||
|
|
"120829 16 -5.000000e-01 5 22 True \n",
|
||
|
|
"122114 12 -1.000000e+00 9 23 True \n",
|
||
|
|
"122224 17 -2.588190e-01 6 23 True \n",
|
||
|
|
"122267 18 -1.836970e-16 4 24 True \n",
|
||
|
|
"123937 10 -8.660254e-01 9 22 True \n",
|
||
|
|
"123938 10 -8.660254e-01 9 22 True \n",
|
||
|
|
"124535 18 -1.836970e-16 8 24 False \n",
|
||
|
|
"127205 11 -9.659258e-01 4 24 False \n",
|
||
|
|
"128002 16 -5.000000e-01 12 23 True \n",
|
||
|
|
"128308 16 -5.000000e-01 2 16 True \n",
|
||
|
|
"128476 16 -5.000000e-01 6 24 True \n",
|
||
|
|
"128542 18 -1.836970e-16 12 19 True \n",
|
||
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
|
"Index: 129061 entries, 0 to 129100\n",
|
||
|
|
"Data columns (total 13 columns):\n",
|
||
|
|
" # Column Non-Null Count Dtype \n",
|
||
|
|
"--- ------ -------------- ----- \n",
|
||
|
|
" 0 Outcome Type 129061 non-null object \n",
|
||
|
|
" 1 Animal Type 129061 non-null object \n",
|
||
|
|
" 2 Sex upon Outcome 129061 non-null object \n",
|
||
|
|
" 3 AgeInDays 129061 non-null int64 \n",
|
||
|
|
" 4 Breed 129061 non-null object \n",
|
||
|
|
" 5 Color 129061 non-null object \n",
|
||
|
|
" 6 Year 129061 non-null int16 \n",
|
||
|
|
" 7 Month 129061 non-null float64\n",
|
||
|
|
" 8 Hour 129061 non-null int8 \n",
|
||
|
|
" 9 HourCosine 129061 non-null float64\n",
|
||
|
|
" 10 BirthMonth 129061 non-null int8 \n",
|
||
|
|
" 11 BirthYear 129061 non-null int8 \n",
|
||
|
|
" 12 HasName 129061 non-null bool \n",
|
||
|
|
"dtypes: bool(1), float64(2), int16(1), int64(1), int8(3), object(5)\n",
|
||
|
|
"memory usage: 9.6+ MB\n",
|
||
|
|
"None\n",
|
||
|
|
" Outcome Type Animal Type Sex upon Outcome AgeInDays Breed \\\n",
|
||
|
|
"0 Transfer Bird Unknown 14 Duck \n",
|
||
|
|
"1 Adoption Dog Spayed Female 60 Black Mouth Cur Mix \n",
|
||
|
|
"2 Transfer Bird Unknown 28 Catbird Mix \n",
|
||
|
|
"3 Transfer Bird Unknown 14 Grackle Mix \n",
|
||
|
|
"4 Transfer Bird Intact Male 150 Silkie Mix \n",
|
||
|
|
"\n",
|
||
|
|
" Color Year Month Hour HourCosine BirthMonth BirthYear \\\n",
|
||
|
|
"0 Yellow/Black 2014 -1.000000 0 1.0 5 14 \n",
|
||
|
|
"1 Brown/Black 2014 -0.866025 0 1.0 4 14 \n",
|
||
|
|
"2 Brown 2014 -0.866025 0 1.0 6 14 \n",
|
||
|
|
"3 Black/White 2014 -0.866025 0 1.0 6 14 \n",
|
||
|
|
"4 Black 2014 -0.500000 0 1.0 3 14 \n",
|
||
|
|
"\n",
|
||
|
|
" HasName \n",
|
||
|
|
"0 False \n",
|
||
|
|
"1 True \n",
|
||
|
|
"2 False \n",
|
||
|
|
"3 False \n",
|
||
|
|
"4 False \n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"data = pd.read_csv('project1.csv')\n",
|
||
|
|
"print(f\"{data.info()}\\n\\n\")\n",
|
||
|
|
"# We can see that there are 12 columns, with 131165 entries each. Shape is (131165, 12)\n",
|
||
|
|
"\n",
|
||
|
|
"# It only makes sense to do univariate analysis after cleaning up the data, since we can't really do that on strings and such.\n",
|
||
|
|
"# Therefore, I will be doing cleanup before visualization.\n",
|
||
|
|
"\n",
|
||
|
|
"# Datatypes of Animal ID, Date of Birth, DateTime, MonthYear, and Age upon outcome would need to be converted to numeric formats.\n",
|
||
|
|
"# However, let's see if Animal ID is unique for every animal...\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"\\nUnique animal ID's: {len(data['Animal ID'].unique())}\")\n",
|
||
|
|
"# Not all id's are unique? Which ones are same?\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"\\n\\nnon-unique id's:\")\n",
|
||
|
|
"newdf = data[data['Animal ID'].duplicated()]\n",
|
||
|
|
"print(newdf.head())\n",
|
||
|
|
"\n",
|
||
|
|
"# Let's choose one\n",
|
||
|
|
"print(\"\\n\\nID A724077:\")\n",
|
||
|
|
"print(data[data['Animal ID'] == \"A724077\"].head())\n",
|
||
|
|
"# looks like the same dog gets adopted multiple times? We can maybe have each animal get a count of how many times they've been adopted.\n",
|
||
|
|
"# However, our model will not be trained using the outcome, so this varible is not significant. We don't know whether the animal was transferred or adopted\n",
|
||
|
|
"# we should therefore drop the Animal ID column.\n",
|
||
|
|
"data.drop(['Animal ID'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# remove duplicates\n",
|
||
|
|
"data.drop_duplicates(inplace=True, ignore_index=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# MonthYear and DateTime are redundant - let's just make outcome month, year, date, time columns\n",
|
||
|
|
"# The following was informed by AI but not directly generated (see [4] and [5])\n",
|
||
|
|
"splitDateTime = pd.DataFrame(data['DateTime'].str.findall(r'(\\-*\\d+)').tolist(), index=data.index)\n",
|
||
|
|
"print(splitDateTime.head())\n",
|
||
|
|
"splitDateTime[6] = splitDateTime[6].transform(lambda x: x.fillna(100)) # arbitrary time zone to avoid conflict with any other time zone\n",
|
||
|
|
"\n",
|
||
|
|
"# seems like most times are 00:00:00...\n",
|
||
|
|
"data['Year'] = splitDateTime[0].astype('int16').abs()\n",
|
||
|
|
"data['Month'] = splitDateTime[1].astype('int8').abs()\n",
|
||
|
|
"data['Hour'] = splitDateTime[3].astype('int8').abs()\n",
|
||
|
|
"data['Zone'] = splitDateTime[6].astype('int8')\n",
|
||
|
|
"\n",
|
||
|
|
"print(data[(data['Hour'] != 0)])\n",
|
||
|
|
"\n",
|
||
|
|
"# there are indeed nonzero times. Let's keep the hour.\n",
|
||
|
|
"# let's check if the time zone is different for any of them...\n",
|
||
|
|
"print(len(data[(data['Zone'] != 100) & (data['Zone'] != -5)]))\n",
|
||
|
|
"# since the time zones are either UTC-5 or not present, we can drop the time zone column.\n",
|
||
|
|
"data.drop(['Zone'], axis=1, inplace=True)\n",
|
||
|
|
"# we can now also drop DateTime and MonthYear, since this data is reprsented in the year, month, date, hour, minute, and second columns.\n",
|
||
|
|
"data.drop(['DateTime', 'MonthYear'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# Saw online that representing hour data as a sinusoid is more effective, representing cyclical nature of the day.\n",
|
||
|
|
"data['HourCosine'] = data['Hour'].transform(lambda x: np.cos(np.pi * x / 12))\n",
|
||
|
|
"\n",
|
||
|
|
"data['Month'] = data['Month'].transform(lambda x: np.cos(np.pi * x / 6))\n",
|
||
|
|
"\n",
|
||
|
|
"# Now, since our outcome subtype cannot be used for inference due to its nature as part of the outcome, \n",
|
||
|
|
"# and since we are only predicting the outcome type, we can drop Outcome Subtype.\n",
|
||
|
|
"data.drop(['Outcome Subtype'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# Let's handle the DOB now.\n",
|
||
|
|
"splitDOB = pd.DataFrame(data['Date of Birth'].str.findall(r'(\\d+)').tolist(), index=data.index)\n",
|
||
|
|
"data['BirthMonth'] = splitDOB[0].astype('int8')\n",
|
||
|
|
"data['BirthYear'] = splitDOB[2].astype('int8')\n",
|
||
|
|
"data.drop(['Date of Birth'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# let's try to convert Age to an age in days.\n",
|
||
|
|
"# The function below was informed by AI, but not directly generated; See [1], [2], [3].\n",
|
||
|
|
"def convertAge(series: pd.Series):\n",
|
||
|
|
" outDF = pd.DataFrame(series.str.split(expand=True))\n",
|
||
|
|
" print(outDF.head())\n",
|
||
|
|
" custom_map = {\n",
|
||
|
|
" 'w': 7,\n",
|
||
|
|
" 'd': 1,\n",
|
||
|
|
" 'y': 365,\n",
|
||
|
|
" 'm': 30\n",
|
||
|
|
" }\n",
|
||
|
|
" print(outDF[1].str[0].map(custom_map))\n",
|
||
|
|
" outSeries = outDF[0].astype(int) * outDF[1].str[0].map(custom_map)\n",
|
||
|
|
" return outSeries\n",
|
||
|
|
"\n",
|
||
|
|
"data['Age upon Outcome'] = data['Age upon Outcome'].transform(convertAge)\n",
|
||
|
|
"data.rename(columns={'Age upon Outcome': 'AgeInDays'}, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"# Let's change the name column to a boolean representing whether or not a name is present.\n",
|
||
|
|
"data['HasName'] = data['Name'].notna()\n",
|
||
|
|
"data.drop(['Name'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"print(data.info())\n",
|
||
|
|
"\n",
|
||
|
|
"# It seems we still have null outcome types... Let's see what those are.\n",
|
||
|
|
"print(data[data['Outcome Type'].isna()])\n",
|
||
|
|
"# It only makes sense to keep data that will help us predict the outcome, and without the outcome the data is useless\n",
|
||
|
|
"# we will therefore drop the rows without outcome type.\n",
|
||
|
|
"data = data[data['Outcome Type'].notna()]\n",
|
||
|
|
"\n",
|
||
|
|
"print(data.info())\n",
|
||
|
|
"print(data.head())"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "0d381aa2",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"['Transfer' 'Adoption']\n",
|
||
|
|
"['Bird' 'Dog' 'Cat' 'Livestock']\n",
|
||
|
|
"['Unknown' 'Spayed Female' 'Intact Male' 'Intact Female' 'Neutered Male']\n",
|
||
|
|
"2526\n",
|
||
|
|
"599\n",
|
||
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
|
|
"Index: 129061 entries, 0 to 129100\n",
|
||
|
|
"Data columns (total 16 columns):\n",
|
||
|
|
" # Column Non-Null Count Dtype \n",
|
||
|
|
"--- ------ -------------- ----- \n",
|
||
|
|
" 0 AgeInDays 129061 non-null int64 \n",
|
||
|
|
" 1 Year 129061 non-null int16 \n",
|
||
|
|
" 2 Month 129061 non-null float64\n",
|
||
|
|
" 3 Hour 129061 non-null int8 \n",
|
||
|
|
" 4 HourCosine 129061 non-null float64\n",
|
||
|
|
" 5 BirthMonth 129061 non-null int8 \n",
|
||
|
|
" 6 BirthYear 129061 non-null int8 \n",
|
||
|
|
" 7 HasName 129061 non-null bool \n",
|
||
|
|
" 8 Type_Cat 129061 non-null bool \n",
|
||
|
|
" 9 Type_Dog 129061 non-null bool \n",
|
||
|
|
" 10 Type_Livestock 129061 non-null bool \n",
|
||
|
|
" 11 Sex_Intact Male 129061 non-null bool \n",
|
||
|
|
" 12 Sex_Neutered Male 129061 non-null bool \n",
|
||
|
|
" 13 Sex_Spayed Female 129061 non-null bool \n",
|
||
|
|
" 14 Sex_Unknown 129061 non-null bool \n",
|
||
|
|
" 15 Outcome_Adoption 129061 non-null bool \n",
|
||
|
|
"dtypes: bool(9), float64(2), int16(1), int64(1), int8(3)\n",
|
||
|
|
"memory usage: 5.7 MB\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Let's now take a look at categorizing the outcome type, animal type, sex, breed, and color data.\n",
|
||
|
|
"print(data['Outcome Type'].unique())\n",
|
||
|
|
"print(data['Animal Type'].unique())\n",
|
||
|
|
"print(data['Sex upon Outcome'].unique())\n",
|
||
|
|
"breeds = data['Breed'].unique()\n",
|
||
|
|
"print(len(breeds))\n",
|
||
|
|
"# breeds are too sparse to deal with, and they are dropped in section two of the project. Let's drop them now.\n",
|
||
|
|
"data.drop(['Breed'], axis=1, inplace=True)\n",
|
||
|
|
"\n",
|
||
|
|
"colors = data['Color'].unique()\n",
|
||
|
|
"print(len(colors))\n",
|
||
|
|
"# colors are also looking very sparse, let's drop them for now and come back to them if needed.\n",
|
||
|
|
"data.drop(['Color'], axis=1, inplace=True)\n",
|
||
|
|
"data.rename(columns={'Outcome Type': 'Outcome', 'Animal Type': 'Type', 'Sex upon Outcome': 'Sex'}, inplace=True)\n",
|
||
|
|
"data = pd.get_dummies(data, columns=['Type', 'Sex'], drop_first=True)\n",
|
||
|
|
"data = pd.get_dummies(data, columns=['Outcome'], drop_first=False)\n",
|
||
|
|
"data.drop(['Outcome_Transfer'], axis=1, inplace=True)\n",
|
||
|
|
"# we can drop the first column since a false in all the other possibilities would signify a true in the first.\n",
|
||
|
|
"\n",
|
||
|
|
"data.info()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "47862576",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# Univariate Analysis"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "c7ae7aef",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"\n",
|
||
|
|
"#### We can see certain trends from this readout, such as that there seem to be a greater amount of very young animals than older animals, since the median is 270 while the mean is 583 days.\n",
|
||
|
|
"\n",
|
||
|
|
"\n",
|
||
|
|
"#### Additionally, we see that the adoption or transfer data spans 2013 through 2025, centering around 2018. However, when it comes to birth year, we see the year 99, which signifies 1999. This discrepancy might interfere with the linear classification algorithm, and since we already have age data and adoption date data, we should be good to drop this too.\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 15,
|
||
|
|
"id": "ca5e9c28",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
" AgeInDays Year Month Hour \\\n",
|
||
|
|
"count 129061.000000 129061.000000 1.290610e+05 129061.000000 \n",
|
||
|
|
"mean 583.200851 2018.596803 -3.878811e-02 14.328744 \n",
|
||
|
|
"std 880.694863 3.318159 7.180569e-01 3.695877 \n",
|
||
|
|
"min 1.000000 2013.000000 -1.000000e+00 0.000000 \n",
|
||
|
|
"25% 60.000000 2016.000000 -8.660254e-01 12.000000 \n",
|
||
|
|
"50% 270.000000 2018.000000 -1.836970e-16 15.000000 \n",
|
||
|
|
"75% 730.000000 2021.000000 5.000000e-01 17.000000 \n",
|
||
|
|
"max 10950.000000 2025.000000 1.000000e+00 23.000000 \n",
|
||
|
|
"\n",
|
||
|
|
" HourCosine BirthMonth BirthYear \n",
|
||
|
|
"count 129061.000000 129061.000000 129061.000000 \n",
|
||
|
|
"mean -0.503063 6.244295 16.987231 \n",
|
||
|
|
"std 0.454937 3.203606 4.505264 \n",
|
||
|
|
"min -1.000000 1.000000 0.000000 \n",
|
||
|
|
"25% -0.866025 4.000000 14.000000 \n",
|
||
|
|
"50% -0.500000 6.000000 17.000000 \n",
|
||
|
|
"75% -0.258819 9.000000 20.000000 \n",
|
||
|
|
"max 1.000000 12.000000 99.000000 \n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"print(data.describe())\n",
|
||
|
|
"\n",
|
||
|
|
"data.drop(['BirthYear'], axis=1, inplace=True)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "73c09b78",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"#### We can see from the following histogram that an overwhelming number of animals at the shelter are under a year in age (2000 days / 7 bins ~= 300 days.)\n",
|
||
|
|
"#### This is probably a significant contributor to the animals that actually get adopted, considering that buyers might be more likely to buy younger or older animals."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 16,
|
||
|
|
"id": "daecf558",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"<Axes: xlabel='AgeInDays', ylabel='Count'>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 16,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAO0tJREFUeJzt3Xt0VPW9//9XyGUIMRlCYjJEwsVjjGDAYrAQsAULJFgC9dKiBkepCChCjMJRkfZI/ZWgIpfTolQ5FFSgsadIDxWMCV6okavRVAKI9hQlYEIQJpOAcRKTz+8Pj/vrEAgbDEwCz8daey1nf94z+70/C+G1PrP3niBjjBEAAACa1S7QDQAAALQFhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgQ0igGzifNDY26vPPP1dkZKSCgoIC3Q4AALDBGKOamholJCSoXbuTrycRmlrQ559/rsTExEC3AQAAzkBZWZm6dOly0nFCUwuKjIyU9M2kR0VFBbgbAABgR3V1tRITE61/x0+G0NSCvv1KLioqitAEAEAbc6pLa7gQHAAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYEBLoBmBPbW2tfD6frVqHw6Hw8PCz3BEAABcWQlMbUFtbq8Ru3XX4UKWt+piL41T22acEJwAAWhChqQ3w+Xw6fKhSmbmrFRp+UbO19bVH9eqjN8vn8xGaAABoQYSmNiQ0/CKFdYgMdBsAAFyQuBAcAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbCA0AQAA2EBoAgAAsIHQBAAAYAOhCQAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwIeCh6cCBA7r99tsVExOjDh066Ac/+IGKi4utcWOMZs2apYSEBIWHh2vIkCHauXOn32f4fD5NnTpVsbGxioiI0OjRo7V//36/Go/HI7fbLafTKafTKbfbraqqKr+affv2adSoUYqIiFBsbKyys7NVV1d31s4dAAC0HQENTR6PR4MGDVJoaKhee+017dq1S/PmzVPHjh2tmqeeekrz58/XokWLtH37drlcLg0fPlw1NTVWTU5OjtasWaO8vDwVFRXp6NGjyszMVENDg1WTlZWlkpIS5efnKz8/XyUlJXK73dZ4Q0ODRo4cqWPHjqmoqEh5eXlavXq1pk2bdk7mAgAAtG5BxhgTqIM/8sgjevfdd/XOO++ccNwYo4SEBOXk5Ojhhx+W9M2qUnx8vJ588klNmjRJXq9XF198sV566SXdcsstkqTPP/9ciYmJWr9+vTIyMrR792716tVLW7ZsUf/+/SVJW7ZsUVpamj766CMlJyfrtddeU2ZmpsrKypSQkCBJysvL07hx41RZWamoqKhTnk91dbWcTqe8Xq+teruqqqoUHR2tGxe8rrAOkc3W1n1ZozUPZMjj8fiFTwAAcGJ2//0O6ErT2rVr1a9fP/3iF79QXFyc+vbtqyVLlljje/fuVUVFhdLT0619DodDgwcP1qZNmyRJxcXFqq+v96tJSEhQSkqKVbN582Y5nU4rMEnSgAED5HQ6/WpSUlKswCRJGRkZ8vl8fl8XfpfP51N1dbXfBgAAzk8BDU3/+te/tHjxYiUlJen111/XPffco+zsbL344ouSpIqKCklSfHy83/vi4+OtsYqKCoWFhSk6OrrZmri4uCbHj4uL86s5/jjR0dEKCwuzao43Z84c6xopp9OpxMTE050CAADQRgQ0NDU2Nurqq69Wbm6u+vbtq0mTJmnChAlavHixX11QUJDfa2NMk33HO77mRPVnUvNdM2bMkNfrtbaysrJmewIAAG1XQENT586d1atXL799PXv21L59+yRJLpdLkpqs9FRWVlqrQi6XS3V1dfJ4PM3WHDx4sMnxDx065Fdz/HE8Ho/q6+ubrEB9y+FwKCoqym8DAADnp4CGpkGDBmnPnj1++z7++GN169ZNktSjRw+5XC4VFhZa43V1ddq4caMGDhwoSUpNTVVoaKhfTXl5uUpLS62atLQ0eb1ebdu2zarZunWrvF6vX01paanKy8utmoKCAjkcDqWmprbwmQMAgLYmJJAHf+CBBzRw4EDl5uZqzJgx2rZtm55//nk9//zzkr75uiwnJ0e5ublKSkpSUlKScnNz1aFDB2VlZUmSnE6nxo8fr2nTpikmJkadOnXS9OnT1bt3bw0bNkzSN6tXI0aM0IQJE/Tcc89JkiZOnKjMzEwlJydLktLT09WrVy+53W7NnTtXR44c0fTp0zVhwgRWkAAAQGBD0zXXXKM1a9ZoxowZevzxx9WjRw8tXLhQY8eOtWoeeugh1dbWavLkyfJ4POrfv78KCgoUGfn/br1fsGCBQkJCNGbMGNXW1mro0KFavny5goODrZqVK1cqOzvbustu9OjRWrRokTUeHBysdevWafLkyRo0aJDCw8OVlZWlp59++hzMBAAAaO0C+pym8w3PaQIAoO1pE89pAgAAaCsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgA6EJAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbCA0AQAA2EBoAgAAsIHQBAAAYAOhCQAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgA6EJAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbAhoaJo1a5aCgoL8NpfLZY0bYzRr1iwlJCQoPDxcQ4YM0c6dO/0+w+fzaerUqYqNjVVERIRGjx6t/fv3+9V4PB653W45nU45nU653W5VVVX51ezbt0+jRo1SRESEYmNjlZ2drbq6urN27gAAoG0J+ErTlVdeqfLycmvbsWOHNfbUU09p/vz5WrRokbZv3y6Xy6Xhw4erpqbGqsnJydGaNWuUl5enoqIiHT16VJmZmWpoaLBqsrKyVFJSovz8fOXn56ukpERut9sab2ho0MiRI3Xs2DEVFRUpLy9Pq1ev1rRp087NJAAAgFYvJOANhIT4rS59yxijhQsXaubMmbrpppskSS+88ILi4+O1atUqTZo0SV6vV0uXLtVLL72kYcOGSZJWrFihxMREbdiwQRkZGdq9e7fy8/O1ZcsW9e/fX5K0ZMkSpaWlac+ePUpOTlZBQYF27dqlsrIyJSQkSJLmzZuncePGafbs2YqKijpHswEAAFqrgK80ffLJJ0pISFCPHj1066236l//+pckae/evaqoqFB6erpV63A4NHjwYG3atEmSVFxcrPr6er+ahIQEpaSkWDWbN2+W0+m0ApMkDRgwQE6n068mJSXFCkySlJGRIZ/Pp+Li4pP27vP5VF1d7bcBAIDzU0BDU//+/fXiiy/q9ddf15IlS1RRUaGBAwfq8OHDqqiokCTFx8f7vSc+Pt4aq6ioUFhYmKKjo5utiYuLa3LsuLg4v5rjjxMdHa2wsDCr5kTmzJljXSfldDqVmJh4mjMAAADaioCGpuuvv14333yzevfurWHDhmndunWSvvka7ltBQUF+7zHGNNl3vONrTlR/JjXHmzFjhrxer7WVlZU12xcAAGi7Av713HdFRESod+/e+uSTT6zrnI5f6amsrLRWhVwul+rq6uTxeJqtOXjwYJNjHTp0yK/m+ON4PB7V19c3WYH6LofDoaioKL8NAACcn1pVaPL5fNq9e7c6d+6sHj16yOVyqbCw0Bqvq6vTxo0bNXDgQElSamqqQkND/WrKy8tVWlpq1aSlpcnr9Wrbtm1WzdatW+X1ev1qSktLVV5ebtUUFBT
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Let's explore the distribution of animal age more closely.\n",
|
||
|
|
"import seaborn as sns\n",
|
||
|
|
"sns.histplot(data=data['AgeInDays'], bins= 40)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "8b08973f",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"#### We can see in the following histogram that there's considerably more data recorded after noon, so maybe that's interesting for prediction.\n",
|
||
|
|
"#### It's possible that this data will show a higher probability of adoption if the transaction occurs later in the day."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 17,
|
||
|
|
"id": "5521b827",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/plain": [
|
||
|
|
"<Axes: xlabel='Hour', ylabel='Count'>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 17,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAObNJREFUeJzt3X9YlXWe//HXCeGIiCcQ4XA2RKdRVsMaw1bQmbIskAld011tmD2jOw42W8qyyDWN9a2YZtLZTO26dGoct8lSumz2sqzUQTHLxvVntEyhjGszOmKBMAgHsdOBwfv7R9u9HVG8QeAc4Pm4rvu6uO/P+9znfXM6+Opz3/c5NsMwDAEAAKBd1wW6AQAAgN6A0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsGBDoBvqSixcv6tNPP1VkZKRsNlug2wEAABYYhqHz58/L5XLpuuuuPJ9EaOpCn376qRISEgLdBgAA6ITKykrdcMMNVxwnNHWhyMhISV/80ocMGRLgbgAAgBWNjY1KSEgw/x2/EkJTF/rylNyQIUMITQAA9DJXu7SGC8EBAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAUBDU3Lly/XbbfdpsjISMXGxmrmzJk6fvy4X41hGCosLJTL5VJ4eLimTJmio0eP+tX4fD4tXrxYMTExioiI0IwZM3TmzBm/mvr6erndbjkcDjkcDrndbjU0NPjVnD59WtOnT1dERIRiYmKUm5ur5ubmbjl2AMC183q9amhosLR4vd5At4teLqChae/evXrooYd08OBBlZSU6K9//avS09N14cIFs+bpp5/WqlWrtHbtWh05ckROp1P33HOPzp8/b9bk5eXp9ddf1+bNm7Vv3z41NTUpKytLra2tZk12drbKyspUXFys4uJilZWVye12m+Otra269957deHCBe3bt0+bN2/Wli1btGTJkp75ZQAAOsTr9SohcYSioqIsLQmJIwhOuCY2wzCMQDfxpdraWsXGxmrv3r26/fbbZRiGXC6X8vLy9PDDD0v6YlYpLi5O//7v/64HHnhAHo9Hw4YN08aNGzV37lxJ0qeffqqEhATt2LFDGRkZqqio0NixY3Xw4EFNnDhRknTw4EGlpaXpD3/4g5KSkvTb3/5WWVlZqqyslMvlkiRt3rxZ8+fPV01NjYYMGdKmX5/PJ5/PZ643NjYqISFBHo/nsvUAgK7T0NCgqKgoZS3botDwwe3WtnibtO2R2aqvr9f111/fMw2i12hsbJTD4bjqv99BdU2Tx+ORJEVHR0uSTp48qerqaqWnp5s1drtdd9xxh/bv3y9JKi0tVUtLi1+Ny+VScnKyWXPgwAE5HA4zMElSamqqHA6HX01ycrIZmCQpIyNDPp9PpaWll+13+fLl5uk+h8OhhISErvg1AAA6IDR8sMIGRba7XC1UAVYETWgyDEP5+fn65je/qeTkZElSdXW1JCkuLs6vNi4uzhyrrq5WWFiYoqKi2q2JjY1t85yxsbF+NZc+T1RUlMLCwsyaSy1dulQej8dcKisrO3rYAACglxgQ6Aa+tGjRIn344Yfat29fmzGbzea3bhhGm22XurTmcvWdqfkqu90uu93ebh8AAKBvCIqZpsWLF+vNN9/UO++8oxtuuMHc7nQ6JanNTE9NTY05K+R0OtXc3Kz6+vp2a86ePdvmeWtra/1qLn2e+vp6tbS0tJmBAgAA/U9AQ5NhGFq0aJFee+017dmzRyNHjvQbHzlypJxOp0pKSsxtzc3N2rt3ryZNmiRJSklJUWhoqF9NVVWVysvLzZq0tDR5PB4dPnzYrDl06JA8Ho9fTXl5uaqqqsyaXbt2yW63KyUlpesPHgAA9CoBPT330EMP6ZVXXtEbb7yhyMhIc6bH4XAoPDxcNptNeXl5WrZsmUaNGqVRo0Zp2bJlGjRokLKzs83aBQsWaMmSJRo6dKiio6NVUFCgcePG6e6775YkjRkzRtOmTVNOTo7WrVsnSVq4cKGysrKUlJQkSUpPT9fYsWPldru1YsUKnTt3TgUFBcrJyeFOOAAAENjQ9Pzzz0uSpkyZ4rf9xRdf1Pz58yVJP/rRj+T1evXggw+qvr5eEydO1K5duxQZGWnWr169WgMGDNCcOXPk9Xo1depUbdiwQSEhIWZNUVGRcnNzzbvsZsyYobVr15rjISEh2r59ux588EFNnjxZ4eHhys7O1jPPPNNNRw8AAHqToPqcpt7O6uc8AACu3Zef03Tf6p0KGxTZbm3zZ+f1+r9l8DlNuKxe+TlNAAAAwYrQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYMCHQDAIC+z+v1yufzWaq12+0KDw/v5o6AjiM0AQC6ldfrVULiCNXV1liqHzosVpV/PkVwQtAJ6Om59957T9OnT5fL5ZLNZtPWrVv9xm0222WXFStWmDVTpkxpM37//ff77ae+vl5ut1sOh0MOh0Nut1sNDQ1+NadPn9b06dMVERGhmJgY5ebmqrm5ubsOHQD6DZ/Pp7raGmUt26L7Vu9sd8latkV1tTWWZ6WAnhTQmaYLFy7olltu0T//8z9r9uzZbcarqqr81n/7299qwYIFbWpzcnL05JNPmuuX/t9Jdna2zpw5o+LiYknSwoUL5Xa79dZbb0mSWltbde+992rYsGHat2+f6urqNG/ePBmGoTVr1nTJsQJAfxcaPlhhgyID3QbQaQENTZmZmcrMzLziuNPp9Ft/4403dOedd+prX/ua3/ZBgwa1qf1SRUWFiouLdfDgQU2cOFGStH79eqWlpen48eNKSkrSrl27dOzYMVVWVsrlckmSVq5cqfnz5+upp57SkCFDruUwAQBAH9Br7p47e/astm/frgULFrQZKyoqUkxMjG666SYVFBTo/Pnz5tiBAwfkcDjMwCRJqampcjgc2r9/v1mTnJxsBiZJysjIkM/nU2lp6RV78vl8amxs9FsAAEDf1GsuBH/ppZcUGRmpWbNm+W3/7ne/q5EjR8rpdKq8vFxLly7V73//e5WUlEiSqqurFRsb22Z/sbGxqq6uNmvi4uL8xqOiohQWFmbWXM7y5cv1k5/85FoPDQAA9AK9JjT9+te/1ne/+10NHDjQb3tOTo75c3JyskaNGqUJEybogw8+0K233irpiwvKL2UYht92KzWXWrp0qfLz8831xsZGJSQkWD8oAADQa/SK03O/+93vdPz4cf3gBz+4au2tt96q0NBQnThxQtIX10WdPXu2TV1tba05u+R0OtvMKNXX16ulpaXNDNRX2e12DRkyxG8BAAB9U68ITS+88IJSUlJ0yy23XLX26NGjamlpUXx8vCQpLS1NHo9Hhw8fNmsOHTokj8ejSZMmmTXl5eV+d+vt2rVLdrtdKSkpXXw0AACgNwro6bmmpiZ9/PHH5vrJkydVVlam6OhoDR8+XNIXp7z+8z//UytXrmzz+D/+8Y8qKirSt7/9bcXExOjYsWNasmSJxo8fr8mTJ0uSxowZo2nTpiknJ0fr1q2T9MVHDmRlZSkpKUmSlJ6errFjx8rtdmvFihU6d+6cCgoKlJOTw+wRAACQFOCZpvfff1/jx4/X+PHjJUn5+fkaP368Hn/8cbNm8+bNMgxD3/nOd9o8PiwsTG+//bYyMjKUlJSk3Nxcpaena/fu3QoJCTHrioqKNG7cOKWnpys9PV0333yzNm7caI6HhIRo+/btGjhwoCZPnqw5c+Zo5syZeuaZZ7rx6AEAQG8S0JmmKVOmyDCMdmsWLlyohQsXXnYsISFBe/fuverzREd
|
||
|
|
"text/plain": [
|
||
|
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "display_data"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Finally, let's also explore the hour of the day.\n",
|
||
|
|
"sns.histplot(data=data['Hour'], bins= 40)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 18,
|
||
|
|
"id": "3f9a5c00",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"data": {
|
||
|
|
"text/html": [
|
||
|
|
"<div>\n",
|
||
|
|
"<style scoped>\n",
|
||
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
|
" vertical-align: middle;\n",
|
||
|
|
" }\n",
|
||
|
|
"\n",
|
||
|
|
" .dataframe tbody tr th {\n",
|
||
|
|
" vertical-align: top;\n",
|
||
|
|
" }\n",
|
||
|
|
"\n",
|
||
|
|
" .dataframe thead th {\n",
|
||
|
|
" text-align: right;\n",
|
||
|
|
" }\n",
|
||
|
|
"</style>\n",
|
||
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
|
" <thead>\n",
|
||
|
|
" <tr style=\"text-align: right;\">\n",
|
||
|
|
" <th></th>\n",
|
||
|
|
" <th>AgeInDays</th>\n",
|
||
|
|
" <th>Year</th>\n",
|
||
|
|
" <th>Month</th>\n",
|
||
|
|
" <th>HourCosine</th>\n",
|
||
|
|
" <th>BirthMonth</th>\n",
|
||
|
|
" <th>HasName</th>\n",
|
||
|
|
" <th>Type_Cat</th>\n",
|
||
|
|
" <th>Type_Dog</th>\n",
|
||
|
|
" <th>Type_Livestock</th>\n",
|
||
|
|
" <th>Sex_Intact Male</th>\n",
|
||
|
|
" <th>Sex_Neutered Male</th>\n",
|
||
|
|
" <th>Sex_Spayed Female</th>\n",
|
||
|
|
" <th>Sex_Unknown</th>\n",
|
||
|
|
" <th>Outcome_Adoption</th>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" </thead>\n",
|
||
|
|
" <tbody>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>0</th>\n",
|
||
|
|
" <td>14</td>\n",
|
||
|
|
" <td>2014</td>\n",
|
||
|
|
" <td>-1.000000</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>5</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>1</th>\n",
|
||
|
|
" <td>60</td>\n",
|
||
|
|
" <td>2014</td>\n",
|
||
|
|
" <td>-0.866025</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>4</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>2</th>\n",
|
||
|
|
" <td>28</td>\n",
|
||
|
|
" <td>2014</td>\n",
|
||
|
|
" <td>-0.866025</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>6</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>3</th>\n",
|
||
|
|
" <td>14</td>\n",
|
||
|
|
" <td>2014</td>\n",
|
||
|
|
" <td>-0.866025</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>6</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" <tr>\n",
|
||
|
|
" <th>4</th>\n",
|
||
|
|
" <td>150</td>\n",
|
||
|
|
" <td>2014</td>\n",
|
||
|
|
" <td>-0.500000</td>\n",
|
||
|
|
" <td>1.0</td>\n",
|
||
|
|
" <td>3</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>True</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" <td>False</td>\n",
|
||
|
|
" </tr>\n",
|
||
|
|
" </tbody>\n",
|
||
|
|
"</table>\n",
|
||
|
|
"</div>"
|
||
|
|
],
|
||
|
|
"text/plain": [
|
||
|
|
" AgeInDays Year Month HourCosine BirthMonth HasName Type_Cat \\\n",
|
||
|
|
"0 14 2014 -1.000000 1.0 5 False False \n",
|
||
|
|
"1 60 2014 -0.866025 1.0 4 True False \n",
|
||
|
|
"2 28 2014 -0.866025 1.0 6 False False \n",
|
||
|
|
"3 14 2014 -0.866025 1.0 6 False False \n",
|
||
|
|
"4 150 2014 -0.500000 1.0 3 False False \n",
|
||
|
|
"\n",
|
||
|
|
" Type_Dog Type_Livestock Sex_Intact Male Sex_Neutered Male \\\n",
|
||
|
|
"0 False False False False \n",
|
||
|
|
"1 True False False False \n",
|
||
|
|
"2 False False False False \n",
|
||
|
|
"3 False False False False \n",
|
||
|
|
"4 False False True False \n",
|
||
|
|
"\n",
|
||
|
|
" Sex_Spayed Female Sex_Unknown Outcome_Adoption \n",
|
||
|
|
"0 False True False \n",
|
||
|
|
"1 True False True \n",
|
||
|
|
"2 False True False \n",
|
||
|
|
"3 False True False \n",
|
||
|
|
"4 False False False "
|
||
|
|
]
|
||
|
|
},
|
||
|
|
"execution_count": 18,
|
||
|
|
"metadata": {},
|
||
|
|
"output_type": "execute_result"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Now for PART 2\n",
|
||
|
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
|
"data.drop(['Hour'], axis=1, inplace=True)\n",
|
||
|
|
"data.head()\n",
|
||
|
|
"# Let's first get rid of the normal hour count, since we have the cosine version and have visualized the distribution of hours."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "17384e1b",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
" AgeInDays Year Month HourCosine BirthMonth HasName Type_Cat \\\n",
|
||
|
|
"0 14 2014 -1.000000 1.0 5 False False \n",
|
||
|
|
"1 60 2014 -0.866025 1.0 4 True False \n",
|
||
|
|
"2 28 2014 -0.866025 1.0 6 False False \n",
|
||
|
|
"3 14 2014 -0.866025 1.0 6 False False \n",
|
||
|
|
"4 150 2014 -0.500000 1.0 3 False False \n",
|
||
|
|
"\n",
|
||
|
|
" Type_Dog Type_Livestock Sex_Intact Male Sex_Neutered Male \\\n",
|
||
|
|
"0 False False False False \n",
|
||
|
|
"1 True False False False \n",
|
||
|
|
"2 False False False False \n",
|
||
|
|
"3 False False False False \n",
|
||
|
|
"4 False False True False \n",
|
||
|
|
"\n",
|
||
|
|
" Sex_Spayed Female Sex_Unknown \n",
|
||
|
|
"0 False True \n",
|
||
|
|
"1 True False \n",
|
||
|
|
"2 False True \n",
|
||
|
|
"3 False True \n",
|
||
|
|
"4 False False \n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Some of the following was informed by AI, but not directly generated. See [6]\n",
|
||
|
|
"x = data[data.loc[:, 'AgeInDays':'Sex_Unknown'].columns.tolist()]\n",
|
||
|
|
"y = data['Outcome_Adoption']\n",
|
||
|
|
"print(x.head())\n",
|
||
|
|
"x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=1)\n",
|
||
|
|
"# stratify=y ensures the same general proportion of transfer to adoption in testing and training data.\n",
|
||
|
|
"# random_state=1 ensures the results are reproducible, randomizing the input data in a controlled manner."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "b4104bd2",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Performance on TRAINING\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.35 1.00 0.52 31916\n",
|
||
|
|
" True 0.00 0.00 0.00 58426\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.35 90342\n",
|
||
|
|
" macro avg 0.18 0.50 0.26 90342\n",
|
||
|
|
"weighted avg 0.12 0.35 0.18 90342\n",
|
||
|
|
"\n",
|
||
|
|
"Performance on TEST\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.35 1.00 0.52 13678\n",
|
||
|
|
" True 0.00 0.00 0.00 25041\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.35 38719\n",
|
||
|
|
" macro avg 0.18 0.50 0.26 38719\n",
|
||
|
|
"weighted avg 0.12 0.35 0.18 38719\n",
|
||
|
|
"\n"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "stderr",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
|
||
|
|
"/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||
|
|
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# We will first perform linear classification via Stochastic Gradient Descent.\n",
|
||
|
|
"\n",
|
||
|
|
"from sklearn.linear_model import SGDClassifier\n",
|
||
|
|
"from sklearn.metrics import classification_report\n",
|
||
|
|
"\n",
|
||
|
|
"clf = SGDClassifier(loss=\"perceptron\", alpha=0.05, random_state=1)\n",
|
||
|
|
"\n",
|
||
|
|
"clf.fit(x_train, y_train)\n",
|
||
|
|
"\n",
|
||
|
|
"def report(classifier):\n",
|
||
|
|
" # Accuracy, Precision, Recall, and F1 scores\n",
|
||
|
|
" print(f\"Performance on TRAINING\\n*****************\\n{classification_report(y_train,classifier.predict(x_train))}\")\n",
|
||
|
|
" print(f\"Performance on TEST\\n*****************\\n{classification_report(y_test,classifier.predict(x_test))}\")\n",
|
||
|
|
"\n",
|
||
|
|
"report(clf)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 21,
|
||
|
|
"id": "b82c29be",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Performance on TRAINING\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.91 0.83 0.87 31916\n",
|
||
|
|
" True 0.91 0.96 0.93 58426\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.91 90342\n",
|
||
|
|
" macro avg 0.91 0.89 0.90 90342\n",
|
||
|
|
"weighted avg 0.91 0.91 0.91 90342\n",
|
||
|
|
"\n",
|
||
|
|
"Performance on TEST\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.81 0.71 0.76 13678\n",
|
||
|
|
" True 0.85 0.91 0.88 25041\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.84 38719\n",
|
||
|
|
" macro avg 0.83 0.81 0.82 38719\n",
|
||
|
|
"weighted avg 0.84 0.84 0.84 38719\n",
|
||
|
|
"\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# now, we will perform k-nearest neighbors classification without cross-validation. \n",
|
||
|
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||
|
|
"\n",
|
||
|
|
"knn = KNeighborsClassifier(n_neighbors=3) # choosing arbitrary number of neighbors, \n",
|
||
|
|
"\n",
|
||
|
|
"# fit the model to the training set\n",
|
||
|
|
"knn.fit(x_train, y_train)\n",
|
||
|
|
"\n",
|
||
|
|
"report(knn)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"id": "8ed14aaf",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"Performance on TRAINING\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.81 1.00 0.89 31916\n",
|
||
|
|
" True 1.00 0.87 0.93 58426\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.91 90342\n",
|
||
|
|
" macro avg 0.90 0.93 0.91 90342\n",
|
||
|
|
"weighted avg 0.93 0.91 0.92 90342\n",
|
||
|
|
"\n",
|
||
|
|
"Performance on TEST\n",
|
||
|
|
"*****************\n",
|
||
|
|
" precision recall f1-score support\n",
|
||
|
|
"\n",
|
||
|
|
" False 0.67 0.82 0.74 13678\n",
|
||
|
|
" True 0.89 0.78 0.83 25041\n",
|
||
|
|
"\n",
|
||
|
|
" accuracy 0.80 38719\n",
|
||
|
|
" macro avg 0.78 0.80 0.79 38719\n",
|
||
|
|
"weighted avg 0.81 0.80 0.80 38719\n",
|
||
|
|
"\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"# Finally, we will perform k-nearest neighbors classification with cross-validation. \n",
|
||
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
||
|
|
"\n",
|
||
|
|
"knn_search = KNeighborsClassifier()\n",
|
||
|
|
"param_grid = {\"n_neighbors\": np.arange(1, 100)}\n",
|
||
|
|
"knn_gscv = GridSearchCV(knn_search, param_grid, cv=5, scoring=\"precision\") # most important metric from business perspective.\n",
|
||
|
|
"\n",
|
||
|
|
"# fit the model to the training set\n",
|
||
|
|
"knn_gscv.fit(x_train, y_train)\n",
|
||
|
|
"best_knn = knn_gscv.best_estimator_\n",
|
||
|
|
"\n",
|
||
|
|
"report(best_knn)\n",
|
||
|
|
"k_param = knn_gscv.best_params_"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": 23,
|
||
|
|
"id": "22d2e4ae",
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [
|
||
|
|
{
|
||
|
|
"name": "stdout",
|
||
|
|
"output_type": "stream",
|
||
|
|
"text": [
|
||
|
|
"{'n_neighbors': np.int64(2)}\n"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"source": [
|
||
|
|
"print(k_param)"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"id": "24412ca1",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"from a business perspective, the most important metric is the tp/tp+fp, or precision metric, since it is more costly for a shelter to maintain an animal that will get transferred rather than transferring it as soon as possible. False positives would cause the retention of animals that have low chance of getting adopted, rather than transferring them to other shelters, where they may have higher chances of adoption."
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "base",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"codemirror_mode": {
|
||
|
|
"name": "ipython",
|
||
|
|
"version": 3
|
||
|
|
},
|
||
|
|
"file_extension": ".py",
|
||
|
|
"mimetype": "text/x-python",
|
||
|
|
"name": "python",
|
||
|
|
"nbconvert_exporter": "python",
|
||
|
|
"pygments_lexer": "ipython3",
|
||
|
|
"version": "3.13.5"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 5
|
||
|
|
}
|