Project%201.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b7ee7a83",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "399df849",
   "metadata": {},
   "source": [
    "# Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "edda6af5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 131165 entries, 0 to 131164\n",
      "Data columns (total 12 columns):\n",
      " #   Column            Non-Null Count   Dtype \n",
      "---  ------            --------------   ----- \n",
      " 0   Animal ID         131165 non-null  object\n",
      " 1   Date of Birth     131165 non-null  object\n",
      " 2   Name              93658 non-null   object\n",
      " 3   DateTime          131165 non-null  object\n",
      " 4   MonthYear         131165 non-null  object\n",
      " 5   Outcome Type      131125 non-null  object\n",
      " 6   Outcome Subtype   65810 non-null   object\n",
      " 7   Animal Type       131165 non-null  object\n",
      " 8   Sex upon Outcome  131165 non-null  object\n",
      " 9   Age upon Outcome  131165 non-null  object\n",
      " 10  Breed             131165 non-null  object\n",
      " 11  Color             131165 non-null  object\n",
      "dtypes: object(12)\n",
      "memory usage: 12.0+ MB\n",
      "None\n",
      "\n",
      "\n",
      "\n",
      "Unique animal ID's: 121258\n",
      "\n",
      "\n",
      "non-unique id's:\n",
      "     Animal ID Date of Birth     Name                   DateTime MonthYear  \\\n",
      "694    A724077       7/11/15   *Sandy  2016-05-09T00:00:00-05:00    May-16   \n",
      "1455   A750513      11/28/16      NaN  2017-06-19T00:00:00-05:00    Jun-17   \n",
      "1544   A755088       9/29/16  Machete  2017-08-19T00:00:00-05:00    Aug-17   \n",
      "1762   A758320       9/14/07   Sophie  2018-01-31T00:00:00-05:00    Jan-18   \n",
      "1825   A767543        3/3/16      NaN  2018-04-18T00:00:00-05:00    Apr-18   \n",
      "\n",
      "     Outcome Type Outcome Subtype Animal Type Sex upon Outcome  \\\n",
      "694      Adoption             NaN         Dog    Spayed Female   \n",
      "1455     Transfer             Snr         Cat    Neutered Male   \n",
      "1544     Adoption             NaN         Dog    Spayed Female   \n",
      "1762     Transfer         Partner         Cat    Spayed Female   \n",
      "1825     Transfer             Snr         Cat    Spayed Female   \n",
      "\n",
      "     Age upon Outcome                     Breed               Color  \n",
      "694          9 months     Queensland Heeler Mix                 Red  \n",
      "1455         6 months  Domestic Medium Hair Mix  Orange Tabby/White  \n",
      "1544        10 months     Australian Kelpie Mix         Black/White  \n",
      "1762         10 years    Domestic Shorthair Mix          Blue Tabby  \n",
      "1825          2 years    Domestic Shorthair Mix               Black  \n",
      "\n",
      "\n",
      "ID A724077:\n",
      "      Animal ID Date of Birth    Name                   DateTime MonthYear  \\\n",
      "649     A724077       7/11/15  *Sandy  2016-04-24T00:00:00-05:00    Apr-16   \n",
      "694     A724077       7/11/15  *Sandy  2016-05-09T00:00:00-05:00    May-16   \n",
      "35974   A724077       7/11/15  *Sandy        2016-06-12T16:20:00    Jun-16   \n",
      "\n",
      "      Outcome Type Outcome Subtype Animal Type Sex upon Outcome  \\\n",
      "649       Adoption             NaN         Dog    Spayed Female   \n",
      "694       Adoption             NaN         Dog    Spayed Female   \n",
      "35974     Adoption             NaN         Dog    Spayed Female   \n",
      "\n",
      "      Age upon Outcome                  Breed Color  \n",
      "649           9 months  Queensland Heeler Mix   Red  \n",
      "694           9 months  Queensland Heeler Mix   Red  \n",
      "35974        11 months  Queensland Heeler Mix   Red  \n",
      "      0    1    2   3   4   5    6   7\n",
      "0  2014  -06  -10  00  00  00  -05  00\n",
      "1  2014  -07  -11  00  00  00  -05  00\n",
      "2  2014  -07  -12  00  00  00  -05  00\n",
      "3  2014  -07  -14  00  00  00  -05  00\n",
      "4  2014  -08  -20  00  00  00  -05  00\n",
      "       Date of Birth     Name             DateTime MonthYear Outcome Type  \\\n",
      "2052         7/23/13  *Dudley  2013-10-01T09:31:00    Oct-13     Adoption   \n",
      "2053         9/24/13      NaN  2013-10-01T10:39:00    Oct-13     Transfer   \n",
      "2054         9/24/13      NaN  2013-10-01T10:44:00    Oct-13     Transfer   \n",
      "2055         9/22/10      NaN  2013-10-01T11:12:00    Oct-13     Transfer   \n",
      "2056         9/25/11      NaN  2013-10-01T11:13:00    Oct-13     Transfer   \n",
      "...              ...      ...                  ...       ...          ...   \n",
      "129096       1/31/12      NaN  2014-02-04T10:14:00    Feb-14     Transfer   \n",
      "129097        2/4/09     Toby  2014-02-08T10:14:00    Feb-14     Transfer   \n",
      "129098        7/6/13      NaN  2014-02-07T10:14:00    Feb-14     Transfer   \n",
      "129099       2/16/14      NaN  2014-03-27T08:00:00    Mar-14     Transfer   \n",
      "129100       5/10/11   Gatsby  2014-05-14T08:00:00    May-14     Transfer   \n",
      "\n",
      "       Outcome Subtype Animal Type Sex upon Outcome Age upon Outcome  \\\n",
      "2052            Foster         Dog    Neutered Male         2 months   \n",
      "2053           Partner         Cat          Unknown           1 week   \n",
      "2054           Partner         Cat          Unknown           1 week   \n",
      "2055           Partner         Dog    Neutered Male          3 years   \n",
      "2056           Partner         Dog    Spayed Female          2 years   \n",
      "...                ...         ...              ...              ...   \n",
      "129096            SCRP         Cat      Intact Male          2 years   \n",
      "129097         Partner         Dog      Intact Male          5 years   \n",
      "129098            SCRP         Cat    Spayed Female         7 months   \n",
      "129099         Partner         Dog      Intact Male          1 month   \n",
      "129100         Partner         Dog      Intact Male          3 years   \n",
      "\n",
      "                           Breed         Color  Year  Month  Hour  Zone  \n",
      "2052      Labrador Retriever Mix         Black  2013     10     9   100  \n",
      "2053      Domestic Shorthair Mix  Orange/White  2013     10    10   100  \n",
      "2054      Domestic Shorthair Mix  Orange/White  2013     10    10   100  \n",
      "2055              Toy Poodle Mix         White  2013     10    11   100  \n",
      "2056                   Boxer Mix     Red/White  2013     10    11   100  \n",
      "...                          ...           ...   ...    ...   ...   ...  \n",
      "129096    Domestic Shorthair Mix   Brown Tabby  2014      2    10   100  \n",
      "129097                 Pekingese     Red/White  2014      2    10   100  \n",
      "129098  Domestic Medium Hair Mix   Brown Tabby  2014      2    10   100  \n",
      "129099            Pomeranian Mix   White/Brown  2014      3     8   100  \n",
      "129100      Pekingese/Lhasa Apso          Buff  2014      5     8   100  \n",
      "\n",
      "[125886 rows x 15 columns]\n",
      "0\n",
      "   0       1\n",
      "0  2   weeks\n",
      "1  2  months\n",
      "2  4   weeks\n",
      "3  2   weeks\n",
      "4  5  months\n",
      "0           7\n",
      "1          30\n",
      "2           7\n",
      "3           7\n",
      "4          30\n",
      "         ... \n",
      "129096    365\n",
      "129097    365\n",
      "129098     30\n",
      "129099     30\n",
      "129100    365\n",
      "Name: 1, Length: 129101, dtype: int64\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 129101 entries, 0 to 129100\n",
      "Data columns (total 13 columns):\n",
      " #   Column            Non-Null Count   Dtype  \n",
      "---  ------            --------------   -----  \n",
      " 0   Outcome Type      129061 non-null  object \n",
      " 1   Animal Type       129101 non-null  object \n",
      " 2   Sex upon Outcome  129101 non-null  object \n",
      " 3   AgeInDays         129101 non-null  int64  \n",
      " 4   Breed             129101 non-null  object \n",
      " 5   Color             129101 non-null  object \n",
      " 6   Year              129101 non-null  int16  \n",
      " 7   Month             129101 non-null  float64\n",
      " 8   Hour              129101 non-null  int8   \n",
      " 9   HourCosine        129101 non-null  float64\n",
      " 10  BirthMonth        129101 non-null  int8   \n",
      " 11  BirthYear         129101 non-null  int8   \n",
      " 12  HasName           129101 non-null  bool   \n",
      "dtypes: bool(1), float64(2), int16(1), int64(1), int8(3), object(5)\n",
      "memory usage: 8.6+ MB\n",
      "None\n",
      "       Outcome Type Animal Type Sex upon Outcome  AgeInDays  \\\n",
      "89032           NaN         Dog    Spayed Female        365   \n",
      "89034           NaN         Dog    Spayed Female        300   \n",
      "89143           NaN         Dog    Neutered Male       2190   \n",
      "89375           NaN         Dog    Spayed Female        180   \n",
      "89379           NaN         Dog    Neutered Male        120   \n",
      "89491           NaN         Dog    Neutered Male        365   \n",
      "89503           NaN         Cat    Spayed Female        730   \n",
      "89619           NaN         Dog    Neutered Male         60   \n",
      "89636           NaN         Dog    Spayed Female       1825   \n",
      "89661           NaN         Dog    Neutered Male       1460   \n",
      "89804           NaN         Dog      Intact Male        365   \n",
      "89821           NaN         Dog          Unknown         30   \n",
      "89936           NaN         Dog      Intact Male        365   \n",
      "92129           NaN         Cat      Intact Male         30   \n",
      "92298           NaN         Dog      Intact Male        730   \n",
      "93308           NaN         Cat    Spayed Female        730   \n",
      "94607           NaN         Dog      Intact Male        730   \n",
      "108019          NaN         Cat    Spayed Female       4745   \n",
      "108706          NaN        Bird          Unknown        365   \n",
      "108884          NaN         Dog          Unknown        365   \n",
      "111465          NaN         Dog      Intact Male        730   \n",
      "111621          NaN         Cat    Neutered Male       1095   \n",
      "111732          NaN         Dog    Neutered Male        730   \n",
      "116268          NaN         Cat    Spayed Female         90   \n",
      "116396          NaN         Cat    Spayed Female         90   \n",
      "116522          NaN         Dog    Spayed Female        730   \n",
      "119515          NaN         Dog          Unknown       1825   \n",
      "120797          NaN         Dog      Intact Male        330   \n",
      "120829          NaN         Dog    Intact Female        730   \n",
      "122114          NaN         Dog    Neutered Male        300   \n",
      "122224          NaN         Dog    Spayed Female        365   \n",
      "122267          NaN         Cat    Spayed Female         90   \n",
      "123937          NaN         Dog    Spayed Female        730   \n",
      "123938          NaN         Dog    Spayed Female        730   \n",
      "124535          NaN         Cat    Neutered Male         60   \n",
      "127205          NaN         Dog      Intact Male        300   \n",
      "128002          NaN         Dog    Spayed Female        365   \n",
      "128308          NaN         Dog    Neutered Male       3285   \n",
      "128476          NaN         Dog    Intact Female        270   \n",
      "128542          NaN         Dog    Spayed Female       1825   \n",
      "\n",
      "                                    Breed         Color  Year         Month  \\\n",
      "89032                American Bulldog Mix   White/Black  2021  8.660254e-01   \n",
      "89034                     German Shepherd     Tan/Black  2021  8.660254e-01   \n",
      "89143                        Pit Bull Mix          Blue  2021  8.660254e-01   \n",
      "89375    Labrador Retriever/Border Collie   Black/White  2021  8.660254e-01   \n",
      "89379                American Bulldog Mix   Black/White  2021  8.660254e-01   \n",
      "89491                       Blue Lacy Mix    Blue/White  2021  5.000000e-01   \n",
      "89503                  Domestic Shorthair         Black  2021  5.000000e-01   \n",
      "89619                            Pit Bull    Blue/White  2021  5.000000e-01   \n",
      "89636       American Pit Bull Terrier Mix   White/Brown  2021  5.000000e-01   \n",
      "89661              Labrador Retriever Mix  Yellow/White  2021  5.000000e-01   \n",
      "89804             Chihuahua Shorthair Mix           Tan  2021  6.123234e-17   \n",
      "89821                             Pug Mix   Black/White  2021  6.123234e-17   \n",
      "89936                            Pit Bull         Black  2021  6.123234e-17   \n",
      "92129                Domestic Medium Hair   Black Smoke  2021 -1.000000e+00   \n",
      "92298                             Pug Mix   Black/White  2021 -1.000000e+00   \n",
      "93308                  Domestic Shorthair        Tortie  2021 -8.660254e-01   \n",
      "94607                     German Shepherd     Black/Tan  2021 -5.000000e-01   \n",
      "108019                 Domestic Shorthair        Tortie  2023  8.660254e-01   \n",
      "108706                            Waxwing     Tan/Black  2023  5.000000e-01   \n",
      "108884                          Pekingese   Black/White  2023  5.000000e-01   \n",
      "111465                Doberman Pinsch Mix   Black/Brown  2023 -1.000000e+00   \n",
      "111621                 Domestic Shorthair   Cream Tabby  2023 -1.000000e+00   \n",
      "111732                        Mastiff Mix           Tan  2023 -1.000000e+00   \n",
      "116268                 Domestic Shorthair   Black/White  2023  1.000000e+00   \n",
      "116396                        Siamese Mix    Lynx Point  2023  1.000000e+00   \n",
      "116522  Cairn Terrier/Chihuahua Shorthair         White  2023  1.000000e+00   \n",
      "119515               American Bulldog Mix         Brown  2024 -5.000000e-01   \n",
      "120797                           Pit Bull    Blue/White  2024 -1.000000e+00   \n",
      "120829                     Siberian Husky         White  2024 -1.000000e+00   \n",
      "122114          Wire Hair Fox Terrier Mix         Brown  2024 -8.660254e-01   \n",
      "122224                     Siberian Husky   Brown/White  2024 -8.660254e-01   \n",
      "122267                 Domestic Shorthair  Torbie/White  2024 -8.660254e-01   \n",
      "123937                   Belgian Malinois   Brown/Black  2024 -1.836970e-16   \n",
      "123938                    German Shepherd   Brown/Black  2024 -1.836970e-16   \n",
      "124535               Domestic Medium Hair         Black  2024  5.000000e-01   \n",
      "127205    West Highland/Yorkshire Terrier   White/Cream  2025  5.000000e-01   \n",
      "128002                       Pit Bull Mix   Black/White  2025  6.123234e-17   \n",
      "128308                    German Shepherd   Brown/Black  2025  6.123234e-17   \n",
      "128476                 Siberian Husky Mix   Black/Cream  2025 -5.000000e-01   \n",
      "128542                       Pit Bull Mix    Fawn/White  2025 -5.000000e-01   \n",
      "\n",
      "        Hour    HourCosine  BirthMonth  BirthYear  HasName  \n",
      "89032     17 -2.588190e-01           6         19     True  \n",
      "89034     17 -2.588190e-01           2         20    False  \n",
      "89143     18 -1.836970e-16           4         14     True  \n",
      "89375     14 -8.660254e-01           7         20     True  \n",
      "89379     16 -5.000000e-01           9         20     True  \n",
      "89491     18 -1.836970e-16           1         20     True  \n",
      "89503     15 -7.071068e-01           1         19    False  \n",
      "89619     12 -1.000000e+00          12         20     True  \n",
      "89636     10 -8.660254e-01          10         15     True  \n",
      "89661     12 -1.000000e+00          10         16     True  \n",
      "89804     13 -9.659258e-01           3         20     True  \n",
      "89821     14 -8.660254e-01           1         21    False  \n",
      "89936     13 -9.659258e-01          12         19     True  \n",
      "92129     16 -5.000000e-01           4         21    False  \n",
      "92298     11 -9.659258e-01           6         19     True  \n",
      "93308     18 -1.836970e-16           6         19     True  \n",
      "94607     18 -1.836970e-16           8         19    False  \n",
      "108019    12 -1.000000e+00           5          9     True  \n",
      "108706     9 -7.071068e-01           2         22    False  \n",
      "108884    15 -7.071068e-01           2         22     True  \n",
      "111465    19  2.588190e-01           6         21     True  \n",
      "111621    10 -8.660254e-01           7         19     True  \n",
      "111732    18 -1.836970e-16           6         21     True  \n",
      "116268    14 -8.660254e-01           9         23     True  \n",
      "116396    16 -5.000000e-01           9         23    False  \n",
      "116522    17 -2.588190e-01          12         21    False  \n",
      "119515    10 -8.660254e-01           4         19    False  \n",
      "120797    10 -8.660254e-01           7         23     True  \n",
      "120829    16 -5.000000e-01           5         22     True  \n",
      "122114    12 -1.000000e+00           9         23     True  \n",
      "122224    17 -2.588190e-01           6         23     True  \n",
      "122267    18 -1.836970e-16           4         24     True  \n",
      "123937    10 -8.660254e-01           9         22     True  \n",
      "123938    10 -8.660254e-01           9         22     True  \n",
      "124535    18 -1.836970e-16           8         24    False  \n",
      "127205    11 -9.659258e-01           4         24    False  \n",
      "128002    16 -5.000000e-01          12         23     True  \n",
      "128308    16 -5.000000e-01           2         16     True  \n",
      "128476    16 -5.000000e-01           6         24     True  \n",
      "128542    18 -1.836970e-16          12         19     True  \n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 129061 entries, 0 to 129100\n",
      "Data columns (total 13 columns):\n",
      " #   Column            Non-Null Count   Dtype  \n",
      "---  ------            --------------   -----  \n",
      " 0   Outcome Type      129061 non-null  object \n",
      " 1   Animal Type       129061 non-null  object \n",
      " 2   Sex upon Outcome  129061 non-null  object \n",
      " 3   AgeInDays         129061 non-null  int64  \n",
      " 4   Breed             129061 non-null  object \n",
      " 5   Color             129061 non-null  object \n",
      " 6   Year              129061 non-null  int16  \n",
      " 7   Month             129061 non-null  float64\n",
      " 8   Hour              129061 non-null  int8   \n",
      " 9   HourCosine        129061 non-null  float64\n",
      " 10  BirthMonth        129061 non-null  int8   \n",
      " 11  BirthYear         129061 non-null  int8   \n",
      " 12  HasName           129061 non-null  bool   \n",
      "dtypes: bool(1), float64(2), int16(1), int64(1), int8(3), object(5)\n",
      "memory usage: 9.6+ MB\n",
      "None\n",
      "  Outcome Type Animal Type Sex upon Outcome  AgeInDays                Breed  \\\n",
      "0     Transfer        Bird          Unknown         14                 Duck   \n",
      "1     Adoption         Dog    Spayed Female         60  Black Mouth Cur Mix   \n",
      "2     Transfer        Bird          Unknown         28          Catbird Mix   \n",
      "3     Transfer        Bird          Unknown         14          Grackle Mix   \n",
      "4     Transfer        Bird      Intact Male        150           Silkie Mix   \n",
      "\n",
      "          Color  Year     Month  Hour  HourCosine  BirthMonth  BirthYear  \\\n",
      "0  Yellow/Black  2014 -1.000000     0         1.0           5         14   \n",
      "1   Brown/Black  2014 -0.866025     0         1.0           4         14   \n",
      "2         Brown  2014 -0.866025     0         1.0           6         14   \n",
      "3   Black/White  2014 -0.866025     0         1.0           6         14   \n",
      "4         Black  2014 -0.500000     0         1.0           3         14   \n",
      "\n",
      "   HasName  \n",
      "0    False  \n",
      "1     True  \n",
      "2    False  \n",
      "3    False  \n",
      "4    False  \n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv('project1.csv')\n",
    "print(f\"{data.info()}\\n\\n\")\n",
    "# We can see that there are 12 columns, with 131165 entries each. Shape is (131165, 12)\n",
    "\n",
    "# It only makes sense to do univariate analysis after cleaning up the data, since we can't really do that on strings and such.\n",
    "# Therefore, I will be doing cleanup before visualization.\n",
    "\n",
    "# Datatypes of Animal ID, Date of Birth, DateTime, MonthYear, and Age upon outcome would need to be converted to numeric formats.\n",
    "# However, let's see if Animal ID is unique for every animal...\n",
    "\n",
    "print(f\"\\nUnique animal ID's: {len(data['Animal ID'].unique())}\")\n",
    "# Not all id's are unique? Which ones are same?\n",
    "\n",
    "print(\"\\n\\nnon-unique id's:\")\n",
    "newdf = data[data['Animal ID'].duplicated()]\n",
    "print(newdf.head())\n",
    "\n",
    "# Let's choose one\n",
    "print(\"\\n\\nID A724077:\")\n",
    "print(data[data['Animal ID'] == \"A724077\"].head())\n",
    "# looks like the same dog gets adopted multiple times? We can maybe have each animal get a count of how many times they've been adopted.\n",
    "# However, our model will not be trained using the outcome, so this varible is not significant. We don't know whether the animal was transferred or adopted\n",
    "# we should therefore drop the Animal ID column.\n",
    "data.drop(['Animal ID'], axis=1, inplace=True)\n",
    "\n",
    "# remove duplicates\n",
    "data.drop_duplicates(inplace=True, ignore_index=True)\n",
    "\n",
    "# MonthYear and DateTime are redundant - let's just make outcome month, year, date, time columns\n",
    "# The following was informed by AI but not directly generated (see [4] and [5])\n",
    "splitDateTime = pd.DataFrame(data['DateTime'].str.findall(r'(\\-*\\d+)').tolist(), index=data.index)\n",
    "print(splitDateTime.head())\n",
    "splitDateTime[6] = splitDateTime[6].transform(lambda x: x.fillna(100)) # arbitrary time zone to avoid conflict with any other time zone\n",
    "\n",
    "# seems like most times are 00:00:00...\n",
    "data['Year'] = splitDateTime[0].astype('int16').abs()\n",
    "data['Month'] = splitDateTime[1].astype('int8').abs()\n",
    "data['Hour'] = splitDateTime[3].astype('int8').abs()\n",
    "data['Zone'] = splitDateTime[6].astype('int8')\n",
    "\n",
    "print(data[(data['Hour'] != 0)])\n",
    "\n",
    "# there are indeed nonzero times. Let's keep the hour.\n",
    "# let's check if the time zone is different for any of them...\n",
    "print(len(data[(data['Zone'] != 100) & (data['Zone'] != -5)]))\n",
    "# since the time zones are either UTC-5 or not present, we can drop the time zone column.\n",
    "data.drop(['Zone'], axis=1, inplace=True)\n",
    "# we can now also drop DateTime and MonthYear, since this data is reprsented in the year, month, date, hour, minute, and second columns.\n",
    "data.drop(['DateTime', 'MonthYear'], axis=1, inplace=True)\n",
    "\n",
    "# Saw online that representing hour data as a sinusoid is more effective, representing cyclical nature of the day.\n",
    "data['HourCosine'] = data['Hour'].transform(lambda x: np.cos(np.pi * x / 12))\n",
    "\n",
    "data['Month'] = data['Month'].transform(lambda x: np.cos(np.pi * x / 6))\n",
    "\n",
    "# Now, since our outcome subtype cannot be used for inference due to its nature as part of the outcome, \n",
    "# and since we are only predicting the outcome type, we can drop Outcome Subtype.\n",
    "data.drop(['Outcome Subtype'], axis=1, inplace=True)\n",
    "\n",
    "# Let's handle the DOB now.\n",
    "splitDOB = pd.DataFrame(data['Date of Birth'].str.findall(r'(\\d+)').tolist(), index=data.index)\n",
    "data['BirthMonth'] = splitDOB[0].astype('int8')\n",
    "data['BirthYear'] = splitDOB[2].astype('int8')\n",
    "data.drop(['Date of Birth'], axis=1, inplace=True)\n",
    "\n",
    "# let's try to convert Age to an age in days.\n",
    "# The function below was informed by AI, but not directly generated; See [1], [2], [3].\n",
    "def convertAge(series: pd.Series):\n",
    "    outDF = pd.DataFrame(series.str.split(expand=True))\n",
    "    print(outDF.head())\n",
    "    custom_map = {\n",
    "        'w': 7,\n",
    "        'd': 1,\n",
    "        'y': 365,\n",
    "        'm': 30\n",
    "    }\n",
    "    print(outDF[1].str[0].map(custom_map))\n",
    "    outSeries = outDF[0].astype(int) * outDF[1].str[0].map(custom_map)\n",
    "    return outSeries\n",
    "\n",
    "data['Age upon Outcome'] = data['Age upon Outcome'].transform(convertAge)\n",
    "data.rename(columns={'Age upon Outcome': 'AgeInDays'}, inplace=True)\n",
    "\n",
    "# Let's change the name column to a boolean representing whether or not a name is present.\n",
    "data['HasName'] = data['Name'].notna()\n",
    "data.drop(['Name'], axis=1, inplace=True)\n",
    "\n",
    "print(data.info())\n",
    "\n",
    "# It seems we still have null outcome types... Let's see what those are.\n",
    "print(data[data['Outcome Type'].isna()])\n",
    "# It only makes sense to keep data that will help us predict the outcome, and without the outcome the data is useless\n",
    "# we will therefore drop the rows without outcome type.\n",
    "data = data[data['Outcome Type'].notna()]\n",
    "\n",
    "print(data.info())\n",
    "print(data.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d381aa2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Transfer' 'Adoption']\n",
      "['Bird' 'Dog' 'Cat' 'Livestock']\n",
      "['Unknown' 'Spayed Female' 'Intact Male' 'Intact Female' 'Neutered Male']\n",
      "2526\n",
      "599\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 129061 entries, 0 to 129100\n",
      "Data columns (total 16 columns):\n",
      " #   Column             Non-Null Count   Dtype  \n",
      "---  ------             --------------   -----  \n",
      " 0   AgeInDays          129061 non-null  int64  \n",
      " 1   Year               129061 non-null  int16  \n",
      " 2   Month              129061 non-null  float64\n",
      " 3   Hour               129061 non-null  int8   \n",
      " 4   HourCosine         129061 non-null  float64\n",
      " 5   BirthMonth         129061 non-null  int8   \n",
      " 6   BirthYear          129061 non-null  int8   \n",
      " 7   HasName            129061 non-null  bool   \n",
      " 8   Type_Cat           129061 non-null  bool   \n",
      " 9   Type_Dog           129061 non-null  bool   \n",
      " 10  Type_Livestock     129061 non-null  bool   \n",
      " 11  Sex_Intact Male    129061 non-null  bool   \n",
      " 12  Sex_Neutered Male  129061 non-null  bool   \n",
      " 13  Sex_Spayed Female  129061 non-null  bool   \n",
      " 14  Sex_Unknown        129061 non-null  bool   \n",
      " 15  Outcome_Adoption   129061 non-null  bool   \n",
      "dtypes: bool(9), float64(2), int16(1), int64(1), int8(3)\n",
      "memory usage: 5.7 MB\n"
     ]
    }
   ],
   "source": [
    "# Let's now take a look at categorizing the outcome type, animal type, sex, breed, and color data.\n",
    "print(data['Outcome Type'].unique())\n",
    "print(data['Animal Type'].unique())\n",
    "print(data['Sex upon Outcome'].unique())\n",
    "breeds = data['Breed'].unique()\n",
    "print(len(breeds))\n",
    "# breeds are too sparse to deal with, and they are dropped in section two of the project. Let's drop them now.\n",
    "data.drop(['Breed'], axis=1, inplace=True)\n",
    "\n",
    "colors = data['Color'].unique()\n",
    "print(len(colors))\n",
    "# colors are also looking very sparse, let's drop them for now and come back to them if needed.\n",
    "data.drop(['Color'], axis=1, inplace=True)\n",
    "data.rename(columns={'Outcome Type': 'Outcome', 'Animal Type': 'Type', 'Sex upon Outcome': 'Sex'}, inplace=True)\n",
    "data = pd.get_dummies(data, columns=['Type', 'Sex'], drop_first=True)\n",
    "data = pd.get_dummies(data, columns=['Outcome'], drop_first=False)\n",
    "data.drop(['Outcome_Transfer'], axis=1, inplace=True)\n",
    "# we can drop the first column since a false in all the other possibilities would signify a true in the first.\n",
    "\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47862576",
   "metadata": {},
   "source": [
    "# Univariate Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7ae7aef",
   "metadata": {},
   "source": [
    "\n",
    "#### We can see certain trends from this readout, such as that there seem to be a greater amount of very young animals than older animals, since the median is 270 while the mean is 583 days.\n",
    "\n",
    "\n",
    "#### Additionally, we see that the adoption or transfer data spans 2013 through 2025, centering around 2018. However, when it comes to birth year, we see the year 99, which signifies 1999. This discrepancy might interfere with the linear classification algorithm, and since we already have age data and adoption date data, we should be good to drop this too.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ca5e9c28",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           AgeInDays           Year         Month           Hour  \\\n",
      "count  129061.000000  129061.000000  1.290610e+05  129061.000000   \n",
      "mean      583.200851    2018.596803 -3.878811e-02      14.328744   \n",
      "std       880.694863       3.318159  7.180569e-01       3.695877   \n",
      "min         1.000000    2013.000000 -1.000000e+00       0.000000   \n",
      "25%        60.000000    2016.000000 -8.660254e-01      12.000000   \n",
      "50%       270.000000    2018.000000 -1.836970e-16      15.000000   \n",
      "75%       730.000000    2021.000000  5.000000e-01      17.000000   \n",
      "max     10950.000000    2025.000000  1.000000e+00      23.000000   \n",
      "\n",
      "          HourCosine     BirthMonth      BirthYear  \n",
      "count  129061.000000  129061.000000  129061.000000  \n",
      "mean       -0.503063       6.244295      16.987231  \n",
      "std         0.454937       3.203606       4.505264  \n",
      "min        -1.000000       1.000000       0.000000  \n",
      "25%        -0.866025       4.000000      14.000000  \n",
      "50%        -0.500000       6.000000      17.000000  \n",
      "75%        -0.258819       9.000000      20.000000  \n",
      "max         1.000000      12.000000      99.000000  \n"
     ]
    }
   ],
   "source": [
    "print(data.describe())\n",
    "\n",
    "data.drop(['BirthYear'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73c09b78",
   "metadata": {},
   "source": [
    "#### We can see from the following histogram that an overwhelming number of animals at the shelter are under a year in age (2000 days / 7 bins ~= 300 days.)\n",
    "#### This is probably a significant contributor to the animals that actually get adopted, considering that buyers might be more likely to buy younger or older animals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "daecf558",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Axes: xlabel='AgeInDays', ylabel='Count'>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAO0tJREFUeJzt3Xt0VPW9//9XyGUIMRlCYjJEwsVjjGDAYrAQsAULJFgC9dKiBkepCChCjMJRkfZI/ZWgIpfTolQ5FFSgsadIDxWMCV6okavRVAKI9hQlYEIQJpOAcRKTz+8Pj/vrEAgbDEwCz8daey1nf94z+70/C+G1PrP3niBjjBEAAACa1S7QDQAAALQFhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgQ0igGzifNDY26vPPP1dkZKSCgoIC3Q4AALDBGKOamholJCSoXbuTrycRmlrQ559/rsTExEC3AQAAzkBZWZm6dOly0nFCUwuKjIyU9M2kR0VFBbgbAABgR3V1tRITE61/x0+G0NSCvv1KLioqitAEAEAbc6pLa7gQHAAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYEBLoBmBPbW2tfD6frVqHw6Hw8PCz3BEAABcWQlMbUFtbq8Ru3XX4UKWt+piL41T22acEJwAAWhChqQ3w+Xw6fKhSmbmrFRp+UbO19bVH9eqjN8vn8xGaAABoQYSmNiQ0/CKFdYgMdBsAAFyQuBAcAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbCA0AQAA2EBoAgAAsIHQBAAAYAOhCQAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwIeCh6cCBA7r99tsVExOjDh066Ac/+IGKi4utcWOMZs2apYSEBIWHh2vIkCHauXOn32f4fD5NnTpVsbGxioiI0OjRo7V//36/Go/HI7fbLafTKafTKbfbraqqKr+affv2adSoUYqIiFBsbKyys7NVV1d31s4dAAC0HQENTR6PR4MGDVJoaKhee+017dq1S/PmzVPHjh2tmqeeekrz58/XokWLtH37drlcLg0fPlw1NTVWTU5OjtasWaO8vDwVFRXp6NGjyszMVENDg1WTlZWlkpIS5efnKz8/XyUlJXK73dZ4Q0ODRo4cqWPHjqmoqEh5eXlavXq1pk2bdk7mAgAAtG5BxhgTqIM/8sgjevfdd/XOO++ccNwYo4SEBOXk5Ojhhx+W9M2qUnx8vJ588klNmjRJXq9XF198sV566SXdcsstkqTPP/9ciYmJWr9+vTIyMrR792716tVLW7ZsUf/+/SVJW7ZsUVpamj766CMlJyfrtddeU2ZmpsrKypSQkCBJysvL07hx41RZWamoqKhTnk91dbWcTqe8Xq+teruqqqoUHR2tGxe8rrAOkc3W1n1ZozUPZMjj8fiFTwAAcGJ2//0O6ErT2rVr1a9fP/3iF79QXFyc+vbtqyVLlljje/fuVUVFhdLT0619DodDgwcP1qZNmyRJxcXFqq+v96tJSEhQSkqKVbN582Y5nU4rMEnSgAED5HQ6/WpSUlKswCRJGRkZ8vl8fl8XfpfP51N1dbXfBgAAzk8BDU3/+te/tHjxYiUlJen111/XPffco+zsbL344ouSpIqKCklSfHy83/vi4+OtsYqKCoWFhSk6OrrZmri4uCbHj4uL86s5/jjR0dEKCwuzao43Z84c6xopp9OpxMTE050CAADQRgQ0NDU2Nurqq69Wbm6u+vbtq0mTJmnChAlavHixX11QUJDfa2NMk33HO77mRPVnUvNdM2bMkNfrtbaysrJmewIAAG1XQENT586d1atXL799PXv21L59+yRJLpdLkpqs9FRWVlqrQi6XS3V1dfJ4PM3WHDx4sMnxDx065Fdz/HE8Ho/q6+ubrEB9y+FwKCoqym8DAADnp4CGpkGDBmnPnj1++z7++GN169ZNktSjRw+5XC4VFhZa43V1ddq4caMGDhwoSUpNTVVoaKhfTXl5uUpLS62atLQ0eb1ebdu2zarZunWrvF6vX01paanKy8utmoKCAjkcDqWmprbwmQMAgLYmJJAHf+CBBzRw4EDl5uZqzJgx2rZtm55//nk9//zzkr75uiwnJ0e5ublKSkpSUlKScnNz1aFDB2VlZUmSnE6nxo8fr2nTpikmJkadOnXS9OnT1bt3bw0bNkzSN6tXI0aM0IQJE/Tcc89JkiZOnKjMzEwlJydLktLT09WrVy+53W7NnTtXR44c0fTp0zVhwgRWkAAAQGBD0zXXXKM1a9ZoxowZevzxx9WjRw8tXLhQY8eOtWoeeugh1dbWavLkyfJ4POrfv78KCgoUGfn/br1fsGCBQkJCNGbMGNXW1mro0KFavny5goODrZqVK1cqOzvbustu9OjRWrRokTUeHBysdevWafLkyRo0aJDCw8OVlZWlp59++hzMBAAAaO0C+pym8w3PaQIAoO1pE89pAgAAaCsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgA6EJAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbCA0AQAA2EBoAgAAsIHQBAAAYAOhCQAAwAZCEwAAgA2EJgAAABsITQAAADYQmgAAAGwgNAEAANhAaAIAALCB0AQAAGADoQkAAMAGQhMAAIANhCYAAAAbCE0AAAA2EJoAAABsIDQBAADYQGgCAACwgdAEAABgA6EJAADABkITAACADYQmAAAAGwhNAAAANhCaAAAAbAhoaJo1a5aCgoL8NpfLZY0bYzRr1iwlJCQoPDxcQ4YM0c6dO/0+w+fzaerUqYqNjVVERIRGjx6t/fv3+9V4PB653W45nU45nU653W5VVVX51ezbt0+jRo1SRESEYmNjlZ2drbq6urN27gAAoG0J+ErTlVdeqfLycmvbsWOHNfbUU09p/vz5WrRokbZv3y6Xy6Xhw4erpqbGqsnJydGaNWuUl5enoqIiHT16VJmZmWpoaLBqsrKyVFJSovz8fOXn56ukpERut9sab2ho0MiRI3Xs2DEVFRUpLy9Pq1ev1rRp087NJAAAgFYvJOANhIT4rS59yxijhQsXaubMmbrpppskSS+88ILi4+O1atUqTZo0SV6vV0uXLtVLL72kYcOGSZJWrFihxMREbdiwQRkZGdq9e7fy8/O1ZcsW9e/fX5K0ZMkSpaWlac+ePUpOTlZBQYF27dqlsrIyJSQkSJLmzZuncePGafbs2YqKijpHswEAAFqrgK80ffLJJ0pISFCPHj1066236l//+pckae/evaqoqFB6erpV63A4NHjwYG3atEmSVFxcrPr6er+ahIQEpaSkWDWbN2+W0+m0ApMkDRgwQE6n068mJSXFCkySlJGRIZ/Pp+Li4pP27vP5VF1d7bcBAIDzU0BDU//+/fXiiy/q9ddf15IlS1RRUaGBAwfq8OHDqqiokCTFx8f7vSc+Pt4aq6ioUFhYmKKjo5utiYuLa3LsuLg4v5rjjxMdHa2wsDCr5kTmzJljXSfldDqVmJh4mjMAAADaioCGpuuvv14333yzevfurWHDhmndunWSvvka7ltBQUF+7zHGNNl3vONrTlR/JjXHmzFjhrxer7WVlZU12xcAAGi7Av713HdFRESod+/e+uSTT6zrnI5f6amsrLRWhVwul+rq6uTxeJqtOXjwYJNjHTp0yK/m+ON4PB7V19c3WYH6LofDoaioKL8NAACcn1pVaPL5fNq9e7c6d+6sHj16yOVyqbCw0Bqvq6vTxo0bNXDgQElSamqqQkND/WrKy8tVWlpq1aSlpcnr9Wrbtm1WzdatW+X1ev1qSktLVV5ebtUUFBT
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Let's explore the distribution of animal age more closely.\n",
    "import seaborn as sns\n",
    "sns.histplot(data=data['AgeInDays'], bins= 40)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8b08973f",
   "metadata": {},
   "source": [
    "#### We can see in the following histogram that there's considerably more data recorded after noon, so maybe that's interesting for prediction.\n",
    "#### It's possible that this data will show a higher probability of adoption if the transaction occurs later in the day."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "5521b827",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Axes: xlabel='Hour', ylabel='Count'>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAObNJREFUeJzt3X9YlXWe//HXCeGIiCcQ4XA2RKdRVsMaw1bQmbIskAld011tmD2jOw42W8qyyDWN9a2YZtLZTO26dGoct8lSumz2sqzUQTHLxvVntEyhjGszOmKBMAgHsdOBwfv7R9u9HVG8QeAc4Pm4rvu6uO/P+9znfXM6+Opz3/c5NsMwDAEAAKBd1wW6AQAAgN6A0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsGBDoBvqSixcv6tNPP1VkZKRsNlug2wEAABYYhqHz58/L5XLpuuuuPJ9EaOpCn376qRISEgLdBgAA6ITKykrdcMMNVxwnNHWhyMhISV/80ocMGRLgbgAAgBWNjY1KSEgw/x2/EkJTF/rylNyQIUMITQAA9DJXu7SGC8EBAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAUBDU3Lly/XbbfdpsjISMXGxmrmzJk6fvy4X41hGCosLJTL5VJ4eLimTJmio0eP+tX4fD4tXrxYMTExioiI0IwZM3TmzBm/mvr6erndbjkcDjkcDrndbjU0NPjVnD59WtOnT1dERIRiYmKUm5ur5ubmbjl2AMC183q9amhosLR4vd5At4teLqChae/evXrooYd08OBBlZSU6K9//avS09N14cIFs+bpp5/WqlWrtHbtWh05ckROp1P33HOPzp8/b9bk5eXp9ddf1+bNm7Vv3z41NTUpKytLra2tZk12drbKyspUXFys4uJilZWVye12m+Otra269957deHCBe3bt0+bN2/Wli1btGTJkp75ZQAAOsTr9SohcYSioqIsLQmJIwhOuCY2wzCMQDfxpdraWsXGxmrv3r26/fbbZRiGXC6X8vLy9PDDD0v6YlYpLi5O//7v/64HHnhAHo9Hw4YN08aNGzV37lxJ0qeffqqEhATt2LFDGRkZqqio0NixY3Xw4EFNnDhRknTw4EGlpaXpD3/4g5KSkvTb3/5WWVlZqqyslMvlkiRt3rxZ8+fPV01NjYYMGdKmX5/PJ5/PZ643NjYqISFBHo/nsvUAgK7T0NCgqKgoZS3botDwwe3WtnibtO2R2aqvr9f111/fMw2i12hsbJTD4bjqv99BdU2Tx+ORJEVHR0uSTp48qerqaqWnp5s1drtdd9xxh/bv3y9JKi0tVUtLi1+Ny+VScnKyWXPgwAE5HA4zMElSamqqHA6HX01ycrIZmCQpIyNDPp9PpaWll+13+fLl5uk+h8OhhISErvg1AAA6IDR8sMIGRba7XC1UAVYETWgyDEP5+fn65je/qeTkZElSdXW1JCkuLs6vNi4uzhyrrq5WWFiYoqKi2q2JjY1t85yxsbF+NZc+T1RUlMLCwsyaSy1dulQej8dcKisrO3rYAACglxgQ6Aa+tGjRIn344Yfat29fmzGbzea3bhhGm22XurTmcvWdqfkqu90uu93ebh8AAKBvCIqZpsWLF+vNN9/UO++8oxtuuMHc7nQ6JanNTE9NTY05K+R0OtXc3Kz6+vp2a86ePdvmeWtra/1qLn2e+vp6tbS0tJmBAgAA/U9AQ5NhGFq0aJFee+017dmzRyNHjvQbHzlypJxOp0pKSsxtzc3N2rt3ryZNmiRJSklJUWhoqF9NVVWVysvLzZq0tDR5PB4dPnzYrDl06JA8Ho9fTXl5uaqqqsyaXbt2yW63KyUlpesPHgAA9CoBPT330EMP6ZVXXtEbb7yhyMhIc6bH4XAoPDxcNptNeXl5WrZsmUaNGqVRo0Zp2bJlGjRokLKzs83aBQsWaMmSJRo6dKiio6NVUFCgcePG6e6775YkjRkzRtOmTVNOTo7WrVsnSVq4cKGysrKUlJQkSUpPT9fYsWPldru1YsUKnTt3TgUFBcrJyeFOOAAAENjQ9Pzzz0uSpkyZ4rf9xRdf1Pz58yVJP/rRj+T1evXggw+qvr5eEydO1K5duxQZGWnWr169WgMGDNCcOXPk9Xo1depUbdiwQSEhIWZNUVGRcnNzzbvsZsyYobVr15rjISEh2r59ux588EFNnjxZ4eHhys7O1jPPPNNNRw8AAHqToPqcpt7O6uc8AACu3Zef03Tf6p0KGxTZbm3zZ+f1+r9l8DlNuKxe+TlNAAAAwYrQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYQmgAAACwgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWEJoAAAAsIDQBAABYQGgCAACwgNAEAABgAaEJAADAAkITAACABYQmAAAACwhNAAAAFhCaAAAALCA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABYMCHQDAIC+z+v1yufzWaq12+0KDw/v5o6AjiM0AQC6ldfrVULiCNXV1liqHzosVpV/PkVwQtAJ6Om59957T9OnT5fL5ZLNZtPWrVv9xm0222WXFStWmDVTpkxpM37//ff77ae+vl5ut1sOh0MOh0Nut1sNDQ1+NadPn9b06dMVERGhmJgY5ebmqrm5ubsOHQD6DZ/Pp7raGmUt26L7Vu9sd8latkV1tTWWZ6WAnhTQmaYLFy7olltu0T//8z9r9uzZbcarqqr81n/7299qwYIFbWpzcnL05JNPmuuX/t9Jdna2zpw5o+LiYknSwoUL5Xa79dZbb0mSWltbde+992rYsGHat2+f6urqNG/ePBmGoTVr1nTJsQJAfxcaPlhhgyID3QbQaQENTZmZmcrMzLziuNPp9Ft/4403dOedd+prX/ua3/ZBgwa1qf1SRUWFiouLdfDgQU2cOFGStH79eqWlpen48eNKSkrSrl27dOzYMVVWVsrlckmSVq5cqfnz5+upp57SkCFDruUwAQBAH9Br7p47e/astm/frgULFrQZKyoqUkxMjG666SYVFBTo/Pnz5tiBAwfkcDjMwCRJqampcjgc2r9/v1mTnJxsBiZJysjIkM/nU2lp6RV78vl8amxs9FsAAEDf1GsuBH/ppZcUGRmpWbNm+W3/7ne/q5EjR8rpdKq8vFxLly7V73//e5WUlEiSqqurFRsb22Z/sbGxqq6uNmvi4uL8xqOiohQWFmbWXM7y5cv1k5/85FoPDQAA9AK9JjT9+te/1ne/+10NHDjQb3tOTo75c3JyskaNGqUJEybogw8+0K233irpiwvKL2UYht92KzWXWrp0qfLz8831xsZGJSQkWD8oAADQa/SK03O/+93vdPz4cf3gBz+4au2tt96q0NBQnThxQtIX10WdPXu2TV1tba05u+R0OtvMKNXX16ulpaXNDNRX2e12DRkyxG8BAAB9U68ITS+88IJSUlJ0yy23XLX26NGjamlpUXx8vCQpLS1NHo9Hhw8fNmsOHTokj8ejSZMmmTXl5eV+d+vt2rVLdrtdKSkpXXw0AACgNwro6bmmpiZ9/PHH5vrJkydVVlam6OhoDR8+XNIXp7z+8z//UytXrmzz+D/+8Y8qKirSt7/9bcXExOjYsWNasmSJxo8fr8mTJ0uSxowZo2nTpiknJ0fr1q2T9MVHDmRlZSkpKUmSlJ6errFjx8rtdmvFihU6d+6cCgoKlJOTw+wRAACQFOCZpvfff1/jx4/X+PHjJUn5+fkaP368Hn/8cbNm8+bNMgxD3/nOd9o8PiwsTG+//bYyMjKUlJSk3Nxcpaena/fu3QoJCTHrioqKNG7cOKWnpys9PV0333yzNm7caI6HhIRo+/btGjhwoCZPnqw5c+Zo5syZeuaZZ7rx6AEAQG8S0JmmKVOmyDCMdmsWLlyohQsXXnYsISFBe/fuverzREd
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Finally, let's also explore the hour of the day.\n",
    "sns.histplot(data=data['Hour'], bins= 40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "3f9a5c00",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AgeInDays</th>\n",
       "      <th>Year</th>\n",
       "      <th>Month</th>\n",
       "      <th>HourCosine</th>\n",
       "      <th>BirthMonth</th>\n",
       "      <th>HasName</th>\n",
       "      <th>Type_Cat</th>\n",
       "      <th>Type_Dog</th>\n",
       "      <th>Type_Livestock</th>\n",
       "      <th>Sex_Intact Male</th>\n",
       "      <th>Sex_Neutered Male</th>\n",
       "      <th>Sex_Spayed Female</th>\n",
       "      <th>Sex_Unknown</th>\n",
       "      <th>Outcome_Adoption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14</td>\n",
       "      <td>2014</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>60</td>\n",
       "      <td>2014</td>\n",
       "      <td>-0.866025</td>\n",
       "      <td>1.0</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28</td>\n",
       "      <td>2014</td>\n",
       "      <td>-0.866025</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14</td>\n",
       "      <td>2014</td>\n",
       "      <td>-0.866025</td>\n",
       "      <td>1.0</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>150</td>\n",
       "      <td>2014</td>\n",
       "      <td>-0.500000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   AgeInDays  Year     Month  HourCosine  BirthMonth  HasName  Type_Cat  \\\n",
       "0         14  2014 -1.000000         1.0           5    False     False   \n",
       "1         60  2014 -0.866025         1.0           4     True     False   \n",
       "2         28  2014 -0.866025         1.0           6    False     False   \n",
       "3         14  2014 -0.866025         1.0           6    False     False   \n",
       "4        150  2014 -0.500000         1.0           3    False     False   \n",
       "\n",
       "   Type_Dog  Type_Livestock  Sex_Intact Male  Sex_Neutered Male  \\\n",
       "0     False           False            False              False   \n",
       "1      True           False            False              False   \n",
       "2     False           False            False              False   \n",
       "3     False           False            False              False   \n",
       "4     False           False             True              False   \n",
       "\n",
       "   Sex_Spayed Female  Sex_Unknown  Outcome_Adoption  \n",
       "0              False         True             False  \n",
       "1               True        False              True  \n",
       "2              False         True             False  \n",
       "3              False         True             False  \n",
       "4              False        False             False  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now for PART 2\n",
    "from sklearn.model_selection import train_test_split\n",
    "data.drop(['Hour'], axis=1, inplace=True)\n",
    "data.head()\n",
    "# Let's first get rid of the normal hour count, since we have the cosine version and have visualized the distribution of hours."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17384e1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   AgeInDays  Year     Month  HourCosine  BirthMonth  HasName  Type_Cat  \\\n",
      "0         14  2014 -1.000000         1.0           5    False     False   \n",
      "1         60  2014 -0.866025         1.0           4     True     False   \n",
      "2         28  2014 -0.866025         1.0           6    False     False   \n",
      "3         14  2014 -0.866025         1.0           6    False     False   \n",
      "4        150  2014 -0.500000         1.0           3    False     False   \n",
      "\n",
      "   Type_Dog  Type_Livestock  Sex_Intact Male  Sex_Neutered Male  \\\n",
      "0     False           False            False              False   \n",
      "1      True           False            False              False   \n",
      "2     False           False            False              False   \n",
      "3     False           False            False              False   \n",
      "4     False           False             True              False   \n",
      "\n",
      "   Sex_Spayed Female  Sex_Unknown  \n",
      "0              False         True  \n",
      "1               True        False  \n",
      "2              False         True  \n",
      "3              False         True  \n",
      "4              False        False  \n"
     ]
    }
   ],
   "source": [
    "# Some of the following was informed by AI, but not directly generated. See [6]\n",
    "x = data[data.loc[:, 'AgeInDays':'Sex_Unknown'].columns.tolist()]\n",
    "y = data['Outcome_Adoption']\n",
    "print(x.head())\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=1)\n",
    "# stratify=y ensures the same general proportion of transfer to adoption in testing and training data.\n",
    "# random_state=1 ensures the results are reproducible, randomizing the input data in a controlled manner."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4104bd2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Performance on TRAINING\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.35      1.00      0.52     31916\n",
      "        True       0.00      0.00      0.00     58426\n",
      "\n",
      "    accuracy                           0.35     90342\n",
      "   macro avg       0.18      0.50      0.26     90342\n",
      "weighted avg       0.12      0.35      0.18     90342\n",
      "\n",
      "Performance on TEST\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.35      1.00      0.52     13678\n",
      "        True       0.00      0.00      0.00     25041\n",
      "\n",
      "    accuracy                           0.35     38719\n",
      "   macro avg       0.18      0.50      0.26     38719\n",
      "weighted avg       0.12      0.35      0.18     38719\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n",
      "/opt/miniconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py:1731: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", result.shape[0])\n"
     ]
    }
   ],
   "source": [
    "# We will first perform linear classification via Stochastic Gradient Descent.\n",
    "\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "clf = SGDClassifier(loss=\"perceptron\", alpha=0.05, random_state=1)\n",
    "\n",
    "clf.fit(x_train, y_train)\n",
    "\n",
    "def report(classifier):\n",
    "  # Accuracy, Precision, Recall, and F1 scores\n",
    "  print(f\"Performance on TRAINING\\n*****************\\n{classification_report(y_train,classifier.predict(x_train))}\")\n",
    "  print(f\"Performance on TEST\\n*****************\\n{classification_report(y_test,classifier.predict(x_test))}\")\n",
    "\n",
    "report(clf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b82c29be",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Performance on TRAINING\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.91      0.83      0.87     31916\n",
      "        True       0.91      0.96      0.93     58426\n",
      "\n",
      "    accuracy                           0.91     90342\n",
      "   macro avg       0.91      0.89      0.90     90342\n",
      "weighted avg       0.91      0.91      0.91     90342\n",
      "\n",
      "Performance on TEST\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.81      0.71      0.76     13678\n",
      "        True       0.85      0.91      0.88     25041\n",
      "\n",
      "    accuracy                           0.84     38719\n",
      "   macro avg       0.83      0.81      0.82     38719\n",
      "weighted avg       0.84      0.84      0.84     38719\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# now, we will perform k-nearest neighbors classification without cross-validation. \n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "knn = KNeighborsClassifier(n_neighbors=3) # choosing arbitrary number of neighbors, \n",
    "\n",
    "# fit the model to the training set\n",
    "knn.fit(x_train, y_train)\n",
    "\n",
    "report(knn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ed14aaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Performance on TRAINING\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.81      1.00      0.89     31916\n",
      "        True       1.00      0.87      0.93     58426\n",
      "\n",
      "    accuracy                           0.91     90342\n",
      "   macro avg       0.90      0.93      0.91     90342\n",
      "weighted avg       0.93      0.91      0.92     90342\n",
      "\n",
      "Performance on TEST\n",
      "*****************\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.67      0.82      0.74     13678\n",
      "        True       0.89      0.78      0.83     25041\n",
      "\n",
      "    accuracy                           0.80     38719\n",
      "   macro avg       0.78      0.80      0.79     38719\n",
      "weighted avg       0.81      0.80      0.80     38719\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Finally, we will perform k-nearest neighbors classification with cross-validation. \n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "knn_search = KNeighborsClassifier()\n",
    "param_grid = {\"n_neighbors\": np.arange(1, 100)}\n",
    "knn_gscv = GridSearchCV(knn_search, param_grid, cv=5, scoring=\"precision\") # most important metric from business perspective.\n",
    "\n",
    "# fit the model to the training set\n",
    "knn_gscv.fit(x_train, y_train)\n",
    "best_knn = knn_gscv.best_estimator_\n",
    "\n",
    "report(best_knn)\n",
    "k_param = knn_gscv.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "22d2e4ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'n_neighbors': np.int64(2)}\n"
     ]
    }
   ],
   "source": [
    "print(k_param)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24412ca1",
   "metadata": {},
   "source": [
    "from a business perspective, the most important metric is the tp/tp+fp, or precision metric, since it is more costly for a shelter to maintain an animal that will get transferred rather than transferring it as soon as possible. False positives would cause the retention of animals that have low chance of getting adopted, rather than transferring them to other shelters, where they may have higher chances of adoption."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}