diff --git a/data/raw/Travel details dataset.csv b/data/raw/Travel details dataset.csv
new file mode 100644
index 00000000..f3c1b7e5
--- /dev/null
+++ b/data/raw/Travel details dataset.csv
@@ -0,0 +1,140 @@
+Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
+1,"London, UK",5/1/2023,5/8/2023,7,John Smith,35,Male,American,Hotel,1200,Flight,600
+2,"Phuket, Thailand",6/15/2023,6/20/2023,5,Jane Doe,28,Female,Canadian,Resort,800,Flight,500
+3,"Bali, Indonesia",7/1/2023,7/8/2023,7,David Lee,45,Male,Korean,Villa,1000,Flight,700
+4,"New York, USA",8/15/2023,8/29/2023,14,Sarah Johnson,29,Female,British,Hotel,2000,Flight,1000
+5,"Tokyo, Japan",9/10/2023,9/17/2023,7,Kim Nguyen,26,Female,Vietnamese,Airbnb,700,Train,200
+6,"Paris, France",10/5/2023,10/10/2023,5,Michael Brown,42,Male,American,Hotel,1500,Flight,800
+7,"Sydney, Australia",11/20/2023,11/30/2023,10,Emily Davis,33,Female,Australian,Hostel,500,Flight,1200
+8,"Rio de Janeiro, Brazil",1/5/2024,1/12/2024,7,Lucas Santos,25,Male,Brazilian,Airbnb,900,Flight,600
+9,"Amsterdam, Netherlands",2/14/2024,2/21/2024,7,Laura Janssen,31,Female,Dutch,Hotel,1200,Train,200
+10,"Dubai, United Arab Emirates",3/10/2024,3/17/2024,7,Mohammed Ali,39,Male,Emirati,Resort,2500,Flight,800
+11,"Cancun, Mexico",4/1/2024,4/8/2024,7,Ana Hernandez,27,Female,Mexican,Hotel,1000,Flight,500
+12,"Barcelona, Spain",5/15/2024,5/22/2024,7,Carlos Garcia,36,Male,Spanish,Airbnb,800,Train,100
+13,"Honolulu, Hawaii",6/10/2024,6/18/2024,8,Lily Wong,29,Female,Chinese,Resort,3000,Flight,1200
+14,"Berlin, Germany",7/1/2024,7/10/2024,9,Hans Mueller,48,Male,German,Hotel,1400,Flight,700
+15,"Marrakech, Morocco",8/20/2024,8/27/2024,7,Fatima Khouri,26,Female,Moroccan,Riad,600,Flight,400
+16,"Edinburgh, Scotland",9/5/2024,9/12/2024,7,James MacKenzie,32,Male,Scottish,Hotel,900,Train,150
+17,Paris,9/1/2023,9/10/2023,9,Sarah Johnson,30,Female,American,Hotel,$900 ,Plane,$400
+18,Bali,8/15/2023,8/25/2023,10,Michael Chang,28,Male,Chinese,Resort,"$1,500 ",Plane,$700
+19,London,7/22/2023,7/28/2023,6,Olivia Rodriguez,35,Female,British,Hotel,"$1,200 ",Train,$150
+20,Tokyo,10/5/2023,10/15/2023,10,Kenji Nakamura,45,Male,Japanese,Hotel,"$1,200 ",Plane,$800
+21,New York,11/20/2023,11/25/2023,5,Emily Lee,27,Female,American,Airbnb,$600 ,Bus,$100
+22,Sydney,12/5/2023,12/12/2023,7,James Wilson,32,Male,Australian,Hotel,"$1,000 ",Plane,$600
+23,Rome,11/1/2023,11/8/2023,7,Sofia Russo,29,Female,Italian,Airbnb,$700 ,Train,$80
+24,Bangkok,9/15/2023,9/23/2023,8,Raj Patel,40,Male,Indian,Hostel,$400 ,Plane,$500
+25,Paris,12/22/2023,12/28/2023,6,Lily Nguyen,24,Female,Vietnamese,Hotel,"$1,400 ",Train,$100
+26,Hawaii,8/1/2023,8/10/2023,9,David Kim,34,Male,Korean,Resort,"$2,000 ",Plane,$800
+27,Barcelona,10/20/2023,10/28/2023,8,Maria Garcia,31,Female,Spanish,Hotel,"$1,100 ",Train,$150
+28,Japan,5/10/2022,5/18/2022,8,Alice Smith,30,Female,American,Hotel,$800 ,Plane,$500
+29,Thailand,6/15/2022,6/22/2022,7,Bob Johnson,45,Male,Canadian,Hostel,$200 ,Train,$150
+30,France,7/2/2022,7/11/2022,9,Charlie Lee,25,Male,Korean,Airbnb,$600 ,Car rental,$300
+31,Australia,8/20/2022,9/2/2022,13,Emma Davis,28,Female,British,Hotel,"$1,000 ",Car rental,$500
+32,Brazil,9/5/2022,9/14/2022,9,Olivia Martin,33,Female,Australian,Hostel,$150 ,Bus,$50
+33,Greece,10/12/2022,10/20/2022,8,Harry Wilson,20,Male,American,Airbnb,$400 ,Plane,$600
+34,Egypt,11/8/2022,11/15/2022,7,Sophia Lee,37,Female,Canadian,Hotel,$700 ,Train,$100
+35,Mexico,1/5/2023,1/15/2023,10,James Brown,42,Male,British,Airbnb,$500 ,Plane,$800
+36,Italy,2/14/2023,2/20/2023,6,Mia Johnson,31,Female,American,Hostel,$180 ,Train,$120
+37,Spain,3/23/2023,3/31/2023,8,William Davis,27,Male,Korean,Hotel,$900 ,Car rental,$400
+38,Canada,4/19/2023,4/26/2023,7,Amelia Brown,38,Female,Australian,Airbnb,$350 ,Bus,$75
+39,"Paris, France",6/12/2022,6/19/2022,7,Mia Johnson,25,Female,American,Hotel,1400,Plane,600
+40,"Sydney, Australia",1/2/2023,1/9/2023,7,Adam Lee,33,Male,Canadian,Airbnb,800,Train,150
+41,"Tokyo, Japan",12/10/2022,12/18/2022,8,Sarah Wong,28,Female,Chinese,Hostel,500,Plane,900
+42,"Cancun, Mexico",7/1/2023,7/8/2023,7,John Smith,45,Male,American,Resort,2200,Plane,800
+43,"Rio de Janeiro, Brazil",11/20/2022,11/27/2022,7,Maria Silva,30,Female,Brazilian,Hotel,1200,Plane,700
+44,"London, UK",3/5/2023,3/12/2023,7,Peter Brown,55,Male,British,Airbnb,900,Train,100
+45,"Barcelona, Spain",8/18/2023,8/25/2023,7,Emma Garcia,27,Female,Spanish,Hostel,600,Plane,600
+46,"New York City, USA",9/15/2022,9/22/2022,7,Michael Davis,41,Male,American,Hotel,1500,Plane,500
+47,"Bangkok, Thailand",5/1/2023,5/7/2023,6,Nina Patel,29,Female,Indian,Airbnb,500,Bus,50
+48,"Vancouver, Canada",7/10/2022,7/17/2022,7,Kevin Kim,24,Male,Korean,Hostel,400,Train,150
+49,"Amsterdam, Netherlands",6/20/2023,6/28/2023,8,Laura van den Berg,31,Female,Dutch,Hotel,1100,Plane,700
+50,"Paris, France",8/15/2023,8/22/2023,7,Jennifer Nguyen,31,Female,Canadian,Hotel,"$1,200 ",Train,$300
+51,"Tokyo, Japan",10/10/2023,10/20/2023,10,David Kim,25,Male,American,Hostel,$500 ,Bus,$100
+52,"Sydney, AUS",11/5/2023,11/12/2023,7,Rachel Lee,27,Female,South Korean,Airbnb,$900 ,Car rental,$200
+53,"New York, USA",12/24/2023,12/31/2023,7,Jessica Wong,28,Female,Canadian,Hotel,"$1,400 ",Flight,$800
+54,"Rio de Janeiro, Brazil",1/15/2024,1/24/2024,9,Felipe Almeida,30,Male,Brazilian,Airbnb,$800 ,Train,$150
+55,"Bangkok, Thailand",2/1/2024,2/9/2024,8,Nisa Patel,23,Female,Indian,Hostel,$400 ,Bus,$50
+56,"London, UK",3/15/2024,3/23/2024,8,Ben Smith,35,Male,British,Hotel,"$1,000 ",Train,$200
+57,"Barcelona, Spain",4/5/2024,4/13/2024,8,Laura Gomez,29,Female,Spanish,Airbnb,$700 ,Car rental,$250
+58,"Seoul, South Korea",5/10/2024,5/18/2024,8,Park Min Woo,27,Male,South Korean,Hostel,$500 ,Subway,$20
+59,"Los Angeles, USA",6/20/2024,6/27/2024,7,Michael Chen,26,Male,Chinese,Hotel,"$1,200 ",Car rental,$300
+60,"Rome, Italy",7/15/2024,7/23/2024,8,Sofia Rossi,33,Female,Italian,Airbnb,$800 ,Train,$100
+61,Paris,7/12/2022,7/18/2022,6,Rachel Sanders,35,Female,American,Hotel,1200,Plane,800
+62,Tokyo,9/3/2022,9/10/2022,7,Kenji Nakamura,28,Male,Japanese,Hostel,400,Train,300
+63,Cape Town,1/7/2023,1/16/2023,9,Emily Watson,29,Female,British,Vacation rental,800,Car rental,200
+64,Sydney,6/23/2023,6/29/2023,6,David Lee,43,Male,Australian,Hotel,1500,Plane,1200
+65,Barcelona,8/18/2023,8/25/2023,7,Ana Rodriguez,31,Female,Spanish,Vacation rental,900,Plane,700
+66,Bali,2/1/2024,2/8/2024,7,Tom Wilson,27,Male,American,Resort,2200,Plane,1000
+67,Paris,5/6/2024,5/12/2024,6,Olivia Green,39,Female,French,Hotel,1100,Train,200
+68,New York,7/20/2024,7/26/2024,6,James Chen,25,Male,American,Vacation rental,1000,Plane,800
+69,Bangkok,9/8/2024,9/16/2024,8,Lila Patel,33,Female,Indian,Hostel,300,Plane,700
+70,Rome,2/14/2025,2/20/2025,6,Marco Rossi,41,Male,Italian,Hotel,1300,Train,100
+71,Bali,5/21/2025,5/29/2025,8,Sarah Brown,37,Female,British,Resort,1800,Plane,1000
+72,,,,,,,,,,,,
+73,"Bali, Indonesia",8/5/2022,8/12/2022,7,Sarah Lee,35,Female,South Korean,Resort,500 USD,Plane,800 USD
+74,"Tokyo, Japan",1/1/2023,1/9/2023,8,Alex Kim,29,Male,American,Hotel,1000 USD,Train,200 USD
+75,"Cancun, Mexico",4/15/2023,4/22/2023,7,Maria Hernandez,42,Female,Mexican,Resort,800 USD,Plane,500 USD
+76,"Paris, France",6/7/2023,6/14/2023,7,John Smith,46,Male,British,Hotel,1200 USD,Plane,700 USD
+77,"Cape Town, SA",9/1/2023,9/10/2023,9,Mark Johnson,31,Male,South African,Guesthouse,400 USD,Car,300 USD
+78,"Bali, Indonesia",11/12/2023,11/19/2023,7,Amanda Chen,25,Female,Taiwanese,Resort,600 USD,Plane,700 USD
+79,"Sydney, Aus",2/5/2024,2/12/2024,7,David Lee,38,Male,Australian,Hotel,900 USD,Plane,600 USD
+80,"Bangkok, Thai",5/15/2024,5/22/2024,7,Nana Kwon,27,Female,Korean,Hotel,400 USD,Plane,400 USD
+81,"New York, USA",8/20/2024,8/27/2024,7,Tom Hanks,60,Male,American,Hotel,1500 USD,Plane,1000 USD
+82,"Phuket, Thai",1/1/2025,1/8/2025,7,Emma Watson,32,Female,British,Resort,700 USD,Plane,800 USD
+83,"Rome, Italy",4/15/2025,4/22/2025,7,James Kim,41,Male,American,Hotel,100,,
+84,Paris,6/15/2021,6/20/2021,6,John Smith,35,Male,American,Hotel,800 USD,Plane,500 USD
+85,Tokyo,7/1/2021,7/10/2021,10,Sarah Lee,28,Female,Korean,Airbnb,500 USD,Train,300 USD
+86,Bali,8/10/2021,8/20/2021,11,Maria Garcia,42,Female,Spanish,Resort,1200 USD,Plane,700 USD
+87,Sydney,9/1/2021,9/10/2021,9,David Lee,45,Male,Australian,Hotel,900 USD,Plane,600 USD
+88,New York,10/15/2021,10/20/2021,6,Emily Davis,31,Female,American,Airbnb,700 USD,Car rental,200 USD
+89,London,11/20/2021,11/30/2021,11,James Wilson,29,Male,British,Hostel,300 USD,Plane,400 USD
+90,Dubai,1/1/2022,1/8/2022,8,Fatima Ahmed,24,Female,Emirati,Hotel,1000 USD,Plane,800 USD
+91,Bangkok,2/14/2022,2/20/2022,7,Liam Nguyen,26,Male,Vietnamese,Airbnb,400 USD,Train,100 USD
+92,Rome,3/10/2022,3/20/2022,11,Giulia Rossi,30,Female,Italian,Hostel,200 USD,Plane,350 USD
+93,Bali,4/15/2022,4/25/2022,11,Putra Wijaya,33,Male,Indonesian,Villa,1500 USD,Car rental,300 USD
+94,Seoul,5/1/2022,5/10/2022,10,Kim Min-ji,27,Female,Korean,Hotel,800 USD,Train,150 USD
+95,Paris,6/15/2022,6/20/2022,5,John Smith,35,Male,USA,Hotel,$500 ,Plane,$800
+96,Tokyo,9/1/2022,9/10/2022,9,Emily Johnson,28,Female,Canada,Airbnb,$400 ,Train,$200
+97,Sydney,11/23/2022,12/2/2022,9,David Lee,45,Male,South Korea,Hostel,$200 ,Plane,"$1,200 "
+98,London,2/14/2023,2/19/2023,5,Sarah Brown,37,Female,UK,Hotel,$600 ,Plane,$700
+99,New York,5/8/2023,5/14/2023,6,Michael Wong,50,Male,China,Airbnb,$800 ,Car rental,$300
+100,Rome,8/20/2023,8/27/2023,7,Jessica Chen,31,Female,Taiwan,Hotel,$700 ,Plane,$900
+101,Bangkok,11/12/2023,11/20/2023,8,Ken Tanaka,42,Male,Japan,Hostel,$300 ,Train,$100
+102,Cape Town,1/6/2024,1/14/2024,8,Maria Garcia,27,Female,Spain,Airbnb,$500 ,Plane,"$1,500 "
+103,Rio de Janeiro,4/3/2024,4/10/2024,7,Rodrigo Oliveira,33,Male,Brazil,Hotel,$900 ,Car rental,$400
+104,Bali,7/22/2024,7/28/2024,6,Olivia Kim,29,Female,South Korea,Villa,"$1,200 ",Plane,"$1,000 "
+105,Amsterdam,10/10/2024,10/17/2024,7,Robert Mueller,41,Male,Germany,Hotel,$600 ,Train,$150
+106,Paris,5/15/2022,5/20/2022,5,John Smith,35,Male,USA,Hotel,1000,Plane,800
+107,Tokyo,9/1/2022,9/10/2022,9,Sarah Lee,28,Female,South Korea,Airbnb,800,Train,500
+108,New York,6/20/2022,6/25/2022,5,Michael Wong,42,Male,Hong Kong,Hotel,1200,Car rental,200
+109,Bali,8/12/2022,8/20/2022,8,Lisa Chen,30,Female,Taiwan,Resort,1500,Plane,1200
+110,Sydney,7/1/2022,7/10/2022,9,David Kim,26,Male,Canada,Hostel,300,Plane,900
+111,London,6/10/2022,6/15/2022,5,Emily Wong,38,Female,United Kingdom,Hotel,900,Train,150
+112,Phuket,9/5/2022,9/12/2022,7,Mark Tan,45,Male,Singapore,Villa,2000,Plane,700
+113,Rome,5/1/2022,5/8/2022,7,Emma Lee,31,Female,Italy,Hotel,1100,Train,250
+114,Santorini,7/15/2022,7/22/2022,7,George Chen,27,Male,Greece,Airbnb,1000,Ferry,150
+115,Dubai,8/25/2022,8/30/2022,5,Sophia Kim,29,Female,United Arab Emirates,Hotel,1500,Car rental,300
+116,Phnom Penh,9/10/2022,9/15/2022,5,Alex Ng,33,Male,Cambodia,Hostel,200,Plane,500
+117,"Tokyo, Japan",2/5/2022,2/14/2022,9,Alice Smith,32,Female,American,Hotel,1000,Plane,700
+118,"Paris, France",3/15/2022,3/22/2022,7,Bob Johnson,47,Male,Canadian,Hotel,1200,Train,500
+119,"Sydney, Aus",5/1/2022,5/12/2022,11,Cindy Chen,26,Female,Chinese,Airbnb,800,Plane,1000
+120,"Rome, Italy",6/10/2022,6/17/2022,7,David Lee,38,Male,Korean,Hotel,900,Train,400
+121,"Bali, Indonesia",7/20/2022,7/30/2022,10,Emily Kim,29,Female,Korean,Hostel,500,Plane,800
+122,"Cancun, Mexico",8/8/2022,8/16/2022,8,Frank Li,41,Male,American,Hotel,1300,Plane,600
+123,"Athens, Greece",9/20/2022,9/30/2022,10,Gina Lee,35,Female,Korean,Airbnb,700,Plane,900
+124,"Tokyo, Japan",10/5/2022,10/13/2022,8,Henry Kim,24,Male,Korean,Hotel,1200,Plane,700
+125,"Sydney, Aus",11/11/2022,11/21/2022,10,Isabella Chen,30,Female,Chinese,Airbnb,900,Plane,1000
+126,"Paris, France",12/24/2022,1/1/2023,8,Jack Smith,28,Male,American,Hostel,400,Plane,700
+127,"Bali, Indonesia",2/10/2023,2/18/2023,8,Katie Johnson,33,Female,Canadian,Hotel,800,Plane,800
+128,,,,,,,,,,,,
+129,"Paris, France",5/1/2023,5/7/2023,6,John Doe,35,Male,American,Hotel,5000,Airplane,2500
+130,"Tokyo, Japan",5/15/2023,5/22/2023,7,Jane Smith,28,Female,British,Airbnb,7000,Train,1500
+131,"Cape Town, South Africa",6/1/2023,6/10/2023,9,Michael Johnson,45,Male,South African,Hostel,3000,Car,2000
+132,"Sydney, Australia",6/15/2023,6/21/2023,6,Sarah Lee,31,Female,Australian,Hotel,6000,Airplane,3000
+133,"Rome, Italy",7/1/2023,7/8/2023,7,David Kim,42,Male,Korean,Airbnb,4000,Train,1500
+134,"New York City, USA",7/15/2023,7/22/2023,7,Emily Davis,27,Female,American,Hotel,8000,Airplane,2500
+135,"Rio de Janeiro, Brazil",8/1/2023,8/10/2023,9,Jose Perez,37,Male,Brazilian,Hostel,2500,Car,2000
+136,"Vancouver, Canada",8/15/2023,8/21/2023,6,Emma Wilson,29,Female,Canadian,Hotel,5000,Airplane,3000
+137,"Bangkok, Thailand",9/1/2023,9/8/2023,7,Ryan Chen,34,Male,Chinese,Hostel,2000,Train,1000
+138,"Barcelona, Spain",9/15/2023,9/22/2023,7,Sofia Rodriguez,25,Female,Spanish,Airbnb,6000,Airplane,2500
+139,"Auckland, New Zealand",10/1/2023,10/8/2023,7,William Brown,39,Male,New Zealander,Hotel,7000,Train,2500
diff --git a/notebooks/traveler_trip_dataset_hoaithuong.ipynb b/notebooks/traveler_trip_dataset_hoaithuong.ipynb
new file mode 100644
index 00000000..bf217058
--- /dev/null
+++ b/notebooks/traveler_trip_dataset_hoaithuong.ipynb
@@ -0,0 +1,1984 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "id": "eaa9e1ba-9051-4a18-8857-97b56f4b8140",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Trip ID | \n",
+ " Destination | \n",
+ " Start date | \n",
+ " End date | \n",
+ " Duration (days) | \n",
+ " Traveler name | \n",
+ " Traveler age | \n",
+ " Traveler gender | \n",
+ " Traveler nationality | \n",
+ " Accommodation type | \n",
+ " Accommodation cost | \n",
+ " Transportation type | \n",
+ " Transportation cost | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " London, UK | \n",
+ " 5/1/2023 | \n",
+ " 5/8/2023 | \n",
+ " 7.0 | \n",
+ " John Smith | \n",
+ " 35.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Phuket, Thailand | \n",
+ " 6/15/2023 | \n",
+ " 6/20/2023 | \n",
+ " 5.0 | \n",
+ " Jane Doe | \n",
+ " 28.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Resort | \n",
+ " 800 | \n",
+ " Flight | \n",
+ " 500 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " Bali, Indonesia | \n",
+ " 7/1/2023 | \n",
+ " 7/8/2023 | \n",
+ " 7.0 | \n",
+ " David Lee | \n",
+ " 45.0 | \n",
+ " Male | \n",
+ " Korean | \n",
+ " Villa | \n",
+ " 1000 | \n",
+ " Flight | \n",
+ " 700 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " New York, USA | \n",
+ " 8/15/2023 | \n",
+ " 8/29/2023 | \n",
+ " 14.0 | \n",
+ " Sarah Johnson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " British | \n",
+ " Hotel | \n",
+ " 2000 | \n",
+ " Flight | \n",
+ " 1000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Tokyo, Japan | \n",
+ " 9/10/2023 | \n",
+ " 9/17/2023 | \n",
+ " 7.0 | \n",
+ " Kim Nguyen | \n",
+ " 26.0 | \n",
+ " Female | \n",
+ " Vietnamese | \n",
+ " Airbnb | \n",
+ " 700 | \n",
+ " Train | \n",
+ " 200 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " Paris, France | \n",
+ " 10/5/2023 | \n",
+ " 10/10/2023 | \n",
+ " 5.0 | \n",
+ " Michael Brown | \n",
+ " 42.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1500 | \n",
+ " Flight | \n",
+ " 800 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 7 | \n",
+ " Sydney, Australia | \n",
+ " 11/20/2023 | \n",
+ " 11/30/2023 | \n",
+ " 10.0 | \n",
+ " Emily Davis | \n",
+ " 33.0 | \n",
+ " Female | \n",
+ " Australian | \n",
+ " Hostel | \n",
+ " 500 | \n",
+ " Flight | \n",
+ " 1200 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8 | \n",
+ " Rio de Janeiro, Brazil | \n",
+ " 1/5/2024 | \n",
+ " 1/12/2024 | \n",
+ " 7.0 | \n",
+ " Lucas Santos | \n",
+ " 25.0 | \n",
+ " Male | \n",
+ " Brazilian | \n",
+ " Airbnb | \n",
+ " 900 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 9 | \n",
+ " Amsterdam, Netherlands | \n",
+ " 2/14/2024 | \n",
+ " 2/21/2024 | \n",
+ " 7.0 | \n",
+ " Laura Janssen | \n",
+ " 31.0 | \n",
+ " Female | \n",
+ " Dutch | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Train | \n",
+ " 200 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 10 | \n",
+ " Dubai, United Arab Emirates | \n",
+ " 3/10/2024 | \n",
+ " 3/17/2024 | \n",
+ " 7.0 | \n",
+ " Mohammed Ali | \n",
+ " 39.0 | \n",
+ " Male | \n",
+ " Emirati | \n",
+ " Resort | \n",
+ " 2500 | \n",
+ " Flight | \n",
+ " 800 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Trip ID Destination Start date End date \\\n",
+ "0 1 London, UK 5/1/2023 5/8/2023 \n",
+ "1 2 Phuket, Thailand 6/15/2023 6/20/2023 \n",
+ "2 3 Bali, Indonesia 7/1/2023 7/8/2023 \n",
+ "3 4 New York, USA 8/15/2023 8/29/2023 \n",
+ "4 5 Tokyo, Japan 9/10/2023 9/17/2023 \n",
+ "5 6 Paris, France 10/5/2023 10/10/2023 \n",
+ "6 7 Sydney, Australia 11/20/2023 11/30/2023 \n",
+ "7 8 Rio de Janeiro, Brazil 1/5/2024 1/12/2024 \n",
+ "8 9 Amsterdam, Netherlands 2/14/2024 2/21/2024 \n",
+ "9 10 Dubai, United Arab Emirates 3/10/2024 3/17/2024 \n",
+ "\n",
+ " Duration (days) Traveler name Traveler age Traveler gender \\\n",
+ "0 7.0 John Smith 35.0 Male \n",
+ "1 5.0 Jane Doe 28.0 Female \n",
+ "2 7.0 David Lee 45.0 Male \n",
+ "3 14.0 Sarah Johnson 29.0 Female \n",
+ "4 7.0 Kim Nguyen 26.0 Female \n",
+ "5 5.0 Michael Brown 42.0 Male \n",
+ "6 10.0 Emily Davis 33.0 Female \n",
+ "7 7.0 Lucas Santos 25.0 Male \n",
+ "8 7.0 Laura Janssen 31.0 Female \n",
+ "9 7.0 Mohammed Ali 39.0 Male \n",
+ "\n",
+ " Traveler nationality Accommodation type Accommodation cost \\\n",
+ "0 American Hotel 1200 \n",
+ "1 Canadian Resort 800 \n",
+ "2 Korean Villa 1000 \n",
+ "3 British Hotel 2000 \n",
+ "4 Vietnamese Airbnb 700 \n",
+ "5 American Hotel 1500 \n",
+ "6 Australian Hostel 500 \n",
+ "7 Brazilian Airbnb 900 \n",
+ "8 Dutch Hotel 1200 \n",
+ "9 Emirati Resort 2500 \n",
+ "\n",
+ " Transportation type Transportation cost \n",
+ "0 Flight 600 \n",
+ "1 Flight 500 \n",
+ "2 Flight 700 \n",
+ "3 Flight 1000 \n",
+ "4 Train 200 \n",
+ "5 Flight 800 \n",
+ "6 Flight 1200 \n",
+ "7 Flight 600 \n",
+ "8 Train 200 \n",
+ "9 Flight 800 "
+ ]
+ },
+ "execution_count": 142,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "travel_trip_data = \"../data/raw/Travel details dataset.csv\"\n",
+ "travel_trip_hoaithuong_df = pd.read_csv(travel_trip_data, encoding='ISO-8859-1')\n",
+ "travel_trip_hoaithuong_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "id": "5215913a-b54e-4a6d-a07e-5e702c75ede4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(139, 13)"
+ ]
+ },
+ "execution_count": 143,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "id": "d0ae93e3-d769-4e0b-b4b1-9a8f7dee6b9b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Trip ID', 'Destination', 'Start date', 'End date',\n",
+ " 'Duration (days)', 'Traveler name', 'Traveler age', 'Traveler gender',\n",
+ " 'Traveler nationality', 'Accommodation type', 'Accommodation cost',\n",
+ " 'Transportation type', 'Transportation cost'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 144,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 145,
+ "id": "f6b0a039-7cdc-461c-958f-4ee1634601be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "travel_trip_hoaithuong_df.columns = travel_trip_hoaithuong_df.columns.str.replace(\"\", \"\", regex=False).str.replace(r\"[()]\", \"\", regex=True).str.strip(\")\").str.replace(\" \",\"_\").str.lower()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "id": "6785b96f-411c-4f9e-82a6-d00dbc033aaf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['trip_id', 'destination', 'start_date', 'end_date', 'duration_days',\n",
+ " 'traveler_name', 'traveler_age', 'traveler_gender',\n",
+ " 'traveler_nationality', 'accommodation_type', 'accommodation_cost',\n",
+ " 'transportation_type', 'transportation_cost'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 146,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 147,
+ "id": "4835cbff-9092-476f-bd78-1ba752ecec33",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " destination | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " duration_days | \n",
+ " traveler_name | \n",
+ " traveler_age | \n",
+ " traveler_gender | \n",
+ " traveler_nationality | \n",
+ " accommodation_type | \n",
+ " accommodation_cost | \n",
+ " transportation_type | \n",
+ " transportation_cost | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " London, UK | \n",
+ " 5/1/2023 | \n",
+ " 5/8/2023 | \n",
+ " 7.0 | \n",
+ " John Smith | \n",
+ " 35.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Phuket, Thailand | \n",
+ " 6/15/2023 | \n",
+ " 6/20/2023 | \n",
+ " 5.0 | \n",
+ " Jane Doe | \n",
+ " 28.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Resort | \n",
+ " 800 | \n",
+ " Flight | \n",
+ " 500 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " Bali, Indonesia | \n",
+ " 7/1/2023 | \n",
+ " 7/8/2023 | \n",
+ " 7.0 | \n",
+ " David Lee | \n",
+ " 45.0 | \n",
+ " Male | \n",
+ " Korean | \n",
+ " Villa | \n",
+ " 1000 | \n",
+ " Flight | \n",
+ " 700 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " New York, USA | \n",
+ " 8/15/2023 | \n",
+ " 8/29/2023 | \n",
+ " 14.0 | \n",
+ " Sarah Johnson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " British | \n",
+ " Hotel | \n",
+ " 2000 | \n",
+ " Flight | \n",
+ " 1000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Tokyo, Japan | \n",
+ " 9/10/2023 | \n",
+ " 9/17/2023 | \n",
+ " 7.0 | \n",
+ " Kim Nguyen | \n",
+ " 26.0 | \n",
+ " Female | \n",
+ " Vietnamese | \n",
+ " Airbnb | \n",
+ " 700 | \n",
+ " Train | \n",
+ " 200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id destination start_date end_date duration_days \\\n",
+ "0 1 London, UK 5/1/2023 5/8/2023 7.0 \n",
+ "1 2 Phuket, Thailand 6/15/2023 6/20/2023 5.0 \n",
+ "2 3 Bali, Indonesia 7/1/2023 7/8/2023 7.0 \n",
+ "3 4 New York, USA 8/15/2023 8/29/2023 14.0 \n",
+ "4 5 Tokyo, Japan 9/10/2023 9/17/2023 7.0 \n",
+ "\n",
+ " traveler_name traveler_age traveler_gender traveler_nationality \\\n",
+ "0 John Smith 35.0 Male American \n",
+ "1 Jane Doe 28.0 Female Canadian \n",
+ "2 David Lee 45.0 Male Korean \n",
+ "3 Sarah Johnson 29.0 Female British \n",
+ "4 Kim Nguyen 26.0 Female Vietnamese \n",
+ "\n",
+ " accommodation_type accommodation_cost transportation_type \\\n",
+ "0 Hotel 1200 Flight \n",
+ "1 Resort 800 Flight \n",
+ "2 Villa 1000 Flight \n",
+ "3 Hotel 2000 Flight \n",
+ "4 Airbnb 700 Train \n",
+ "\n",
+ " transportation_cost \n",
+ "0 600 \n",
+ "1 500 \n",
+ "2 700 \n",
+ "3 1000 \n",
+ "4 200 "
+ ]
+ },
+ "execution_count": 147,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "id": "eacb80fb-b3a0-49a8-81b9-259bcb54911b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Clean columns start_date"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "id": "d02c1da1-0f38-4260-92be-bb53a6a8791a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['5/1/2023', '6/15/2023', '7/1/2023', '8/15/2023', '9/10/2023',\n",
+ " '10/5/2023', '11/20/2023', '1/5/2024', '2/14/2024', '3/10/2024',\n",
+ " '4/1/2024', '5/15/2024', '6/10/2024', '7/1/2024', '8/20/2024',\n",
+ " '9/5/2024', '9/1/2023', '7/22/2023', '12/5/2023', '11/1/2023',\n",
+ " '9/15/2023', '12/22/2023', '8/1/2023', '10/20/2023', '5/10/2022',\n",
+ " '6/15/2022', '7/2/2022', '8/20/2022', '9/5/2022', '10/12/2022',\n",
+ " '11/8/2022', '1/5/2023', '2/14/2023', '3/23/2023', '4/19/2023',\n",
+ " '6/12/2022', '1/2/2023', '12/10/2022', '11/20/2022', '3/5/2023',\n",
+ " '8/18/2023', '9/15/2022', '7/10/2022', '6/20/2023', '10/10/2023',\n",
+ " '11/5/2023', '12/24/2023', '1/15/2024', '2/1/2024', '3/15/2024',\n",
+ " '4/5/2024', '5/10/2024', '6/20/2024', '7/15/2024', '7/12/2022',\n",
+ " '9/3/2022', '1/7/2023', '6/23/2023', '5/6/2024', '7/20/2024',\n",
+ " '9/8/2024', '2/14/2025', '5/21/2025', nan, '8/5/2022', '1/1/2023',\n",
+ " '4/15/2023', '6/7/2023', '11/12/2023', '2/5/2024', '1/1/2025',\n",
+ " '4/15/2025', '6/15/2021', '7/1/2021', '8/10/2021', '9/1/2021',\n",
+ " '10/15/2021', '11/20/2021', '1/1/2022', '2/14/2022', '3/10/2022',\n",
+ " '4/15/2022', '5/1/2022', '9/1/2022', '11/23/2022', '5/8/2023',\n",
+ " '8/20/2023', '1/6/2024', '4/3/2024', '7/22/2024', '10/10/2024',\n",
+ " '5/15/2022', '6/20/2022', '8/12/2022', '7/1/2022', '6/10/2022',\n",
+ " '7/15/2022', '8/25/2022', '9/10/2022', '2/5/2022', '3/15/2022',\n",
+ " '7/20/2022', '8/8/2022', '9/20/2022', '10/5/2022', '11/11/2022',\n",
+ " '12/24/2022', '2/10/2023', '5/15/2023', '6/1/2023', '7/15/2023',\n",
+ " '10/1/2023'], dtype=object)"
+ ]
+ },
+ "execution_count": 149,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['start_date'].unique()\n",
+ "#extract the list of unique values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "id": "ce61b023-32f0-4052-b222-3e0dce231312",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dtype('O')"
+ ]
+ },
+ "execution_count": 150,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['start_date'].dtype"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "id": "146c5905-e3ef-435a-a341-603a057d1b58",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " destination | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " duration_days | \n",
+ " traveler_name | \n",
+ " traveler_age | \n",
+ " traveler_gender | \n",
+ " traveler_nationality | \n",
+ " accommodation_type | \n",
+ " accommodation_cost | \n",
+ " transportation_type | \n",
+ " transportation_cost | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 71 | \n",
+ " 72 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 127 | \n",
+ " 128 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id destination start_date end_date duration_days traveler_name \\\n",
+ "71 72 NaN NaN NaN NaN NaN \n",
+ "127 128 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " traveler_age traveler_gender traveler_nationality accommodation_type \\\n",
+ "71 NaN NaN NaN NaN \n",
+ "127 NaN NaN NaN NaN \n",
+ "\n",
+ " accommodation_cost transportation_type transportation_cost \n",
+ "71 NaN NaN NaN \n",
+ "127 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 151,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df[travel_trip_hoaithuong_df['start_date'].isna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "id": "caf55e0d-37c1-4211-8978-43f323f88eb7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "np.int64(2)"
+ ]
+ },
+ "execution_count": 152,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df[\"start_date\"].isna().sum()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "id": "14139d28-be33-449f-9b2b-6c1266dad526",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "111"
+ ]
+ },
+ "execution_count": 153,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df[\"start_date\"].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "id": "622ecee8-8cc1-495e-a9d2-4e3c2fb6b05d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 2023-05-01\n",
+ "1 2023-06-15\n",
+ "2 2023-07-01\n",
+ "3 2023-08-15\n",
+ "4 2023-09-10\n",
+ " ... \n",
+ "134 2023-08-01\n",
+ "135 2023-08-15\n",
+ "136 2023-09-01\n",
+ "137 2023-09-15\n",
+ "138 2023-10-01\n",
+ "Name: start_date, Length: 139, dtype: datetime64[ns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(travel_trip_hoaithuong_df['start_date'], format='%m/%d/%Y')\n",
+ "print(travel_trip_hoaithuong_df['start_date'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 155,
+ "id": "9517424f-c607-4eca-a11b-18de715df887",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " start_date\n",
+ "0 2023-05\n",
+ "1 2023-06\n",
+ "2 2023-07\n",
+ "3 2023-08\n",
+ "4 2023-09\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Convert the 'end_date' column to datetime first (if not already done)\n",
+ "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(\n",
+ " travel_trip_hoaithuong_df['start_date'], \n",
+ " errors='coerce'\n",
+ ")\n",
+ "\n",
+ "# Use .dt.strftime('%Y-%m') to format the date to Year and Month only.\n",
+ "travel_trip_hoaithuong_df['start_date'] = travel_trip_hoaithuong_df['start_date'].dt.strftime('%Y-%m')\n",
+ "\n",
+ "# Optional: Display the new data type and values\n",
+ "print(travel_trip_hoaithuong_df[['start_date']].head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 156,
+ "id": "6c20800d-1a44-4695-a500-70bbaf5f1399",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " destination | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " duration_days | \n",
+ " traveler_name | \n",
+ " traveler_age | \n",
+ " traveler_gender | \n",
+ " traveler_nationality | \n",
+ " accommodation_type | \n",
+ " accommodation_cost | \n",
+ " transportation_type | \n",
+ " transportation_cost | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " London, UK | \n",
+ " 2023-05 | \n",
+ " 5/8/2023 | \n",
+ " 7.0 | \n",
+ " John Smith | \n",
+ " 35.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Phuket, Thailand | \n",
+ " 2023-06 | \n",
+ " 6/20/2023 | \n",
+ " 5.0 | \n",
+ " Jane Doe | \n",
+ " 28.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Resort | \n",
+ " 800 | \n",
+ " Flight | \n",
+ " 500 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " Bali, Indonesia | \n",
+ " 2023-07 | \n",
+ " 7/8/2023 | \n",
+ " 7.0 | \n",
+ " David Lee | \n",
+ " 45.0 | \n",
+ " Male | \n",
+ " Korean | \n",
+ " Villa | \n",
+ " 1000 | \n",
+ " Flight | \n",
+ " 700 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " New York, USA | \n",
+ " 2023-08 | \n",
+ " 8/29/2023 | \n",
+ " 14.0 | \n",
+ " Sarah Johnson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " British | \n",
+ " Hotel | \n",
+ " 2000 | \n",
+ " Flight | \n",
+ " 1000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Tokyo, Japan | \n",
+ " 2023-09 | \n",
+ " 9/17/2023 | \n",
+ " 7.0 | \n",
+ " Kim Nguyen | \n",
+ " 26.0 | \n",
+ " Female | \n",
+ " Vietnamese | \n",
+ " Airbnb | \n",
+ " 700 | \n",
+ " Train | \n",
+ " 200 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 134 | \n",
+ " 135 | \n",
+ " Rio de Janeiro, Brazil | \n",
+ " 2023-08 | \n",
+ " 8/10/2023 | \n",
+ " 9.0 | \n",
+ " Jose Perez | \n",
+ " 37.0 | \n",
+ " Male | \n",
+ " Brazilian | \n",
+ " Hostel | \n",
+ " 2500 | \n",
+ " Car | \n",
+ " 2000 | \n",
+ "
\n",
+ " \n",
+ " | 135 | \n",
+ " 136 | \n",
+ " Vancouver, Canada | \n",
+ " 2023-08 | \n",
+ " 8/21/2023 | \n",
+ " 6.0 | \n",
+ " Emma Wilson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Hotel | \n",
+ " 5000 | \n",
+ " Airplane | \n",
+ " 3000 | \n",
+ "
\n",
+ " \n",
+ " | 136 | \n",
+ " 137 | \n",
+ " Bangkok, Thailand | \n",
+ " 2023-09 | \n",
+ " 9/8/2023 | \n",
+ " 7.0 | \n",
+ " Ryan Chen | \n",
+ " 34.0 | \n",
+ " Male | \n",
+ " Chinese | \n",
+ " Hostel | \n",
+ " 2000 | \n",
+ " Train | \n",
+ " 1000 | \n",
+ "
\n",
+ " \n",
+ " | 137 | \n",
+ " 138 | \n",
+ " Barcelona, Spain | \n",
+ " 2023-09 | \n",
+ " 9/22/2023 | \n",
+ " 7.0 | \n",
+ " Sofia Rodriguez | \n",
+ " 25.0 | \n",
+ " Female | \n",
+ " Spanish | \n",
+ " Airbnb | \n",
+ " 6000 | \n",
+ " Airplane | \n",
+ " 2500 | \n",
+ "
\n",
+ " \n",
+ " | 138 | \n",
+ " 139 | \n",
+ " Auckland, New Zealand | \n",
+ " 2023-10 | \n",
+ " 10/8/2023 | \n",
+ " 7.0 | \n",
+ " William Brown | \n",
+ " 39.0 | \n",
+ " Male | \n",
+ " New Zealander | \n",
+ " Hotel | \n",
+ " 7000 | \n",
+ " Train | \n",
+ " 2500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
139 rows × 13 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id destination start_date end_date duration_days \\\n",
+ "0 1 London, UK 2023-05 5/8/2023 7.0 \n",
+ "1 2 Phuket, Thailand 2023-06 6/20/2023 5.0 \n",
+ "2 3 Bali, Indonesia 2023-07 7/8/2023 7.0 \n",
+ "3 4 New York, USA 2023-08 8/29/2023 14.0 \n",
+ "4 5 Tokyo, Japan 2023-09 9/17/2023 7.0 \n",
+ ".. ... ... ... ... ... \n",
+ "134 135 Rio de Janeiro, Brazil 2023-08 8/10/2023 9.0 \n",
+ "135 136 Vancouver, Canada 2023-08 8/21/2023 6.0 \n",
+ "136 137 Bangkok, Thailand 2023-09 9/8/2023 7.0 \n",
+ "137 138 Barcelona, Spain 2023-09 9/22/2023 7.0 \n",
+ "138 139 Auckland, New Zealand 2023-10 10/8/2023 7.0 \n",
+ "\n",
+ " traveler_name traveler_age traveler_gender traveler_nationality \\\n",
+ "0 John Smith 35.0 Male American \n",
+ "1 Jane Doe 28.0 Female Canadian \n",
+ "2 David Lee 45.0 Male Korean \n",
+ "3 Sarah Johnson 29.0 Female British \n",
+ "4 Kim Nguyen 26.0 Female Vietnamese \n",
+ ".. ... ... ... ... \n",
+ "134 Jose Perez 37.0 Male Brazilian \n",
+ "135 Emma Wilson 29.0 Female Canadian \n",
+ "136 Ryan Chen 34.0 Male Chinese \n",
+ "137 Sofia Rodriguez 25.0 Female Spanish \n",
+ "138 William Brown 39.0 Male New Zealander \n",
+ "\n",
+ " accommodation_type accommodation_cost transportation_type \\\n",
+ "0 Hotel 1200 Flight \n",
+ "1 Resort 800 Flight \n",
+ "2 Villa 1000 Flight \n",
+ "3 Hotel 2000 Flight \n",
+ "4 Airbnb 700 Train \n",
+ ".. ... ... ... \n",
+ "134 Hostel 2500 Car \n",
+ "135 Hotel 5000 Airplane \n",
+ "136 Hostel 2000 Train \n",
+ "137 Airbnb 6000 Airplane \n",
+ "138 Hotel 7000 Train \n",
+ "\n",
+ " transportation_cost \n",
+ "0 600 \n",
+ "1 500 \n",
+ "2 700 \n",
+ "3 1000 \n",
+ "4 200 \n",
+ ".. ... \n",
+ "134 2000 \n",
+ "135 3000 \n",
+ "136 1000 \n",
+ "137 2500 \n",
+ "138 2500 \n",
+ "\n",
+ "[139 rows x 13 columns]"
+ ]
+ },
+ "execution_count": 156,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ " travel_trip_hoaithuong_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "id": "9b426ca7-b105-4e6c-a965-a3d2f9e1794d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 2023-05-01\n",
+ "1 2023-06-01\n",
+ "2 2023-07-01\n",
+ "3 2023-08-01\n",
+ "4 2023-09-01\n",
+ " ... \n",
+ "134 2023-08-01\n",
+ "135 2023-08-01\n",
+ "136 2023-09-01\n",
+ "137 2023-09-01\n",
+ "138 2023-10-01\n",
+ "Name: start_date, Length: 139, dtype: datetime64[ns]"
+ ]
+ },
+ "execution_count": 157,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(travel_trip_hoaithuong_df['start_date'], errors='coerce')\n",
+ "travel_trip_hoaithuong_df['start_date']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "id": "8ed625ad-7d44-4c07-a604-8d2e7f84843c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 May\n",
+ "1 June\n",
+ "2 July\n",
+ "3 August\n",
+ "4 September\n",
+ " ... \n",
+ "134 August\n",
+ "135 August\n",
+ "136 September\n",
+ "137 September\n",
+ "138 October\n",
+ "Name: start_month, Length: 139, dtype: object"
+ ]
+ },
+ "execution_count": 158,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['start_month'] = travel_trip_hoaithuong_df['start_date'].dt.strftime('%B')\n",
+ "travel_trip_hoaithuong_df['start_month']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 159,
+ "id": "f450c9cf-f145-4683-be6d-966e7b034201",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " destination | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " duration_days | \n",
+ " traveler_name | \n",
+ " traveler_age | \n",
+ " traveler_gender | \n",
+ " traveler_nationality | \n",
+ " accommodation_type | \n",
+ " accommodation_cost | \n",
+ " transportation_type | \n",
+ " transportation_cost | \n",
+ " start_month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " London, UK | \n",
+ " 2023-05-01 | \n",
+ " 5/8/2023 | \n",
+ " 7.0 | \n",
+ " John Smith | \n",
+ " 35.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ " May | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Phuket, Thailand | \n",
+ " 2023-06-01 | \n",
+ " 6/20/2023 | \n",
+ " 5.0 | \n",
+ " Jane Doe | \n",
+ " 28.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Resort | \n",
+ " 800 | \n",
+ " Flight | \n",
+ " 500 | \n",
+ " June | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " Bali, Indonesia | \n",
+ " 2023-07-01 | \n",
+ " 7/8/2023 | \n",
+ " 7.0 | \n",
+ " David Lee | \n",
+ " 45.0 | \n",
+ " Male | \n",
+ " Korean | \n",
+ " Villa | \n",
+ " 1000 | \n",
+ " Flight | \n",
+ " 700 | \n",
+ " July | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " New York, USA | \n",
+ " 2023-08-01 | \n",
+ " 8/29/2023 | \n",
+ " 14.0 | \n",
+ " Sarah Johnson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " British | \n",
+ " Hotel | \n",
+ " 2000 | \n",
+ " Flight | \n",
+ " 1000 | \n",
+ " August | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Tokyo, Japan | \n",
+ " 2023-09-01 | \n",
+ " 9/17/2023 | \n",
+ " 7.0 | \n",
+ " Kim Nguyen | \n",
+ " 26.0 | \n",
+ " Female | \n",
+ " Vietnamese | \n",
+ " Airbnb | \n",
+ " 700 | \n",
+ " Train | \n",
+ " 200 | \n",
+ " September | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id destination start_date end_date duration_days \\\n",
+ "0 1 London, UK 2023-05-01 5/8/2023 7.0 \n",
+ "1 2 Phuket, Thailand 2023-06-01 6/20/2023 5.0 \n",
+ "2 3 Bali, Indonesia 2023-07-01 7/8/2023 7.0 \n",
+ "3 4 New York, USA 2023-08-01 8/29/2023 14.0 \n",
+ "4 5 Tokyo, Japan 2023-09-01 9/17/2023 7.0 \n",
+ "\n",
+ " traveler_name traveler_age traveler_gender traveler_nationality \\\n",
+ "0 John Smith 35.0 Male American \n",
+ "1 Jane Doe 28.0 Female Canadian \n",
+ "2 David Lee 45.0 Male Korean \n",
+ "3 Sarah Johnson 29.0 Female British \n",
+ "4 Kim Nguyen 26.0 Female Vietnamese \n",
+ "\n",
+ " accommodation_type accommodation_cost transportation_type \\\n",
+ "0 Hotel 1200 Flight \n",
+ "1 Resort 800 Flight \n",
+ "2 Villa 1000 Flight \n",
+ "3 Hotel 2000 Flight \n",
+ "4 Airbnb 700 Train \n",
+ "\n",
+ " transportation_cost start_month \n",
+ "0 600 May \n",
+ "1 500 June \n",
+ "2 700 July \n",
+ "3 1000 August \n",
+ "4 200 September "
+ ]
+ },
+ "execution_count": 159,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 160,
+ "id": "90974fd4-9cbc-414b-a0c9-790f5685badc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " destination | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " duration_days | \n",
+ " traveler_name | \n",
+ " traveler_age | \n",
+ " traveler_gender | \n",
+ " traveler_nationality | \n",
+ " accommodation_type | \n",
+ " accommodation_cost | \n",
+ " transportation_type | \n",
+ " transportation_cost | \n",
+ " start_month | \n",
+ " start_year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " London, UK | \n",
+ " 2023-05-01 | \n",
+ " 5/8/2023 | \n",
+ " 7.0 | \n",
+ " John Smith | \n",
+ " 35.0 | \n",
+ " Male | \n",
+ " American | \n",
+ " Hotel | \n",
+ " 1200 | \n",
+ " Flight | \n",
+ " 600 | \n",
+ " May | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Phuket, Thailand | \n",
+ " 2023-06-01 | \n",
+ " 6/20/2023 | \n",
+ " 5.0 | \n",
+ " Jane Doe | \n",
+ " 28.0 | \n",
+ " Female | \n",
+ " Canadian | \n",
+ " Resort | \n",
+ " 800 | \n",
+ " Flight | \n",
+ " 500 | \n",
+ " June | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " Bali, Indonesia | \n",
+ " 2023-07-01 | \n",
+ " 7/8/2023 | \n",
+ " 7.0 | \n",
+ " David Lee | \n",
+ " 45.0 | \n",
+ " Male | \n",
+ " Korean | \n",
+ " Villa | \n",
+ " 1000 | \n",
+ " Flight | \n",
+ " 700 | \n",
+ " July | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " New York, USA | \n",
+ " 2023-08-01 | \n",
+ " 8/29/2023 | \n",
+ " 14.0 | \n",
+ " Sarah Johnson | \n",
+ " 29.0 | \n",
+ " Female | \n",
+ " British | \n",
+ " Hotel | \n",
+ " 2000 | \n",
+ " Flight | \n",
+ " 1000 | \n",
+ " August | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Tokyo, Japan | \n",
+ " 2023-09-01 | \n",
+ " 9/17/2023 | \n",
+ " 7.0 | \n",
+ " Kim Nguyen | \n",
+ " 26.0 | \n",
+ " Female | \n",
+ " Vietnamese | \n",
+ " Airbnb | \n",
+ " 700 | \n",
+ " Train | \n",
+ " 200 | \n",
+ " September | \n",
+ " 2023 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id destination start_date end_date duration_days \\\n",
+ "0 1 London, UK 2023-05-01 5/8/2023 7.0 \n",
+ "1 2 Phuket, Thailand 2023-06-01 6/20/2023 5.0 \n",
+ "2 3 Bali, Indonesia 2023-07-01 7/8/2023 7.0 \n",
+ "3 4 New York, USA 2023-08-01 8/29/2023 14.0 \n",
+ "4 5 Tokyo, Japan 2023-09-01 9/17/2023 7.0 \n",
+ "\n",
+ " traveler_name traveler_age traveler_gender traveler_nationality \\\n",
+ "0 John Smith 35.0 Male American \n",
+ "1 Jane Doe 28.0 Female Canadian \n",
+ "2 David Lee 45.0 Male Korean \n",
+ "3 Sarah Johnson 29.0 Female British \n",
+ "4 Kim Nguyen 26.0 Female Vietnamese \n",
+ "\n",
+ " accommodation_type accommodation_cost transportation_type \\\n",
+ "0 Hotel 1200 Flight \n",
+ "1 Resort 800 Flight \n",
+ "2 Villa 1000 Flight \n",
+ "3 Hotel 2000 Flight \n",
+ "4 Airbnb 700 Train \n",
+ "\n",
+ " transportation_cost start_month start_year \n",
+ "0 600 May 2023 \n",
+ "1 500 June 2023 \n",
+ "2 700 July 2023 \n",
+ "3 1000 August 2023 \n",
+ "4 200 September 2023 "
+ ]
+ },
+ "execution_count": 160,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df[\"start_date\"] = pd.to_datetime(\n",
+ " travel_trip_hoaithuong_df[\"start_date\"], errors=\"coerce\"\n",
+ ")\n",
+ "\n",
+ "travel_trip_hoaithuong_df[\"start_year\"] = (\n",
+ " travel_trip_hoaithuong_df[\"start_date\"].dt.year.astype(\"Int64\")\n",
+ ")\n",
+ "travel_trip_hoaithuong_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "id": "2cd814fb-da83-4583-9f34-97372e41ee74",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Clean columns: traveler_nationality"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "id": "d36774ee-16ad-4b7c-8b92-5b4ac81c589f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['American', 'Canadian', 'Korean', 'British', 'Vietnamese',\n",
+ " 'Australian', 'Brazilian', 'Dutch', 'Emirati', 'Mexican',\n",
+ " 'Spanish', 'Chinese', 'German', 'Moroccan', 'Scottish', 'Japanese',\n",
+ " 'Italian', 'Indian', 'South Korean', 'French', nan,\n",
+ " 'South African', 'Taiwanese', 'Indonesian', 'USA', 'Canada',\n",
+ " 'South Korea', 'UK', 'China', 'Taiwan', 'Japan', 'Spain', 'Brazil',\n",
+ " 'Germany', 'Hong Kong', 'United Kingdom', 'Singapore', 'Italy',\n",
+ " 'Greece', 'United Arab Emirates', 'Cambodia', 'New Zealander'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 162,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ " travel_trip_hoaithuong_df['traveler_nationality'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "id": "2e165506-1ee0-44c9-91a5-907d9e6884bf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 American\n",
+ "1 Canadian\n",
+ "2 Korean\n",
+ "3 British\n",
+ "4 Vietnamese\n",
+ " ... \n",
+ "134 Brazilian\n",
+ "135 Canadian\n",
+ "136 Chinese\n",
+ "137 Spanish\n",
+ "138 New Zealander\n",
+ "Name: traveler_nationality, Length: 139, dtype: object"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality'].str.strip().str.lower().str.title()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "id": "2ea7355a-8df1-4bc6-b29b-e893e1221394",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "traveler_nationality\n",
+ "American 24\n",
+ "Korean 13\n",
+ "British 12\n",
+ "Canadian 9\n",
+ "Australian 8\n",
+ "Spanish 7\n",
+ "Chinese 7\n",
+ "Italian 4\n",
+ "Brazilian 4\n",
+ "Indian 4\n",
+ "Vietnamese 3\n",
+ "South Korea 3\n",
+ "South Korean 3\n",
+ "Taiwan 2\n",
+ "Canada 2\n",
+ "USA 2\n",
+ "South African 2\n",
+ "Japanese 2\n",
+ "Mexican 2\n",
+ "Emirati 2\n",
+ "Dutch 2\n",
+ "Brazil 1\n",
+ "Cambodia 1\n",
+ "United Arab Emirates 1\n",
+ "Greece 1\n",
+ "Italy 1\n",
+ "Singapore 1\n",
+ "United Kingdom 1\n",
+ "Hong Kong 1\n",
+ "Germany 1\n",
+ "Japan 1\n",
+ "Spain 1\n",
+ "Scottish 1\n",
+ "China 1\n",
+ "UK 1\n",
+ "German 1\n",
+ "Indonesian 1\n",
+ "Taiwanese 1\n",
+ "Moroccan 1\n",
+ "French 1\n",
+ "New Zealander 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 164,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "id": "b4b7ab06-6da3-40c9-9bbf-62a0d321cd76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping = {\n",
+ " 'American': 'United States',\n",
+ " 'USA': 'United States',\n",
+ " 'Canada': 'Canada',\n",
+ " 'Canadian': 'Canada',\n",
+ " 'UK': 'United Kingdom',\n",
+ " 'United Kingdom': 'United Kingdom',\n",
+ " 'British': 'United Kingdom',\n",
+ " 'Scottish': 'United Kingdom',\n",
+ " 'Korean': 'South Korea',\n",
+ " 'South Korea': 'South Korea',\n",
+ " 'South Korean': 'South Korea',\n",
+ " 'China': 'China',\n",
+ " 'Chinese': 'China',\n",
+ " 'Italy': 'Italy',\n",
+ " 'Italian': 'Italy',\n",
+ " 'Brazil': 'Brazil',\n",
+ " 'Brazilian': 'Brazil',\n",
+ " 'Emirati': 'United Arab Emirates',\n",
+ " 'United Arab Emirates': 'United Arab Emirates',\n",
+ " 'Taiwan': 'Taiwan',\n",
+ " 'Taiwanese': 'Taiwan',\n",
+ " 'Germany': 'Germany',\n",
+ " 'German': 'Germany',\n",
+ " 'Spain': 'Spain',\n",
+ " 'Spanish': 'Spain',\n",
+ " 'Japan': 'Japan',\n",
+ " 'Japanese': 'Japan'\n",
+ "}\n",
+ "\n",
+ "travel_trip_hoaithuong_df['traveler_nationality_clean'] = travel_trip_hoaithuong_df['traveler_nationality'].replace(mapping)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "id": "6044065c-4763-4f33-8e14-4d404c9de96c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "traveler_nationality_clean\n",
+ "United States 26\n",
+ "South Korea 19\n",
+ "United Kingdom 15\n",
+ "Canada 11\n",
+ "China 8\n",
+ "Spain 8\n",
+ "Australian 8\n",
+ "Brazil 5\n",
+ "Italy 5\n",
+ "Indian 4\n",
+ "United Arab Emirates 3\n",
+ "Vietnamese 3\n",
+ "Japan 3\n",
+ "Taiwan 3\n",
+ "Dutch 2\n",
+ "Mexican 2\n",
+ "Germany 2\n",
+ "South African 2\n",
+ "Singapore 1\n",
+ "Cambodia 1\n",
+ "Greece 1\n",
+ "Moroccan 1\n",
+ "Hong Kong 1\n",
+ "Indonesian 1\n",
+ "French 1\n",
+ "New Zealander 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 166,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality_clean'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "id": "f473b554-a78f-4431-8c5e-06980976e343",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dtype('O')"
+ ]
+ },
+ "execution_count": 167,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality'].dtype"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "id": "5bfed24f-5b63-4330-9e2d-7d6edfce67a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 American\n",
+ "1 Canadian\n",
+ "2 Korean\n",
+ "3 British\n",
+ "4 Vietnamese\n",
+ " ... \n",
+ "134 Brazilian\n",
+ "135 Canadian\n",
+ "136 Chinese\n",
+ "137 Spanish\n",
+ "138 New Zealander\n",
+ "Name: traveler_nationality, Length: 139, dtype: string"
+ ]
+ },
+ "execution_count": 168,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality'].astype('string')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "id": "9c363536-ca86-417a-aa9a-59b37db0119c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "np.int64(2)"
+ ]
+ },
+ "execution_count": 169,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_nationality'].isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "id": "85f9faa5-c85d-48b3-b0b0-3dcea796fdaf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Clean columns traveler_gender"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "id": "68ceab2f-3ae5-425d-ba28-704a44289be7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['Male', 'Female', nan], dtype=object)"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_gender'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "id": "0e794a05-5879-4857-84f9-1674d4212a1e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "traveler_gender\n",
+ "Female 70\n",
+ "Male 67\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_gender'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "id": "e87e63d4-c180-4263-820b-abf80afe31e4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "np.int64(2)"
+ ]
+ },
+ "execution_count": 173,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_gender'].isna().sum() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 174,
+ "id": "8a073849-ae87-44aa-bd84-3916171634f4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Male\n",
+ "1 Female\n",
+ "2 Male\n",
+ "3 Female\n",
+ "4 Female\n",
+ " ... \n",
+ "134 Male\n",
+ "135 Female\n",
+ "136 Male\n",
+ "137 Female\n",
+ "138 Male\n",
+ "Name: traveler_gender, Length: 139, dtype: object"
+ ]
+ },
+ "execution_count": 174,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df['traveler_gender'].str.strip().str.title()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 175,
+ "id": "7a432295-fa6c-474e-b3ad-98835f8dc5db",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "traveler_gender\n",
+ "Female 70\n",
+ "Male 67\n",
+ "Unknown 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 175,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "travel_trip_hoaithuong_df[\"traveler_gender\"] = (\n",
+ " travel_trip_hoaithuong_df[\"traveler_gender\"].fillna(\"Unknown\")\n",
+ ")\n",
+ "travel_trip_hoaithuong_df['traveler_gender'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bb6b32cd-c090-4b88-97a6-f9c8d647e7b7",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9a0312b-3d0f-4561-bd4c-7591e903ce7d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}