diff --git a/data/raw/Travel details dataset.csv b/data/raw/Travel details dataset.csv new file mode 100644 index 00000000..f3c1b7e5 --- /dev/null +++ b/data/raw/Travel details dataset.csv @@ -0,0 +1,140 @@ +Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost +1,"London, UK",5/1/2023,5/8/2023,7,John Smith,35,Male,American,Hotel,1200,Flight,600 +2,"Phuket, Thailand",6/15/2023,6/20/2023,5,Jane Doe,28,Female,Canadian,Resort,800,Flight,500 +3,"Bali, Indonesia",7/1/2023,7/8/2023,7,David Lee,45,Male,Korean,Villa,1000,Flight,700 +4,"New York, USA",8/15/2023,8/29/2023,14,Sarah Johnson,29,Female,British,Hotel,2000,Flight,1000 +5,"Tokyo, Japan",9/10/2023,9/17/2023,7,Kim Nguyen,26,Female,Vietnamese,Airbnb,700,Train,200 +6,"Paris, France",10/5/2023,10/10/2023,5,Michael Brown,42,Male,American,Hotel,1500,Flight,800 +7,"Sydney, Australia",11/20/2023,11/30/2023,10,Emily Davis,33,Female,Australian,Hostel,500,Flight,1200 +8,"Rio de Janeiro, Brazil",1/5/2024,1/12/2024,7,Lucas Santos,25,Male,Brazilian,Airbnb,900,Flight,600 +9,"Amsterdam, Netherlands",2/14/2024,2/21/2024,7,Laura Janssen,31,Female,Dutch,Hotel,1200,Train,200 +10,"Dubai, United Arab Emirates",3/10/2024,3/17/2024,7,Mohammed Ali,39,Male,Emirati,Resort,2500,Flight,800 +11,"Cancun, Mexico",4/1/2024,4/8/2024,7,Ana Hernandez,27,Female,Mexican,Hotel,1000,Flight,500 +12,"Barcelona, Spain",5/15/2024,5/22/2024,7,Carlos Garcia,36,Male,Spanish,Airbnb,800,Train,100 +13,"Honolulu, Hawaii",6/10/2024,6/18/2024,8,Lily Wong,29,Female,Chinese,Resort,3000,Flight,1200 +14,"Berlin, Germany",7/1/2024,7/10/2024,9,Hans Mueller,48,Male,German,Hotel,1400,Flight,700 +15,"Marrakech, Morocco",8/20/2024,8/27/2024,7,Fatima Khouri,26,Female,Moroccan,Riad,600,Flight,400 +16,"Edinburgh, Scotland",9/5/2024,9/12/2024,7,James MacKenzie,32,Male,Scottish,Hotel,900,Train,150 +17,Paris,9/1/2023,9/10/2023,9,Sarah Johnson,30,Female,American,Hotel,$900 ,Plane,$400 +18,Bali,8/15/2023,8/25/2023,10,Michael Chang,28,Male,Chinese,Resort,"$1,500 ",Plane,$700 +19,London,7/22/2023,7/28/2023,6,Olivia Rodriguez,35,Female,British,Hotel,"$1,200 ",Train,$150 +20,Tokyo,10/5/2023,10/15/2023,10,Kenji Nakamura,45,Male,Japanese,Hotel,"$1,200 ",Plane,$800 +21,New York,11/20/2023,11/25/2023,5,Emily Lee,27,Female,American,Airbnb,$600 ,Bus,$100 +22,Sydney,12/5/2023,12/12/2023,7,James Wilson,32,Male,Australian,Hotel,"$1,000 ",Plane,$600 +23,Rome,11/1/2023,11/8/2023,7,Sofia Russo,29,Female,Italian,Airbnb,$700 ,Train,$80 +24,Bangkok,9/15/2023,9/23/2023,8,Raj Patel,40,Male,Indian,Hostel,$400 ,Plane,$500 +25,Paris,12/22/2023,12/28/2023,6,Lily Nguyen,24,Female,Vietnamese,Hotel,"$1,400 ",Train,$100 +26,Hawaii,8/1/2023,8/10/2023,9,David Kim,34,Male,Korean,Resort,"$2,000 ",Plane,$800 +27,Barcelona,10/20/2023,10/28/2023,8,Maria Garcia,31,Female,Spanish,Hotel,"$1,100 ",Train,$150 +28,Japan,5/10/2022,5/18/2022,8,Alice Smith,30,Female,American,Hotel,$800 ,Plane,$500 +29,Thailand,6/15/2022,6/22/2022,7,Bob Johnson,45,Male,Canadian,Hostel,$200 ,Train,$150 +30,France,7/2/2022,7/11/2022,9,Charlie Lee,25,Male,Korean,Airbnb,$600 ,Car rental,$300 +31,Australia,8/20/2022,9/2/2022,13,Emma Davis,28,Female,British,Hotel,"$1,000 ",Car rental,$500 +32,Brazil,9/5/2022,9/14/2022,9,Olivia Martin,33,Female,Australian,Hostel,$150 ,Bus,$50 +33,Greece,10/12/2022,10/20/2022,8,Harry Wilson,20,Male,American,Airbnb,$400 ,Plane,$600 +34,Egypt,11/8/2022,11/15/2022,7,Sophia Lee,37,Female,Canadian,Hotel,$700 ,Train,$100 +35,Mexico,1/5/2023,1/15/2023,10,James Brown,42,Male,British,Airbnb,$500 ,Plane,$800 +36,Italy,2/14/2023,2/20/2023,6,Mia Johnson,31,Female,American,Hostel,$180 ,Train,$120 +37,Spain,3/23/2023,3/31/2023,8,William Davis,27,Male,Korean,Hotel,$900 ,Car rental,$400 +38,Canada,4/19/2023,4/26/2023,7,Amelia Brown,38,Female,Australian,Airbnb,$350 ,Bus,$75 +39,"Paris, France",6/12/2022,6/19/2022,7,Mia Johnson,25,Female,American,Hotel,1400,Plane,600 +40,"Sydney, Australia",1/2/2023,1/9/2023,7,Adam Lee,33,Male,Canadian,Airbnb,800,Train,150 +41,"Tokyo, Japan",12/10/2022,12/18/2022,8,Sarah Wong,28,Female,Chinese,Hostel,500,Plane,900 +42,"Cancun, Mexico",7/1/2023,7/8/2023,7,John Smith,45,Male,American,Resort,2200,Plane,800 +43,"Rio de Janeiro, Brazil",11/20/2022,11/27/2022,7,Maria Silva,30,Female,Brazilian,Hotel,1200,Plane,700 +44,"London, UK",3/5/2023,3/12/2023,7,Peter Brown,55,Male,British,Airbnb,900,Train,100 +45,"Barcelona, Spain",8/18/2023,8/25/2023,7,Emma Garcia,27,Female,Spanish,Hostel,600,Plane,600 +46,"New York City, USA",9/15/2022,9/22/2022,7,Michael Davis,41,Male,American,Hotel,1500,Plane,500 +47,"Bangkok, Thailand",5/1/2023,5/7/2023,6,Nina Patel,29,Female,Indian,Airbnb,500,Bus,50 +48,"Vancouver, Canada",7/10/2022,7/17/2022,7,Kevin Kim,24,Male,Korean,Hostel,400,Train,150 +49,"Amsterdam, Netherlands",6/20/2023,6/28/2023,8,Laura van den Berg,31,Female,Dutch,Hotel,1100,Plane,700 +50,"Paris, France",8/15/2023,8/22/2023,7,Jennifer Nguyen,31,Female,Canadian,Hotel,"$1,200 ",Train,$300 +51,"Tokyo, Japan",10/10/2023,10/20/2023,10,David Kim,25,Male,American,Hostel,$500 ,Bus,$100 +52,"Sydney, AUS",11/5/2023,11/12/2023,7,Rachel Lee,27,Female,South Korean,Airbnb,$900 ,Car rental,$200 +53,"New York, USA",12/24/2023,12/31/2023,7,Jessica Wong,28,Female,Canadian,Hotel,"$1,400 ",Flight,$800 +54,"Rio de Janeiro, Brazil",1/15/2024,1/24/2024,9,Felipe Almeida,30,Male,Brazilian,Airbnb,$800 ,Train,$150 +55,"Bangkok, Thailand",2/1/2024,2/9/2024,8,Nisa Patel,23,Female,Indian,Hostel,$400 ,Bus,$50 +56,"London, UK",3/15/2024,3/23/2024,8,Ben Smith,35,Male,British,Hotel,"$1,000 ",Train,$200 +57,"Barcelona, Spain",4/5/2024,4/13/2024,8,Laura Gomez,29,Female,Spanish,Airbnb,$700 ,Car rental,$250 +58,"Seoul, South Korea",5/10/2024,5/18/2024,8,Park Min Woo,27,Male,South Korean,Hostel,$500 ,Subway,$20 +59,"Los Angeles, USA",6/20/2024,6/27/2024,7,Michael Chen,26,Male,Chinese,Hotel,"$1,200 ",Car rental,$300 +60,"Rome, Italy",7/15/2024,7/23/2024,8,Sofia Rossi,33,Female,Italian,Airbnb,$800 ,Train,$100 +61,Paris,7/12/2022,7/18/2022,6,Rachel Sanders,35,Female,American,Hotel,1200,Plane,800 +62,Tokyo,9/3/2022,9/10/2022,7,Kenji Nakamura,28,Male,Japanese,Hostel,400,Train,300 +63,Cape Town,1/7/2023,1/16/2023,9,Emily Watson,29,Female,British,Vacation rental,800,Car rental,200 +64,Sydney,6/23/2023,6/29/2023,6,David Lee,43,Male,Australian,Hotel,1500,Plane,1200 +65,Barcelona,8/18/2023,8/25/2023,7,Ana Rodriguez,31,Female,Spanish,Vacation rental,900,Plane,700 +66,Bali,2/1/2024,2/8/2024,7,Tom Wilson,27,Male,American,Resort,2200,Plane,1000 +67,Paris,5/6/2024,5/12/2024,6,Olivia Green,39,Female,French,Hotel,1100,Train,200 +68,New York,7/20/2024,7/26/2024,6,James Chen,25,Male,American,Vacation rental,1000,Plane,800 +69,Bangkok,9/8/2024,9/16/2024,8,Lila Patel,33,Female,Indian,Hostel,300,Plane,700 +70,Rome,2/14/2025,2/20/2025,6,Marco Rossi,41,Male,Italian,Hotel,1300,Train,100 +71,Bali,5/21/2025,5/29/2025,8,Sarah Brown,37,Female,British,Resort,1800,Plane,1000 +72,,,,,,,,,,,, +73,"Bali, Indonesia",8/5/2022,8/12/2022,7,Sarah Lee,35,Female,South Korean,Resort,500 USD,Plane,800 USD +74,"Tokyo, Japan",1/1/2023,1/9/2023,8,Alex Kim,29,Male,American,Hotel,1000 USD,Train,200 USD +75,"Cancun, Mexico",4/15/2023,4/22/2023,7,Maria Hernandez,42,Female,Mexican,Resort,800 USD,Plane,500 USD +76,"Paris, France",6/7/2023,6/14/2023,7,John Smith,46,Male,British,Hotel,1200 USD,Plane,700 USD +77,"Cape Town, SA",9/1/2023,9/10/2023,9,Mark Johnson,31,Male,South African,Guesthouse,400 USD,Car,300 USD +78,"Bali, Indonesia",11/12/2023,11/19/2023,7,Amanda Chen,25,Female,Taiwanese,Resort,600 USD,Plane,700 USD +79,"Sydney, Aus",2/5/2024,2/12/2024,7,David Lee,38,Male,Australian,Hotel,900 USD,Plane,600 USD +80,"Bangkok, Thai",5/15/2024,5/22/2024,7,Nana Kwon,27,Female,Korean,Hotel,400 USD,Plane,400 USD +81,"New York, USA",8/20/2024,8/27/2024,7,Tom Hanks,60,Male,American,Hotel,1500 USD,Plane,1000 USD +82,"Phuket, Thai",1/1/2025,1/8/2025,7,Emma Watson,32,Female,British,Resort,700 USD,Plane,800 USD +83,"Rome, Italy",4/15/2025,4/22/2025,7,James Kim,41,Male,American,Hotel,100,, +84,Paris,6/15/2021,6/20/2021,6,John Smith,35,Male,American,Hotel,800 USD,Plane,500 USD +85,Tokyo,7/1/2021,7/10/2021,10,Sarah Lee,28,Female,Korean,Airbnb,500 USD,Train,300 USD +86,Bali,8/10/2021,8/20/2021,11,Maria Garcia,42,Female,Spanish,Resort,1200 USD,Plane,700 USD +87,Sydney,9/1/2021,9/10/2021,9,David Lee,45,Male,Australian,Hotel,900 USD,Plane,600 USD +88,New York,10/15/2021,10/20/2021,6,Emily Davis,31,Female,American,Airbnb,700 USD,Car rental,200 USD +89,London,11/20/2021,11/30/2021,11,James Wilson,29,Male,British,Hostel,300 USD,Plane,400 USD +90,Dubai,1/1/2022,1/8/2022,8,Fatima Ahmed,24,Female,Emirati,Hotel,1000 USD,Plane,800 USD +91,Bangkok,2/14/2022,2/20/2022,7,Liam Nguyen,26,Male,Vietnamese,Airbnb,400 USD,Train,100 USD +92,Rome,3/10/2022,3/20/2022,11,Giulia Rossi,30,Female,Italian,Hostel,200 USD,Plane,350 USD +93,Bali,4/15/2022,4/25/2022,11,Putra Wijaya,33,Male,Indonesian,Villa,1500 USD,Car rental,300 USD +94,Seoul,5/1/2022,5/10/2022,10,Kim Min-ji,27,Female,Korean,Hotel,800 USD,Train,150 USD +95,Paris,6/15/2022,6/20/2022,5,John Smith,35,Male,USA,Hotel,$500 ,Plane,$800 +96,Tokyo,9/1/2022,9/10/2022,9,Emily Johnson,28,Female,Canada,Airbnb,$400 ,Train,$200 +97,Sydney,11/23/2022,12/2/2022,9,David Lee,45,Male,South Korea,Hostel,$200 ,Plane,"$1,200 " +98,London,2/14/2023,2/19/2023,5,Sarah Brown,37,Female,UK,Hotel,$600 ,Plane,$700 +99,New York,5/8/2023,5/14/2023,6,Michael Wong,50,Male,China,Airbnb,$800 ,Car rental,$300 +100,Rome,8/20/2023,8/27/2023,7,Jessica Chen,31,Female,Taiwan,Hotel,$700 ,Plane,$900 +101,Bangkok,11/12/2023,11/20/2023,8,Ken Tanaka,42,Male,Japan,Hostel,$300 ,Train,$100 +102,Cape Town,1/6/2024,1/14/2024,8,Maria Garcia,27,Female,Spain,Airbnb,$500 ,Plane,"$1,500 " +103,Rio de Janeiro,4/3/2024,4/10/2024,7,Rodrigo Oliveira,33,Male,Brazil,Hotel,$900 ,Car rental,$400 +104,Bali,7/22/2024,7/28/2024,6,Olivia Kim,29,Female,South Korea,Villa,"$1,200 ",Plane,"$1,000 " +105,Amsterdam,10/10/2024,10/17/2024,7,Robert Mueller,41,Male,Germany,Hotel,$600 ,Train,$150 +106,Paris,5/15/2022,5/20/2022,5,John Smith,35,Male,USA,Hotel,1000,Plane,800 +107,Tokyo,9/1/2022,9/10/2022,9,Sarah Lee,28,Female,South Korea,Airbnb,800,Train,500 +108,New York,6/20/2022,6/25/2022,5,Michael Wong,42,Male,Hong Kong,Hotel,1200,Car rental,200 +109,Bali,8/12/2022,8/20/2022,8,Lisa Chen,30,Female,Taiwan,Resort,1500,Plane,1200 +110,Sydney,7/1/2022,7/10/2022,9,David Kim,26,Male,Canada,Hostel,300,Plane,900 +111,London,6/10/2022,6/15/2022,5,Emily Wong,38,Female,United Kingdom,Hotel,900,Train,150 +112,Phuket,9/5/2022,9/12/2022,7,Mark Tan,45,Male,Singapore,Villa,2000,Plane,700 +113,Rome,5/1/2022,5/8/2022,7,Emma Lee,31,Female,Italy,Hotel,1100,Train,250 +114,Santorini,7/15/2022,7/22/2022,7,George Chen,27,Male,Greece,Airbnb,1000,Ferry,150 +115,Dubai,8/25/2022,8/30/2022,5,Sophia Kim,29,Female,United Arab Emirates,Hotel,1500,Car rental,300 +116,Phnom Penh,9/10/2022,9/15/2022,5,Alex Ng,33,Male,Cambodia,Hostel,200,Plane,500 +117,"Tokyo, Japan",2/5/2022,2/14/2022,9,Alice Smith,32,Female,American,Hotel,1000,Plane,700 +118,"Paris, France",3/15/2022,3/22/2022,7,Bob Johnson,47,Male,Canadian,Hotel,1200,Train,500 +119,"Sydney, Aus",5/1/2022,5/12/2022,11,Cindy Chen,26,Female,Chinese,Airbnb,800,Plane,1000 +120,"Rome, Italy",6/10/2022,6/17/2022,7,David Lee,38,Male,Korean,Hotel,900,Train,400 +121,"Bali, Indonesia",7/20/2022,7/30/2022,10,Emily Kim,29,Female,Korean,Hostel,500,Plane,800 +122,"Cancun, Mexico",8/8/2022,8/16/2022,8,Frank Li,41,Male,American,Hotel,1300,Plane,600 +123,"Athens, Greece",9/20/2022,9/30/2022,10,Gina Lee,35,Female,Korean,Airbnb,700,Plane,900 +124,"Tokyo, Japan",10/5/2022,10/13/2022,8,Henry Kim,24,Male,Korean,Hotel,1200,Plane,700 +125,"Sydney, Aus",11/11/2022,11/21/2022,10,Isabella Chen,30,Female,Chinese,Airbnb,900,Plane,1000 +126,"Paris, France",12/24/2022,1/1/2023,8,Jack Smith,28,Male,American,Hostel,400,Plane,700 +127,"Bali, Indonesia",2/10/2023,2/18/2023,8,Katie Johnson,33,Female,Canadian,Hotel,800,Plane,800 +128,,,,,,,,,,,, +129,"Paris, France",5/1/2023,5/7/2023,6,John Doe,35,Male,American,Hotel,5000,Airplane,2500 +130,"Tokyo, Japan",5/15/2023,5/22/2023,7,Jane Smith,28,Female,British,Airbnb,7000,Train,1500 +131,"Cape Town, South Africa",6/1/2023,6/10/2023,9,Michael Johnson,45,Male,South African,Hostel,3000,Car,2000 +132,"Sydney, Australia",6/15/2023,6/21/2023,6,Sarah Lee,31,Female,Australian,Hotel,6000,Airplane,3000 +133,"Rome, Italy",7/1/2023,7/8/2023,7,David Kim,42,Male,Korean,Airbnb,4000,Train,1500 +134,"New York City, USA",7/15/2023,7/22/2023,7,Emily Davis,27,Female,American,Hotel,8000,Airplane,2500 +135,"Rio de Janeiro, Brazil",8/1/2023,8/10/2023,9,Jose Perez,37,Male,Brazilian,Hostel,2500,Car,2000 +136,"Vancouver, Canada",8/15/2023,8/21/2023,6,Emma Wilson,29,Female,Canadian,Hotel,5000,Airplane,3000 +137,"Bangkok, Thailand",9/1/2023,9/8/2023,7,Ryan Chen,34,Male,Chinese,Hostel,2000,Train,1000 +138,"Barcelona, Spain",9/15/2023,9/22/2023,7,Sofia Rodriguez,25,Female,Spanish,Airbnb,6000,Airplane,2500 +139,"Auckland, New Zealand",10/1/2023,10/8/2023,7,William Brown,39,Male,New Zealander,Hotel,7000,Train,2500 diff --git a/notebooks/traveler_trip_dataset_hoaithuong.ipynb b/notebooks/traveler_trip_dataset_hoaithuong.ipynb new file mode 100644 index 00000000..bf217058 --- /dev/null +++ b/notebooks/traveler_trip_dataset_hoaithuong.ipynb @@ -0,0 +1,1984 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 142, + "id": "eaa9e1ba-9051-4a18-8857-97b56f4b8140", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Trip IDDestinationStart dateEnd dateDuration (days)Traveler nameTraveler ageTraveler genderTraveler nationalityAccommodation typeAccommodation costTransportation typeTransportation cost
01London, UK5/1/20235/8/20237.0John Smith35.0MaleAmericanHotel1200Flight600
12Phuket, Thailand6/15/20236/20/20235.0Jane Doe28.0FemaleCanadianResort800Flight500
23Bali, Indonesia7/1/20237/8/20237.0David Lee45.0MaleKoreanVilla1000Flight700
34New York, USA8/15/20238/29/202314.0Sarah Johnson29.0FemaleBritishHotel2000Flight1000
45Tokyo, Japan9/10/20239/17/20237.0Kim Nguyen26.0FemaleVietnameseAirbnb700Train200
56Paris, France10/5/202310/10/20235.0Michael Brown42.0MaleAmericanHotel1500Flight800
67Sydney, Australia11/20/202311/30/202310.0Emily Davis33.0FemaleAustralianHostel500Flight1200
78Rio de Janeiro, Brazil1/5/20241/12/20247.0Lucas Santos25.0MaleBrazilianAirbnb900Flight600
89Amsterdam, Netherlands2/14/20242/21/20247.0Laura Janssen31.0FemaleDutchHotel1200Train200
910Dubai, United Arab Emirates3/10/20243/17/20247.0Mohammed Ali39.0MaleEmiratiResort2500Flight800
\n", + "
" + ], + "text/plain": [ + " Trip ID Destination Start date End date \\\n", + "0 1 London, UK 5/1/2023 5/8/2023 \n", + "1 2 Phuket, Thailand 6/15/2023 6/20/2023 \n", + "2 3 Bali, Indonesia 7/1/2023 7/8/2023 \n", + "3 4 New York, USA 8/15/2023 8/29/2023 \n", + "4 5 Tokyo, Japan 9/10/2023 9/17/2023 \n", + "5 6 Paris, France 10/5/2023 10/10/2023 \n", + "6 7 Sydney, Australia 11/20/2023 11/30/2023 \n", + "7 8 Rio de Janeiro, Brazil 1/5/2024 1/12/2024 \n", + "8 9 Amsterdam, Netherlands 2/14/2024 2/21/2024 \n", + "9 10 Dubai, United Arab Emirates 3/10/2024 3/17/2024 \n", + "\n", + " Duration (days) Traveler name Traveler age Traveler gender \\\n", + "0 7.0 John Smith 35.0 Male \n", + "1 5.0 Jane Doe 28.0 Female \n", + "2 7.0 David Lee 45.0 Male \n", + "3 14.0 Sarah Johnson 29.0 Female \n", + "4 7.0 Kim Nguyen 26.0 Female \n", + "5 5.0 Michael Brown 42.0 Male \n", + "6 10.0 Emily Davis 33.0 Female \n", + "7 7.0 Lucas Santos 25.0 Male \n", + "8 7.0 Laura Janssen 31.0 Female \n", + "9 7.0 Mohammed Ali 39.0 Male \n", + "\n", + " Traveler nationality Accommodation type Accommodation cost \\\n", + "0 American Hotel 1200 \n", + "1 Canadian Resort 800 \n", + "2 Korean Villa 1000 \n", + "3 British Hotel 2000 \n", + "4 Vietnamese Airbnb 700 \n", + "5 American Hotel 1500 \n", + "6 Australian Hostel 500 \n", + "7 Brazilian Airbnb 900 \n", + "8 Dutch Hotel 1200 \n", + "9 Emirati Resort 2500 \n", + "\n", + " Transportation type Transportation cost \n", + "0 Flight 600 \n", + "1 Flight 500 \n", + "2 Flight 700 \n", + "3 Flight 1000 \n", + "4 Train 200 \n", + "5 Flight 800 \n", + "6 Flight 1200 \n", + "7 Flight 600 \n", + "8 Train 200 \n", + "9 Flight 800 " + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "travel_trip_data = \"../data/raw/Travel details dataset.csv\"\n", + "travel_trip_hoaithuong_df = pd.read_csv(travel_trip_data, encoding='ISO-8859-1')\n", + "travel_trip_hoaithuong_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "5215913a-b54e-4a6d-a07e-5e702c75ede4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(139, 13)" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "d0ae93e3-d769-4e0b-b4b1-9a8f7dee6b9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Trip ID', 'Destination', 'Start date', 'End date',\n", + " 'Duration (days)', 'Traveler name', 'Traveler age', 'Traveler gender',\n", + " 'Traveler nationality', 'Accommodation type', 'Accommodation cost',\n", + " 'Transportation type', 'Transportation cost'],\n", + " dtype='object')" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "f6b0a039-7cdc-461c-958f-4ee1634601be", + "metadata": {}, + "outputs": [], + "source": [ + "travel_trip_hoaithuong_df.columns = travel_trip_hoaithuong_df.columns.str.replace(\"\", \"\", regex=False).str.replace(r\"[()]\", \"\", regex=True).str.strip(\")\").str.replace(\" \",\"_\").str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "6785b96f-411c-4f9e-82a6-d00dbc033aaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['trip_id', 'destination', 'start_date', 'end_date', 'duration_days',\n", + " 'traveler_name', 'traveler_age', 'traveler_gender',\n", + " 'traveler_nationality', 'accommodation_type', 'accommodation_cost',\n", + " 'transportation_type', 'transportation_cost'],\n", + " dtype='object')" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "4835cbff-9092-476f-bd78-1ba752ecec33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_iddestinationstart_dateend_dateduration_daystraveler_nametraveler_agetraveler_gendertraveler_nationalityaccommodation_typeaccommodation_costtransportation_typetransportation_cost
01London, UK5/1/20235/8/20237.0John Smith35.0MaleAmericanHotel1200Flight600
12Phuket, Thailand6/15/20236/20/20235.0Jane Doe28.0FemaleCanadianResort800Flight500
23Bali, Indonesia7/1/20237/8/20237.0David Lee45.0MaleKoreanVilla1000Flight700
34New York, USA8/15/20238/29/202314.0Sarah Johnson29.0FemaleBritishHotel2000Flight1000
45Tokyo, Japan9/10/20239/17/20237.0Kim Nguyen26.0FemaleVietnameseAirbnb700Train200
\n", + "
" + ], + "text/plain": [ + " trip_id destination start_date end_date duration_days \\\n", + "0 1 London, UK 5/1/2023 5/8/2023 7.0 \n", + "1 2 Phuket, Thailand 6/15/2023 6/20/2023 5.0 \n", + "2 3 Bali, Indonesia 7/1/2023 7/8/2023 7.0 \n", + "3 4 New York, USA 8/15/2023 8/29/2023 14.0 \n", + "4 5 Tokyo, Japan 9/10/2023 9/17/2023 7.0 \n", + "\n", + " traveler_name traveler_age traveler_gender traveler_nationality \\\n", + "0 John Smith 35.0 Male American \n", + "1 Jane Doe 28.0 Female Canadian \n", + "2 David Lee 45.0 Male Korean \n", + "3 Sarah Johnson 29.0 Female British \n", + "4 Kim Nguyen 26.0 Female Vietnamese \n", + "\n", + " accommodation_type accommodation_cost transportation_type \\\n", + "0 Hotel 1200 Flight \n", + "1 Resort 800 Flight \n", + "2 Villa 1000 Flight \n", + "3 Hotel 2000 Flight \n", + "4 Airbnb 700 Train \n", + "\n", + " transportation_cost \n", + "0 600 \n", + "1 500 \n", + "2 700 \n", + "3 1000 \n", + "4 200 " + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "eacb80fb-b3a0-49a8-81b9-259bcb54911b", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean columns start_date" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "d02c1da1-0f38-4260-92be-bb53a6a8791a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['5/1/2023', '6/15/2023', '7/1/2023', '8/15/2023', '9/10/2023',\n", + " '10/5/2023', '11/20/2023', '1/5/2024', '2/14/2024', '3/10/2024',\n", + " '4/1/2024', '5/15/2024', '6/10/2024', '7/1/2024', '8/20/2024',\n", + " '9/5/2024', '9/1/2023', '7/22/2023', '12/5/2023', '11/1/2023',\n", + " '9/15/2023', '12/22/2023', '8/1/2023', '10/20/2023', '5/10/2022',\n", + " '6/15/2022', '7/2/2022', '8/20/2022', '9/5/2022', '10/12/2022',\n", + " '11/8/2022', '1/5/2023', '2/14/2023', '3/23/2023', '4/19/2023',\n", + " '6/12/2022', '1/2/2023', '12/10/2022', '11/20/2022', '3/5/2023',\n", + " '8/18/2023', '9/15/2022', '7/10/2022', '6/20/2023', '10/10/2023',\n", + " '11/5/2023', '12/24/2023', '1/15/2024', '2/1/2024', '3/15/2024',\n", + " '4/5/2024', '5/10/2024', '6/20/2024', '7/15/2024', '7/12/2022',\n", + " '9/3/2022', '1/7/2023', '6/23/2023', '5/6/2024', '7/20/2024',\n", + " '9/8/2024', '2/14/2025', '5/21/2025', nan, '8/5/2022', '1/1/2023',\n", + " '4/15/2023', '6/7/2023', '11/12/2023', '2/5/2024', '1/1/2025',\n", + " '4/15/2025', '6/15/2021', '7/1/2021', '8/10/2021', '9/1/2021',\n", + " '10/15/2021', '11/20/2021', '1/1/2022', '2/14/2022', '3/10/2022',\n", + " '4/15/2022', '5/1/2022', '9/1/2022', '11/23/2022', '5/8/2023',\n", + " '8/20/2023', '1/6/2024', '4/3/2024', '7/22/2024', '10/10/2024',\n", + " '5/15/2022', '6/20/2022', '8/12/2022', '7/1/2022', '6/10/2022',\n", + " '7/15/2022', '8/25/2022', '9/10/2022', '2/5/2022', '3/15/2022',\n", + " '7/20/2022', '8/8/2022', '9/20/2022', '10/5/2022', '11/11/2022',\n", + " '12/24/2022', '2/10/2023', '5/15/2023', '6/1/2023', '7/15/2023',\n", + " '10/1/2023'], dtype=object)" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['start_date'].unique()\n", + "#extract the list of unique values" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "ce61b023-32f0-4052-b222-3e0dce231312", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['start_date'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "146c5905-e3ef-435a-a341-603a057d1b58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_iddestinationstart_dateend_dateduration_daystraveler_nametraveler_agetraveler_gendertraveler_nationalityaccommodation_typeaccommodation_costtransportation_typetransportation_cost
7172NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
127128NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " trip_id destination start_date end_date duration_days traveler_name \\\n", + "71 72 NaN NaN NaN NaN NaN \n", + "127 128 NaN NaN NaN NaN NaN \n", + "\n", + " traveler_age traveler_gender traveler_nationality accommodation_type \\\n", + "71 NaN NaN NaN NaN \n", + "127 NaN NaN NaN NaN \n", + "\n", + " accommodation_cost transportation_type transportation_cost \n", + "71 NaN NaN NaN \n", + "127 NaN NaN NaN " + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df[travel_trip_hoaithuong_df['start_date'].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "caf55e0d-37c1-4211-8978-43f323f88eb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(2)" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df[\"start_date\"].isna().sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "14139d28-be33-449f-9b2b-6c1266dad526", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "111" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df[\"start_date\"].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "622ecee8-8cc1-495e-a9d2-4e3c2fb6b05d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2023-05-01\n", + "1 2023-06-15\n", + "2 2023-07-01\n", + "3 2023-08-15\n", + "4 2023-09-10\n", + " ... \n", + "134 2023-08-01\n", + "135 2023-08-15\n", + "136 2023-09-01\n", + "137 2023-09-15\n", + "138 2023-10-01\n", + "Name: start_date, Length: 139, dtype: datetime64[ns]\n" + ] + } + ], + "source": [ + "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(travel_trip_hoaithuong_df['start_date'], format='%m/%d/%Y')\n", + "print(travel_trip_hoaithuong_df['start_date'])" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "9517424f-c607-4eca-a11b-18de715df887", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " start_date\n", + "0 2023-05\n", + "1 2023-06\n", + "2 2023-07\n", + "3 2023-08\n", + "4 2023-09\n" + ] + } + ], + "source": [ + "# Convert the 'end_date' column to datetime first (if not already done)\n", + "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(\n", + " travel_trip_hoaithuong_df['start_date'], \n", + " errors='coerce'\n", + ")\n", + "\n", + "# Use .dt.strftime('%Y-%m') to format the date to Year and Month only.\n", + "travel_trip_hoaithuong_df['start_date'] = travel_trip_hoaithuong_df['start_date'].dt.strftime('%Y-%m')\n", + "\n", + "# Optional: Display the new data type and values\n", + "print(travel_trip_hoaithuong_df[['start_date']].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "6c20800d-1a44-4695-a500-70bbaf5f1399", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_iddestinationstart_dateend_dateduration_daystraveler_nametraveler_agetraveler_gendertraveler_nationalityaccommodation_typeaccommodation_costtransportation_typetransportation_cost
01London, UK2023-055/8/20237.0John Smith35.0MaleAmericanHotel1200Flight600
12Phuket, Thailand2023-066/20/20235.0Jane Doe28.0FemaleCanadianResort800Flight500
23Bali, Indonesia2023-077/8/20237.0David Lee45.0MaleKoreanVilla1000Flight700
34New York, USA2023-088/29/202314.0Sarah Johnson29.0FemaleBritishHotel2000Flight1000
45Tokyo, Japan2023-099/17/20237.0Kim Nguyen26.0FemaleVietnameseAirbnb700Train200
..........................................
134135Rio de Janeiro, Brazil2023-088/10/20239.0Jose Perez37.0MaleBrazilianHostel2500Car2000
135136Vancouver, Canada2023-088/21/20236.0Emma Wilson29.0FemaleCanadianHotel5000Airplane3000
136137Bangkok, Thailand2023-099/8/20237.0Ryan Chen34.0MaleChineseHostel2000Train1000
137138Barcelona, Spain2023-099/22/20237.0Sofia Rodriguez25.0FemaleSpanishAirbnb6000Airplane2500
138139Auckland, New Zealand2023-1010/8/20237.0William Brown39.0MaleNew ZealanderHotel7000Train2500
\n", + "

139 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " trip_id destination start_date end_date duration_days \\\n", + "0 1 London, UK 2023-05 5/8/2023 7.0 \n", + "1 2 Phuket, Thailand 2023-06 6/20/2023 5.0 \n", + "2 3 Bali, Indonesia 2023-07 7/8/2023 7.0 \n", + "3 4 New York, USA 2023-08 8/29/2023 14.0 \n", + "4 5 Tokyo, Japan 2023-09 9/17/2023 7.0 \n", + ".. ... ... ... ... ... \n", + "134 135 Rio de Janeiro, Brazil 2023-08 8/10/2023 9.0 \n", + "135 136 Vancouver, Canada 2023-08 8/21/2023 6.0 \n", + "136 137 Bangkok, Thailand 2023-09 9/8/2023 7.0 \n", + "137 138 Barcelona, Spain 2023-09 9/22/2023 7.0 \n", + "138 139 Auckland, New Zealand 2023-10 10/8/2023 7.0 \n", + "\n", + " traveler_name traveler_age traveler_gender traveler_nationality \\\n", + "0 John Smith 35.0 Male American \n", + "1 Jane Doe 28.0 Female Canadian \n", + "2 David Lee 45.0 Male Korean \n", + "3 Sarah Johnson 29.0 Female British \n", + "4 Kim Nguyen 26.0 Female Vietnamese \n", + ".. ... ... ... ... \n", + "134 Jose Perez 37.0 Male Brazilian \n", + "135 Emma Wilson 29.0 Female Canadian \n", + "136 Ryan Chen 34.0 Male Chinese \n", + "137 Sofia Rodriguez 25.0 Female Spanish \n", + "138 William Brown 39.0 Male New Zealander \n", + "\n", + " accommodation_type accommodation_cost transportation_type \\\n", + "0 Hotel 1200 Flight \n", + "1 Resort 800 Flight \n", + "2 Villa 1000 Flight \n", + "3 Hotel 2000 Flight \n", + "4 Airbnb 700 Train \n", + ".. ... ... ... \n", + "134 Hostel 2500 Car \n", + "135 Hotel 5000 Airplane \n", + "136 Hostel 2000 Train \n", + "137 Airbnb 6000 Airplane \n", + "138 Hotel 7000 Train \n", + "\n", + " transportation_cost \n", + "0 600 \n", + "1 500 \n", + "2 700 \n", + "3 1000 \n", + "4 200 \n", + ".. ... \n", + "134 2000 \n", + "135 3000 \n", + "136 1000 \n", + "137 2500 \n", + "138 2500 \n", + "\n", + "[139 rows x 13 columns]" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " travel_trip_hoaithuong_df" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "9b426ca7-b105-4e6c-a965-a3d2f9e1794d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2023-05-01\n", + "1 2023-06-01\n", + "2 2023-07-01\n", + "3 2023-08-01\n", + "4 2023-09-01\n", + " ... \n", + "134 2023-08-01\n", + "135 2023-08-01\n", + "136 2023-09-01\n", + "137 2023-09-01\n", + "138 2023-10-01\n", + "Name: start_date, Length: 139, dtype: datetime64[ns]" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['start_date'] = pd.to_datetime(travel_trip_hoaithuong_df['start_date'], errors='coerce')\n", + "travel_trip_hoaithuong_df['start_date']" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "8ed625ad-7d44-4c07-a604-8d2e7f84843c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 May\n", + "1 June\n", + "2 July\n", + "3 August\n", + "4 September\n", + " ... \n", + "134 August\n", + "135 August\n", + "136 September\n", + "137 September\n", + "138 October\n", + "Name: start_month, Length: 139, dtype: object" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['start_month'] = travel_trip_hoaithuong_df['start_date'].dt.strftime('%B')\n", + "travel_trip_hoaithuong_df['start_month']" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "f450c9cf-f145-4683-be6d-966e7b034201", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_iddestinationstart_dateend_dateduration_daystraveler_nametraveler_agetraveler_gendertraveler_nationalityaccommodation_typeaccommodation_costtransportation_typetransportation_coststart_month
01London, UK2023-05-015/8/20237.0John Smith35.0MaleAmericanHotel1200Flight600May
12Phuket, Thailand2023-06-016/20/20235.0Jane Doe28.0FemaleCanadianResort800Flight500June
23Bali, Indonesia2023-07-017/8/20237.0David Lee45.0MaleKoreanVilla1000Flight700July
34New York, USA2023-08-018/29/202314.0Sarah Johnson29.0FemaleBritishHotel2000Flight1000August
45Tokyo, Japan2023-09-019/17/20237.0Kim Nguyen26.0FemaleVietnameseAirbnb700Train200September
\n", + "
" + ], + "text/plain": [ + " trip_id destination start_date end_date duration_days \\\n", + "0 1 London, UK 2023-05-01 5/8/2023 7.0 \n", + "1 2 Phuket, Thailand 2023-06-01 6/20/2023 5.0 \n", + "2 3 Bali, Indonesia 2023-07-01 7/8/2023 7.0 \n", + "3 4 New York, USA 2023-08-01 8/29/2023 14.0 \n", + "4 5 Tokyo, Japan 2023-09-01 9/17/2023 7.0 \n", + "\n", + " traveler_name traveler_age traveler_gender traveler_nationality \\\n", + "0 John Smith 35.0 Male American \n", + "1 Jane Doe 28.0 Female Canadian \n", + "2 David Lee 45.0 Male Korean \n", + "3 Sarah Johnson 29.0 Female British \n", + "4 Kim Nguyen 26.0 Female Vietnamese \n", + "\n", + " accommodation_type accommodation_cost transportation_type \\\n", + "0 Hotel 1200 Flight \n", + "1 Resort 800 Flight \n", + "2 Villa 1000 Flight \n", + "3 Hotel 2000 Flight \n", + "4 Airbnb 700 Train \n", + "\n", + " transportation_cost start_month \n", + "0 600 May \n", + "1 500 June \n", + "2 700 July \n", + "3 1000 August \n", + "4 200 September " + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "90974fd4-9cbc-414b-a0c9-790f5685badc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_iddestinationstart_dateend_dateduration_daystraveler_nametraveler_agetraveler_gendertraveler_nationalityaccommodation_typeaccommodation_costtransportation_typetransportation_coststart_monthstart_year
01London, UK2023-05-015/8/20237.0John Smith35.0MaleAmericanHotel1200Flight600May2023
12Phuket, Thailand2023-06-016/20/20235.0Jane Doe28.0FemaleCanadianResort800Flight500June2023
23Bali, Indonesia2023-07-017/8/20237.0David Lee45.0MaleKoreanVilla1000Flight700July2023
34New York, USA2023-08-018/29/202314.0Sarah Johnson29.0FemaleBritishHotel2000Flight1000August2023
45Tokyo, Japan2023-09-019/17/20237.0Kim Nguyen26.0FemaleVietnameseAirbnb700Train200September2023
\n", + "
" + ], + "text/plain": [ + " trip_id destination start_date end_date duration_days \\\n", + "0 1 London, UK 2023-05-01 5/8/2023 7.0 \n", + "1 2 Phuket, Thailand 2023-06-01 6/20/2023 5.0 \n", + "2 3 Bali, Indonesia 2023-07-01 7/8/2023 7.0 \n", + "3 4 New York, USA 2023-08-01 8/29/2023 14.0 \n", + "4 5 Tokyo, Japan 2023-09-01 9/17/2023 7.0 \n", + "\n", + " traveler_name traveler_age traveler_gender traveler_nationality \\\n", + "0 John Smith 35.0 Male American \n", + "1 Jane Doe 28.0 Female Canadian \n", + "2 David Lee 45.0 Male Korean \n", + "3 Sarah Johnson 29.0 Female British \n", + "4 Kim Nguyen 26.0 Female Vietnamese \n", + "\n", + " accommodation_type accommodation_cost transportation_type \\\n", + "0 Hotel 1200 Flight \n", + "1 Resort 800 Flight \n", + "2 Villa 1000 Flight \n", + "3 Hotel 2000 Flight \n", + "4 Airbnb 700 Train \n", + "\n", + " transportation_cost start_month start_year \n", + "0 600 May 2023 \n", + "1 500 June 2023 \n", + "2 700 July 2023 \n", + "3 1000 August 2023 \n", + "4 200 September 2023 " + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df[\"start_date\"] = pd.to_datetime(\n", + " travel_trip_hoaithuong_df[\"start_date\"], errors=\"coerce\"\n", + ")\n", + "\n", + "travel_trip_hoaithuong_df[\"start_year\"] = (\n", + " travel_trip_hoaithuong_df[\"start_date\"].dt.year.astype(\"Int64\")\n", + ")\n", + "travel_trip_hoaithuong_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "2cd814fb-da83-4583-9f34-97372e41ee74", + "metadata": {}, + "outputs": [], + "source": [ + "#Clean columns: traveler_nationality" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "d36774ee-16ad-4b7c-8b92-5b4ac81c589f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['American', 'Canadian', 'Korean', 'British', 'Vietnamese',\n", + " 'Australian', 'Brazilian', 'Dutch', 'Emirati', 'Mexican',\n", + " 'Spanish', 'Chinese', 'German', 'Moroccan', 'Scottish', 'Japanese',\n", + " 'Italian', 'Indian', 'South Korean', 'French', nan,\n", + " 'South African', 'Taiwanese', 'Indonesian', 'USA', 'Canada',\n", + " 'South Korea', 'UK', 'China', 'Taiwan', 'Japan', 'Spain', 'Brazil',\n", + " 'Germany', 'Hong Kong', 'United Kingdom', 'Singapore', 'Italy',\n", + " 'Greece', 'United Arab Emirates', 'Cambodia', 'New Zealander'],\n", + " dtype=object)" + ] + }, + "execution_count": 162, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " travel_trip_hoaithuong_df['traveler_nationality'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "2e165506-1ee0-44c9-91a5-907d9e6884bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 American\n", + "1 Canadian\n", + "2 Korean\n", + "3 British\n", + "4 Vietnamese\n", + " ... \n", + "134 Brazilian\n", + "135 Canadian\n", + "136 Chinese\n", + "137 Spanish\n", + "138 New Zealander\n", + "Name: traveler_nationality, Length: 139, dtype: object" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality'].str.strip().str.lower().str.title()" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "2ea7355a-8df1-4bc6-b29b-e893e1221394", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "traveler_nationality\n", + "American 24\n", + "Korean 13\n", + "British 12\n", + "Canadian 9\n", + "Australian 8\n", + "Spanish 7\n", + "Chinese 7\n", + "Italian 4\n", + "Brazilian 4\n", + "Indian 4\n", + "Vietnamese 3\n", + "South Korea 3\n", + "South Korean 3\n", + "Taiwan 2\n", + "Canada 2\n", + "USA 2\n", + "South African 2\n", + "Japanese 2\n", + "Mexican 2\n", + "Emirati 2\n", + "Dutch 2\n", + "Brazil 1\n", + "Cambodia 1\n", + "United Arab Emirates 1\n", + "Greece 1\n", + "Italy 1\n", + "Singapore 1\n", + "United Kingdom 1\n", + "Hong Kong 1\n", + "Germany 1\n", + "Japan 1\n", + "Spain 1\n", + "Scottish 1\n", + "China 1\n", + "UK 1\n", + "German 1\n", + "Indonesian 1\n", + "Taiwanese 1\n", + "Moroccan 1\n", + "French 1\n", + "New Zealander 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "b4b7ab06-6da3-40c9-9bbf-62a0d321cd76", + "metadata": {}, + "outputs": [], + "source": [ + "mapping = {\n", + " 'American': 'United States',\n", + " 'USA': 'United States',\n", + " 'Canada': 'Canada',\n", + " 'Canadian': 'Canada',\n", + " 'UK': 'United Kingdom',\n", + " 'United Kingdom': 'United Kingdom',\n", + " 'British': 'United Kingdom',\n", + " 'Scottish': 'United Kingdom',\n", + " 'Korean': 'South Korea',\n", + " 'South Korea': 'South Korea',\n", + " 'South Korean': 'South Korea',\n", + " 'China': 'China',\n", + " 'Chinese': 'China',\n", + " 'Italy': 'Italy',\n", + " 'Italian': 'Italy',\n", + " 'Brazil': 'Brazil',\n", + " 'Brazilian': 'Brazil',\n", + " 'Emirati': 'United Arab Emirates',\n", + " 'United Arab Emirates': 'United Arab Emirates',\n", + " 'Taiwan': 'Taiwan',\n", + " 'Taiwanese': 'Taiwan',\n", + " 'Germany': 'Germany',\n", + " 'German': 'Germany',\n", + " 'Spain': 'Spain',\n", + " 'Spanish': 'Spain',\n", + " 'Japan': 'Japan',\n", + " 'Japanese': 'Japan'\n", + "}\n", + "\n", + "travel_trip_hoaithuong_df['traveler_nationality_clean'] = travel_trip_hoaithuong_df['traveler_nationality'].replace(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "6044065c-4763-4f33-8e14-4d404c9de96c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "traveler_nationality_clean\n", + "United States 26\n", + "South Korea 19\n", + "United Kingdom 15\n", + "Canada 11\n", + "China 8\n", + "Spain 8\n", + "Australian 8\n", + "Brazil 5\n", + "Italy 5\n", + "Indian 4\n", + "United Arab Emirates 3\n", + "Vietnamese 3\n", + "Japan 3\n", + "Taiwan 3\n", + "Dutch 2\n", + "Mexican 2\n", + "Germany 2\n", + "South African 2\n", + "Singapore 1\n", + "Cambodia 1\n", + "Greece 1\n", + "Moroccan 1\n", + "Hong Kong 1\n", + "Indonesian 1\n", + "French 1\n", + "New Zealander 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality_clean'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "f473b554-a78f-4431-8c5e-06980976e343", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "5bfed24f-5b63-4330-9e2d-7d6edfce67a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 American\n", + "1 Canadian\n", + "2 Korean\n", + "3 British\n", + "4 Vietnamese\n", + " ... \n", + "134 Brazilian\n", + "135 Canadian\n", + "136 Chinese\n", + "137 Spanish\n", + "138 New Zealander\n", + "Name: traveler_nationality, Length: 139, dtype: string" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality'].astype('string')" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "9c363536-ca86-417a-aa9a-59b37db0119c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(2)" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_nationality'].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "85f9faa5-c85d-48b3-b0b0-3dcea796fdaf", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean columns traveler_gender" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "68ceab2f-3ae5-425d-ba28-704a44289be7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Male', 'Female', nan], dtype=object)" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_gender'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "0e794a05-5879-4857-84f9-1674d4212a1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "traveler_gender\n", + "Female 70\n", + "Male 67\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_gender'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "e87e63d4-c180-4263-820b-abf80afe31e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(2)" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_gender'].isna().sum() " + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "8a073849-ae87-44aa-bd84-3916171634f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Male\n", + "1 Female\n", + "2 Male\n", + "3 Female\n", + "4 Female\n", + " ... \n", + "134 Male\n", + "135 Female\n", + "136 Male\n", + "137 Female\n", + "138 Male\n", + "Name: traveler_gender, Length: 139, dtype: object" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df['traveler_gender'].str.strip().str.title()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "7a432295-fa6c-474e-b3ad-98835f8dc5db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "traveler_gender\n", + "Female 70\n", + "Male 67\n", + "Unknown 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "travel_trip_hoaithuong_df[\"traveler_gender\"] = (\n", + " travel_trip_hoaithuong_df[\"traveler_gender\"].fillna(\"Unknown\")\n", + ")\n", + "travel_trip_hoaithuong_df['traveler_gender'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6b32cd-c090-4b88-97a6-f9c8d647e7b7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9a0312b-3d0f-4561-bd4c-7591e903ce7d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}