In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [3]:
# ===================================================
# Einlesen und erste Analyse des Datensatzes
# -- Der Hotelbuchungs-Datensatz wird geladen
# -- Überblick über die Struktur, Form und fehlende Werte
# ===================================================
In [4]:
df = pd.read_csv(r'C:\Users\Miso\Desktop\Data Analysis 2/hotel_bookings.csv')
df.head()
Out[4]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 |
5 rows × 32 columns
In [5]:
df.shape
Out[5]:
(119390, 32)
In [6]:
df.isnull().values.any()
Out[6]:
np.True_
In [7]:
df.isnull().sum()
Out[7]:
hotel 0 is_canceled 0 lead_time 0 arrival_date_year 0 arrival_date_month 0 arrival_date_week_number 0 arrival_date_day_of_month 0 stays_in_weekend_nights 0 stays_in_week_nights 0 adults 0 children 4 babies 0 meal 0 country 488 market_segment 0 distribution_channel 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 reserved_room_type 0 assigned_room_type 0 booking_changes 0 deposit_type 0 agent 16340 company 112593 days_in_waiting_list 0 customer_type 0 adr 0 required_car_parking_spaces 0 total_of_special_requests 0 reservation_status 0 reservation_status_date 0 dtype: int64
In [8]:
df.fillna(0, inplace=True)
In [9]:
df['meal'].value_counts()
Out[9]:
meal BB 92310 HB 14463 SC 10650 Undefined 1169 FB 798 Name: count, dtype: int64
In [10]:
df['children'].unique()
df['adults'].unique()
df['babies'].unique()
Out[10]:
array([ 0, 1, 2, 10, 9])
In [11]:
# ===================================================
# Ungültige Buchungen filtern
# -- Entfernt Buchungen ohne jegliche Gäste (Kinder, Erwachsene, Babys = 0)
# ===================================================
In [12]:
filter=(df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filter]
Out[12]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2224 | Resort Hotel | 0 | 1 | 2015 | October | 41 | 6 | 0 | 3 | 0 | ... | No Deposit | 0.0 | 174.0 | 0 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 10/6/2015 |
2409 | Resort Hotel | 0 | 0 | 2015 | October | 42 | 12 | 0 | 0 | 0 | ... | No Deposit | 0.0 | 174.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 10/12/2015 |
3181 | Resort Hotel | 0 | 36 | 2015 | November | 47 | 20 | 1 | 2 | 0 | ... | No Deposit | 38.0 | 0.0 | 0 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 11/23/2015 |
3684 | Resort Hotel | 0 | 165 | 2015 | December | 53 | 30 | 1 | 4 | 0 | ... | No Deposit | 308.0 | 0.0 | 122 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 1/4/2016 |
3708 | Resort Hotel | 0 | 165 | 2015 | December | 53 | 30 | 2 | 4 | 0 | ... | No Deposit | 308.0 | 0.0 | 122 | Transient-Party | 0.00 | 0 | 0 | Check-Out | 1/5/2016 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
115029 | City Hotel | 0 | 107 | 2017 | June | 26 | 27 | 0 | 3 | 0 | ... | No Deposit | 7.0 | 0.0 | 0 | Transient | 100.80 | 0 | 0 | Check-Out | 6/30/2017 |
115091 | City Hotel | 0 | 1 | 2017 | June | 26 | 30 | 0 | 1 | 0 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.00 | 1 | 1 | Check-Out | 7/1/2017 |
116251 | City Hotel | 0 | 44 | 2017 | July | 28 | 15 | 1 | 1 | 0 | ... | No Deposit | 425.0 | 0.0 | 0 | Transient | 73.80 | 0 | 0 | Check-Out | 7/17/2017 |
116534 | City Hotel | 0 | 2 | 2017 | July | 28 | 15 | 2 | 5 | 0 | ... | No Deposit | 9.0 | 0.0 | 0 | Transient-Party | 22.86 | 0 | 1 | Check-Out | 7/22/2017 |
117087 | City Hotel | 0 | 170 | 2017 | July | 30 | 27 | 0 | 2 | 0 | ... | No Deposit | 52.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 7/29/2017 |
180 rows × 32 columns
In [13]:
data=df[~filter]
In [14]:
# ===================================================
# Gästeherkunft analysieren (nur gültige, nicht stornierte Buchungen)
# -- Filtert Daten nach Hoteltyp und ob storniert wurde
# ===================================================
In [15]:
resort=data[ (data['hotel']=='Resort Hotel') & (data['is_canceled']==0) ]
city=data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]
In [16]:
resort.head()
city.head()
Out[16]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
40060 | City Hotel | 0 | 6 | 2015 | July | 27 | 1 | 0 | 2 | 1 | ... | No Deposit | 6.0 | 0.0 | 0 | Transient | 0.00 | 0 | 0 | Check-Out | 7/3/2015 |
40066 | City Hotel | 0 | 3 | 2015 | July | 27 | 2 | 0 | 3 | 1 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 58.67 | 0 | 0 | Check-Out | 7/5/2015 |
40070 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 7/5/2015 |
40071 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 43.00 | 0 | 0 | Check-Out | 7/5/2015 |
40072 | City Hotel | 0 | 43 | 2015 | July | 27 | 3 | 0 | 2 | 2 | ... | No Deposit | 1.0 | 0.0 | 0 | Transient-Party | 86.00 | 0 | 0 | Check-Out | 7/5/2015 |
5 rows × 32 columns
In [17]:
!pip install plotly
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
Requirement already satisfied: plotly in c:\users\miso\anaconda3\lib\site-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\miso\anaconda3\lib\site-packages (from plotly) (9.0.0) Requirement already satisfied: packaging in c:\users\miso\anaconda3\lib\site-packages (from plotly) (24.2)
In [18]:
# ===================================================
# Kreisdiagramm zur Länderherkunft mit Plotly
# -- Visualisiert, aus welchen Ländern Gäste im Resort Hotel stammen
# ===================================================
In [19]:
labels=resort['country'].value_counts().index
values=resort['country'].value_counts()
In [20]:
trace=go.Pie(labels=labels, values=values,hoverinfo='label+percent', textinfo='value')
In [21]:
iplot([trace])
In [22]:
# ===================================================
# Choroplethenkarte: Anzahl Gäste nach Herkunftsland
# -- Visualisiert die geografische Verteilung der Gäste
# ===================================================
In [23]:
country_wise_data=data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns=['country','Number of Guests']
country_wise_data.head()
Out[23]:
country | Number of Guests | |
---|---|---|
0 | PRT | 20977 |
1 | GBR | 9668 |
2 | FRA | 8468 |
3 | ESP | 6383 |
4 | DEU | 6067 |
In [24]:
px.choropleth(country_wise_data,
locations=country_wise_data['country'],
color=country_wise_data['Number of Guests'],
hover_name=country_wise_data['country'],
title='Home country of Guests')
In [25]:
data.head()
Out[25]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 |
5 rows × 32 columns
In [26]:
# ===================================================
# Zimmerpreise pro Zimmerart und Hoteltyp
# -- Vergleicht durchschnittliche Tagespreise für Zimmer
# ===================================================
In [27]:
data2=data[data['is_canceled']==0]
In [28]:
plt.figure(figsize=(12,8))
sns.boxplot(x='reserved_room_type', y='adr', data=data2, hue='hotel')
plt.title('Price of Room Types per Night and per person', fontsize=16)
plt.xlabel('Room_type')
plt.ylabel('Price in [EUR]')
plt.show()
In [29]:
# ===================================================
# Preisentwicklung übers Jahr hinweg
# -- Zeigt, wie sich die durchschnittlichen Preise pro Monat verändern
# ===================================================
In [30]:
data_resort=resort[resort['is_canceled']==0]
data_city=city[city['is_canceled']==0]
In [31]:
resort_hotel=data_resort.groupby('arrival_date_month')['adr'].mean().reset_index()
city_hotel=data_city.groupby('arrival_date_month')['adr'].mean().reset_index()
In [32]:
city_hotel
Out[32]:
arrival_date_month | adr | |
---|---|---|
0 | April | 111.962267 |
1 | August | 118.674598 |
2 | December | 88.401855 |
3 | February | 86.520062 |
4 | January | 82.330983 |
5 | July | 115.818019 |
6 | June | 117.874360 |
7 | March | 90.658533 |
8 | May | 120.669827 |
9 | November | 86.946592 |
10 | October | 102.004672 |
11 | September | 112.776582 |
In [33]:
final=resort_hotel.merge(city_hotel,on='arrival_date_month')
final.columns=['month','price_for_resort','price_for_city_hotel']
final.head()
Out[33]:
month | price_for_resort | price_for_city_hotel | |
---|---|---|---|
0 | April | 75.867816 | 111.962267 |
1 | August | 181.205892 | 118.674598 |
2 | December | 68.410104 | 88.401855 |
3 | February | 54.147478 | 86.520062 |
4 | January | 48.761125 | 82.330983 |
In [34]:
!pip install sorted-months-weekdays
!pip install sort-dataframeby-monthorweek
Requirement already satisfied: sorted-months-weekdays in c:\users\miso\anaconda3\lib\site-packages (0.2) Requirement already satisfied: sort-dataframeby-monthorweek in c:\users\miso\anaconda3\lib\site-packages (0.4)
In [35]:
from sort_dataframeby_monthorweek import Sort_Dataframeby_Month
In [36]:
final2=Sort_Dataframeby_Month(final,'month')
final2.head()
Out[36]:
month | price_for_resort | price_for_city_hotel | |
---|---|---|---|
0 | January | 48.761125 | 82.330983 |
1 | February | 54.147478 | 86.520062 |
2 | March | 57.056838 | 90.658533 |
3 | April | 75.867816 | 111.962267 |
4 | May | 76.657558 | 120.669827 |
In [37]:
px.line(final2,x='month', y=['price_for_resort','price_for_city_hotel'],title='Room price per night over the year')
In [38]:
# ===================================================
# Analyse der Aufenthaltsdauer nach Marktsegment
# -- Veranschaulicht, wie lange Gäste je nach Marktsegment bleiben
# ===================================================
In [39]:
plt.figure(figsize=(15,10))
sns.boxplot(x='market_segment',y='stays_in_weekend_nights', data=data, hue='hotel')
Out[39]:
<Axes: xlabel='market_segment', ylabel='stays_in_weekend_nights'>
In [40]:
data['meal'].value_counts()
Out[40]:
meal BB 92236 HB 14458 SC 10549 Undefined 1169 FB 798 Name: count, dtype: int64
In [41]:
# ===================================================
# Verteilung der gebuchten Mahlzeiten
# -- Zeigt, welche Verpflegungsarten am häufigsten gebucht wurden
# ===================================================
In [42]:
px.pie(data_frame=
data['meal'].value_counts().rename_axis('meal').reset_index(name='count'),
values='count',
names='meal',
hole=0.5)
In [43]:
data.head()
Out[43]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 |
5 rows × 32 columns
In [44]:
# ===================================================
# Sonderwünsche und Stornierungen
# -- Zusammenhang zwischen Anzahl Sonderwünsche und Stornoverhalten
# ===================================================
In [52]:
sns.countplot(data=data.sample(1000), x='total_of_special_requests')
plt.show()
In [53]:
data.columns
Out[53]:
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'reservation_status_date'], dtype='object')
In [57]:
pivot=data.groupby(['total_of_special_requests','is_canceled']).agg({'total_of_special_requests':'count'}).rename({'total_of_special_requests':'count'}).unstack()
pivot
Out[57]:
total_of_special_requests | ||
---|---|---|
is_canceled | 0 | 1 |
total_of_special_requests | ||
0 | 36667 | 33534 |
1 | 25867 | 7316 |
2 | 10086 | 2866 |
3 | 2049 | 445 |
4 | 304 | 36 |
5 | 38 | 2 |
In [58]:
pivot.plot(kind='bar')
Out[58]:
<Axes: xlabel='total_of_special_requests'>
In [59]:
# ===================================================
# Gästeströme über Monate analysieren
# -- Vergleich der Gästezahlen im Resort und City Hotel
# ===================================================
In [65]:
rush_resort=data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month','number of guests']
rush_resort
Out[65]:
month | number of guests | |
---|---|---|
0 | August | 3257 |
1 | July | 3137 |
2 | October | 2575 |
3 | March | 2571 |
4 | April | 2550 |
5 | May | 2535 |
6 | February | 2308 |
7 | September | 2102 |
8 | June | 2037 |
9 | December | 2014 |
10 | November | 1975 |
11 | January | 1866 |
In [69]:
rush_city=data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month','number of guests']
rush_city
Out[69]:
month | number of guests | |
---|---|---|
0 | August | 5367 |
1 | July | 4770 |
2 | May | 4568 |
3 | June | 4358 |
4 | October | 4326 |
5 | September | 4283 |
6 | March | 4049 |
7 | April | 4010 |
8 | February | 3051 |
9 | November | 2676 |
10 | December | 2377 |
11 | January | 2249 |
In [70]:
final_rush=rush_resort.merge(rush_city,on='month')
final_rush.columns=['month','number of guests in resort','number of guests in city hotel']
final_rush
Out[70]:
month | number of guests in resort | number of guests in city hotel | |
---|---|---|---|
0 | August | 3257 | 5367 |
1 | July | 3137 | 4770 |
2 | October | 2575 | 4326 |
3 | March | 2571 | 4049 |
4 | April | 2550 | 4010 |
5 | May | 2535 | 4568 |
6 | February | 2308 | 3051 |
7 | September | 2102 | 4283 |
8 | June | 2037 | 4358 |
9 | December | 2014 | 2377 |
10 | November | 1975 | 2676 |
11 | January | 1866 | 2249 |
In [71]:
import sort_dataframeby_monthorweek as sd
In [72]:
final_rush2=sd.Sort_Dataframeby_Month(df=final_rush,monthcolumnname='month')
final_rush2
Out[72]:
month | number of guests in resort | number of guests in city hotel | |
---|---|---|---|
0 | January | 1866 | 2249 |
1 | February | 2308 | 3051 |
2 | March | 2571 | 4049 |
3 | April | 2550 | 4010 |
4 | May | 2535 | 4568 |
5 | June | 2037 | 4358 |
6 | July | 3137 | 4770 |
7 | August | 3257 | 5367 |
8 | September | 2102 | 4283 |
9 | October | 2575 | 4326 |
10 | November | 1975 | 2676 |
11 | December | 2014 | 2377 |
In [73]:
final_rush2.columns
Out[73]:
Index(['month', 'number of guests in resort', 'number of guests in city hotel'], dtype='object')
In [74]:
px.line(data_frame=final_rush2,x='month',y=['number of guests in resort','number of guests in city hotel'],title='Total number of guests per month')
In [75]:
filter=data['is_canceled']==0
clean_data=data[filter]
In [76]:
clean_data.head()
Out[76]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 |
2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 |
4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 |
5 rows × 32 columns
In [77]:
# ===================================================
# Aufenthaltsdauer vs. Hoteltyp
# -- Wie lange Gäste durchschnittlich bleiben – nach Hoteltyp
# ===================================================
In [78]:
clean_data['total_nights']=clean_data['stays_in_weekend_nights'] + clean_data['stays_in_week_nights']
C:\Users\Miso\AppData\Local\Temp\ipykernel_9756\3138704394.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [79]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
In [80]:
clean_data.head()
Out[80]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | total_nights | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 | 0 |
1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | 0.0 | 0.0 | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 7/1/2015 | 0 |
2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 0.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 | 1 |
3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | 304.0 | 0.0 | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 7/2/2015 | 1 |
4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | 240.0 | 0.0 | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 7/3/2015 | 2 |
5 rows × 33 columns
In [81]:
stay=clean_data.groupby(['total_nights','hotel']).agg('count').reset_index()
stay=stay.iloc[:,0:3]
stay.head()
Out[81]:
total_nights | hotel | is_canceled | |
---|---|---|---|
0 | 0 | City Hotel | 251 |
1 | 0 | Resort Hotel | 371 |
2 | 1 | City Hotel | 9155 |
3 | 1 | Resort Hotel | 6579 |
4 | 2 | City Hotel | 10983 |
In [83]:
stay=stay.rename(columns={'is_canceled':'number of stays'})
stay.head()
Out[83]:
total_nights | hotel | number of stays | |
---|---|---|---|
0 | 0 | City Hotel | 251 |
1 | 0 | Resort Hotel | 371 |
2 | 1 | City Hotel | 9155 |
3 | 1 | Resort Hotel | 6579 |
4 | 2 | City Hotel | 10983 |
In [100]:
plt.figure(figsize=(15,8))
sns.barplot(x='total_nights',y='number of stays',hue='hotel',hue_order=['City Hotel','Resort Hotel'],data=stay)
Out[100]:
<Axes: xlabel='total_nights', ylabel='number of stays'>
In [85]:
clean_data.columns
Out[85]:
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'reservation_status_date', 'total_nights'], dtype='object')
In [86]:
clean_data['market_segment'].value_counts()
Out[86]:
market_segment Online TA 35673 Offline TA/TO 15880 Direct 10648 Groups 7697 Corporate 4291 Complementary 639 Aviation 183 Name: count, dtype: int64
In [87]:
# ===================================================
# Buchungen nach Marktsegment
# -- Verteilung und durchschnittlicher Preis je Segment und Zimmerart
# ===================================================
In [89]:
segment_counts = clean_data['market_segment'].value_counts().reset_index()
segment_counts.columns = ['market_segment', 'count']
# Step 2: Pass this to px.pie
fig = px.pie(
segment_counts,
names='market_segment',
values='count',
title='Bookings per Market Segment'
)
fig.show()
In [90]:
clean_data.columns
Out[90]:
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'reservation_status_date', 'total_nights'], dtype='object')
In [91]:
plt.figure(figsize=(15,8))
sns.barplot(x='market_segment',y='adr',hue='reserved_room_type',data=clean_data)
Out[91]:
<Axes: xlabel='market_segment', ylabel='adr'>
In [92]:
cancel=data[data['is_canceled']==1]
cancel.head()
Out[92]:
hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8 | Resort Hotel | 1 | 85 | 2015 | July | 27 | 1 | 0 | 3 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 82.0 | 0 | 1 | Canceled | 5/6/2015 |
9 | Resort Hotel | 1 | 75 | 2015 | July | 27 | 1 | 0 | 3 | 2 | ... | No Deposit | 15.0 | 0.0 | 0 | Transient | 105.5 | 0 | 0 | Canceled | 4/22/2015 |
10 | Resort Hotel | 1 | 23 | 2015 | July | 27 | 1 | 0 | 4 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 123.0 | 0 | 0 | Canceled | 6/23/2015 |
27 | Resort Hotel | 1 | 60 | 2015 | July | 27 | 1 | 2 | 5 | 2 | ... | No Deposit | 240.0 | 0.0 | 0 | Transient | 107.0 | 0 | 2 | Canceled | 5/11/2015 |
32 | Resort Hotel | 1 | 96 | 2015 | July | 27 | 1 | 2 | 8 | 2 | ... | No Deposit | 0.0 | 0.0 | 0 | Transient | 108.3 | 0 | 2 | Canceled | 5/29/2015 |
5 rows × 32 columns
In [93]:
# ===================================================
# Stornierungen nach Hoteltyp
# -- Wie viele Buchungen wurden je Hotel storniert?
# ===================================================
In [94]:
len(cancel[cancel['hotel']=='Resort Hotel'])
len(cancel[cancel['hotel']=='City Hotel'])
Out[94]:
33079
In [99]:
rh_cancellations = 11120
ch_cancellations = 33079
# Labels as strings
fig = px.pie(
values=[rh_cancellations, ch_cancellations],
names=['Resort Hotel', 'City Hotel'], # or ['RH', 'CH'] if you prefer short
title='Cancellations by Hotel Type'
)
fig.show()
In [ ]: