In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import os
In [4]:
# =======================================================
# 📂 Laden und Filtern der CSV-Dateien
# - Listet die letzten 8 Dateien im Uber-Datenordner auf
# - Entfernt ungewünschte Dateien aus der Analyse
# =======================================================
In [5]:
files = os.listdir(r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet')[-8:]
files
Out[5]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [6]:
files.remove('uber-raw-data-janjune-15_sample.csv')
In [7]:
files.remove('uber-raw-data-janjune-15.csv')
In [8]:
files
Out[8]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [9]:
# =======================================================
# 📁 Zusammenführen der ausgewählten CSV-Dateien
# - Liest jede Datei ein
# - Hängt alle Daten zu einem großen DataFrame `final` zusammen
# =======================================================
In [10]:
path = r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet'
final=pd.DataFrame()
for file in files:
df=pd.read_csv(path+"/"+file,encoding='utf-8')
final=pd.concat([df,final])
In [11]:
final.shape
Out[11]:
(4534327, 4)
In [12]:
df = final.copy()
In [13]:
df.head()
Out[13]:
Date/Time | Lat | Lon | Base | |
---|---|---|---|---|
0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
In [14]:
df.dtypes
Out[14]:
Date/Time object Lat float64 Lon float64 Base object dtype: object
In [15]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'],format='%m/%d/%Y %H:%M:%S')
In [16]:
df.dtypes
Out[16]:
Date/Time datetime64[ns] Lat float64 Lon float64 Base object dtype: object
In [17]:
df.head()
Out[17]:
Date/Time | Lat | Lon | Base | |
---|---|---|---|---|
0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 |
1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 |
2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 |
3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 |
4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 |
In [18]:
# =======================================================
# 🧹 Datenvorverarbeitung
# - Umwandlung der Spalte 'Date/Time' in ein datetime-Format
# - Extraktion von Zeitkomponenten für spätere Analysen
# =======================================================
In [19]:
df['Weekday'] = df['Date/Time'].dt.day_name()
df['Month'] = df['Date/Time'].dt.month
df['Day'] = df['Date/Time'].dt.day
df['Hour'] = df['Date/Time'].dt.hour
df['Minute'] = df['Date/Time'].dt.minute
In [20]:
df.head()
Out[20]:
Date/Time | Lat | Lon | Base | Weekday | Month | Day | Hour | Minute | |
---|---|---|---|---|---|---|---|---|---|
0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 | Monday | 9 | 1 | 0 | 1 |
1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 | Monday | 9 | 1 | 0 | 1 |
2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 | Monday | 9 | 1 | 0 | 3 |
3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 | Monday | 9 | 1 | 0 | 6 |
4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 | Monday | 9 | 1 | 0 | 11 |
In [21]:
df['Weekday'].value_counts()
Out[21]:
Weekday Thursday 755145 Friday 741139 Wednesday 696488 Tuesday 663789 Saturday 646114 Monday 541472 Sunday 490180 Name: count, dtype: int64
In [22]:
# =======================================================
# 📊 Wochentagsverteilung visualisieren
# - Zählt Fahrten pro Wochentag
# - Erstellt eine interaktive Balkengrafik mit Plotly
# =======================================================
In [23]:
px.bar(x=df['Weekday'].value_counts().index,
y=df['Weekday'].value_counts()
)
In [24]:
# =======================================================
# 🕒 Stundenverteilung pro Monat
# - Zeigt in Histogrammen, wann (zu welcher Uhrzeit) im jeweiligen Monat die meisten Fahrten stattfanden
# =======================================================
In [25]:
plt.figure(figsize=(40,20))
for i, month in enumerate(df['Month'].unique()):
plt.subplot(3,2,i+1)
df[df['Month']==month]['Hour'].hist()
In [26]:
!pip install chart-studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
Requirement already satisfied: chart-studio in c:\users\miso\anaconda3\lib\site-packages (1.1.0) Requirement already satisfied: plotly in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (5.24.1) Requirement already satisfied: requests in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (2.32.3) Requirement already satisfied: retrying>=1.3.3 in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (1.4.1) Requirement already satisfied: six in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (1.17.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\miso\anaconda3\lib\site-packages (from plotly->chart-studio) (9.0.0) Requirement already satisfied: packaging in c:\users\miso\anaconda3\lib\site-packages (from plotly->chart-studio) (24.2) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (2.3.0) Requirement already satisfied: certifi>=2017.4.17 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (2025.7.14)
In [27]:
df.groupby('Month')['Hour'].count()
Out[27]:
Month 4 564516 5 652435 6 663844 7 796121 8 829275 9 1028136 Name: Hour, dtype: int64
In [28]:
# =======================================================
# 📈 Monatsverteilung der Fahrtenanzahl
# - Zeigt, wie viele Fahrten es pro Monat gibt
# - Interaktive Darstellung mit Plotly
# =======================================================
In [29]:
trace=go.Bar(
x = df.groupby('Month')['Hour'].count().index,
y = df.groupby('Month')['Hour'].count(),
name = 'Priority'
)
In [30]:
iplot([trace])
In [31]:
# =======================================================
# 📅 Verteilung der Fahrten innerhalb eines Monats
# - Histogramm der Tagesanzahl pro Monat
# =======================================================
In [32]:
plt.figure(figsize=(10,8))
plt.hist(df['Day'],bins=30, rwidth=0.5) # adding range=(0.5,30.5)) erzeugt die gleiche wie unten
plt.xlabel('Date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by month day')
Out[32]:
Text(0.5, 1.0, 'Journeys by month day')
In [33]:
sns.displot(df['Day'])
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x1b7857d16a0>
In [34]:
# =======================================================
# 📅📦 Tagesverteilung pro Monat
# - Histogramm für jeden Monat: Welche Tage im Monat waren besonders fahrtenintensiv?
# =======================================================
In [35]:
plt.figure(figsize=(40,20))
for i, Month in enumerate (df['Month'].unique(),1):
plt.subplot(3,2,i)
df_out=df[df['Month']==Month]
plt.hist(df_out['Day'])
plt.xlabel('Days in Month{}'.format(Month))
plt.ylabel('total_rides')
In [36]:
# =======================================================
# 📍 Zusammenhang von Uhrzeit und geographischer Breite
# - Scatterplot zur Untersuchung des Zusammenhangs zwischen Uhrzeit und Ort
# =======================================================
In [39]:
plt.figure(figsize=(12,6))
sns.pointplot(x='Hour', y='Lat', data=df, hue='Weekday')
#plt.set_title('Hours off day vs Latitude of Passenger')
Out[39]:
<Axes: xlabel='Hour', ylabel='Lat'>
In [ ]:
# =======================================================
# 📊 Fahrtenanzahl nach Monat und Basisstation
# - Gruppiert Fahrten nach Monat und Base-Code
# - Liniendiagramm zur Visualisierung
# =======================================================
In [31]:
base = df.groupby(['Base', 'Month'])['Date/Time'].count().reset_index()
base
Out[31]:
Base | Month | Date/Time | |
---|---|---|---|
0 | B02512 | 4 | 35536 |
1 | B02512 | 5 | 36765 |
2 | B02512 | 6 | 32509 |
3 | B02512 | 7 | 35021 |
4 | B02512 | 8 | 31472 |
5 | B02512 | 9 | 34370 |
6 | B02598 | 4 | 183263 |
7 | B02598 | 5 | 260549 |
8 | B02598 | 6 | 242975 |
9 | B02598 | 7 | 245597 |
10 | B02598 | 8 | 220129 |
11 | B02598 | 9 | 240600 |
12 | B02617 | 4 | 108001 |
13 | B02617 | 5 | 122734 |
14 | B02617 | 6 | 184460 |
15 | B02617 | 7 | 310160 |
16 | B02617 | 8 | 355803 |
17 | B02617 | 9 | 377695 |
18 | B02682 | 4 | 227808 |
19 | B02682 | 5 | 222883 |
20 | B02682 | 6 | 194926 |
21 | B02682 | 7 | 196754 |
22 | B02682 | 8 | 173280 |
23 | B02682 | 9 | 197138 |
24 | B02764 | 4 | 9908 |
25 | B02764 | 5 | 9504 |
26 | B02764 | 6 | 8974 |
27 | B02764 | 7 | 8589 |
28 | B02764 | 8 | 48591 |
29 | B02764 | 9 | 178333 |
In [32]:
plt.figure(figsize=(10,6))
sns.lineplot(x='Month', y='Date/Time', hue='Base', data=base)
Out[32]:
<Axes: xlabel='Month', ylabel='Date/Time'>
In [ ]:
# =======================================================
# 🔥 Heatmap: Anzahl der Fahrten pro Wochentag und Stunde
# - Gruppiert Fahrten nach Tag und Stunde
# - Erstellt eine Heatmap
# =======================================================
In [33]:
def count_rows(rows):
return len (rows)
In [34]:
by_cross = df.groupby(['Weekday', 'Hour']).apply(count_rows)
by_cross
C:\Users\Miso\AppData\Local\Temp\ipykernel_11320\3792439153.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
Out[34]:
Weekday Hour Friday 0 13716 1 8163 2 5350 3 6930 4 8806 ... Wednesday 19 47017 20 47772 21 44553 22 32868 23 18146 Length: 168, dtype: int64
In [35]:
pivot = by_cross.unstack()
pivot
Out[35]:
Hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Weekday | |||||||||||||||||||||
Friday | 13716 | 8163 | 5350 | 6930 | 8806 | 13450 | 23412 | 32061 | 31509 | 25230 | ... | 36206 | 43673 | 48169 | 51961 | 54762 | 49595 | 43542 | 48323 | 49409 | 41260 |
Monday | 6436 | 3737 | 2938 | 6232 | 9640 | 15032 | 23746 | 31159 | 29265 | 22197 | ... | 28157 | 32744 | 38770 | 42023 | 37000 | 34159 | 32849 | 28925 | 20158 | 11811 |
Saturday | 27633 | 19189 | 12710 | 9542 | 6846 | 7084 | 8579 | 11014 | 14411 | 17669 | ... | 31418 | 38769 | 43512 | 42844 | 45883 | 41098 | 38714 | 43826 | 47951 | 43174 |
Sunday | 32877 | 23015 | 15436 | 10597 | 6374 | 6169 | 6596 | 8728 | 12128 | 16401 | ... | 28151 | 31112 | 33038 | 31521 | 28291 | 25948 | 25076 | 23967 | 19566 | 12166 |
Thursday | 9293 | 5290 | 3719 | 5637 | 8505 | 14169 | 27065 | 37038 | 35431 | 27812 | ... | 36699 | 44442 | 50560 | 56704 | 55825 | 51907 | 51990 | 51953 | 44194 | 27764 |
Tuesday | 6237 | 3509 | 2571 | 4494 | 7548 | 14241 | 26872 | 36599 | 33934 | 25023 | ... | 34846 | 41338 | 48667 | 55500 | 50186 | 44789 | 44661 | 39913 | 27712 | 14869 |
Wednesday | 7644 | 4324 | 3141 | 4855 | 7511 | 13794 | 26943 | 36495 | 33826 | 25635 | ... | 35148 | 43388 | 50684 | 55637 | 52732 | 47017 | 47772 | 44553 | 32868 | 18146 |
7 rows × 24 columns
In [36]:
plt.figure(figsize=(12,6))
sns.heatmap(pivot)
Out[36]:
<Axes: xlabel='Hour', ylabel='Weekday'>
In [37]:
def heatmap(col1, col2):
by_cross = df.groupby([col1, col2]).apply(count_rows)
pivot = by_cross.unstack()
plt.figure(figsize=(12,6))
return sns.heatmap(pivot)
In [38]:
heatmap('Day', 'Hour')
C:\Users\Miso\AppData\Local\Temp\ipykernel_11320\3216948876.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
Out[38]:
<Axes: xlabel='Hour', ylabel='Day'>
In [ ]:
# =======================================================
# 🗺️ Darstellung aller Uber-Fahrten als Punktekarte
# - Sehr fein aufgelöstes Scatter-Plot der GPS-Koordinaten
# =======================================================
In [39]:
plt.figure(figsize=(12,6))
plt.plot(df['Lon'], df['Lat'], 'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41)
Out[39]:
(40.6, 41.0)
In [ ]:
# =======================================================
# 🗺️ Heatmap für Fahrten an Sonntagen
# - Gruppiert nach GPS-Koordinaten
# - Nutzt Folium zur Erstellung einer interaktiven Karte
# =======================================================
In [45]:
df_out=df[df['Weekday']=='Sunday']
#df_out.shape
#df_out.head()
rush=df_out.groupby(['Lat','Lon'])['Weekday'].count().reset_index()
rush.columns=['Lat', 'Lon', 'Number of Trips']
rush
Out[45]:
Lat | Lon | Number of Trips | |
---|---|---|---|
0 | 39.9374 | -74.0722 | 1 |
1 | 39.9378 | -74.0721 | 1 |
2 | 39.9384 | -74.0742 | 1 |
3 | 39.9385 | -74.0734 | 1 |
4 | 39.9415 | -74.0736 | 1 |
... | ... | ... | ... |
209225 | 41.3141 | -74.1249 | 1 |
209226 | 41.3180 | -74.1298 | 1 |
209227 | 41.3195 | -73.6905 | 1 |
209228 | 41.3197 | -73.6903 | 1 |
209229 | 42.1166 | -72.0666 | 1 |
209230 rows × 3 columns
In [48]:
!pip install folium
from folium.plugins import HeatMap
import folium
basemap=folium.Map()
Requirement already satisfied: folium in c:\users\miso\anaconda3\lib\site-packages (0.20.0) Requirement already satisfied: branca>=0.6.0 in c:\users\miso\anaconda3\lib\site-packages (from folium) (0.8.1) Requirement already satisfied: jinja2>=2.9 in c:\users\miso\anaconda3\lib\site-packages (from folium) (3.1.6) Requirement already satisfied: numpy in c:\users\miso\anaconda3\lib\site-packages (from folium) (2.1.3) Requirement already satisfied: requests in c:\users\miso\anaconda3\lib\site-packages (from folium) (2.32.3) Requirement already satisfied: xyzservices in c:\users\miso\anaconda3\lib\site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\miso\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (3.0.2) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (2.3.0) Requirement already satisfied: certifi>=2017.4.17 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (2025.7.14)
In [49]:
HeatMap(rush,zoom=20,radius=15).add_to(basemap)
basemap
Out[49]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
# =======================================================
# 🗺️ Allgemeine Funktion zur Erstellung von Heatmaps je Wochentag
# - Erstellt interaktive Karten für beliebige Tage
# =======================================================
In [62]:
def plot(df, day):
basemap=folium.Map()
df_out=df[df['Weekday']==day]
HeatMap(df_out.groupby(['Lat', 'Lon'])['Weekday'].count().reset_index(),zoom=20, radius=15).add_to(basemap)
return basemap
In [63]:
plot(df, 'Saturday')
Out[63]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
uber_15=pd.read_csv(r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet\uber-raw-data-janjune-15_sample.csv')