In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import os
In [4]:
# =======================================================
# 📂 Laden und Filtern der CSV-Dateien
# - Listet die letzten 8 Dateien im Uber-Datenordner auf
# - Entfernt ungewünschte Dateien aus der Analyse
# =======================================================
In [5]:
files = os.listdir(r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet')[-8:]
files
Out[5]:
['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-janjune-15.csv',
 'uber-raw-data-janjune-15_sample.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [6]:
files.remove('uber-raw-data-janjune-15_sample.csv')
In [7]:
files.remove('uber-raw-data-janjune-15.csv')
In [8]:
files
Out[8]:
['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']
In [9]:
# =======================================================
# 📁 Zusammenführen der ausgewählten CSV-Dateien
# - Liest jede Datei ein
# - Hängt alle Daten zu einem großen DataFrame `final` zusammen
# =======================================================
In [10]:
path = r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet'

final=pd.DataFrame()

for file in files:
    df=pd.read_csv(path+"/"+file,encoding='utf-8')
    final=pd.concat([df,final])
In [11]:
final.shape
Out[11]:
(4534327, 4)
In [12]:
df = final.copy()
In [13]:
df.head()
Out[13]:
Date/Time Lat Lon Base
0 9/1/2014 0:01:00 40.2201 -74.0021 B02512
1 9/1/2014 0:01:00 40.7500 -74.0027 B02512
2 9/1/2014 0:03:00 40.7559 -73.9864 B02512
3 9/1/2014 0:06:00 40.7450 -73.9889 B02512
4 9/1/2014 0:11:00 40.8145 -73.9444 B02512
In [14]:
df.dtypes
Out[14]:
Date/Time     object
Lat          float64
Lon          float64
Base          object
dtype: object
In [15]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'],format='%m/%d/%Y %H:%M:%S')
In [16]:
df.dtypes
Out[16]:
Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object
In [17]:
df.head()
Out[17]:
Date/Time Lat Lon Base
0 2014-09-01 00:01:00 40.2201 -74.0021 B02512
1 2014-09-01 00:01:00 40.7500 -74.0027 B02512
2 2014-09-01 00:03:00 40.7559 -73.9864 B02512
3 2014-09-01 00:06:00 40.7450 -73.9889 B02512
4 2014-09-01 00:11:00 40.8145 -73.9444 B02512
In [18]:
# =======================================================
# 🧹 Datenvorverarbeitung
# - Umwandlung der Spalte 'Date/Time' in ein datetime-Format
# - Extraktion von Zeitkomponenten für spätere Analysen
# =======================================================
In [19]:
df['Weekday'] = df['Date/Time'].dt.day_name()
df['Month'] = df['Date/Time'].dt.month
df['Day'] = df['Date/Time'].dt.day
df['Hour'] = df['Date/Time'].dt.hour
df['Minute'] = df['Date/Time'].dt.minute
In [20]:
df.head()
Out[20]:
Date/Time Lat Lon Base Weekday Month Day Hour Minute
0 2014-09-01 00:01:00 40.2201 -74.0021 B02512 Monday 9 1 0 1
1 2014-09-01 00:01:00 40.7500 -74.0027 B02512 Monday 9 1 0 1
2 2014-09-01 00:03:00 40.7559 -73.9864 B02512 Monday 9 1 0 3
3 2014-09-01 00:06:00 40.7450 -73.9889 B02512 Monday 9 1 0 6
4 2014-09-01 00:11:00 40.8145 -73.9444 B02512 Monday 9 1 0 11
In [21]:
df['Weekday'].value_counts()
Out[21]:
Weekday
Thursday     755145
Friday       741139
Wednesday    696488
Tuesday      663789
Saturday     646114
Monday       541472
Sunday       490180
Name: count, dtype: int64
In [22]:
# =======================================================
# 📊 Wochentagsverteilung visualisieren
# - Zählt Fahrten pro Wochentag
# - Erstellt eine interaktive Balkengrafik mit Plotly
# =======================================================
In [23]:
px.bar(x=df['Weekday'].value_counts().index,
       y=df['Weekday'].value_counts()
      )
In [24]:
# =======================================================
# 🕒 Stundenverteilung pro Monat
# - Zeigt in Histogrammen, wann (zu welcher Uhrzeit) im jeweiligen Monat die meisten Fahrten stattfanden
# =======================================================
In [25]:
plt.figure(figsize=(40,20))

for i, month in enumerate(df['Month'].unique()):
    plt.subplot(3,2,i+1)
    df[df['Month']==month]['Hour'].hist()
No description has been provided for this image
In [26]:
!pip install chart-studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
Requirement already satisfied: chart-studio in c:\users\miso\anaconda3\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (5.24.1)
Requirement already satisfied: requests in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (2.32.3)
Requirement already satisfied: retrying>=1.3.3 in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (1.4.1)
Requirement already satisfied: six in c:\users\miso\anaconda3\lib\site-packages (from chart-studio) (1.17.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\miso\anaconda3\lib\site-packages (from plotly->chart-studio) (9.0.0)
Requirement already satisfied: packaging in c:\users\miso\anaconda3\lib\site-packages (from plotly->chart-studio) (24.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\miso\anaconda3\lib\site-packages (from requests->chart-studio) (2025.7.14)
In [27]:
df.groupby('Month')['Hour'].count()
Out[27]:
Month
4     564516
5     652435
6     663844
7     796121
8     829275
9    1028136
Name: Hour, dtype: int64
In [28]:
# =======================================================
# 📈 Monatsverteilung der Fahrtenanzahl
# - Zeigt, wie viele Fahrten es pro Monat gibt
# - Interaktive Darstellung mit Plotly
# =======================================================
In [29]:
trace=go.Bar(
    x = df.groupby('Month')['Hour'].count().index,
    y = df.groupby('Month')['Hour'].count(),
    name = 'Priority'
)
In [30]:
iplot([trace])
In [31]:
# =======================================================
# 📅 Verteilung der Fahrten innerhalb eines Monats
# - Histogramm der Tagesanzahl pro Monat
# =======================================================
In [32]:
plt.figure(figsize=(10,8))
plt.hist(df['Day'],bins=30, rwidth=0.5) # adding range=(0.5,30.5)) erzeugt die gleiche wie unten
plt.xlabel('Date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by month day')
Out[32]:
Text(0.5, 1.0, 'Journeys by month day')
No description has been provided for this image
In [33]:
sns.displot(df['Day'])
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x1b7857d16a0>
No description has been provided for this image
In [34]:
# =======================================================
# 📅📦 Tagesverteilung pro Monat
# - Histogramm für jeden Monat: Welche Tage im Monat waren besonders fahrtenintensiv?
# =======================================================
In [35]:
plt.figure(figsize=(40,20))

for i, Month in enumerate (df['Month'].unique(),1):
    plt.subplot(3,2,i)
    df_out=df[df['Month']==Month]
    plt.hist(df_out['Day'])
    plt.xlabel('Days in Month{}'.format(Month))
    plt.ylabel('total_rides')
No description has been provided for this image
In [36]:
# =======================================================
# 📍 Zusammenhang von Uhrzeit und geographischer Breite
# - Scatterplot zur Untersuchung des Zusammenhangs zwischen Uhrzeit und Ort
# =======================================================
In [39]:
plt.figure(figsize=(12,6))
sns.pointplot(x='Hour', y='Lat', data=df, hue='Weekday')
#plt.set_title('Hours off day vs Latitude of Passenger')
Out[39]:
<Axes: xlabel='Hour', ylabel='Lat'>
No description has been provided for this image
In [ ]:
# =======================================================
# 📊 Fahrtenanzahl nach Monat und Basisstation
# - Gruppiert Fahrten nach Monat und Base-Code
# - Liniendiagramm zur Visualisierung
# =======================================================
In [31]:
base = df.groupby(['Base', 'Month'])['Date/Time'].count().reset_index()
base
Out[31]:
Base Month Date/Time
0 B02512 4 35536
1 B02512 5 36765
2 B02512 6 32509
3 B02512 7 35021
4 B02512 8 31472
5 B02512 9 34370
6 B02598 4 183263
7 B02598 5 260549
8 B02598 6 242975
9 B02598 7 245597
10 B02598 8 220129
11 B02598 9 240600
12 B02617 4 108001
13 B02617 5 122734
14 B02617 6 184460
15 B02617 7 310160
16 B02617 8 355803
17 B02617 9 377695
18 B02682 4 227808
19 B02682 5 222883
20 B02682 6 194926
21 B02682 7 196754
22 B02682 8 173280
23 B02682 9 197138
24 B02764 4 9908
25 B02764 5 9504
26 B02764 6 8974
27 B02764 7 8589
28 B02764 8 48591
29 B02764 9 178333
In [32]:
plt.figure(figsize=(10,6))
sns.lineplot(x='Month', y='Date/Time', hue='Base', data=base)
Out[32]:
<Axes: xlabel='Month', ylabel='Date/Time'>
No description has been provided for this image
In [ ]:
# =======================================================
# 🔥 Heatmap: Anzahl der Fahrten pro Wochentag und Stunde
# - Gruppiert Fahrten nach Tag und Stunde
# - Erstellt eine Heatmap
# =======================================================
In [33]:
def count_rows(rows):
    return len (rows)
In [34]:
by_cross = df.groupby(['Weekday', 'Hour']).apply(count_rows)
by_cross
C:\Users\Miso\AppData\Local\Temp\ipykernel_11320\3792439153.py:1: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[34]:
Weekday    Hour
Friday     0       13716
           1        8163
           2        5350
           3        6930
           4        8806
                   ...  
Wednesday  19      47017
           20      47772
           21      44553
           22      32868
           23      18146
Length: 168, dtype: int64
In [35]:
pivot = by_cross.unstack()
pivot
Out[35]:
Hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
Weekday
Friday 13716 8163 5350 6930 8806 13450 23412 32061 31509 25230 ... 36206 43673 48169 51961 54762 49595 43542 48323 49409 41260
Monday 6436 3737 2938 6232 9640 15032 23746 31159 29265 22197 ... 28157 32744 38770 42023 37000 34159 32849 28925 20158 11811
Saturday 27633 19189 12710 9542 6846 7084 8579 11014 14411 17669 ... 31418 38769 43512 42844 45883 41098 38714 43826 47951 43174
Sunday 32877 23015 15436 10597 6374 6169 6596 8728 12128 16401 ... 28151 31112 33038 31521 28291 25948 25076 23967 19566 12166
Thursday 9293 5290 3719 5637 8505 14169 27065 37038 35431 27812 ... 36699 44442 50560 56704 55825 51907 51990 51953 44194 27764
Tuesday 6237 3509 2571 4494 7548 14241 26872 36599 33934 25023 ... 34846 41338 48667 55500 50186 44789 44661 39913 27712 14869
Wednesday 7644 4324 3141 4855 7511 13794 26943 36495 33826 25635 ... 35148 43388 50684 55637 52732 47017 47772 44553 32868 18146

7 rows × 24 columns

In [36]:
plt.figure(figsize=(12,6))
sns.heatmap(pivot)
Out[36]:
<Axes: xlabel='Hour', ylabel='Weekday'>
No description has been provided for this image
In [37]:
def heatmap(col1, col2):
    by_cross = df.groupby([col1, col2]).apply(count_rows)
    pivot = by_cross.unstack()
    plt.figure(figsize=(12,6))
    return sns.heatmap(pivot)
In [38]:
heatmap('Day', 'Hour')
C:\Users\Miso\AppData\Local\Temp\ipykernel_11320\3216948876.py:2: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Out[38]:
<Axes: xlabel='Hour', ylabel='Day'>
No description has been provided for this image
In [ ]:
# =======================================================
# 🗺️ Darstellung aller Uber-Fahrten als Punktekarte
# - Sehr fein aufgelöstes Scatter-Plot der GPS-Koordinaten
# =======================================================
In [39]:
plt.figure(figsize=(12,6))
plt.plot(df['Lon'], df['Lat'], 'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41)
Out[39]:
(40.6, 41.0)
No description has been provided for this image
In [ ]:
# =======================================================
# 🗺️ Heatmap für Fahrten an Sonntagen
# - Gruppiert nach GPS-Koordinaten
# - Nutzt Folium zur Erstellung einer interaktiven Karte
# =======================================================
In [45]:
df_out=df[df['Weekday']=='Sunday']
#df_out.shape
#df_out.head()
rush=df_out.groupby(['Lat','Lon'])['Weekday'].count().reset_index()
rush.columns=['Lat', 'Lon', 'Number of Trips']
rush
Out[45]:
Lat Lon Number of Trips
0 39.9374 -74.0722 1
1 39.9378 -74.0721 1
2 39.9384 -74.0742 1
3 39.9385 -74.0734 1
4 39.9415 -74.0736 1
... ... ... ...
209225 41.3141 -74.1249 1
209226 41.3180 -74.1298 1
209227 41.3195 -73.6905 1
209228 41.3197 -73.6903 1
209229 42.1166 -72.0666 1

209230 rows × 3 columns

In [48]:
!pip install folium
from folium.plugins import HeatMap
import folium
basemap=folium.Map()
Requirement already satisfied: folium in c:\users\miso\anaconda3\lib\site-packages (0.20.0)
Requirement already satisfied: branca>=0.6.0 in c:\users\miso\anaconda3\lib\site-packages (from folium) (0.8.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\miso\anaconda3\lib\site-packages (from folium) (3.1.6)
Requirement already satisfied: numpy in c:\users\miso\anaconda3\lib\site-packages (from folium) (2.1.3)
Requirement already satisfied: requests in c:\users\miso\anaconda3\lib\site-packages (from folium) (2.32.3)
Requirement already satisfied: xyzservices in c:\users\miso\anaconda3\lib\site-packages (from folium) (2022.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\miso\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (3.0.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\miso\anaconda3\lib\site-packages (from requests->folium) (2025.7.14)
In [49]:
HeatMap(rush,zoom=20,radius=15).add_to(basemap)
basemap
Out[49]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
# =======================================================
# 🗺️ Allgemeine Funktion zur Erstellung von Heatmaps je Wochentag
# - Erstellt interaktive Karten für beliebige Tage
# =======================================================
In [62]:
def plot(df, day):
    basemap=folium.Map()
    df_out=df[df['Weekday']==day]
    HeatMap(df_out.groupby(['Lat', 'Lon'])['Weekday'].count().reset_index(),zoom=20, radius=15).add_to(basemap)
    return basemap
In [63]:
plot(df, 'Saturday')
Out[63]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
uber_15=pd.read_csv(r'C:\Users\Miso\Desktop\Data Analysis 1\UberDataSet\uber-raw-data-janjune-15_sample.csv')