-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_visualization.py
More file actions
201 lines (184 loc) · 7.54 KB
/
data_visualization.py
File metadata and controls
201 lines (184 loc) · 7.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# ================================================================
# 4. Data Visualization Module
# Create charts to explore patterns and extract insights
# ================================================================
import seaborn as sns
import matplotlib.pyplot as plt
# ----------------------------------------------------------------
# Distribution of Pickup Points
# ----------------------------------------------------------------
def plot_pickup_point_distribution(df):
"""
Bar chart showing the count of pickup points (City vs Airport)
"""
sns.countplot(x='Pickup point', data=df)
plt.title('Distribution of Pickup Points')
plt.xlabel('Pickup Point')
plt.ylabel('Count')
plt.show()
# ----------------------------------------------------------------
# Distribution of Trip Status
# ----------------------------------------------------------------
def plot_trip_status(df):
"""
Bar chart showing the status of trips (Completed, Cancelled, etc.)
"""
sns.countplot(x='Status', data=df)
plt.title('Distribution of Trip Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.show()
# ----------------------------------------------------------------
# Trip Status broken down by Pickup Point
# ----------------------------------------------------------------
def plot_status_by_pickup_point(df):
"""
Stacked bar chart: Status by Pickup Point
"""
sns.countplot(x='Status', hue='Pickup point', data=df)
plt.title('Trip Status by Pickup Point')
plt.xlabel('Trip Status')
plt.ylabel('Count')
plt.show()
# ----------------------------------------------------------------
# Daily Request Count by Day of the Week
# ----------------------------------------------------------------
def plot_daily_requests(df):
"""
Shows total number of requests for each day of the week
"""
sns.countplot(x='Request day', data=df)
plt.title('Number of Daily Requests by Day')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Requests')
plt.show()
# ----------------------------------------------------------------
# Requests per Time Slot
# ----------------------------------------------------------------
def plot_time_slot_distribution(df):
"""
Count of requests per time period (Morning, Afternoon, etc.)
"""
sns.countplot(x='Time slot', data=df, palette='crest')
plt.title('Requests per Time Slot')
plt.xlabel('Time Slot')
plt.ylabel('Number of Requests')
plt.show()
# ----------------------------------------------------------------
# Requests per Hour
# ----------------------------------------------------------------
def plot_hourly_requests(df):
"""
Count of requests by hour of the day
"""
sns.countplot(x='Request hour', data=df, palette='crest')
plt.title('Requests per Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Requests')
plt.show()
# ----------------------------------------------------------------
# Heatmap of Requests by Hour and Day
# ----------------------------------------------------------------
def plot_request_heatmap(df):
"""
Heatmap showing request density by hour and weekday
"""
pivot_table = df.pivot_table(values='Request id', index='Request hour', columns='Request day', aggfunc='count')
sns.heatmap(pivot_table, cmap='Spectral')
plt.title('Requests by Hour and Day')
plt.xlabel('Day of the Week')
plt.ylabel('Hour of the Day')
plt.show()
# ----------------------------------------------------------------
# Trip Duration Distribution per Hour (Boxplot)
# ----------------------------------------------------------------
def plot_trip_duration_boxplot(df):
"""
Boxplot showing the variation of trip duration over different hours
"""
df_filtered = df[df['Trip Duration Mins'] > 0]
sns.boxplot(x='Request hour', y='Trip Duration Mins', data=df_filtered, palette='rainbow')
plt.title('Trip Duration Distribution by Request Hour')
plt.xlabel('Request Hour')
plt.ylabel('Trip Duration (mins)')
plt.ylim(0, 100)
plt.show()
# ----------------------------------------------------------------
# Trip Duration vs. Request Hour (Scatter plot)
# ----------------------------------------------------------------
def plot_trip_duration_scatter(df):
"""
Scatter plot showing trip duration across request hours
"""
# Remove outliers before plotting
df_filtered = df[(df['Trip Duration Mins'] <= 180) & (df['Trip Duration Mins'] > 0)]
# Scatter plot
sns.scatterplot(x='Request hour', y='Trip Duration Mins', data=df_filtered)
plt.xlabel('Request Hour')
plt.ylabel('Trip Duration (mins)')
plt.title('Trip Duration vs. Request Hour')
plt.show()
# ----------------------------------------------------------------
# Top 10 Drivers with Most Trips
# ----------------------------------------------------------------
def plot_top_drivers(df):
"""
Bar chart showing top 10 drivers by number of trips
"""
top_drivers = df[df['Driver id'] > 0].groupby('Driver id').size().sort_values(ascending=False).head(10)
sns.barplot(x=top_drivers.index.astype(str), y=top_drivers.values, palette='viridis')
plt.title('Top 10 Drivers by Number of Trips')
plt.xlabel('Driver ID')
plt.ylabel('Number of Trips')
plt.xticks(rotation=45)
plt.show()
# ----------------------------------------------------------------
# Driver Availability by Day of Week
# ----------------------------------------------------------------
def plot_driver_availability_by_day(df):
"""
Stacked bar showing driver availability broken down by day
"""
sns.countplot(x='Driver Available', hue='Request day', data=df, palette='rainbow')
plt.title('Driver Availability by Day of Week')
plt.xlabel('Driver Available?')
plt.ylabel('Number of Requests')
plt.show()
# ----------------------------------------------------------------
# Driver Availability by Time Slot
# ----------------------------------------------------------------
def plot_driver_availability_by_slot(df):
"""
Stacked bar showing driver availability by time of day
"""
sns.countplot(x='Driver Available', hue='Time slot', data=df, palette='crest')
plt.title('Driver Availability by Time Slot')
plt.xlabel('Driver Available?')
plt.ylabel('Number of Requests')
plt.show()
# ----------------------------------------------------------------
# Number of Requests Per Day of the Week
# ----------------------------------------------------------------
def plot_requests_per_weekday(df):
"""
Bar chart showing number of ride requests per day of the week.
"""
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
sns.countplot(x='Request day', data=df, order=order)
plt.title('Amount of Daily Requests per Week')
plt.xlabel('Week Day')
plt.ylabel('Total Requests')
plt.show()
# ----------------------------------------------------------------
# KMeans Clustering Plot (after clustering is done externally)
# ----------------------------------------------------------------
def plot_location_clusters(df):
"""
Visualizes clusters of locations using KMeans result
"""
sns.scatterplot(x='Longitude', y='Latitude', hue='cluster', data=df, palette='viridis', s=15)
plt.title('K-Means Clustering of Pickup Locations')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()