@@ -34,14 +34,14 @@ def _():
3434 "category" : np .random .choice (["A" , "B" , "C" , "D" ], size = n_rows ),
3535 "value" : np .random .rand (n_rows ) * 1000 ,
3636 }
37- df = pd .DataFrame (data )
38- df .head (10 )
39- return data , df , n_rows , np , pd
37+ pandas_df = pd .DataFrame (data )
38+ pandas_df .head (10 )
39+ return data , n_rows , np , pandas_df , pd
4040
4141
4242@app .cell
43- def _ (df ):
44- df .to_csv ("large_file.csv" , index = False )
43+ def _ (pandas_df ):
44+ pandas_df .to_csv ("large_file.csv" , index = False )
4545 return
4646
4747
@@ -61,11 +61,11 @@ def _(mo):
6161def _ (pd ):
6262 import time
6363
64- start = time .time ()
64+ start_read_pd = time .time ()
6565 df_pd = pd .read_csv ("large_file.csv" )
66- end = time .time ()
67- print (f"Pandas read_csv took { end - start :.2f} seconds" )
68- return df_pd , end , start , time
66+ end_read_pd = time .time ()
67+ print (f"Pandas read_csv took { end_read_pd - start_read_pd :.2f} seconds" )
68+ return df_pd , end_read_pd , start_read_pd , time
6969
7070
7171@app .cell (hide_code = True )
@@ -78,11 +78,11 @@ def _(mo):
7878def _ (time ):
7979 import polars as pl
8080
81- start_1 = time .time ()
82- df_pl = pl .read_csv ("large_file.csv" )
83- end_1 = time .time ()
84- print (f"Polars read_csv took { end_1 - start_1 :.2f} seconds" )
85- return df_pl , end_1 , pl , start_1
81+ start_read_pl = time .time ()
82+ polars_df = pl .read_csv ("large_file.csv" )
83+ end_read_pl = time .time ()
84+ print (f"Polars read_csv took { end_read_pl - start_read_pl :.2f} seconds" )
85+ return end_read_pl , pl , polars_df , start_read_pl
8686
8787
8888@app .cell (hide_code = True )
@@ -92,16 +92,16 @@ def _(mo):
9292
9393
9494@app .cell
95- def _ (df_pl , pl ):
96- lazy_df = df_pl .lazy ()
95+ def _ (pl , polars_df ):
96+ lazy_polars_df = polars_df .lazy ()
9797 result = (
98- lazy_df .filter (pl .col ("value" ) > 100 )
98+ lazy_polars_df .filter (pl .col ("value" ) > 100 )
9999 .group_by ("category" )
100100 .agg (pl .col ("value" ).mean ().alias ("avg_value" ))
101101 .collect ()
102102 )
103103 result .head (10 )
104- return lazy_df , result
104+ return lazy_polars_df , result
105105
106106
107107@app .cell (hide_code = True )
@@ -112,39 +112,109 @@ def _(mo):
112112
113113@app .cell
114114def _ (data , pd , pl ):
115- df_pd_1 = pd .DataFrame (data )
116- df_pl_1 = pl .DataFrame (data )
117- return df_pd_1 , df_pl_1
115+ pandas_groupby_df = pd .DataFrame (data )
116+ polars_groupby_df = pl .DataFrame (data )
117+ return pandas_groupby_df , polars_groupby_df
118118
119119
120120@app .cell (hide_code = True )
121121def _ (mo ):
122- mo .md (r"""### Pandas """ )
122+ mo .md (r"""### Groupby Mean """ )
123123 return
124124
125125
126126@app .cell
127- def _ (df_pd_1 , time ):
128- start_2 = time .time ()
129- df_pd_1 .groupby ("category" )["value" ].mean ()
130- end_2 = time .time ()
131- print (f"Pandas groupby took { end_2 - start_2 :.2f} seconds" )
132- return end_2 , start_2
127+ def _ (pandas_groupby_df , time ):
128+ start_groupby_pd = time .time ()
129+ pandas_groupby_df .groupby ("category" )["value" ].mean ()
130+ end_groupby_pd = time .time ()
131+ print (f"Pandas groupby took { end_groupby_pd - start_groupby_pd :.2f} seconds" )
132+ return end_groupby_pd , start_groupby_pd
133+
134+
135+ @app .cell
136+ def _ (pl , polars_groupby_df , time ):
137+ start_groupby_pl = time .time ()
138+ polars_groupby_df .group_by ("category" ).agg (pl .col ("value" ).mean ())
139+ end_groupby_pl = time .time ()
140+ print (f"Polars groupby took { end_groupby_pl - start_groupby_pl :.2f} seconds" )
141+ return end_groupby_pl , start_groupby_pl
133142
134143
135144@app .cell (hide_code = True )
136145def _ (mo ):
137- mo .md (r"""### Polars """ )
146+ mo .md (r"""### Filter Rows """ )
138147 return
139148
140149
141150@app .cell
142- def _ (df_pl_1 , pl , time ):
143- start_3 = time .time ()
144- df_pl_1 .group_by ("category" ).agg (pl .col ("value" ).mean ())
145- end_3 = time .time ()
146- print (f"Polars groupby took { end_3 - start_3 :.2f} seconds" )
147- return end_3 , start_3
151+ def _ (pandas_groupby_df , time ):
152+ start_filter_pd = time .time ()
153+ pandas_filtered_df = pandas_groupby_df [pandas_groupby_df ["value" ] > 500 ]
154+ end_filter_pd = time .time ()
155+ print (f"Pandas filter took { end_filter_pd - start_filter_pd :.2f} seconds" )
156+ return end_filter_pd , pandas_filtered_df , start_filter_pd
157+
158+
159+ @app .cell
160+ def _ (pl , polars_groupby_df , time ):
161+ start_filter_pl = time .time ()
162+ polars_filtered_df = polars_groupby_df .filter (pl .col ("value" ) > 500 )
163+ end_filter_pl = time .time ()
164+ print (f"Polars filter took { end_filter_pl - start_filter_pl :.2f} seconds" )
165+ return end_filter_pl , polars_filtered_df , start_filter_pl
166+
167+
168+ @app .cell (hide_code = True )
169+ def _ (mo ):
170+ mo .md (r"""### Sort by Column""" )
171+ return
172+
173+
174+ @app .cell
175+ def _ (pandas_groupby_df , time ):
176+ start_sort_pd = time .time ()
177+ pandas_sorted_df = pandas_groupby_df .sort_values ("value" )
178+ end_sort_pd = time .time ()
179+ print (f"Pandas sort took { end_sort_pd - start_sort_pd :.2f} seconds" )
180+ return end_sort_pd , pandas_sorted_df , start_sort_pd
181+
182+
183+ @app .cell
184+ def _ (polars_groupby_df , time ):
185+ start_sort_pl = time .time ()
186+ polars_sorted_df = polars_groupby_df .sort ("value" )
187+ end_sort_pl = time .time ()
188+ print (f"Polars sort took { end_sort_pl - start_sort_pl :.2f} seconds" )
189+ return end_sort_pl , polars_sorted_df , start_sort_pl
190+
191+
192+ @app .cell (hide_code = True )
193+ def _ (mo ):
194+ mo .md (r"""### Join on Key""" )
195+ return
196+
197+
198+ @app .cell
199+ def _ (pd , time ):
200+ pandas_df1 = pd .DataFrame ({"key" : range (5_000_000 ), "val1" : range (5_000_000 )})
201+ pandas_df2 = pd .DataFrame ({"key" : range (5_000_000 ), "val2" : range (5_000_000 )})
202+ start_join_pd = time .time ()
203+ pandas_joined_df = pd .merge (pandas_df1 , pandas_df2 , on = "key" )
204+ end_join_pd = time .time ()
205+ print (f"Pandas join took { end_join_pd - start_join_pd :.2f} seconds" )
206+ return end_join_pd , pandas_df1 , pandas_df2 , pandas_joined_df , start_join_pd
207+
208+
209+ @app .cell
210+ def _ (pl , time ):
211+ polars_df1 = pl .DataFrame ({"key" : range (5_000_000 ), "val1" : range (5_000_000 )})
212+ polars_df2 = pl .DataFrame ({"key" : range (5_000_000 ), "val2" : range (5_000_000 )})
213+ start_join_pl = time .time ()
214+ polars_joined_df = polars_df1 .join (polars_df2 , on = "key" , how = "inner" )
215+ end_join_pl = time .time ()
216+ print (f"Polars join took { end_join_pl - start_join_pl :.2f} seconds" )
217+ return end_join_pl , polars_df1 , polars_df2 , polars_joined_df , start_join_pl
148218
149219
150220@app .cell (hide_code = True )
@@ -160,15 +230,15 @@ def _(mo):
160230
161231
162232@app .cell
163- def _ (df_pd_1 ):
164- df_pd_1 [ df_pd_1 ["value" ] > 100 ]
165- return
233+ def _ (pandas_groupby_df ):
234+ pandas_filtered_rows_df = pandas_groupby_df [ pandas_groupby_df ["value" ] > 100 ]
235+ return ( pandas_filtered_rows_df ,)
166236
167237
168238@app .cell
169- def _ (df_pl_1 , pl ):
170- df_pl_1 .filter (pl .col ("value" ) > 100 )
171- return
239+ def _ (pl , polars_groupby_df ):
240+ polars_filtered_rows_df = polars_groupby_df .filter (pl .col ("value" ) > 100 )
241+ return ( polars_filtered_rows_df ,)
172242
173243
174244@app .cell (hide_code = True )
@@ -178,15 +248,15 @@ def _(mo):
178248
179249
180250@app .cell
181- def _ (df_pd_1 ):
182- df_pd_1 [["category" , "value" ]]
183- return
251+ def _ (pandas_groupby_df ):
252+ pandas_selected_columns_df = pandas_groupby_df [["category" , "value" ]]
253+ return ( pandas_selected_columns_df ,)
184254
185255
186256@app .cell
187- def _ (df_pl_1 ):
188- df_pl_1 .select (["category" , "value" ])
189- return
257+ def _ (polars_groupby_df ):
258+ polars_selected_columns_df = polars_groupby_df .select (["category" , "value" ])
259+ return ( polars_selected_columns_df ,)
190260
191261
192262@app .cell (hide_code = True )
@@ -196,20 +266,21 @@ def _(mo):
196266
197267
198268@app .cell
199- def _ (df_pd_1 ):
200- df_result = df_pd_1 [df_pd_1 ["value" ] > 1000 ]
201- df_result = df_result .groupby ("category" )["value" ].mean ().reset_index ()
202- return (df_result ,)
269+ def _ (pandas_groupby_df ):
270+ pandas_chained_operations_df = pandas_groupby_df [pandas_groupby_df ["value" ] > 1000 ]
271+ pandas_chained_operations_df = (
272+ pandas_chained_operations_df .groupby ("category" )["value" ].mean ().reset_index ()
273+ )
274+ return (pandas_chained_operations_df ,)
203275
204276
205277@app .cell
206- def _ (df_pl_1 , pl ):
207- df_result_1 = (
208- df_pl_1 .filter (pl .col ("value" ) > 1000 )
209- .group_by ("category" )
210- .agg (pl .col ("value" ).mean ().alias ("avg_value" ))
211- )
212- return (df_result_1 ,)
278+ def _ (pl , polars_groupby_df ):
279+ polars_chained_operations_df = polars_groupby_df .filter (pl .col ("value" ) > 1000 )
280+ polars_chained_operations_df = polars_chained_operations_df .group_by (
281+ "category"
282+ ).agg (pl .col ("value" ).mean ().alias ("avg_value" ))
283+ return (polars_chained_operations_df ,)
213284
214285
215286@app .cell (hide_code = True )
@@ -219,9 +290,18 @@ def _(mo):
219290
220291
221292@app .cell
222- def _ (df_pd_1 , df_pl_1 ):
223- print (df_pd_1 .memory_usage (deep = True ).sum () / 1000000.0 , "MB" )
224- print (df_pl_1 .estimated_size () / 1000000.0 , "MB" )
293+ def _ (pandas_groupby_df , polars_groupby_df ):
294+ print (
295+ f"Pandas DataFrame memory usage: { pandas_groupby_df .memory_usage (deep = True ).sum () / 1000000.0 :2f} MB"
296+ )
297+ print (
298+ f"Polars DataFrame estimated size: { polars_groupby_df .estimated_size () / 1000000.0 } MB"
299+ )
300+ return
301+
302+
303+ @app .cell
304+ def _ ():
225305 return
226306
227307
0 commit comments