@@ -143,7 +143,7 @@ df = pd.DataFrame({ # DataFrame is created from a dictionary
143143``` {python}
144144#| eval: true
145145#| output-location: fragment
146- df = pd.read_csv("employees.csv") # Reads the CSV file
146+ df = pd.read_csv("data/ employees.csv") # Reads the CSV file
147147print(df)
148148```
149149
@@ -157,7 +157,7 @@ print(df)
157157``` {python}
158158#| eval: true
159159#| output-location: fragment
160- df = pd.read_csv("employees.csv")
160+ df = pd.read_csv("data/ employees.csv")
161161print(df.tail())
162162```
163163
@@ -171,7 +171,7 @@ print(df.tail())
171171``` {python}
172172#| eval: true
173173#| output-location: fragment
174- df = pd.read_csv("employees.csv")
174+ df = pd.read_csv("data/ employees.csv")
175175print(df.info())
176176```
177177
@@ -185,7 +185,7 @@ print(df.info())
185185``` {python}
186186#| eval: true
187187#| output-location: fragment
188- df = pd.read_csv("employees.csv")
188+ df = pd.read_csv("data/ employees.csv")
189189print(df.describe())
190190```
191191
@@ -199,7 +199,7 @@ print(df.describe())
199199``` {python}
200200#| eval: true
201201#| output-location: fragment
202- df = pd.read_csv("employees.csv")
202+ df = pd.read_csv("data/ employees.csv")
203203df_high_salary = df[df['Salary'] >= 67000]
204204print(df_high_salary)
205205print(df_high_salary.iloc[2]["Name"]) #Access the third row and the "Name" column
@@ -211,8 +211,7 @@ print(df_high_salary.loc[40]["Name"]) #Access the label 40 and the "Name" column
211211[ Task] {.task}: Complete the following task:
212212
213213``` {python}
214- #| eval: true
215- #| output-location: fragment
214+ #| eval: false
216215
217216# TODO: Load the employees.csv located in the git repository into a DataFrame
218217# First, filter the DataFrame for employees with a manager position
@@ -240,9 +239,25 @@ Note, that we can use the `mean()` method on the `Salary` column, as it is a num
240239``` {python}
241240#| eval: true
242241#| output-location: slide
243- df = pd.read_csv("employees.csv")
244- df = df.drop(columns=["Name", "Department"])
245- df.groupby(['Position']).mean() # Mean per position
242+ df = pd.read_csv("data/employees.csv")
243+ df.groupby(['Position']).sum() # Mean per position
244+ ```
245+
246+ ## Grouping Numeric Columns
247+
248+ - To prevent errors, we can [ select numeric columns first] {.highlight}
249+ - Afterwards, perform the operation on the ** selected columns**
250+ - Helps to avoid errors when grouping by non-numeric columns
251+ - Or ** drop columns** by ` df.drop(columns=["column"]) `
252+
253+ . . .
254+
255+ ``` {python}
256+ #| eval: true
257+ #| output-location: slide
258+ df = pd.read_csv("data/employees.csv")
259+ numeric_cols = df.select_dtypes(include=['number']).columns
260+ print(df.groupby("Position")[numeric_cols].sum())
246261```
247262
248263## Grouping by Multiple Columns
@@ -255,15 +270,15 @@ df.groupby(['Position']).mean() # Mean per position
255270``` {python}
256271#| eval: true
257272#| output-location: slide
258- df = pd.read_csv("employees.csv")
273+ df = pd.read_csv("data/ employees.csv")
259274df = df.drop(columns=["Name"])
260275# Max per position and department
261276df.groupby(['Position', "Department"]).max()
262277```
263278
264279## Grouping with Aggregations
265280
266- - As seen, we can use aggregation functions:
281+ - We can use different aggregation functions:
267282 - ` sum() ` : sum of the values
268283 - ` mean() ` : mean of the values
269284 - ` max() ` : maximum of the values
@@ -277,17 +292,11 @@ df.groupby(['Position', "Department"]).max()
277292#| eval: false
278293# TODO: Load the employees.csv again into a DataFrame
279294# First, group by the "Position" column and count the employees per position
280- # Then, group by the "Department" column and calculate the sum of all other columns per department
281- df = pd.read_csv("employees.csv")
295+ # Then, group by the "Department" column and calculate the mean of all other columns per department
296+ df = pd.read_csv("data/ employees.csv")
282297# Your code here
283298```
284299
285- . . .
286-
287- :::{.callout-note}
288- Do you notice any [ irregularities] {.highlight} while calculating the sum per department?
289- :::
290-
291300# [ Combining DataFrames] {.flow} {.title}
292301
293302## Concatenating DataFrames
@@ -342,21 +351,15 @@ print(df_merged)
342351[ Task] {.task}: Complete the following task:
343352
344353``` {python}
345- #| eval: true
346- #| output-location: fragment
347-
348- # Create two sample DataFrames
354+ #| eval: false
349355df1 = pd.DataFrame({
350356 "Name": ["John", "Alice", "Bob", "Carol"],
351357 "Department": ["Sales", "IT", "HR", "Sales"],
352- "Salary": [50000, 60000, 55000, 52000]
353- })
354-
358+ "Salary": [50000, 60000, 55000, 52000]})
355359df2 = pd.DataFrame({
356360 "Name": ["Alice", "Bob", "Dave", "Eve"],
357361 "Position": ["Developer", "Manager", "Analyst", "Developer"],
358- "Years": [5, 8, 3, 4]
359- })
362+ "Years": [5, 8, 3, 4]})
360363
361364# TODO: Merge the two DataFrames on the "Name" column
362365# Try different types of merges (inner, outer, left, right)
@@ -376,8 +379,8 @@ df2 = pd.DataFrame({
376379``` {python}
377380#| eval: true
378381import pandas as pd
379- df = pd.read_csv("employees.csv")
380- df.to_excel("employees.xlsx", index=False)
382+ df = pd.read_csv("data/ employees.csv")
383+ df.to_excel("data/ employees.xlsx", index=False)
381384```
382385
383386. . .
@@ -388,15 +391,23 @@ Note, that you likely need to install the `openpyxl` package to be able to write
388391
389392## Advanced Excel file handling
390393
394+ We can also [ specify the sheet name] {.highlight} when reading and writing
395+
391396``` {python}
392397#| eval: true
393- df = pd.read_excel("employees.xlsx")
394398
395399# Writes to the Employees sheet and does not include row indices
396- df.to_excel("employees.xlsx", sheet_name="Employees", index=False)
400+ df.to_excel("data/employees.xlsx", sheet_name="Employees", index=False)
401+ ```
402+
403+ . . .
397404
405+ ``` {python}
406+ #| eval: true
407+ #| output-location: fragment
398408# Reads from the Employees sheet
399- df = pd.read_excel("employees.xlsx", sheet_name="Employees")
409+ df = pd.read_excel("data/employees.xlsx", sheet_name="Employees")
410+ print(df.head())
400411```
401412
402413## Excel in Action
@@ -430,7 +441,7 @@ For example, the following DataFrame is in [wide format]{.highlight}:
430441``` {python}
431442#| eval: true
432443#| echo: false
433- df = pd.read_excel("temperatures.xlsx")
444+ df = pd.read_excel("data/ temperatures.xlsx")
434445print(df)
435446```
436447
@@ -441,7 +452,7 @@ The melting process transforms it into the following [long format]{.highlight}:
441452``` {python}
442453#| eval: true
443454#| echo: false
444- df = pd.read_excel("temperatures.xlsx")
455+ df = pd.read_excel("data/ temperatures.xlsx")
445456df = pd.melt(df, id_vars=['Date'], var_name='City', value_name='Temperature')
446457print(df)
447458```
@@ -459,7 +470,7 @@ print(df)
459470``` {python}
460471#| eval: true
461472#| output-location: slide
462- df = pd.read_csv("employees.csv")
473+ df = pd.read_csv("data/ employees.csv")
463474df = pd.melt(df, id_vars=['Position'], var_name='Variables', value_name='Values')
464475print(df)
465476```
@@ -476,10 +487,10 @@ print(df)
476487# 0 2024-03-01 Hamburg 7.2
477488# 1 2024-03-01 Los_Angeles 18.5
478489# 2 2024-03-01 Tokyo 12.3
490+ # Then, print the maximum temperature per city by grouping by the "City" column
479491```
480492
481493
482-
483494# [ Programming with AI] {.flow} {.title}
484495
485496## Using AI to generate code
0 commit comments