# main.py
# A demonstration of data ingestion, exploration, and wrangling using pandas.
#
# Before running, you may need to install pandas and openpyxl:
# pip install pandas openpyxl
import pandas as pd
import numpy as np
import os
print("--- Starting Data Ingestion and Wrangling Demonstration ---")
# --- Setup: Create dummy files for demonstration ---
CSV_FILENAME = "employee_data.csv"
EXCEL_FILENAME = "location_data.xlsx"
# Create a sample CSV file with missing data
try:
with open(CSV_FILENAME, 'w') as f:
f.write("EmployeeID,Name,Age,Department,Salary\n")
f.write("E01,Alice,34,HR,70000\n")
f.write("E02,Bob,45,Engineering,95000\n")
f.write("E03,Charlie,28,Sales,65000\n")
f.write("E04,David,,Engineering,80000\n") # Missing Age
f.write("E05,Eve,40,HR,\n") # Missing Salary
f.write("E06,Frank,52,Sales,120000\n")
f.write("E07,Grace,30,Marketing,68000\n")
print(f"\nSuccessfully created dummy file: '{CSV_FILENAME}'")
except IOError as e:
print(f"Error creating CSV file: {e}")
# Create a sample Excel file
try:
excel_data = {
'Department': ['HR', 'Engineering', 'Sales', 'Marketing'],
'Location': ['New York', 'London', 'Tokyo', 'Paris']
}
df_for_excel = pd.DataFrame(excel_data)
df_for_excel.to_excel(EXCEL_FILENAME, index=False, engine='openpyxl')
print(f"Successfully created dummy file: '{EXCEL_FILENAME}'")
except Exception as e:
print(f"Error creating Excel file: {e}")
# Main logic wrapped in a try...finally block to ensure cleanup
try:
# --- Section 1: Data Ingestion ---
print("\n--- 1. Data Ingestion ---")
df_employees = pd.DataFrame()
df_locations = pd.DataFrame()
# Read from CSV
try:
df_employees = pd.read_csv(CSV_FILENAME)
print("\nSuccessfully loaded data from CSV. Displaying the DataFrame:")
print(df_employees)
except FileNotFoundError:
print(f"Error: The file '{CSV_FILENAME}' was not found.")
# Read from Excel
try:
df_locations = pd.read_excel(EXCEL_FILENAME)
print("\nSuccessfully loaded data from Excel. Displaying the DataFrame:")
print(df_locations)
except FileNotFoundError:
print(f"Error: The file '{EXCEL_FILENAME}' was not found.")
# --- Section 2: Data Exploration ---
if not df_employees.empty:
print("\n--- 2. Initial Exploration of Employee Data ---")
print("\n[Checking for missing values (NaN)]")
# .isnull() returns a DataFrame of booleans; .sum() counts the True values per column.
print(df_employees.isnull().sum())
# --- Section 3: Data Cleaning - Handling Missing Values ---
if not df_employees.empty:
print("\n--- 3. Handling Missing Values ---")
# Strategy 1: Fill missing 'Age' with the mean age.
mean_age = df_employees['Age'].mean()
print(f"\nMean age is: {mean_age:.2f}. Filling missing 'Age' with this value.")
df_employees['Age'].fillna(mean_age, inplace=True) # inplace=True modifies the DataFrame directly
# Strategy 2: Fill missing 'Salary' with the median salary of the respective department.
print("Filling missing 'Salary' with the median salary of the employee's department.")
df_employees['Salary'].fillna(df_employees.groupby('Department')['Salary'].transform('median'), inplace=True)
print("\n[DataFrame after handling missing values]")
print(df_employees)
# --- Section 4: Data Wrangling - Binning and Merging ---
if not df_employees.empty:
print("\n--- 4. Data Wrangling: Binning and Merging ---")
# Binning: Transform numerical 'Age' into categorical 'AgeGroup'.
print("\n[Binning ages into groups]")
bins = [20, 30, 40, 50, 60]
labels = ['20-30', '31-40', '41-50', '51-60']
df_employees['AgeGroup'] = pd.cut(df_employees['Age'], bins=bins, labels=labels, right=True)
print(df_employees[['Name', 'Age', 'AgeGroup']].head())
# Merging: Combine the two DataFrames
if not df_locations.empty:
print("\n[Merging employee and location data]")
df_merged = pd.merge(df_employees, df_locations, on='Department', how='left')
print(df_merged)
finally:
# --- Clean up the created files ---
# This block will run whether an error occurred or not.
print("\n--- Cleaning up created files ---")
if os.path.exists(CSV_FILENAME):
os.remove(CSV_FILENAME)
print(f"Removed '{CSV_FILENAME}'")
if os.path.exists(EXCEL_FILENAME):
os.remove(EXCEL_FILENAME)
print(f"Removed '{EXCEL_FILENAME}'")
print("\n--- End of Demonstration ---")