Study | StudyLover

Program for Data Exploration and Wrangling

Download

Program for Data Ingestion with Pandas : Program for Data Visualization with Matplotlib

Programs

# main.py

# A demonstration of data ingestion, exploration, and wrangling using pandas.

# Before running, you may need to install pandas and openpyxl:

# pip install pandas openpyxl

import pandas as pd

import numpy as np

import os

print("--- Starting Data Ingestion and Wrangling Demonstration ---")

# --- Setup: Create dummy files for demonstration ---

CSV_FILENAME = "employee_data.csv"

EXCEL_FILENAME = "location_data.xlsx"

# Create a sample CSV file with missing data

try:

with open(CSV_FILENAME, 'w') as f:

f.write("EmployeeID,Name,Age,Department,Salary\n")

f.write("E01,Alice,34,HR,70000\n")

f.write("E02,Bob,45,Engineering,95000\n")

f.write("E03,Charlie,28,Sales,65000\n")

f.write("E04,David,,Engineering,80000\n") # Missing Age

f.write("E05,Eve,40,HR,\n") # Missing Salary

f.write("E06,Frank,52,Sales,120000\n")

f.write("E07,Grace,30,Marketing,68000\n")

print(f"\nSuccessfully created dummy file: '{CSV_FILENAME}'")

except IOError as e:

print(f"Error creating CSV file: {e}")

# Create a sample Excel file

try:

excel_data = {

'Department': ['HR', 'Engineering', 'Sales', 'Marketing'],

'Location': ['New York', 'London', 'Tokyo', 'Paris']

}

df_for_excel = pd.DataFrame(excel_data)

df_for_excel.to_excel(EXCEL_FILENAME, index=False, engine='openpyxl')

print(f"Successfully created dummy file: '{EXCEL_FILENAME}'")

except Exception as e:

print(f"Error creating Excel file: {e}")

# Main logic wrapped in a try...finally block to ensure cleanup

try:

# --- Section 1: Data Ingestion ---

print("\n--- 1. Data Ingestion ---")

df_employees = pd.DataFrame()

df_locations = pd.DataFrame()

# Read from CSV

try:

df_employees = pd.read_csv(CSV_FILENAME)

print("\nSuccessfully loaded data from CSV. Displaying the DataFrame:")

print(df_employees)

except FileNotFoundError:

print(f"Error: The file '{CSV_FILENAME}' was not found.")

# Read from Excel

try:

df_locations = pd.read_excel(EXCEL_FILENAME)

print("\nSuccessfully loaded data from Excel. Displaying the DataFrame:")

print(df_locations)

except FileNotFoundError:

print(f"Error: The file '{EXCEL_FILENAME}' was not found.")

# --- Section 2: Data Exploration ---

if not df_employees.empty:

print("\n--- 2. Initial Exploration of Employee Data ---")

print("\n[Checking for missing values (NaN)]")

# .isnull() returns a DataFrame of booleans; .sum() counts the True values per column.

print(df_employees.isnull().sum())

# --- Section 3: Data Cleaning - Handling Missing Values ---

if not df_employees.empty:

print("\n--- 3. Handling Missing Values ---")

# Strategy 1: Fill missing 'Age' with the mean age.

mean_age = df_employees['Age'].mean()

print(f"\nMean age is: {mean_age:.2f}. Filling missing 'Age' with this value.")

df_employees['Age'].fillna(mean_age, inplace=True) # inplace=True modifies the DataFrame directly

# Strategy 2: Fill missing 'Salary' with the median salary of the respective department.

print("Filling missing 'Salary' with the median salary of the employee's department.")

df_employees['Salary'].fillna(df_employees.groupby('Department')['Salary'].transform('median'), inplace=True)

print("\n[DataFrame after handling missing values]")

print(df_employees)

# --- Section 4: Data Wrangling - Binning and Merging ---

if not df_employees.empty:

print("\n--- 4. Data Wrangling: Binning and Merging ---")

# Binning: Transform numerical 'Age' into categorical 'AgeGroup'.

print("\n[Binning ages into groups]")

bins = [20, 30, 40, 50, 60]

labels = ['20-30', '31-40', '41-50', '51-60']

df_employees['AgeGroup'] = pd.cut(df_employees['Age'], bins=bins, labels=labels, right=True)

print(df_employees[['Name', 'Age', 'AgeGroup']].head())

# Merging: Combine the two DataFrames

if not df_locations.empty:

print("\n[Merging employee and location data]")

df_merged = pd.merge(df_employees, df_locations, on='Department', how='left')

print(df_merged)

finally:

# --- Clean up the created files ---

# This block will run whether an error occurred or not.

print("\n--- Cleaning up created files ---")

if os.path.exists(CSV_FILENAME):

os.remove(CSV_FILENAME)

print(f"Removed '{CSV_FILENAME}'")

if os.path.exists(EXCEL_FILENAME):

os.remove(EXCEL_FILENAME)

print(f"Removed '{EXCEL_FILENAME}'")

print("\n--- End of Demonstration ---")

Program for Data Ingestion with Pandas Program for Data Visualization with Matplotlib