StudyLover
  • Home
  • Study Zone
  • Profiles
  • Contact us
  • Sign in
StudyLover Program for Data Exploration and Wrangling
Download
  1. Python
  2. Pyhton MCA (Machine Learning using Python)
  3. Programs
Program for Data Ingestion with Pandas : Program for Data Visualization with Matplotlib
Programs

# main.py

# A demonstration of data ingestion, exploration, and wrangling using pandas.

#

# Before running, you may need to install pandas and openpyxl:

# pip install pandas openpyxl


import pandas as pd

import numpy as np

import os


print("--- Starting Data Ingestion and Wrangling Demonstration ---")


# --- Setup: Create dummy files for demonstration ---

CSV_FILENAME = "employee_data.csv"

EXCEL_FILENAME = "location_data.xlsx"


# Create a sample CSV file with missing data

try:

    with open(CSV_FILENAME, 'w') as f:

        f.write("EmployeeID,Name,Age,Department,Salary\n")

        f.write("E01,Alice,34,HR,70000\n")

        f.write("E02,Bob,45,Engineering,95000\n")

        f.write("E03,Charlie,28,Sales,65000\n")

        f.write("E04,David,,Engineering,80000\n") # Missing Age

        f.write("E05,Eve,40,HR,\n") # Missing Salary

        f.write("E06,Frank,52,Sales,120000\n")

        f.write("E07,Grace,30,Marketing,68000\n")

    print(f"\nSuccessfully created dummy file: '{CSV_FILENAME}'")

except IOError as e:

    print(f"Error creating CSV file: {e}")


# Create a sample Excel file

try:

    excel_data = {

        'Department': ['HR', 'Engineering', 'Sales', 'Marketing'],

        'Location': ['New York', 'London', 'Tokyo', 'Paris']

    }

    df_for_excel = pd.DataFrame(excel_data)

    df_for_excel.to_excel(EXCEL_FILENAME, index=False, engine='openpyxl')

    print(f"Successfully created dummy file: '{EXCEL_FILENAME}'")

except Exception as e:

    print(f"Error creating Excel file: {e}")



# Main logic wrapped in a try...finally block to ensure cleanup

try:

    # --- Section 1: Data Ingestion ---

    print("\n--- 1. Data Ingestion ---")

    df_employees = pd.DataFrame()

    df_locations = pd.DataFrame()

    

    # Read from CSV

    try:

        df_employees = pd.read_csv(CSV_FILENAME)

        print("\nSuccessfully loaded data from CSV. Displaying the DataFrame:")

        print(df_employees)

    except FileNotFoundError:

        print(f"Error: The file '{CSV_FILENAME}' was not found.")

    

    # Read from Excel

    try:

        df_locations = pd.read_excel(EXCEL_FILENAME)

        print("\nSuccessfully loaded data from Excel. Displaying the DataFrame:")

        print(df_locations)

    except FileNotFoundError:

        print(f"Error: The file '{EXCEL_FILENAME}' was not found.")


    # --- Section 2: Data Exploration ---

    if not df_employees.empty:

        print("\n--- 2. Initial Exploration of Employee Data ---")

        print("\n[Checking for missing values (NaN)]")

        # .isnull() returns a DataFrame of booleans; .sum() counts the True values per column.

        print(df_employees.isnull().sum())


    # --- Section 3: Data Cleaning - Handling Missing Values ---

    if not df_employees.empty:

        print("\n--- 3. Handling Missing Values ---")

        

        # Strategy 1: Fill missing 'Age' with the mean age.

        mean_age = df_employees['Age'].mean()

        print(f"\nMean age is: {mean_age:.2f}. Filling missing 'Age' with this value.")

        df_employees['Age'].fillna(mean_age, inplace=True) # inplace=True modifies the DataFrame directly

        

        # Strategy 2: Fill missing 'Salary' with the median salary of the respective department.

        print("Filling missing 'Salary' with the median salary of the employee's department.")

        df_employees['Salary'].fillna(df_employees.groupby('Department')['Salary'].transform('median'), inplace=True)

        

        print("\n[DataFrame after handling missing values]")

        print(df_employees)


    # --- Section 4: Data Wrangling - Binning and Merging ---

    if not df_employees.empty:

        print("\n--- 4. Data Wrangling: Binning and Merging ---")

        

        # Binning: Transform numerical 'Age' into categorical 'AgeGroup'.

        print("\n[Binning ages into groups]")

        bins = [20, 30, 40, 50, 60]

        labels = ['20-30', '31-40', '41-50', '51-60']

        df_employees['AgeGroup'] = pd.cut(df_employees['Age'], bins=bins, labels=labels, right=True)

        print(df_employees[['Name', 'Age', 'AgeGroup']].head())


        # Merging: Combine the two DataFrames

        if not df_locations.empty:

            print("\n[Merging employee and location data]")

            df_merged = pd.merge(df_employees, df_locations, on='Department', how='left')

            print(df_merged)


finally:

    # --- Clean up the created files ---

    # This block will run whether an error occurred or not.

    print("\n--- Cleaning up created files ---")

    if os.path.exists(CSV_FILENAME):

        os.remove(CSV_FILENAME)

        print(f"Removed '{CSV_FILENAME}'")

    if os.path.exists(EXCEL_FILENAME):

        os.remove(EXCEL_FILENAME)

        print(f"Removed '{EXCEL_FILENAME}'")


print("\n--- End of Demonstration ---")


Program for Data Ingestion with Pandas Program for Data Visualization with Matplotlib
Our Products & Services
  • Home
Connect with us
  • Contact us
  • +91 82955 87844
  • Rk6yadav@gmail.com

StudyLover - About us

The Best knowledge for Best people.

Copyright © StudyLover
Powered by Odoo - Create a free website