Create automated data cleaning scripts

Automated data cleaning scripts help streamline the process of preparing raw data for analysis by identifying and handling inconsistencies, missing values, duplicates, and formatting issues. Below is a comprehensive Python-based script using pandas and numpy, designed to clean CSV datasets automatically. This script can be customized and extended as needed.

python
import pandas as pd
import numpy as np
import os

def load_data(file_path):
    """Load CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

def standardize_column_names(df):
    """Standardize column names to lowercase and replace spaces with underscores."""
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

def handle_missing_values(df, strategy='mean'):
    """Fill missing values using a specified strategy: mean, median, or drop."""
    for column in df.columns:
        if df[column].isnull().sum() > 0:
            if df[column].dtype in ['float64', 'int64']:
                if strategy == 'mean':
                    df[column].fillna(df[column].mean(), inplace=True)
                elif strategy == 'median':
                    df[column].fillna(df[column].median(), inplace=True)
            else:
                df[column].fillna(df[column].mode()[0], inplace=True)
    return df

def remove_duplicates(df):
    """Remove duplicate rows."""
    before = df.shape[0]
    df.drop_duplicates(inplace=True)
    after = df.shape[0]
    print(f"Removed {before - after} duplicate rows.")
    return df

def correct_data_types(df):
    """Try to correct column data types automatically."""
    for column in df.columns:
        try:
            df[column] = pd.to_numeric(df[column])
        except:
            try:
                df[column] = pd.to_datetime(df[column])
            except:
                pass
    return df

def strip_whitespace(df):
    """Strip whitespace from string columns."""
    str_cols = df.select_dtypes(include=['object']).columns
    for col in str_cols:
        df[col] = df[col].str.strip()
    return df

def detect_outliers(df, z_thresh=3):
    """Remove rows with numerical outliers based on Z-score."""
    numeric_cols = df.select_dtypes(include=[np.number])
    z_scores = (numeric_cols - numeric_cols.mean()) / numeric_cols.std()
    mask = (np.abs(z_scores) < z_thresh).all(axis=1)
    before = df.shape[0]
    df = df[mask]
    after = df.shape[0]
    print(f"Removed {before - after} outlier rows.")
    return df

def save_clean_data(df, output_path):
    """Save the cleaned DataFrame to a new CSV file."""
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to: {output_path}")

def clean_data_pipeline(input_file, output_file, missing_strategy='mean'):
    """Full data cleaning pipeline."""
    df = load_data(input_file)
    if df is None:
        return
    df = standardize_column_names(df)
    df = strip_whitespace(df)
    df = remove_duplicates(df)
    df = correct_data_types(df)
    df = handle_missing_values(df, strategy=missing_strategy)
    df = detect_outliers(df)
    save_clean_data(df, output_file)

# Example usage
if __name__ == "__main__":
    input_path = "raw_data.csv"         # Replace with actual input file
    output_path = "cleaned_data.csv"    # Replace with desired output file
    clean_data_pipeline(input_path, output_path)

Features of the Script:

Flexible Missing Value Handling: Choose to fill with mean, median, or mode.
Automatic Type Conversion: Attempts to infer correct data types (numeric, datetime).
Duplicate and Outlier Removal: Removes exact duplicates and statistical outliers using z-score.
Whitespace Trimming: Cleans up textual columns.
Column Name Normalization: Makes column headers consistent and machine-friendly.

Customization Ideas:

Add logging instead of print() statements.
Add a GUI or CLI interface.
Handle JSON, Excel, or database inputs.
Add validation rules or custom exceptions.

Let me know if you want the same script adapted for different file formats or integrated into a web API.

Share This Page:

Features of the Script:

Customization Ideas:

Comments

Leave a Reply Cancel reply

Check Out Our Newest Posts we wrote about

Write scripts to automate online shopping

Write a Python script to clean HTML files

Why You Need an AI Content Operations Strategy

Why You Need a Business Case for Every Model