Back to Projects
    Churn Prediction
    Survival Analysis
    XGBoost

    Customer Churn Prediction Model

    ML ensemble predicting customer churn with 89% accuracy using behavioral features and survival analysis techniques.

    Overview

    Built a sophisticated customer churn prediction system combining gradient boosting with survival analysis. The model identifies customers at risk of churning and estimates time-to-churn probabilities. Features advanced feature engineering, model interpretability with SHAP, and automated retraining pipelines.

    Code Highlight

    Churn Prediction with Survival Analysis
    import pandas as pd
    import numpy as np
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    import shap
    from lifelines import CoxPHFitter
    class ChurnPredictionModel:
    def __init__(self):
    self.xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
    )
    self.cox_model = CoxPHFitter()
    self.feature_columns = []
    self.explainer = None
    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
    """Advanced feature engineering for churn prediction"""
    features = df.copy()
    # Behavioral features
    features['avg_session_duration'] = features['total_session_time'] / features['session_count']
    features['days_since_last_login'] = (pd.Timestamp.now() - features['last_login_date']).dt.days
    features['support_ticket_rate'] = features['support_tickets'] / features['account_age_days']
    # Engagement trends (rolling windows)
    features['login_trend_30d'] = features['logins_last_30d'] / features['logins_prev_30d']
    features['usage_trend_30d'] = features['usage_last_30d'] / features['usage_prev_30d']
    # Monetary features
    features['ltv_to_acquisition_cost'] = features['lifetime_value'] / features['acquisition_cost']
    features['payment_failure_rate'] = features['failed_payments'] / features['total_payments']
    # Risk indicators
    features['high_support_user'] = (features['support_tickets'] > features['support_tickets'].quantile(0.9)).astype(int)
    features['price_sensitive'] = (features['discount_usage'] > 3).astype(int)
    return features
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series, duration_train: pd.Series):
    """Train both classification and survival models"""
    # Engineer features
    X_engineered = self.engineer_features(X_train)
    self.feature_columns = X_engineered.select_dtypes(include=[np.number]).columns.tolist()
    # Train XGBoost classifier
    self.xgb_model.fit(X_engineered[self.feature_columns], y_train)
    # Train Cox proportional hazards model for time-to-churn
    survival_data = X_engineered[self.feature_columns].copy()
    survival_data['duration'] = duration_train
    survival_data['event'] = y_train
    self.cox_model.fit(survival_data, duration_col='duration', event_col='event')
    # Initialize SHAP explainer
    self.explainer = shap.TreeExplainer(self.xgb_model)
    def predict_churn_risk(self, X: pd.DataFrame) -> pd.DataFrame:
    """Predict churn probability and time-to-churn"""
    X_engineered = self.engineer_features(X)
    X_features = X_engineered[self.feature_columns]
    # Churn probability
    churn_prob = self.xgb_model.predict_proba(X_features)[:, 1]
    # Survival analysis - time to churn
    survival_func = self.cox_model.predict_survival_function(X_features)
    median_survival_time = survival_func.quantile(0.5, axis=0)
    # Feature importance for each prediction
    shap_values = self.explainer.shap_values(X_features)
    results = pd.DataFrame({
    'customer_id': X.index,
    'churn_probability': churn_prob,
    'estimated_days_to_churn': median_survival_time,
    'risk_category': pd.cut(churn_prob, bins=[0, 0.3, 0.7, 1.0], labels=['Low', 'Medium', 'High'])
    })
    # Add top risk factors
    for i, customer_idx in enumerate(X.index):
    top_features = np.argsort(np.abs(shap_values[i]))[-3:]
    results.loc[results['customer_id'] == customer_idx, 'top_risk_factors'] = ', '.join(
    [self.feature_columns[j] for j in top_features]
    )
    return results

    Key Results

    89% churn prediction accuracy
    Identified 25% of churners early
    Reduced churn rate by 15%
    ROI of 300% on retention campaigns

    Technologies Used

    Python
    XGBoost
    Scikit-learn
    SHAP
    Pandas
    Matplotlib
    Optuna

    Project Category

    data science

    Repository

    View on GitHub