Credit card fraud detection

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerfrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curvefrom imblearn.over_sampling import SMOTE# --- 1. Load the Dataset ---# The dataset can be downloaded from Kaggle: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraudtry: data = pd.read_csv('creditcard.csv')except FileNotFoundError: print("Error: 'creditcard.csv' not found.") print("Please download the dataset from Kaggle and place it in the same directory as this script.") exit()# --- 2. Data Exploration and Preprocessing ---print("First 5 rows of the dataset:")print(data.head())print("\nDataset Information:")data.info()print("\nChecking for missing values:")print(data.isnull().sum().max()) # A quick way to see if there are any nulls# The dataset is highly imbalanced. Let's visualize this.print("\nClass Distribution:")print(data['Class'].value_counts())plt.figure(figsize=(7, 5))sns.countplot(x='Class', data=data)plt.title('Class Distribution (0: Non-Fraudulent, 1: Fraudulent)')plt.xlabel('Class')plt.ylabel('Frequency')plt.show()# The 'Time' and 'Amount' columns are not scaled like the other 'V' columns (which are from PCA).# We need to scale them.scaler = StandardScaler()data['Scaled_Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))data['Scaled_Time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))# Drop the original 'Time' and 'Amount' columnsdata.drop(['Time', 'Amount'], axis=1, inplace=True)# Rearrange columns to have 'Class' at the enddata = data[['Scaled_Time', 'Scaled_Amount'] + [col for col in data.columns if col not in ['Class', 'Scaled_Time', 'Scaled_Amount']] + ['Class']]print("\nDataset after scaling 'Amount' and 'Time':")print(data.head())# --- 3. Splitting the Data ---# Separate features (X) and target (y)X = data.drop('Class', axis=1)y = data['Class']# Split into training and testing sets.# Using stratify=y ensures that the class proportion is the same in train and test sets.X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)# --- 4. Handling Class Imbalance with SMOTE ---print("\nShape of training data before SMOTE:")print(X_train.shape)print("Distribution of training labels before SMOTE:")print(y_train.value_counts())# IMPORTANT: Apply SMOTE only on the training data to avoid data leakage.smote = SMOTE(random_state=42)X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)print("\nShape of training data after SMOTE:")print(X_train_resampled.shape)print("Distribution of training labels after SMOTE:")print(y_train_resampled.value_counts())# --- 5. Model Building and Training ---# We will use Logistic Regression on the resampled data.model = LogisticRegression(solver='liblinear') # 'liblinear' is good for smaller datasetsmodel.fit(X_train_resampled, y_train_resampled)# --- 6. Model Evaluation ---# Make predictions on the original, non-resampled test sety_pred = model.predict(X_test)y_pred_proba = model.predict_proba(X_test)[:, 1] # Probabilities for the positive class# Confusion Matrixprint("\n--- Model Evaluation ---")cm = confusion_matrix(y_test, y_pred)plt.figure(figsize=(8, 6))sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraudulent', 'Fraudulent'], yticklabels=['Non-Fraudulent', 'Fraudulent'])plt.title('Confusion Matrix')plt.ylabel('Actual Class')plt.xlabel('Predicted Class')plt.show()# Classification Reportprint("\nClassification Report:")print(classification_report(y_test, y_pred, target_names=['Non-Fraudulent (0)', 'Fraudulent (1)']))# AUC-ROC Scoreauc_score = roc_auc_score(y_test, y_pred_proba)print(f"Area Under the ROC Curve (AUC-ROC): {auc_score:.4f}")# Plot ROC Curvefpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)plt.figure(figsize=(8, 6))plt.plot(fpr, tpr, color='blue', label=f'AUC = {auc_score:.4f}')plt.plot([0, 1], [0, 1], color='darkred', linestyle='--', label='Random Guessing')plt.xlabel('False Positive Rate (FPR)')plt.ylabel('True Positive Rate (TPR)')plt.title('Receiver Operating Characteristic (ROC) Curve')plt.legend()plt.show()

Key Features:

Interactive Dashboard: A modern web interface with tabbed navigation for different stages of the analysis

Data Upload: File upload functionality for CSV datasets with validation

Data Exploration: Visual analysis of class distribution, feature information, and preprocessing steps

Model Training: Detailed view of the training pipeline, SMOTE resampling, and model configuration

Model Evaluation: Comprehensive evaluation metrics with interactive charts including ROC curves and confusion matrix

Features Highlights:

Real-time Progress Tracking: Shows processing steps with progress indicators

Interactive Visualizations: ROC curves, confusion matrices, and distribution charts

Performance Metrics: Accuracy, precision, recall, F1-score, and AUC-ROC

SMOTE Analysis: Before/after comparison of class distribution

Model Insights: Detailed explanations of strengths and areas for improvement

The application maintains all the functionality of your original Python script while providing a user-friendly interface for fraud detection analysis.

To configure the generation, complete these steps:

Setup Integrations

Key Features:

Interactive Dashboard: A modern web interface with tabbed navigation for different stages of the analysis

Data Upload: File upload functionality for CSV datasets with validation

Data Exploration: Visual analysis of class distribution, feature information, and preprocessing steps

Model Training: Detailed view of the training pipeline, SMOTE resampling, and model configuration

Model Evaluation: Comprehensive evaluation metrics with interactive charts including ROC curves and confusion matrix

Features Highlights:

Real-time Progress Tracking: Shows processing steps with progress indicators

Interactive Visualizations: ROC curves, confusion matrices, and distribution charts

Performance Metrics: Accuracy, precision, recall, F1-score, and AUC-ROC

SMOTE Analysis: Before/after comparison of class distribution

Model Insights: Detailed explanations of strengths and areas for improvement

The application maintains all the functionality of your original Python script while providing a user-friendly interface for fraud detection analysis.

To configure the generation, complete these steps:

Setup Integrations

Key Features:

Technical Implementation:

Features Highlights:

Key Features:

Technical Implementation:

Features Highlights: