{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Propensity Model with MLflow Tracking\n",
    "This notebook trains a simple logistic regression model to predict treatment propensity and logs parameters, metrics, and artifacts using MLflow."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score\n",
    "import boto3\n",
    "import mlflow\n",
    "import mlflow.sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MLflow setup\n",
    "mlflow.set_experiment(\"Propensity_Model\")\n",
    "\n",
    "with mlflow.start_run():\n",
    "    # Parameters\n",
    "    solver = 'liblinear'\n",
    "    mlflow.log_param(\"solver\", solver)\n",
    "\n",
    "    # Data generation\n",
    "    np.random.seed(42)\n",
    "    n_samples = 1000\n",
    "    age = np.random.normal(40, 10, n_samples)\n",
    "    income = np.random.normal(60000, 15000, n_samples)\n",
    "    beta0, beta1, beta2 = -15, 0.2, 0.0001\n",
    "    linear_combination = beta0 + beta1 * age + beta2 * income\n",
    "    probability = 1 / (1 + np.exp(-linear_combination))\n",
    "    treatment = np.random.binomial(1, probability)\n",
    "\n",
    "    data = pd.DataFrame({\n",
    "        'age': age,\n",
    "        'income': income,\n",
    "        'treatment': treatment\n",
    "    })\n",
    "\n",
    "    X = data[['age', 'income']]\n",
    "    y = data['treatment']\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
    "\n",
    "    # Model training\n",
    "    model = LogisticRegression(solver=solver)\n",
    "    model.fit(X_train, y_train)\n",
    "    y_proba = model.predict_proba(X_test)[:, 1]\n",
    "    y_pred = (y_proba >= 0.5).astype(int)\n",
    "\n",
    "    # Metrics\n",
    "    acc = accuracy_score(y_test, y_pred)\n",
    "    prec = precision_score(y_test, y_pred)\n",
    "    rec = recall_score(y_test, y_pred)\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_proba)\n",
    "    roc_auc = auc(fpr, tpr)\n",
    "\n",
    "    mlflow.log_metric(\"accuracy\", acc)\n",
    "    mlflow.log_metric(\"precision\", prec)\n",
    "    mlflow.log_metric(\"recall\", rec)\n",
    "    mlflow.log_metric(\"auc\", roc_auc)\n",
    "\n",
    "    print(\"Confusion Matrix:\")\n",
    "    print(confusion_matrix(y_test, y_pred))\n",
    "    print(\"\\nClassification Report:\")\n",
    "    print(classification_report(y_test, y_pred))\n",
    "\n",
    "    # ROC Curve\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')\n",
    "    plt.plot([0, 1], [0, 1], 'k--')\n",
    "    plt.xlabel('False Positive Rate')\n",
    "    plt.ylabel('True Positive Rate')\n",
    "    plt.title('Receiver Operating Characteristic (ROC)')\n",
    "    plt.legend(loc='lower right')\n",
    "    roc_path = \"/tmp/roc_curve.png\"\n",
    "    plt.savefig(roc_path)\n",
    "    plt.close()\n",
    "    mlflow.log_artifact(roc_path)\n",
    "\n",
    "    # Save results\n",
    "    results = X_test.copy()\n",
    "    results['Actual_Treatment'] = y_test.values\n",
    "    results['Propensity_Score'] = y_proba\n",
    "    results.sort_values(by='Propensity_Score', ascending=False, inplace=True)\n",
    "\n",
    "    local_path = \"/tmp/propensity_model_results.csv\"\n",
    "    results.to_csv(local_path, index=False)\n",
    "    print(f\"Saved local CSV: {local_path}\")\n",
    "    mlflow.log_artifact(local_path)\n",
    "\n",
    "    # Upload to S3\n",
    "    s3 = boto3.client('s3')\n",
    "    bucket_name = 'YOUR_BUCKET'  # 🔁 Replace\n",
    "    output_path = 'PATH/TO/YOUR/FILE/propensity_results.csv'  # 🔁 Replace\n",
    "    try:\n",
    "        s3.upload_file(local_path, bucket_name, output_path)\n",
    "        print(f\"Uploaded to: s3://{bucket_name}/{output_path}\")\n",
    "    except Exception as e:\n",
    "        print(\"Failed to upload to S3:\", e)\n",
    "\n",
    "    # Optionally log model\n",
    "    mlflow.sklearn.log_model(model, \"logistic_model\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.x"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
