lss-logistic-regression/logistic_regression.py at main · samirsaci/lss-logistic-regression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Lean Six Sigma - Logistic Regression

This script performs logistic regression to estimate the minimum bonus
needed to reach 75% of a productivity target.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


def load_data():
    """Load incentive data from Excel file."""
    df = pd.read_excel('data/df_incentive.xlsx')
    return df


def create_sample_data():
    """Create sample data for demonstration."""
    np.random.seed(42)

    data = []
    for incentive in range(1, 21):
        # Higher incentive = higher probability of reaching target
        prob = 1 / (1 + np.exp(-(incentive - 15) / 3))
        n_samples = np.random.randint(14, 20)
        for _ in range(n_samples):
            target = 1 if np.random.random() < prob else 0
            data.append({'Incentive': incentive, 'Target': target})

    return pd.DataFrame(data)


def plot_boxplot(df):
    """Create and save boxplot of incentive distribution by target."""
    fig, ax = plt.subplots(figsize=(10, 7))
    df.boxplot(by=['Target'], column=['Incentive'], ax=ax)
    plt.xlabel('Target Reached (1: True, 0: False)')
    plt.ylabel('Incentive (Euros/Day)')
    plt.title('Incentive Distribution by Target Achievement')
    plt.suptitle('')  # Remove default title
    plt.tight_layout()
    plt.savefig('boxplot_incentive.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("Saved: boxplot_incentive.png")


def plot_logistic_regression(df):
    """Create and save logistic regression plot."""
    plt.figure(figsize=(12, 6))
    ax = plt.gca()
    sns.regplot(x='Incentive', y='Target', data=df, logistic=True, ax=ax)
    plt.xlabel('Productivity Incentive (Euros/Day)')
    plt.ylabel('Probability of meeting the productivity target')
    plt.title('Logistic Regression: Incentive vs Target Achievement')

    # Add horizontal line at 0.75
    plt.axhline(y=0.75, color='r', linestyle='--', alpha=0.7, label='75% target')
    plt.legend()

    plt.tight_layout()
    plt.savefig('logistic_regression.png', dpi=150, bbox_inches='tight')
    plt.close()
    print("Saved: logistic_regression.png")


def perform_logistic_regression(df):
    """Perform logistic regression analysis."""
    # Define X, y
    X = df[['Incentive']]
    y = df['Target']

    # Training/Test Sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0
    )

    # Instantiate and fit model
    log_regression = LogisticRegression()
    log_regression.fit(X_train, y_train)

    # Calculate p-value
    denom = (2.0 * (1.0 + np.cosh(log_regression.decision_function(X))))
    denom = np.tile(denom, (X.shape[1], 1)).T
    # Fisher Information Matrix
    F_ij = np.dot((X / denom).T, X)
    # Inverse Information Matrix
    Cramer_Rao = np.linalg.inv(F_ij)
    sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
    # z-score for each model coefficient
    z_scores = log_regression.coef_[0] / sigma_estimates
    # two tailed test for p-values
    p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores]

    # Find incentive for 75% probability
    # P = 1 / (1 + exp(-(b0 + b1*x)))
    # 0.75 = 1 / (1 + exp(-(b0 + b1*x)))
    # Solving: x = (log(0.75/0.25) - b0) / b1
    b0 = log_regression.intercept_[0]
    b1 = log_regression.coef_[0][0]
    incentive_75 = (np.log(0.75 / 0.25) - b0) / b1

    return {
        'model': log_regression,
        'intercept': b0,
        'coefficient': b1,
        'p_value': p_values[0],
        'incentive_75': incentive_75,
        'accuracy_train': log_regression.score(X_train, y_train),
        'accuracy_test': log_regression.score(X_test, y_test)
    }


def display_results(df, results):
    """Display analysis results."""
    print("=" * 60)
    print("LOGISTIC REGRESSION - INCENTIVE POLICY ANALYSIS")
    print("=" * 60)

    print(f"\n--- DATA SUMMARY ---")
    print(f"Total Records: {len(df):,}")
    print(f"Target Reached: {df['Target'].sum()}")
    print(f"Target Not Reached: {len(df) - df['Target'].sum()}")
    print(f"Success Rate: {df['Target'].mean()*100:.1f}%")

    # Summary by incentive
    df_calc = pd.DataFrame(df.groupby(['Incentive'])['Target'].sum())
    df_calc.columns = ['Target']
    df_calc['No Target'] = df.groupby(['Incentive'])['Target'].count() - df_calc['Target']
    df_calc['Total'] = df.groupby(['Incentive'])['Target'].count()
    df_calc['Success Rate'] = (df_calc['Target'] / df_calc['Total'] * 100).round(1)

    print(f"\n--- SUCCESS RATE BY INCENTIVE ---")
    print(df_calc.to_string())

    print(f"\n--- LOGISTIC REGRESSION RESULTS ---")
    print(f"Intercept (b0): {results['intercept']:.4f}")
    print(f"Coefficient (b1): {results['coefficient']:.4f}")
    print(f"p-value: {results['p_value']:.2e}")
    print(f"Training Accuracy: {results['accuracy_train']*100:.1f}%")
    print(f"Test Accuracy: {results['accuracy_test']*100:.1f}%")

    print(f"\n--- CONCLUSION ---")
    alpha = 0.05
    if results['p_value'] < alpha:
        print(f"The coefficient is statistically significant (p < {alpha}).")
        print(f"Incentive amount DOES affect target achievement probability.")
    else:
        print(f"The coefficient is NOT statistically significant (p >= {alpha}).")

    print(f"\n--- RECOMMENDATION ---")
    print(f"To achieve 75% probability of meeting the target:")
    print(f"Minimum incentive needed: {results['incentive_75']:.2f} Euros/Day")

    if results['incentive_75'] > 0:
        print(f"\nRounded up: {int(np.ceil(results['incentive_75']))} Euros/Day")


def main():
    """Main function for logistic regression analysis."""
    # Load or create data
    try:
        df = load_data()
        print("Data loaded from file.")
    except FileNotFoundError:
        print("Data file not found. Using sample data.")
        df = create_sample_data()

    # Create visualizations
    plot_boxplot(df)
    plot_logistic_regression(df)

    # Perform analysis
    results = perform_logistic_regression(df)

    # Display results
    display_results(df, results)


if __name__ == "__main__":
    main()