-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlogistic_regression.py
More file actions
186 lines (149 loc) · 6.07 KB
/
logistic_regression.py
File metadata and controls
186 lines (149 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Lean Six Sigma - Logistic Regression
This script performs logistic regression to estimate the minimum bonus
needed to reach 75% of a productivity target.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def load_data():
"""Load incentive data from Excel file."""
df = pd.read_excel('data/df_incentive.xlsx')
return df
def create_sample_data():
"""Create sample data for demonstration."""
np.random.seed(42)
data = []
for incentive in range(1, 21):
# Higher incentive = higher probability of reaching target
prob = 1 / (1 + np.exp(-(incentive - 15) / 3))
n_samples = np.random.randint(14, 20)
for _ in range(n_samples):
target = 1 if np.random.random() < prob else 0
data.append({'Incentive': incentive, 'Target': target})
return pd.DataFrame(data)
def plot_boxplot(df):
"""Create and save boxplot of incentive distribution by target."""
fig, ax = plt.subplots(figsize=(10, 7))
df.boxplot(by=['Target'], column=['Incentive'], ax=ax)
plt.xlabel('Target Reached (1: True, 0: False)')
plt.ylabel('Incentive (Euros/Day)')
plt.title('Incentive Distribution by Target Achievement')
plt.suptitle('') # Remove default title
plt.tight_layout()
plt.savefig('boxplot_incentive.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved: boxplot_incentive.png")
def plot_logistic_regression(df):
"""Create and save logistic regression plot."""
plt.figure(figsize=(12, 6))
ax = plt.gca()
sns.regplot(x='Incentive', y='Target', data=df, logistic=True, ax=ax)
plt.xlabel('Productivity Incentive (Euros/Day)')
plt.ylabel('Probability of meeting the productivity target')
plt.title('Logistic Regression: Incentive vs Target Achievement')
# Add horizontal line at 0.75
plt.axhline(y=0.75, color='r', linestyle='--', alpha=0.7, label='75% target')
plt.legend()
plt.tight_layout()
plt.savefig('logistic_regression.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved: logistic_regression.png")
def perform_logistic_regression(df):
"""Perform logistic regression analysis."""
# Define X, y
X = df[['Incentive']]
y = df['Target']
# Training/Test Sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0
)
# Instantiate and fit model
log_regression = LogisticRegression()
log_regression.fit(X_train, y_train)
# Calculate p-value
denom = (2.0 * (1.0 + np.cosh(log_regression.decision_function(X))))
denom = np.tile(denom, (X.shape[1], 1)).T
# Fisher Information Matrix
F_ij = np.dot((X / denom).T, X)
# Inverse Information Matrix
Cramer_Rao = np.linalg.inv(F_ij)
sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
# z-score for each model coefficient
z_scores = log_regression.coef_[0] / sigma_estimates
# two tailed test for p-values
p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores]
# Find incentive for 75% probability
# P = 1 / (1 + exp(-(b0 + b1*x)))
# 0.75 = 1 / (1 + exp(-(b0 + b1*x)))
# Solving: x = (log(0.75/0.25) - b0) / b1
b0 = log_regression.intercept_[0]
b1 = log_regression.coef_[0][0]
incentive_75 = (np.log(0.75 / 0.25) - b0) / b1
return {
'model': log_regression,
'intercept': b0,
'coefficient': b1,
'p_value': p_values[0],
'incentive_75': incentive_75,
'accuracy_train': log_regression.score(X_train, y_train),
'accuracy_test': log_regression.score(X_test, y_test)
}
def display_results(df, results):
"""Display analysis results."""
print("=" * 60)
print("LOGISTIC REGRESSION - INCENTIVE POLICY ANALYSIS")
print("=" * 60)
print(f"\n--- DATA SUMMARY ---")
print(f"Total Records: {len(df):,}")
print(f"Target Reached: {df['Target'].sum()}")
print(f"Target Not Reached: {len(df) - df['Target'].sum()}")
print(f"Success Rate: {df['Target'].mean()*100:.1f}%")
# Summary by incentive
df_calc = pd.DataFrame(df.groupby(['Incentive'])['Target'].sum())
df_calc.columns = ['Target']
df_calc['No Target'] = df.groupby(['Incentive'])['Target'].count() - df_calc['Target']
df_calc['Total'] = df.groupby(['Incentive'])['Target'].count()
df_calc['Success Rate'] = (df_calc['Target'] / df_calc['Total'] * 100).round(1)
print(f"\n--- SUCCESS RATE BY INCENTIVE ---")
print(df_calc.to_string())
print(f"\n--- LOGISTIC REGRESSION RESULTS ---")
print(f"Intercept (b0): {results['intercept']:.4f}")
print(f"Coefficient (b1): {results['coefficient']:.4f}")
print(f"p-value: {results['p_value']:.2e}")
print(f"Training Accuracy: {results['accuracy_train']*100:.1f}%")
print(f"Test Accuracy: {results['accuracy_test']*100:.1f}%")
print(f"\n--- CONCLUSION ---")
alpha = 0.05
if results['p_value'] < alpha:
print(f"The coefficient is statistically significant (p < {alpha}).")
print(f"Incentive amount DOES affect target achievement probability.")
else:
print(f"The coefficient is NOT statistically significant (p >= {alpha}).")
print(f"\n--- RECOMMENDATION ---")
print(f"To achieve 75% probability of meeting the target:")
print(f"Minimum incentive needed: {results['incentive_75']:.2f} Euros/Day")
if results['incentive_75'] > 0:
print(f"\nRounded up: {int(np.ceil(results['incentive_75']))} Euros/Day")
def main():
"""Main function for logistic regression analysis."""
# Load or create data
try:
df = load_data()
print("Data loaded from file.")
except FileNotFoundError:
print("Data file not found. Using sample data.")
df = create_sample_data()
# Create visualizations
plot_boxplot(df)
plot_logistic_regression(df)
# Perform analysis
results = perform_logistic_regression(df)
# Display results
display_results(df, results)
if __name__ == "__main__":
main()