15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124 | def train(args: Namespace, X, y, numeric_vars, trial=None, test_size=0.2, random_seed=42) -> Dict:
"""
Train a logistic regression model on binned data, and return the model, the scorecard, and the
performance metrics
Args:
args (Namespace): Namespace
X: the dataframe of features
y: the target variable
numeric_vars: list of numeric variables
trial: If true, don't log with mlflow
test_size: the fraction of the data to use for testing
random_seed: int. Defaults to 42
"""
# instead of "trial" maybe just "mlflow=False"
# Setup ####
utils.set_seeds()
# get categorical and numeric vars
categorical_variables = [col for col in X.columns if col not in numeric_vars]
# Define the feature list from dataset (including categorical and numerical)
list_features = X.columns.values
# Define selection criteria for BinningProcess
selection_criteria = {
"iv": {"min": args.iv_min, "max": args.iv_max, "strategy": args.iv_strategy}
}
# Instantiate BinningProcess
binning_process = BinningProcess(
categorical_variables=categorical_variables,
variable_names=list_features,
selection_criteria=selection_criteria,
special_codes=[args.special_code],
)
# Split ####
X_train, X_test, y_train, y_test = data.get_data_splits(
X, y, test_frac=test_size, seed=random_seed
)
# get binned data
X_train_binned = binning_process.fit_transform(
X_train,
y_train,
sample_weight=args.binning_sample_weight,
metric=args.binning_metric,
metric_special=args.binning_metric_special,
metric_missing=args.binning_metric_missing,
show_digits=args.binning_show_digits,
check_input=args.binning_check_input,
)
X_test_binned = binning_process.transform(
X_test,
metric=args.binning_metric,
metric_special=args.binning_metric_special,
metric_missing=args.binning_metric_missing,
show_digits=args.binning_show_digits,
check_input=args.binning_check_input,
)
lr_model = LogisticRegression(
penalty=args.lr_penalty,
C=args.C,
l1_ratio=args.l1_ratio,
solver=args.solver,
max_iter=args.max_iter,
n_jobs=-1,
)
sc_model = Scorecard(
binning_process=binning_process,
estimator=lr_model,
scaling_method=args.scaling_method,
rounding=args.rounding,
scaling_method_params={"min": args.scaling_method_min, "max": args.scaling_method_max},
reverse_scorecard=args.reverse_scorecard,
intercept_based=args.intercept_based,
)
# looks like you have to at least do metric_special if you have special values. "empirical" uses WoE
sc_model.fit(
X_train,
y_train,
metric_missing=args.binning_metric_missing,
metric_special=args.binning_metric_special,
)
scorecard_metrics = evaluate.get_scorecard_metrics(
sc_model, X_train, X_test, y_train, y_test, trial
)
lr_metrics = evaluate.get_lr_metrics(
lr_model=sc_model.estimator_,
X_train_binned=X_train_binned,
X_test_binned=X_test_binned,
y_train=y_train,
y_test=y_test,
trial=trial,
)
performance = {"scorecard": scorecard_metrics, "lr": lr_metrics}
print(json.dumps(performance, indent=2))
return {
"args": args,
"lr_model": lr_model,
"scorecard_model": sc_model,
"performance": performance,
}
|