def py_clean_data(data_path: Path):
"""
We read in the data, remove some variables, and return the X and y dataframes
Args:
data_path (Path): Path =
Path('/Users/joshua/Documents/GitHub/scorecard_modeling/data/data_for_scorecard.csv')
Returns:
X and y
"""
data_for_scorecard = pd.read_csv(data_path)
# if surgical specialty is a number, mark it as NA
data_for_scorecard.loc[
data_for_scorecard.SurgicalSpecialty.str.isdigit()
| data_for_scorecard.SurgicalSpecialty.isna(),
"SurgicalSpecialty",
] = pd.NA
# remove some variables per 3/14 modeling meeting
y_MET = data_for_scorecard["MET_Team"] == 1
y_ICU = (
(data_for_scorecard["ICUafterPACU_Days"] == 1) & (data_for_scorecard["ICU_Bed_Order"] == 0)
) | (data_for_scorecard["ICU_AfterStepDown_NoOrderBeforePacuDepart_Days"] == 1)
y_stepdown = pd.notna(data_for_scorecard["StepDownUnitAfterGeneralCareTime_Days"])
data_for_scorecard.drop(
[
"SurgicalSpecialty",
"LastPACU_HR",
"LastPACU_SBP",
"LastPACU_RR",
"LastPACU_PainScore",
"LastPACU_Aldrete",
"SchedRecAnesType",
"Urine",
"NaCl",
"LR",
"PreOpGlucose",
"PreOpHematocrit",
"PostOpGlucose",
"MinPACUTemp",
"MaxPACUTemp",
"LastSpO2lte92",
"LastSBPlte100",
"LastSBPgte100",
"LastPaingte5",
"anes_duration",
"procedure_duration",
"LastO2Flowgt2",
"LastHRgte90",
"LastMAPlte60",
"LastMAPgte85",
"LastRRgte20",
"CurrentSmoker", # counterintuitive results
"Total_BloodProducts", # duplicate
"ICU_AfterStepDown_NoOrderBeforePacuDepart_Days", # part of response
"MET_Team",
"ICUafterPACU_Days",
"ICU_Bed_Order",
"StepDownUnitAfterGeneralCareTime_Days",
"ICU_AfterStepDown_NoOrderBeforePacuDepart",
],
axis=1,
inplace=True,
)
y = data_for_scorecard["escalation"]
X = data_for_scorecard.drop("escalation", axis=1)
other_escalations = pd.DataFrame(
{"pt_idx": X.index, "MET": y_MET, "ICU": y_ICU, "stepdown": y_stepdown}
)
other_escalations.to_csv(Path(config.INTERMEDIATE_DATA, "other_escalations.csv"))
logger.info("Other escalation types saved to intermediate data path.")
return X, y