The Evaluator class in Python calculates and displays evaluation metrics such as accuracy, precision, recall, F1 score, confusion matrix, and classification report for classification tasks.

Evaluator

This Python class Evaluator provides methods to calculate and display evaluation metrics for classification tasks, including precision, recall, F1 score, accuracy, confusion matrix, and classification report.

Source code in LabeLMaker/Evaluate/evaluator.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class Evaluator:
    """
    This Python class `Evaluator` provides methods to calculate and display evaluation metrics for
    classification tasks, including precision, recall, F1 score, accuracy, confusion matrix, and
    classification report.
    """

    def __init__(self, y_true: List, y_pred: List) -> None:
        """
        Initializes the Evaluator with true and predicted labels.
        Parameters:
            y_true (List): Ground truth labels.
            y_pred (List): Predicted labels.
        """
        self.y_true = y_true
        self.y_pred = y_pred
        self.metrics: Dict[str, Any] = {}

    @staticmethod
    def _format_numeric(value: Any) -> Any:
        """
        Format a numeric value to 5 significant figures. If the value is not numeric,
        it is returned unchanged.
        """
        if isinstance(value, (int, float)):
            # Convert to float and then format.
            return float(format(value, ".4g"))
        return value

    @classmethod
    def _format_dict(cls, d: Dict[Any, Any]) -> Dict[Any, Any]:
        """
        Recursively format all numeric entries in a dictionary to 5 significant figures.
        """
        formatted = {}
        for k, v in d.items():
            if isinstance(v, dict):
                # Recursively process nested dictionaries.
                formatted[k] = cls._format_dict(v)
            else:
                formatted[k] = cls._format_numeric(v)
        return formatted

    def calculate_metrics(self, average_options: Optional[List[str]] = None) -> Dict[str, Any]:
        """
        Calculates evaluation metrics.
        Parameters:
            average_options (List[str], optional): Averaging methods
                (e.g., ['macro', 'weighted']).
        Returns:
            Dict[str, Any]: Dictionary of calculated metrics.
        """
        if average_options is None:
            average_options = ["macro", "weighted"]

        # Calculate and format accuracy.
        self.metrics["Accuracy"] = self._format_numeric(accuracy_score(self.y_true, self.y_pred))

        # Calculate precision, recall, and f1 scores for each averaging option.
        for avg in average_options:
            self.metrics[f"Precision ({avg})"] = self._format_numeric(
                precision_score(self.y_true, self.y_pred, average=avg, zero_division=0)
            )
            self.metrics[f"Recall ({avg})"] = self._format_numeric(
                recall_score(self.y_true, self.y_pred, average=avg, zero_division=0)
            )
            self.metrics[f"F1 Score ({avg})"] = self._format_numeric(
                f1_score(self.y_true, self.y_pred, average=avg, zero_division=0)
            )

        # Compute and save the confusion matrix without formatting.
        self.metrics["Confusion Matrix"] = confusion_matrix(self.y_true, self.y_pred)

        # Compute and then recursively format the classification report.
        raw_report = classification_report(
            self.y_true, self.y_pred, output_dict=True, zero_division=0
        )
        self.metrics["Classification Report"] = self._format_dict(raw_report)

        return self.metrics

    def display_metrics(self) -> pd.DataFrame:
        """
        Returns calculated metrics as a DataFrame.
        """
        metrics_to_display = {
            k: v
            for k, v in self.metrics.items()
            if k not in ["Confusion Matrix", "Classification Report"]
        }
        df = pd.DataFrame(list(metrics_to_display.items()), columns=["Metric", "Value"])
        return df

    def plot_confusion_matrix(self, class_labels: Optional[List[str]] = None) -> plt.Figure:
        """
        Plots the confusion matrix.
        Parameters:
            class_labels (List[str], optional): Labels for the classes.
        Raises:
            ValueError: If confusion matrix is not calculated.
        """
        cm = self.metrics.get("Confusion Matrix")
        if cm is None:
            raise ValueError("Confusion Matrix not calculated. Call calculate_metrics() first.")
        if class_labels is None:
            class_labels = sorted(set(self.y_true) | set(self.y_pred))
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=class_labels,
            yticklabels=class_labels,
            ax=ax,
        )
        ax.set_title("Confusion Matrix")
        ax.set_xlabel("Predicted Label")
        ax.set_ylabel("True Label")
        plt.close(fig)  # Close the figure to prevent it from displaying automatically
        return fig

__init__(y_true, y_pred)

Initializes the Evaluator with true and predicted labels. Parameters: y_true (List): Ground truth labels. y_pred (List): Predicted labels.

Source code in LabeLMaker/Evaluate/evaluator.py
27
28
29
30
31
32
33
34
35
36
def __init__(self, y_true: List, y_pred: List) -> None:
    """
    Initializes the Evaluator with true and predicted labels.
    Parameters:
        y_true (List): Ground truth labels.
        y_pred (List): Predicted labels.
    """
    self.y_true = y_true
    self.y_pred = y_pred
    self.metrics: Dict[str, Any] = {}

calculate_metrics(average_options=None)

Calculates evaluation metrics. Parameters: average_options (List[str], optional): Averaging methods (e.g., ['macro', 'weighted']). Returns: Dict[str, Any]: Dictionary of calculated metrics.

Source code in LabeLMaker/Evaluate/evaluator.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def calculate_metrics(self, average_options: Optional[List[str]] = None) -> Dict[str, Any]:
    """
    Calculates evaluation metrics.
    Parameters:
        average_options (List[str], optional): Averaging methods
            (e.g., ['macro', 'weighted']).
    Returns:
        Dict[str, Any]: Dictionary of calculated metrics.
    """
    if average_options is None:
        average_options = ["macro", "weighted"]

    # Calculate and format accuracy.
    self.metrics["Accuracy"] = self._format_numeric(accuracy_score(self.y_true, self.y_pred))

    # Calculate precision, recall, and f1 scores for each averaging option.
    for avg in average_options:
        self.metrics[f"Precision ({avg})"] = self._format_numeric(
            precision_score(self.y_true, self.y_pred, average=avg, zero_division=0)
        )
        self.metrics[f"Recall ({avg})"] = self._format_numeric(
            recall_score(self.y_true, self.y_pred, average=avg, zero_division=0)
        )
        self.metrics[f"F1 Score ({avg})"] = self._format_numeric(
            f1_score(self.y_true, self.y_pred, average=avg, zero_division=0)
        )

    # Compute and save the confusion matrix without formatting.
    self.metrics["Confusion Matrix"] = confusion_matrix(self.y_true, self.y_pred)

    # Compute and then recursively format the classification report.
    raw_report = classification_report(
        self.y_true, self.y_pred, output_dict=True, zero_division=0
    )
    self.metrics["Classification Report"] = self._format_dict(raw_report)

    return self.metrics

display_metrics()

Returns calculated metrics as a DataFrame.

Source code in LabeLMaker/Evaluate/evaluator.py
101
102
103
104
105
106
107
108
109
110
111
def display_metrics(self) -> pd.DataFrame:
    """
    Returns calculated metrics as a DataFrame.
    """
    metrics_to_display = {
        k: v
        for k, v in self.metrics.items()
        if k not in ["Confusion Matrix", "Classification Report"]
    }
    df = pd.DataFrame(list(metrics_to_display.items()), columns=["Metric", "Value"])
    return df

plot_confusion_matrix(class_labels=None)

Plots the confusion matrix. Parameters: class_labels (List[str], optional): Labels for the classes. Raises: ValueError: If confusion matrix is not calculated.

Source code in LabeLMaker/Evaluate/evaluator.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def plot_confusion_matrix(self, class_labels: Optional[List[str]] = None) -> plt.Figure:
    """
    Plots the confusion matrix.
    Parameters:
        class_labels (List[str], optional): Labels for the classes.
    Raises:
        ValueError: If confusion matrix is not calculated.
    """
    cm = self.metrics.get("Confusion Matrix")
    if cm is None:
        raise ValueError("Confusion Matrix not calculated. Call calculate_metrics() first.")
    if class_labels is None:
        class_labels = sorted(set(self.y_true) | set(self.y_pred))
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_labels,
        yticklabels=class_labels,
        ax=ax,
    )
    ax.set_title("Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.close(fig)  # Close the figure to prevent it from displaying automatically
    return fig