DataLoader

Source code in LabeLMaker/Evaluate/data_loader.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class DataLoader:
    def __init__(
        self, file: Optional[str] = None, dataframe: Optional[pd.DataFrame] = None
    ) -> None:
        """
        Initializes the DataLoader with a file path or an existing DataFrame.

        Parameters:
            file (str, optional): Path to the CSV file to load.
            dataframe (pd.DataFrame, optional): An existing DataFrame.

        Raises:
            ValueError: If neither file nor dataframe is provided.
        """
        if file is not None:
            self.df = self.load_csv_file(file)
        elif dataframe is not None:
            self.df = dataframe
        else:
            raise ValueError("Either 'file' or 'dataframe' must be provided.")

    def load_csv_file(self, file: str) -> pd.DataFrame:
        """
        Loads a CSV file into a pandas DataFrame.

        Parameters:
            file (str): Path to the CSV file.

        Returns:
            pd.DataFrame: Loaded DataFrame.
        """
        return pd.read_csv(file, encoding="utf-8")

    def preprocess_text_columns(self, columns: List[str]) -> "DataLoader":
        """
        Preprocesses text columns by stripping whitespace and converting to lowercase.

        Parameters:
            columns (List[str]): List of column names to preprocess.

        Returns:
            DataLoader: Returns self for method chaining.
        """
        for col in columns:
            self.df[col] = self.df[col].astype(str).str.strip().str.lower()
        return self

    def drop_duplicates(self, subset: List[str], keep: str = "first") -> "DataLoader":
        """
        Drops duplicate rows based on specified columns.

        Parameters:
            subset (List[str]): Columns to consider for identifying duplicates.
            keep (str, optional): Which duplicates to keep ('first', 'last', or False).

        Returns:
            DataLoader: Returns self for method chaining.
        """
        self.df = self.df.drop_duplicates(subset=subset, keep=keep)
        return self

__init__(file=None, dataframe=None)

Initializes the DataLoader with a file path or an existing DataFrame.

Parameters:
  • file (str, default: None ) –

    Path to the CSV file to load.

  • dataframe (DataFrame, default: None ) –

    An existing DataFrame.

Raises:
  • ValueError

    If neither file nor dataframe is provided.

Source code in LabeLMaker/Evaluate/data_loader.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def __init__(
    self, file: Optional[str] = None, dataframe: Optional[pd.DataFrame] = None
) -> None:
    """
    Initializes the DataLoader with a file path or an existing DataFrame.

    Parameters:
        file (str, optional): Path to the CSV file to load.
        dataframe (pd.DataFrame, optional): An existing DataFrame.

    Raises:
        ValueError: If neither file nor dataframe is provided.
    """
    if file is not None:
        self.df = self.load_csv_file(file)
    elif dataframe is not None:
        self.df = dataframe
    else:
        raise ValueError("Either 'file' or 'dataframe' must be provided.")

drop_duplicates(subset, keep='first')

Drops duplicate rows based on specified columns.

Parameters:
  • subset (List[str]) –

    Columns to consider for identifying duplicates.

  • keep (str, default: 'first' ) –

    Which duplicates to keep ('first', 'last', or False).

Returns:
  • DataLoader( DataLoader ) –

    Returns self for method chaining.

Source code in LabeLMaker/Evaluate/data_loader.py
54
55
56
57
58
59
60
61
62
63
64
65
66
def drop_duplicates(self, subset: List[str], keep: str = "first") -> "DataLoader":
    """
    Drops duplicate rows based on specified columns.

    Parameters:
        subset (List[str]): Columns to consider for identifying duplicates.
        keep (str, optional): Which duplicates to keep ('first', 'last', or False).

    Returns:
        DataLoader: Returns self for method chaining.
    """
    self.df = self.df.drop_duplicates(subset=subset, keep=keep)
    return self

load_csv_file(file)

Loads a CSV file into a pandas DataFrame.

Parameters:
  • file (str) –

    Path to the CSV file.

Returns:
  • DataFrame

    pd.DataFrame: Loaded DataFrame.

Source code in LabeLMaker/Evaluate/data_loader.py
28
29
30
31
32
33
34
35
36
37
38
def load_csv_file(self, file: str) -> pd.DataFrame:
    """
    Loads a CSV file into a pandas DataFrame.

    Parameters:
        file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(file, encoding="utf-8")

preprocess_text_columns(columns)

Preprocesses text columns by stripping whitespace and converting to lowercase.

Parameters:
  • columns (List[str]) –

    List of column names to preprocess.

Returns:
  • DataLoader( DataLoader ) –

    Returns self for method chaining.

Source code in LabeLMaker/Evaluate/data_loader.py
40
41
42
43
44
45
46
47
48
49
50
51
52
def preprocess_text_columns(self, columns: List[str]) -> "DataLoader":
    """
    Preprocesses text columns by stripping whitespace and converting to lowercase.

    Parameters:
        columns (List[str]): List of column names to preprocess.

    Returns:
        DataLoader: Returns self for method chaining.
    """
    for col in columns:
        self.df[col] = self.df[col].astype(str).str.strip().str.lower()
    return self