MultiLabel-PFN/data_preprocessing.py at main · FloAvis/MultiLabel-PFN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""File for Data preprocessing of the Stanford HIV database"""


# Setup Imports
import pandas as pd
import numpy as np
from scipy.io import arff


THRESHOLDS = [
    [3, 15],  # FPV
    [3, 15],  # ATV
    [3, 15],  # IDV
    [9, 55],  # LPV
    [3, 6],  # NFV
    [3, 15],  # SQV
    [2, 8],  # TPV
    [10, 90],  # DRV
    [5, 25],  # X3TC
    [2, 6],  # ABC
    [3, 15],  # AZT
    [1.5, 3],  # D4T
    [1.5, 3],  # DDI
    [1.5, 3],  # TDF
    [3, 10],  # EFV
    [3, 10],  # NVP
    [3, 10],  # ETR
    [3, 10],  # RPV
    [2.5, 10],  # BIC
    [4, 13],  # DTG
    [2.5, 10],  # EVG - upper threshold guessed
    [1.5, 10]  # RAL - upper threshold guessed
]

# Define row and column names
THRESHOLD_INDICES = ["FPV", "ATV", "IDV", "LPV", "NFV", "SQV", "TPV", "DRV",
                     "3TC", "ABC", "AZT", "D4T", "DDI", "TDF",
                     "EFV", "NVP", "ETR", "RPV", "BIC", "DTG", "EVG", "RAL"]

THRESHOLD_COLUMNS = ["lower", "upper"]


def get_thresholds():
    """
        Function to return a DataFrame of drug resistance score thresholds

        :return: DataFrame containing resistance thresholds for each antiretroviral drug.
                 Each row corresponds to a drug, and the columns ('lower', 'upper')
                 represent the threshold values used to interpret resistance scores.
    """

    return pd.DataFrame(THRESHOLDS, index=THRESHOLD_INDICES, columns=THRESHOLD_COLUMNS)


def get_classes(df, drugs, mode="binary"):
    """
        Function to classify resistance scores for a given drug or list of drugs
        into binary or multiclass levels

        :param df: DataFrame containing resistance scores for various drugs
        :param drugs: Drug name or list of drugs (must match the threshold index)
        :param mode: Classification mode, either "multiclass" or "binary". Default is "binary"
        :return: DataFrame of categorical classes for the given drugs
                 - In 'multiclass' mode: 0 = susceptible, 1 = intermediate, 2 = resistant
                 - In 'binary' mode: 0 = susceptible, 1 = resistant
    """

    if type(drugs) != list:
        drugs = [drugs]

    classes = {}


    for drug in drugs:

        lower, upper = get_thresholds().loc[drug, ["lower", "upper"]]

        if mode == "multiclass":
            conditions = [
                df[drug] < lower,
                df[drug] >= upper,
                np.isnan(df[drug])
            ]
            choices = [0, 2, np.nan]
            default = 1
        elif mode == "binary":
            conditions = [
                df[drug] < lower,
                np.isnan(df[drug])]
            choices = [0, np.nan]
            default = 1
        else:
            raise ValueError("mode must be either 'multiclass' or 'binary'")

        classes.update({drug : np.select(conditions, choices, default=default)})

    return pd.DataFrame(classes)


def hq_hiv_loader(filename, drop_na=False, class_mode="binary"):
    """
        Function to load and preprocess high-quality HIV datasets from the Stanford HIV database

        :param filename: Filename of the target dataset to be processed
        :param drop_na: Whether to drop all rows containing NaNs in one or more targets. Default is False
        :param class_mode: Classification mode, either "multiclass" or "binary". Default is "binary"
        :return:
                 X (DataFrame): DataFrame of input features with shape (T, H),
                                where T is the number of examples and H is the number of features
                 Y (DataFrame): DataFrame of categorical classes with shape (T, L),
                                where T is the number of examples and L is the number of labels
                                - In 'multiclass' mode: 0 = susceptible, 1 = intermediate, 2 = resistant
                                - In 'binary' mode: 0 = susceptible, 1 = resistant
                 drugs (list[str]): List of label names in the dataset
    """

    # Reading in and processing high quality File
    df = pd.read_csv(filename, sep='\t')

    # removing index and summary column
    df = df.iloc[:, 1:-1]

    # list of current drugs of the dataset
    drugs = [drug for drug in list(df.columns) if not drug.startswith("P")]

    #Filtering out drugs with less than 10 labels present
    unusable_drugs = [drug for drug in drugs if df[drug].count() <= 10]

    if len(unusable_drugs) > 0:
        df.drop(columns=unusable_drugs, inplace=True)

        drugs = [drug for drug in drugs if drug not in unusable_drugs]


    if drop_na:
        #dropping rows with na labels
        df.dropna(subset=drugs, inplace=True)

    # collecting all features
    X = df.drop(drugs, axis=1)

    #getting the classlabels from the laboratory values
    Y = get_classes(df, drugs, mode=class_mode)

    return X, Y, drugs