DeepHit-PyTorch/import_data.py at main · nderus/DeepHit-PyTorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import pandas as pd
import torch

##### USER-DEFINED FUNCTIONS #####
def f_get_Normalization(X, norm_mode):
    """
    Normalize the input data matrix X according to the selected normalization mode.

    norm_mode: str, either 'standard' (zero mean, unit variance) or 'normal' (min-max normalization)
    """
    num_Patient, num_Feature = X.shape

    if norm_mode == 'standard':  # Zero mean unit variance
        for j in range(num_Feature):
            if np.std(X[:, j]) != 0:
                X[:, j] = (X[:, j] - np.mean(X[:, j])) / np.std(X[:, j])
            else:
                X[:, j] = X[:, j] - np.mean(X[:, j])
    elif norm_mode == 'normal':  # Min-max normalization
        for j in range(num_Feature):
            X[:, j] = (X[:, j] - np.min(X[:, j])) / (np.max(X[:, j]) - np.min(X[:, j]))
    else:
        raise ValueError("Invalid normalization mode selected!")

    return X

### MASK FUNCTIONS ###
def f_get_fc_mask2(time, label, num_Event, num_Category):
    mask = np.zeros([time.shape[0], num_Event, num_Category])

    for i in range(time.shape[0]):
        if label[i, 0] != 0:  # If not censored
            mask[i, int(label[i, 0] - 1), int(time[i, 0])] = 1
        else:  # If censored
            mask[i, :, int(time[i, 0] + 1):] = 1  # Fill 1 after censoring time

    # Debugging: Print some examples of the mask
    print(f"Mask 1 [0]:\n{mask[0]}")
    print(f"Mask 1 [1]:\n{mask[1]}")

    return mask


def f_get_fc_mask3(time, meas_time, num_Category):
    '''
        mask5 is required to calculate the ranking loss (for pair-wise comparison)
        mask5 size is [N, num_Category].
        - For longitudinal measurements:
             1's from the last measurement to the event time (exclusive and inclusive, respectively)
        - For single measurement:
             1's from start to the event time (inclusive)
    '''
    mask = np.zeros([np.shape(time)[0], num_Category])  # Initialize the mask

    # If longitudinal measurements exist
    if isinstance(meas_time, np.ndarray) and np.shape(meas_time)[0] > 0:  # Check if meas_time is an array
        for i in range(np.shape(time)[0]):
            t1 = int(meas_time[i, 0])  # Last measurement time
            t2 = int(time[i, 0])  # Censoring/event time
            mask[i, (t1+1):(t2+1)] = 1  # Excludes the last measurement time and includes the event time

    else:  # Single measurement case
        for i in range(np.shape(time)[0]):
            t = int(time[i, 0])  # Censoring/event time
            mask[i, :(t+1)] = 1  # Includes the event/censoring time

    return mask

### DATA IMPORT FUNCTIONS ###
def import_dataset_SYNTHETIC(norm_mode='standard'):
    """
    Load and preprocess the synthetic dataset.

    norm_mode: str, either 'standard' (zero mean, unit variance) or 'normal' (min-max normalization)

    Returns: tuple (DIM, DATA, MASK)
    """
    in_filename = './sample data/SYNTHETIC/synthetic_comprisk.csv'
    df = pd.read_csv(in_filename, sep=',')

    label = np.asarray(df[['label']])
    time = np.asarray(df[['time']])
    data = np.asarray(df.iloc[:, 4:])
    data = f_get_Normalization(data, norm_mode)

    num_Category = int(np.max(time) * 1.2)  # To have enough time-horizon
    num_Event = int(len(np.unique(label)) - 1)  # Only count the number of events (do not count censoring)

    x_dim = data.shape[1]

    mask1 = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2 = f_get_fc_mask3(time, -1, num_Category)

    DIM = (x_dim)
    DATA = (data, time, label)
    MASK = (mask1, mask2)

    return DIM, DATA, MASK


def import_dataset_METABRIC(norm_mode='standard'):
    """
    Load and preprocess the METABRIC dataset.

    norm_mode: str, either 'standard' (zero mean, unit variance) or 'normal' (min-max normalization)

    Returns: tuple (DIM, DATA, MASK)
    """
    in_filename1 = './sample data/METABRIC/cleaned_features_final.csv'
    in_filename2 = './sample data/METABRIC/label.csv'

    df1 = pd.read_csv(in_filename1, sep=',')
    df2 = pd.read_csv(in_filename2, sep=',')

    data = np.asarray(df1)
    data = f_get_Normalization(data, norm_mode)

    time = np.asarray(df2[['event_time']])
    # Debugging: Print time before rounding
    print(f"Original Time (Before Rounding): {time[:5]}")

    label = np.asarray(df2[['label']])

    num_Category = int(np.max(time) * 1.2)  # To have enough time-horizon
    num_Event = int(len(np.unique(label)) - 1)  # Only count the number of events (do not count censoring)

    print(f"num_Category: {num_Category}, num_Event: {num_Event}, x_dim: {data.shape[1]}")

    x_dim = data.shape[1]

    mask1 = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2 = f_get_fc_mask3(time, -1, num_Category)

    # Debugging: Print mask shapes
    print(f"Mask 1 Shape: {mask1.shape}")
    print(f"Mask 2 Shape: {mask2.shape}")

    DIM = (x_dim)
    DATA = (data, time, label)
    MASK = (mask1, mask2)

    return DIM, DATA, MASK