diff --git a/requirements.txt b/requirements.txt index cdb95bd..65b8543 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,9 @@ cycler==0.10.0 docx2txt==0.8 joblib==0.14.1 kiwisolver==1.2.0 -matplotlib==3.1.0 -numpy==1.18.4 -pandas==1.0.3 +matplotlib +numpy +pandas pdfminer==20191125 Pillow==9.0.1 pycryptodome==3.9.7 @@ -12,8 +12,8 @@ pyEDFlib==0.1.17 pyparsing==2.4.7 python-dateutil==2.8.1 pytz==2020.1 -scikit-learn==0.22.2.post1 -scipy==1.4.1 +scikit-learn +scipy seaborn==0.10.1 six==1.14.0 -sklearn==0.0 +sklearn diff --git a/source/analysis/analysis_runner.py b/source/analysis/analysis_runner.py index 9f34693..b9b047a 100644 --- a/source/analysis/analysis_runner.py +++ b/source/analysis/analysis_runner.py @@ -64,7 +64,7 @@ def figures_mc_sleep_wake(): classifiers = utils.get_classifiers() feature_sets = utils.get_base_feature_sets() - trial_count = 20 + trial_count = 21 for attributed_classifier in classifiers: if Constants.VERBOSE: @@ -181,9 +181,113 @@ def figures_compare_time_based_features(): CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_time_only_sw_roc') +def figures_train_health_test_disordered(): + classifiers = utils.get_classifiers() + + feature_sets = utils.get_base_feature_sets() + + for attributed_classifier in classifiers: + if Constants.VERBOSE: + print('Running ' + attributed_classifier.name + '...') + classifier_summary = SleepWakeClassifierSummaryBuilder.build_train_healthy_test_disordered(attributed_classifier, feature_sets) + + CurvePlotBuilder.make_roc_sw(classifier_summary) + CurvePlotBuilder.make_pr_sw(classifier_summary) + TableBuilder.print_table_sw(classifier_summary) + + CurvePlotBuilder.combine_plots_as_grid(classifiers, 1, '_sw_pr') + CurvePlotBuilder.combine_plots_as_grid(classifiers, 1, '_sw_roc') + + +def figures_mc_sleep_wake_disordered(): + classifiers = utils.get_classifiers() + + feature_sets = utils.get_base_feature_sets() + trial_count = 20 + + for attributed_classifier in classifiers: + if Constants.VERBOSE: + print('Running ' + attributed_classifier.name + '...') + classifier_summary = \ + SleepWakeClassifierSummaryBuilder.build_monte_carlo_disordered( + attributed_classifier, feature_sets, trial_count) + + CurvePlotBuilder.make_roc_sw(classifier_summary) + CurvePlotBuilder.make_pr_sw(classifier_summary) + TableBuilder.print_table_sw(classifier_summary) + + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_pr') + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_roc') + +def figures_mc_train_apnea_test_narcolepsy(): + classifiers = utils.get_classifiers() + + feature_sets = utils.get_base_feature_sets() + + for attributed_classifier in classifiers: + if Constants.VERBOSE: + print('Running ' + attributed_classifier.name + '...') + classifier_summary = \ + SleepWakeClassifierSummaryBuilder.build_train_apnea_test_narcolepsy( + attributed_classifier, feature_sets) + + CurvePlotBuilder.make_roc_sw(classifier_summary) + CurvePlotBuilder.make_pr_sw(classifier_summary) + TableBuilder.print_table_sw(classifier_summary) + + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_pr') + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_roc') + +def figures_mc_sleep_wake_disordered_apnea_only(): + classifiers = utils.get_classifiers() + + feature_sets = utils.get_base_feature_sets() + trial_count = 22 + + for attributed_classifier in classifiers: + if Constants.VERBOSE: + print('Running ' + attributed_classifier.name + '...') + classifier_summary = \ + SleepWakeClassifierSummaryBuilder.build_monte_carlo_apnea_only( + attributed_classifier, feature_sets, trial_count) + + CurvePlotBuilder.make_roc_sw(classifier_summary) + CurvePlotBuilder.make_pr_sw(classifier_summary) + TableBuilder.print_table_sw(classifier_summary) + + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_pr') + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_roc') + +def figures_mc_disordered_and_healthy_together(): + classifiers = utils.get_classifiers() + feature_sets = utils.get_base_feature_sets() + trial_count = 23 + + for attributed_classifier in classifiers: + if Constants.VERBOSE: + print('Running ' + attributed_classifier.name + '...') + classifier_summary = \ + SleepWakeClassifierSummaryBuilder.build_monte_carlo_everyone( + attributed_classifier, feature_sets, trial_count) + + CurvePlotBuilder.make_roc_sw(classifier_summary) + CurvePlotBuilder.make_pr_sw(classifier_summary) + TableBuilder.print_table_sw(classifier_summary) + + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_pr') + CurvePlotBuilder.combine_plots_as_grid(classifiers, trial_count, '_sw_roc') + + if __name__ == "__main__": start_time = time.time() - figure_leave_one_out_roc_and_pr() + # figures_mc_train_apnea_test_narcolepsy() + # figures_train_health_test_disordered() + # figures_mc_sleep_wake_disordered_apnea_only() + # figures_mc_sleep_wake_disordered() + figures_mc_sleep_wake() + # figures_mc_disordered_and_healthy_together() + + # figure_leave_one_out_roc_and_pr() # # figures_mc_sleep_wake() # figures_mc_three_class() diff --git a/source/analysis/classification/classifier_summary_builder.py b/source/analysis/classification/classifier_summary_builder.py index 22ae111..7b28ec2 100644 --- a/source/analysis/classification/classifier_summary_builder.py +++ b/source/analysis/classification/classifier_summary_builder.py @@ -11,42 +11,151 @@ class SleepWakeClassifierSummaryBuilder(object): @staticmethod - def build_monte_carlo(attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]], + def build_monte_carlo(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]], number_of_splits: int) -> ClassifierSummary: subject_ids = SubjectBuilder.get_all_subject_ids() subject_dictionary = SubjectBuilder.get_subject_dictionary() - data_splits = TrainTestSplitter.by_fraction(subject_ids, test_fraction=0.3, number_of_splits=number_of_splits) + data_splits = TrainTestSplitter.by_fraction(subject_ids, + test_fraction=0.3, + number_of_splits=number_of_splits) - return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, subject_dictionary, + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, + attributed_classifier, + feature_sets) + @staticmethod + def build_monte_carlo_everyone(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]], + number_of_splits: int) -> ClassifierSummary: + subject_ids_healthy = SubjectBuilder.get_all_subject_ids() + subject_ids_disordered = \ + SubjectBuilder.get_all_disordered_subject_ids() + + subject_dictionary = SubjectBuilder.get_subject_dictionary() + subject_dictionary_disordered = \ + SubjectBuilder.get_subject_dictionary_disordered() + + subject_ids = subject_ids_healthy + subject_ids_disordered + overall_dictionary = subject_dictionary | subject_dictionary_disordered + + data_splits = TrainTestSplitter.by_fraction(subject_ids, + test_fraction=0.3, + number_of_splits=number_of_splits) + + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + overall_dictionary, + attributed_classifier, + feature_sets) + + + @staticmethod + def build_monte_carlo_disordered(attributed_classifier: + AttributedClassifier, + feature_sets: [[FeatureType]], + number_of_splits: int) -> \ + ClassifierSummary: + subject_ids = SubjectBuilder.get_all_disordered_subject_ids() + subject_dictionary = SubjectBuilder.get_subject_dictionary_disordered() + + data_splits = TrainTestSplitter.by_fraction(subject_ids, + test_fraction=0.3, + number_of_splits=number_of_splits) + + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, + attributed_classifier, + feature_sets) + + @staticmethod + def build_monte_carlo_apnea_only(attributed_classifier: + AttributedClassifier, + feature_sets: [[FeatureType]], + number_of_splits: int) -> \ + ClassifierSummary: + subject_ids = SubjectBuilder.get_apnea_only_sleepers() + subject_dictionary = SubjectBuilder.get_subject_dictionary_disordered() + + data_splits = TrainTestSplitter.by_fraction(subject_ids, + test_fraction=0.3, + number_of_splits=number_of_splits) + + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, + attributed_classifier, + feature_sets) + + @staticmethod + def build_train_healthy_test_disordered(attributed_classifier, + feature_sets: [[FeatureType]]) -> \ + ClassifierSummary: + subject_ids_healthy = SubjectBuilder.get_all_subject_ids() + subject_ids_disordered = \ + SubjectBuilder.get_all_disordered_subject_ids() + + subject_dictionary = SubjectBuilder.get_subject_dictionary() + subject_dictionary_disordered = \ + SubjectBuilder.get_subject_dictionary_disordered() + + data_splits = [DataSplit(training_set=subject_ids_healthy, + testing_set=subject_ids_disordered)] + overall_dictionary = subject_dictionary | subject_dictionary_disordered + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + overall_dictionary, + attributed_classifier, + feature_sets) + @staticmethod + def build_train_apnea_test_narcolepsy(attributed_classifier, + feature_sets: [[FeatureType]]) -> \ + ClassifierSummary: + subject_ids_apnea = SubjectBuilder.get_apnea_only_sleepers() + subject_ids_narcolepsy = SubjectBuilder.get_narcolepsy_only_sleepers() + + subject_dictionary = SubjectBuilder.get_subject_dictionary() + subject_dictionary_disordered = \ + SubjectBuilder.get_subject_dictionary_disordered() + + data_splits = [DataSplit(training_set=subject_ids_apnea, + testing_set=subject_ids_narcolepsy)] + overall_dictionary = subject_dictionary | subject_dictionary_disordered + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + overall_dictionary, attributed_classifier, feature_sets) @staticmethod def build_leave_one_out(attributed_classifier: AttributedClassifier, - feature_sets: [[FeatureType]]) -> ClassifierSummary: + feature_sets: [ + [FeatureType]]) -> ClassifierSummary: subject_ids = SubjectBuilder.get_all_subject_ids() subject_dictionary = SubjectBuilder.get_subject_dictionary() data_splits = TrainTestSplitter.leave_one_out(subject_ids) - return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, subject_dictionary, + return SleepWakeClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, attributed_classifier, feature_sets) @staticmethod - def run_feature_sets(data_splits: [DataSplit], subject_dictionary, attributed_classifier: AttributedClassifier, + def run_feature_sets(data_splits: [DataSplit], subject_dictionary, + attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]]): performance_dictionary = {} for feature_set in feature_sets: - raw_performance_results = ClassifierService.run_sw(data_splits, attributed_classifier, - subject_dictionary, feature_set) - performance_dictionary[tuple(feature_set)] = raw_performance_results + raw_performance_results = ClassifierService.run_sw(data_splits, + attributed_classifier, + subject_dictionary, + feature_set) + performance_dictionary[ + tuple(feature_set)] = raw_performance_results return ClassifierSummary(attributed_classifier, performance_dictionary) @staticmethod - def build_mesa(attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]]): + def build_mesa(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]]): apple_watch_subjects = SubjectBuilder.get_subject_dictionary() mesa_subjects = MesaDataService.get_all_subjects() training_set = [] @@ -61,10 +170,12 @@ def build_mesa(attributed_classifier: AttributedClassifier, feature_sets: [[Feat testing_set.append(mesa_subject.subject_id) mesa_dictionary[mesa_subject.subject_id] = mesa_subject - data_split = DataSplit(training_set=training_set, testing_set=testing_set) + data_split = DataSplit(training_set=training_set, + testing_set=testing_set) apple_watch_subjects.update(mesa_dictionary) - return SleepWakeClassifierSummaryBuilder.run_feature_sets([data_split], apple_watch_subjects, + return SleepWakeClassifierSummaryBuilder.run_feature_sets([data_split], + apple_watch_subjects, attributed_classifier, feature_sets) @@ -72,48 +183,60 @@ def build_mesa(attributed_classifier: AttributedClassifier, feature_sets: [[Feat class ThreeClassClassifierSummaryBuilder(object): @staticmethod - def build_monte_carlo(attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]], + def build_monte_carlo(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]], number_of_splits: int) -> ClassifierSummary: subject_ids = SubjectBuilder.get_all_subject_ids() subject_dictionary = SubjectBuilder.get_subject_dictionary() - data_splits = TrainTestSplitter.by_fraction(subject_ids, test_fraction=0.3, number_of_splits=number_of_splits) + data_splits = TrainTestSplitter.by_fraction(subject_ids, + test_fraction=0.3, + number_of_splits=number_of_splits) - return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, subject_dictionary, + return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, attributed_classifier, feature_sets) @staticmethod def build_leave_one_out(attributed_classifier: AttributedClassifier, - feature_sets: [[FeatureType]]) -> ClassifierSummary: + feature_sets: [ + [FeatureType]]) -> ClassifierSummary: subject_ids = SubjectBuilder.get_all_subject_ids() subject_dictionary = SubjectBuilder.get_subject_dictionary() data_splits = TrainTestSplitter.leave_one_out(subject_ids) - return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, subject_dictionary, + return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, + subject_dictionary, attributed_classifier, feature_sets) @staticmethod - def run_feature_sets(data_splits: [DataSplit], subject_dictionary, attributed_classifier: AttributedClassifier, + def run_feature_sets(data_splits: [DataSplit], subject_dictionary, + attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]], use_preloaded=False): performance_dictionary = {} for feature_set in feature_sets: if use_preloaded: - raw_performance_results = ClassifierService.run_three_class_with_loaded_model(data_splits, - attributed_classifier, - subject_dictionary, - feature_set) + raw_performance_results = \ + ClassifierService.run_three_class_with_loaded_model( + data_splits, + attributed_classifier, + subject_dictionary, + feature_set) else: - raw_performance_results = ClassifierService.run_three_class(data_splits, attributed_classifier, - subject_dictionary, feature_set) - performance_dictionary[tuple(feature_set)] = raw_performance_results + raw_performance_results = ClassifierService.run_three_class( + data_splits, attributed_classifier, + subject_dictionary, feature_set) + performance_dictionary[ + tuple(feature_set)] = raw_performance_results return ClassifierSummary(attributed_classifier, performance_dictionary) @staticmethod - def build_mesa_leave_one_out(attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]]): + def build_mesa_leave_one_out(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]]): apple_watch_subjects = SubjectBuilder.get_subject_dictionary() mesa_subjects = MesaDataService.get_all_subjects() training_set = [] @@ -127,17 +250,21 @@ def build_mesa_leave_one_out(attributed_classifier: AttributedClassifier, featur mesa_subject.subject_id = 'mesa' + mesa_subject.subject_id mesa_dictionary[mesa_subject.subject_id] = mesa_subject testing_set = [mesa_subject.subject_id] - data_split = DataSplit(training_set=training_set, testing_set=testing_set) + data_split = DataSplit(training_set=training_set, + testing_set=testing_set) data_splits.append(data_split) apple_watch_subjects.update(mesa_dictionary) - return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, apple_watch_subjects, + return ThreeClassClassifierSummaryBuilder.run_feature_sets(data_splits, + apple_watch_subjects, attributed_classifier, - feature_sets, True) + feature_sets, + True) @staticmethod - def build_mesa_all_combined(attributed_classifier: AttributedClassifier, feature_sets: [[FeatureType]]): + def build_mesa_all_combined(attributed_classifier: AttributedClassifier, + feature_sets: [[FeatureType]]): apple_watch_subjects = SubjectBuilder.get_subject_dictionary() mesa_subjects = MesaDataService.get_all_subjects() training_set = [] @@ -152,9 +279,11 @@ def build_mesa_all_combined(attributed_classifier: AttributedClassifier, feature mesa_dictionary[mesa_subject.subject_id] = mesa_subject testing_set.append(mesa_subject.subject_id) - data_split = DataSplit(training_set=training_set, testing_set=testing_set) + data_split = DataSplit(training_set=training_set, + testing_set=testing_set) apple_watch_subjects.update(mesa_dictionary) - return ThreeClassClassifierSummaryBuilder.run_feature_sets([data_split], apple_watch_subjects, - attributed_classifier, - feature_sets) + return ThreeClassClassifierSummaryBuilder.run_feature_sets( + [data_split], apple_watch_subjects, + attributed_classifier, + feature_sets) diff --git a/source/analysis/figures/data_plot_builder.py b/source/analysis/figures/data_plot_builder.py index 1a3627e..cc9bca3 100644 --- a/source/analysis/figures/data_plot_builder.py +++ b/source/analysis/figures/data_plot_builder.py @@ -48,13 +48,11 @@ def tidy_data_plot(x_min, x_max, dt, ax): def make_data_demo(subject_id="16", snippet=False): hr_color = [0.8, 0.2, 0.1] motion_color = [0.3, 0.2, 0.8] - circ_color = [0.9, 0.7, 0] psg_color = [0.1, 0.7, 0.1] font_size = 16 font_name = "Arial" data_path = str(Constants.CROPPED_FILE_PATH) + '/' - circadian_data_path = str(utils.get_project_root().joinpath('data/circadian_predictions/')) + '/' output_path = str(Constants.FIGURE_FILE_PATH) + '/' if snippet is False: @@ -62,21 +60,20 @@ def make_data_demo(subject_id="16", snippet=False): else: fig = plt.figure(figsize=(3, 12)) - num_v_plots = 5 + num_v_plots = 4 fig.patch.set_facecolor('white') if (os.path.isfile(data_path + subject_id + '_cleaned_hr.out') and os.path.isfile( data_path + subject_id + '_cleaned_motion.out') and os.path.isfile( data_path + subject_id + '_cleaned_psg.out') and os.path.isfile(data_path + subject_id + '_cleaned_counts.out') and - os.stat(data_path + subject_id + '_cleaned_motion.out').st_size > 0) and os.path.isfile( - circadian_data_path + subject_id + '_clock_proxy.txt'): + os.stat(data_path + subject_id + '_cleaned_motion.out').st_size + > 0): hr = np.genfromtxt(data_path + subject_id + '_cleaned_hr.out', delimiter=' ') motion = np.genfromtxt(data_path + subject_id + '_cleaned_motion.out', delimiter=' ') scores = np.genfromtxt(data_path + subject_id + '_cleaned_psg.out', delimiter=' ') counts = np.genfromtxt(data_path + subject_id + '_cleaned_counts.out', delimiter=',') - circ_model = np.genfromtxt(circadian_data_path + subject_id + '_clock_proxy.txt', delimiter=',') min_time = min(scores[:, 0]) max_time = max(scores[:, 0]) @@ -160,26 +157,7 @@ def make_data_demo(subject_id="16", snippet=False): plt.xlabel(str(window_size) + ' sec window', fontsize=font_size, fontname=font_name) plt.ylim(35, 100) - else: - y_min = 40 - y_max = 130 - plt.ylim(y_min, y_max) - current_axis = plt.gca() - current_axis.add_patch( - Rectangle((sample_point, y_min), window_size, y_max - y_min, alpha=0.35, facecolor="gray")) - plt.ylim(40, 130) - ax = plt.subplot(num_v_plots, 1, 4) - ax.plot(circ_model[:, 0], -circ_model[:, 1], color=circ_color) - plt.ylabel('Clock Proxy', fontsize=font_size, fontname=font_name) - DataPlotBuilder.tidy_data_plot(min_time, max_time, dt, ax) - if snippet: - plt.axis('off') - plt.ylim(-1, -1) - else: - plt.ylim(.2, 1.2) - - ax = plt.subplot(num_v_plots, 1, 5) relabeled_scores = DataPlotBuilder.convert_labels_for_hypnogram(scores[:, 1]) ax.step(scores[:, 0], relabeled_scores, color=psg_color) diff --git a/source/analysis/setup/subject_builder.py b/source/analysis/setup/subject_builder.py index 09fce60..e2fdf85 100644 --- a/source/analysis/setup/subject_builder.py +++ b/source/analysis/setup/subject_builder.py @@ -1,10 +1,15 @@ + from source.analysis.setup.feature_type import FeatureType from source.analysis.setup.subject import Subject from source.constants import Constants -from source.preprocessing.activity_count.activity_count_feature_service import ActivityCountFeatureService -from source.preprocessing.heart_rate.heart_rate_feature_service import HeartRateFeatureService +from source.preprocessing.activity_count.activity_count_feature_service \ + import \ + ActivityCountFeatureService +from source.preprocessing.heart_rate.heart_rate_feature_service import \ + HeartRateFeatureService from source.preprocessing.psg.psg_label_service import PSGLabelService -from source.preprocessing.time.time_based_feature_service import TimeBasedFeatureService +from source.preprocessing.time.time_based_feature_service import \ + TimeBasedFeatureService class SubjectBuilder(object): @@ -12,9 +17,12 @@ class SubjectBuilder(object): @staticmethod def get_all_subject_ids(): - subjects_as_ints = [3509524, 5132496, 1066528, 5498603, 2638030, 2598705, 5383425, 1455390, 4018081, 9961348, - 1449548, 8258170, 781756, 9106476, 8686948, 8530312, 3997827, 4314139, 1818471, 4426783, - 8173033, 7749105, 5797046, 759667, 8000685, 6220552, 844359, 9618981, 1360686, 46343, + subjects_as_ints = [3509524, 5132496, 1066528, 5498603, 2638030, + 2598705, 5383425, 1455390, 4018081, 9961348, + 1449548, 8258170, 781756, 9106476, 8686948, + 8530312, 3997827, 4314139, 1818471, 4426783, + 8173033, 7749105, 5797046, 759667, 8000685, + 6220552, 844359, 9618981, 1360686, 46343, 8692923] subjects_as_strings = [] @@ -23,6 +31,44 @@ def get_all_subject_ids(): subjects_as_strings.append(str(subject)) return subjects_as_strings + @staticmethod + def get_all_disordered_subject_ids(): + # This is excluding anyone without a 100% full night, and all the nones + return ["d02", "d03", "d04", "d05", "d08", "d09", + "d10", "d11", "d12", "d13", "d15", "d16", "d18", + "d19", "d21", "d23", "d24", "d25", "d28", + "d29", "d30", "d32", "d34", "d35", "d36", "d37", + "d38", "d39", "d40"] + + # return ["d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", + # "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", + # "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d28", + # "d29", "d30", "d31", "d32", "d33", "d34", "d35", "d36", "d37", + # "d38", "d39", "d40"] + + @staticmethod + def get_apnea_only_sleepers(): + base_ids = SubjectBuilder.get_all_disordered_subject_ids() + apnea_only_ids = [] + for id in base_ids: + diagnoses = SubjectBuilder.subject_to_disorder(id) + if 'modosa' in diagnoses or 'milosa' in diagnoses or 'sevosa' in\ + diagnoses: + apnea_only_ids.append(id) + + return apnea_only_ids + + @staticmethod + def get_narcolepsy_only_sleepers(): + base_ids = SubjectBuilder.get_all_disordered_subject_ids() + narco_only_ids = [] + for id in base_ids: + diagnoses = SubjectBuilder.subject_to_disorder(id) + if 'narcolepsy' in diagnoses: + narco_only_ids.append(id) + + return narco_only_ids + @staticmethod def get_subject_dictionary(): subject_dictionary = {} @@ -32,13 +78,23 @@ def get_subject_dictionary(): return subject_dictionary + @staticmethod + def get_subject_dictionary_disordered(): + subject_dictionary = {} + all_subject_ids = SubjectBuilder.get_all_disordered_subject_ids() + for subject_id in all_subject_ids: + subject_dictionary[subject_id] = SubjectBuilder.build(subject_id) + + return subject_dictionary + @staticmethod def build(subject_id): feature_count = ActivityCountFeatureService.load(subject_id) feature_hr = HeartRateFeatureService.load(subject_id) feature_time = TimeBasedFeatureService.load_time(subject_id) if Constants.INCLUDE_CIRCADIAN: - feature_circadian = TimeBasedFeatureService.load_circadian_model(subject_id) + feature_circadian = TimeBasedFeatureService.load_circadian_model( + subject_id) else: feature_circadian = None feature_cosine = TimeBasedFeatureService.load_cosine(subject_id) @@ -66,6 +122,53 @@ def build(subject_id): # ax = plt.subplot(5, 1, 5) # ax.plot(range(len(labeled_sleep)), labeled_sleep) # - # plt.savefig(str(Constants.FIGURE_FILE_PATH.joinpath(subject_id + '_applewatch.png'))) + # plt.savefig(str(Constants.FIGURE_FILE_PATH.joinpath(subject_id + + # '_applewatch.png'))) # plt.close() return subject + + @staticmethod + def subject_to_disorder(subject_id): + + subject_disorder_dictionary = { + 'd01' : [], + 'd02' : ['narcolepsy', 'modosa'], + 'd03': ['narcolepsy', 'modosa'], + 'd04': ['modosa'], + 'd05': ['narcolepsy'], + 'd06': ['sevosa'], + 'd07': ['sevosa'], + 'd08': ['mildosa'], + 'd09': ['modosa'], + 'd10': ['mildosa'], + 'd11': ['mildosa'], + 'd12': ['mildosa'], + 'd13': ['modosa'], + 'd14': ['modosa'], + 'd15': ['mildosa'], + 'd16': ['mildosa'], + 'd17': ['mildosa'], + 'd18': ['mildosa'], + 'd19': ['modosa'], + 'd20': ['mildosa'], + 'd21': ['modosa'], + 'd22': ['mildosa'], + 'd23': ['mildosa'], + 'd24': ['modosa'], + 'd25': ['modosa'], + 'd26': [], + 'd28': ['mildosa'], + 'd29': ['mildosa'], + 'd30': ['mildosa'], + 'd31': [], + 'd32': ['mildosa'], + 'd33': [], + 'd34': ['mildosa'], + 'd35': ['mildosa'], + 'd36': ['mildosa'], + 'd37': ['modosa'], + 'd38': ['sevosa'], + 'd39': ['modosa'], + 'd40': ['modosa'] + } + return subject_disorder_dictionary[subject_id] diff --git a/source/preprocessing/heart_rate/heart_rate_service.py b/source/preprocessing/heart_rate/heart_rate_service.py index 794b528..4551981 100644 --- a/source/preprocessing/heart_rate/heart_rate_service.py +++ b/source/preprocessing/heart_rate/heart_rate_service.py @@ -1,9 +1,12 @@ +import numpy as np + import numpy as np import pandas as pd from source import utils from source.constants import Constants -from source.preprocessing.heart_rate.heart_rate_collection import HeartRateCollection +from source.preprocessing.heart_rate.heart_rate_collection import \ + HeartRateCollection class HeartRateService(object): @@ -13,22 +16,43 @@ def load_raw(subject_id): raw_hr_path = HeartRateService.get_raw_file_path(subject_id) heart_rate_array = HeartRateService.load(raw_hr_path, ",") heart_rate_array = utils.remove_repeats(heart_rate_array) - return HeartRateCollection(subject_id=subject_id, data=heart_rate_array) + return HeartRateCollection(subject_id=subject_id, + data=heart_rate_array) + + @staticmethod + def load_raw_disordered(subject_id): + subject_id = subject_id[1:] + + if len(subject_id) == 1: + subject_id = "0" + subject_id + + raw_hr_path = str(utils.get_project_root().joinpath( + 'data/disordered_sleepers/AWS0' + subject_id + + ' hr_data.csv')) + df = pd.read_csv(raw_hr_path) + heart_rate_array = df.values + + heart_rate_array = utils.remove_repeats(heart_rate_array) + return HeartRateCollection(subject_id="d" + subject_id, + data=heart_rate_array) @staticmethod def load_cropped(subject_id): cropped_hr_path = HeartRateService.get_cropped_file_path(subject_id) heart_rate_array = HeartRateService.load(cropped_hr_path) - return HeartRateCollection(subject_id=subject_id, data=heart_rate_array) + return HeartRateCollection(subject_id=subject_id, + data=heart_rate_array) @staticmethod def load(hr_file, delimiter=" "): - heart_rate_array = pd.read_csv(str(hr_file), delimiter=delimiter).values + heart_rate_array = pd.read_csv(str(hr_file), + delimiter=delimiter).values return heart_rate_array @staticmethod def write(heart_rate_collection): - hr_output_path = HeartRateService.get_cropped_file_path(heart_rate_collection.subject_id) + hr_output_path = HeartRateService.get_cropped_file_path( + heart_rate_collection.subject_id) np.savetxt(hr_output_path, heart_rate_collection.data, fmt='%f') @staticmethod @@ -43,7 +67,8 @@ def crop(heart_rate_collection, interval): @staticmethod def get_cropped_file_path(subject_id): - return Constants.CROPPED_FILE_PATH.joinpath(subject_id + "_cleaned_hr.out") + return Constants.CROPPED_FILE_PATH.joinpath( + subject_id + "_cleaned_hr.out") @staticmethod def get_raw_file_path(subject_id): diff --git a/source/preprocessing/motion/motion_service.py b/source/preprocessing/motion/motion_service.py index 87ec191..047f221 100644 --- a/source/preprocessing/motion/motion_service.py +++ b/source/preprocessing/motion/motion_service.py @@ -15,6 +15,23 @@ def load_raw(subject_id): motion_array = utils.remove_repeats(motion_array) return MotionCollection(subject_id=subject_id, data=motion_array) + @staticmethod + def load_raw_disordered(subject_id): + subject_id = subject_id[1:] + + if len(subject_id) == 1: + subject_id = "0" + subject_id + + raw_motion_path = str(utils.get_project_root().joinpath( + 'data/disordered_sleepers/AWS0' + subject_id + + ' motion_data.csv')) + df = pd.read_csv(raw_motion_path) + motion_array = df.values + + motion_array = utils.remove_repeats(motion_array) + return MotionCollection(subject_id="d" + subject_id, data=motion_array) + + @staticmethod def load_cropped(subject_id): cropped_motion_path = MotionService.get_cropped_file_path(subject_id) diff --git a/source/preprocessing/preprocessing_runner.py b/source/preprocessing/preprocessing_runner.py index 6c96e0d..1567bad 100644 --- a/source/preprocessing/preprocessing_runner.py +++ b/source/preprocessing/preprocessing_runner.py @@ -3,7 +3,8 @@ from source.analysis.figures.data_plot_builder import DataPlotBuilder from source.analysis.setup.subject_builder import SubjectBuilder from source.constants import Constants -from source.preprocessing.activity_count.activity_count_service import ActivityCountService +from source.preprocessing.activity_count.activity_count_service import \ + ActivityCountService from source.preprocessing.feature_builder import FeatureBuilder from source.preprocessing.raw_data_processor import RawDataProcessor from source.preprocessing.time.circadian_service import CircadianService @@ -17,9 +18,26 @@ def run_preprocessing(subject_set): RawDataProcessor.crop_all(str(subject)) if Constants.INCLUDE_CIRCADIAN: - ActivityCountService.build_activity_counts() # This uses MATLAB, but has been replaced with a python implementation - CircadianService.build_circadian_model() # Both of the circadian lines require MATLAB to run - CircadianService.build_circadian_mesa() # INCLUDE_CIRCADIAN = False by default because most people don't have MATLAB + ActivityCountService.build_activity_counts() # This uses MATLAB, + # but has been replaced with a python implementation + CircadianService.build_circadian_model() # Both of the circadian + # lines require MATLAB to run + CircadianService.build_circadian_mesa() # INCLUDE_CIRCADIAN = False + # by default because most people don't have MATLAB + + for subject in subject_set: + FeatureBuilder.build(str(subject)) + + end_time = time.time() + print("Execution took " + str((end_time - start_time) / 60) + " minutes") + + +def run_preprocessing_disordered_sleepers(subject_set): + start_time = time.time() + + for subject in subject_set: + print("Cropping data from subject " + str(subject) + "...") + RawDataProcessor.crop_all_disordered(str(subject)) for subject in subject_set: FeatureBuilder.build(str(subject)) @@ -30,6 +48,11 @@ def run_preprocessing(subject_set): subject_ids = SubjectBuilder.get_all_subject_ids() run_preprocessing(subject_ids) - for subject_id in subject_ids: DataPlotBuilder.make_data_demo(subject_id, False) + +disordered_subject_ids = SubjectBuilder.get_all_disordered_subject_ids() +run_preprocessing_disordered_sleepers(disordered_subject_ids) + +for subject_id in disordered_subject_ids: + DataPlotBuilder.make_data_demo(subject_id, False) diff --git a/source/preprocessing/psg/psg_converter.py b/source/preprocessing/psg/psg_converter.py index cf7905a..1462f9d 100644 --- a/source/preprocessing/psg/psg_converter.py +++ b/source/preprocessing/psg/psg_converter.py @@ -26,6 +26,16 @@ class PSGConverter(object): 5: SleepStage.rem, 6: SleepStage.unscored} + ints_to_labels_disordered = { + -1: SleepStage.unscored, + 0: SleepStage.wake, + 1: SleepStage.n1, + 2: SleepStage.n2, + 3: SleepStage.n3, + 4: SleepStage.rem, + 5: SleepStage.unscored, + 6: SleepStage.unscored} + @staticmethod def get_label_from_string(stage_string): if stage_string in PSGConverter.strings_to_labels: @@ -35,3 +45,8 @@ def get_label_from_string(stage_string): def get_label_from_int(stage_int): if stage_int in PSGConverter.ints_to_labels: return PSGConverter.ints_to_labels[stage_int] + + @staticmethod + def get_label_from_int_disordered(stage_int): + if stage_int in PSGConverter.ints_to_labels_disordered: + return PSGConverter.ints_to_labels_disordered[stage_int] diff --git a/source/preprocessing/psg/psg_service.py b/source/preprocessing/psg/psg_service.py index 6fcb61d..fedeed1 100644 --- a/source/preprocessing/psg/psg_service.py +++ b/source/preprocessing/psg/psg_service.py @@ -1,7 +1,9 @@ import csv +from datetime import datetime import numpy as np import pandas as pd +import pytz from source import utils from source.constants import Constants @@ -9,7 +11,8 @@ from source.preprocessing.psg.compumedics_processor import CompumedicsProcessor from source.preprocessing.psg.psg_converter import PSGConverter from source.preprocessing.psg.psg_file_type import PSGFileType -from source.preprocessing.psg.psg_raw_data_collection import PSGRawDataCollection +from source.preprocessing.psg.psg_raw_data_collection import \ + PSGRawDataCollection from source.preprocessing.psg.psg_report_processor import PSGReportProcessor from source.preprocessing.psg.stage_item import StageItem from source.preprocessing.psg.vitaport_processor import VitaportProcessor @@ -20,11 +23,13 @@ class PSGService(object): @staticmethod def get_path_to_file(subject_id): psg_dir = utils.get_project_root().joinpath('data/psg') - compumedics_file = psg_dir.joinpath('compumedics/AW0' + subject_id.zfill(2) + '.TXT') + compumedics_file = psg_dir.joinpath( + 'compumedics/AW0' + subject_id.zfill(2) + '.TXT') if compumedics_file.is_file(): return compumedics_file - txt_file = psg_dir.joinpath('vitaport/AW0' + subject_id.zfill(2) + '011.txt') + txt_file = psg_dir.joinpath( + 'vitaport/AW0' + subject_id.zfill(2) + '011.txt') if txt_file.is_file(): return txt_file @@ -32,11 +37,13 @@ def get_path_to_file(subject_id): def get_type_and_report(subject_id): report_dir = utils.get_project_root().joinpath('data/reports') - pdf_file = report_dir.joinpath('AW0' + subject_id.zfill(2) + '011_REPORT.pdf') + pdf_file = report_dir.joinpath( + 'AW0' + subject_id.zfill(2) + '011_REPORT.pdf') if pdf_file.is_file(): return pdf_file, PSGFileType.Compumedics - docx_file = report_dir.joinpath('AW00' + subject_id + '011 Study Sleep Log.docx') + docx_file = report_dir.joinpath( + 'AW00' + subject_id + '011 Study Sleep Log.docx') if docx_file.is_file(): return docx_file, PSGFileType.Vitaport @@ -46,20 +53,25 @@ def read_raw(subject_id): psg_report_path, psg_type = PSGService.get_type_and_report(subject_id) if psg_type == PSGFileType.Compumedics: - report_summary = PSGReportProcessor.get_summary_from_pdf(psg_report_path) - report_summary.start_epoch = PSGReportProcessor.get_start_epoch_for_subject(subject_id) + report_summary = PSGReportProcessor.get_summary_from_pdf( + psg_report_path) + report_summary.start_epoch = \ + PSGReportProcessor.get_start_epoch_for_subject( + subject_id) data = CompumedicsProcessor.parse(report_summary, psg_stage_path) return PSGRawDataCollection(subject_id=subject_id, data=data) if psg_type == PSGFileType.Vitaport: - report_summary = PSGReportProcessor.get_summary_from_docx(psg_report_path) + report_summary = PSGReportProcessor.get_summary_from_docx( + psg_report_path) data = VitaportProcessor.parse(report_summary, psg_stage_path) return PSGRawDataCollection(subject_id=subject_id, data=data) @staticmethod def read_precleaned(subject_id): - psg_path = str(utils.get_project_root().joinpath('data/labels/' + subject_id + '_labeled_sleep.txt')) + psg_path = str(utils.get_project_root().joinpath( + 'data/labels/' + subject_id + '_labeled_sleep.txt')) data = [] with open(psg_path, 'rt') as csv_file: @@ -71,17 +83,54 @@ def read_precleaned(subject_id): start_time = float(row[0]) start_score = int(row[1]) epoch = Epoch(timestamp=start_time, index=1) - data.append(StageItem(epoch=epoch, stage=PSGConverter.get_label_from_int(start_score))) + data.append(StageItem(epoch=epoch, + stage=PSGConverter.get_label_from_int( + start_score))) else: timestamp = start_time + count * 30 score = int(row[1]) epoch = Epoch(timestamp=timestamp, - index=(1 + int(np.floor(count / rows_per_epoch)))) + index=(1 + int( + np.floor(count / rows_per_epoch)))) - data.append(StageItem(epoch=epoch, stage=PSGConverter.get_label_from_int(score))) + data.append(StageItem(epoch=epoch, + stage=PSGConverter.get_label_from_int( + score))) count = count + 1 return PSGRawDataCollection(subject_id=subject_id, data=data) + @staticmethod + def read_precleaned_disordered(subject_id): + data = [] + subject_id = subject_id[1:] + if len(subject_id) == 1: + subject_id = "0" + subject_id + tz = pytz.timezone('America/Detroit') + + psg_path = str(utils.get_project_root().joinpath( + 'data/disordered_sleepers/AWS0' + subject_id + + '_stg_data.csv')) + df = pd.read_csv(psg_path) + + df["FullDate"] = df.Date.map(str) + " " + df.Time + df["DateTime"] = pd.to_datetime(df["FullDate"]) + + df["LocalizedDateTime"] = df.DateTime.dt.tz_localize('America/Detroit') + + epoch_times = (df["LocalizedDateTime"] - tz.localize(datetime.utcfromtimestamp(-3600 * 5))).dt.total_seconds() + epoch_times = epoch_times.values + stages = df.Stg.values + index = 0 + for epoch_time, stage in zip(epoch_times, stages): + epoch = Epoch(timestamp=epoch_time, + index=index) + index = index + 1 + data.append(StageItem(epoch=epoch, + stage=PSGConverter.get_label_from_int_disordered( + stage))) + + return PSGRawDataCollection(subject_id="d" + subject_id, data=data) + @staticmethod def crop(psg_raw_collection, interval): subject_id = psg_raw_collection.subject_id @@ -100,16 +149,19 @@ def write(psg_raw_data_collection): for index in range(len(psg_raw_data_collection.data)): stage_item = psg_raw_data_collection.data[index] - data_array.append([stage_item.epoch.timestamp, stage_item.stage.value]) + data_array.append( + [stage_item.epoch.timestamp, stage_item.stage.value]) np_psg_array = np.array(data_array) - psg_output_path = Constants.CROPPED_FILE_PATH.joinpath(psg_raw_data_collection.subject_id + "_cleaned_psg.out") + psg_output_path = Constants.CROPPED_FILE_PATH.joinpath( + psg_raw_data_collection.subject_id + "_cleaned_psg.out") np.savetxt(psg_output_path, np_psg_array, fmt='%f') @staticmethod def load_cropped_array(subject_id): - cropped_psg_path = Constants.CROPPED_FILE_PATH.joinpath(subject_id + "_cleaned_psg.out") + cropped_psg_path = Constants.CROPPED_FILE_PATH.joinpath( + subject_id + "_cleaned_psg.out") return pd.read_csv(str(cropped_psg_path), delimiter=' ').values @staticmethod @@ -119,7 +171,8 @@ def load_cropped(subject_id): for row in range(np.shape(cropped_array)[0]): value = cropped_array[row, 1] - stage_items.append(StageItem(epoch=Epoch(timestamp=cropped_array[row, 0], index=row), - stage=PSGConverter.get_label_from_int(value))) + stage_items.append(StageItem( + epoch=Epoch(timestamp=cropped_array[row, 0], index=row), + stage=PSGConverter.get_label_from_int(value))) return PSGRawDataCollection(subject_id=subject_id, data=stage_items) diff --git a/source/preprocessing/raw_data_processor.py b/source/preprocessing/raw_data_processor.py index 685d03e..0b695b4 100644 --- a/source/preprocessing/raw_data_processor.py +++ b/source/preprocessing/raw_data_processor.py @@ -33,6 +33,27 @@ def crop_all(subject_id): HeartRateService.write(heart_rate_collection) ActivityCountService.build_activity_counts_without_matlab(subject_id, motion_collection.data) # Builds activity counts with python, not MATLAB + @staticmethod + def crop_all_disordered(subject_id): + psg_raw_collection = PSGService.read_precleaned_disordered(subject_id) + motion_collection = MotionService.load_raw_disordered(subject_id) + heart_rate_collection = HeartRateService.load_raw_disordered( + subject_id) + + valid_interval = RawDataProcessor.get_intersecting_interval([psg_raw_collection, + motion_collection, + heart_rate_collection]) + + psg_raw_collection = PSGService.crop(psg_raw_collection, valid_interval) + motion_collection = MotionService.crop(motion_collection, valid_interval) + heart_rate_collection = HeartRateService.crop(heart_rate_collection, valid_interval) + + PSGService.write(psg_raw_collection) + MotionService.write(motion_collection) + HeartRateService.write(heart_rate_collection) + ActivityCountService.build_activity_counts_without_matlab(subject_id, motion_collection.data) # Builds activity counts with python, not MATLAB + + @staticmethod def get_intersecting_interval(collection_list): start_times = [] diff --git a/source/utils.py b/source/utils.py index c5784f2..d170524 100644 --- a/source/utils.py +++ b/source/utils.py @@ -20,6 +20,18 @@ def get_project_root() -> Path: def get_classifiers(): + + return [AttributedClassifier(name='Random Forest', + classifier=RandomForestClassifier(n_estimators=100, max_features=1.0, + max_depth=10, + min_samples_split=10, min_samples_leaf=32, + bootstrap=True)), + AttributedClassifier(name='Logistic Regression', + classifier=LogisticRegression(penalty='l1', solver='liblinear', verbose=0, + multi_class='auto')), + AttributedClassifier(name='k-Nearest Neighbors', + classifier=KNeighborsClassifier( + weights='distance'))] return [AttributedClassifier(name='Random Forest', classifier=RandomForestClassifier(n_estimators=100, max_features=1.0, max_depth=10, @@ -37,6 +49,14 @@ def get_classifiers(): def get_base_feature_sets(): + + return [[FeatureType.count], + [FeatureType.heart_rate], + [FeatureType.count, FeatureType.heart_rate]] + + return [[FeatureType.count]] + + return [[FeatureType.count, FeatureType.heart_rate]] return [[FeatureType.count], [FeatureType.heart_rate], [FeatureType.count, FeatureType.heart_rate],