Skip to content

Commit 9225a36

Browse files
authored
Adding debug hooks and inference pipeline support (#27)
1 parent a305498 commit 9225a36

File tree

4 files changed

+143
-0
lines changed

4 files changed

+143
-0
lines changed

src/stepfunctions/steps/sagemaker.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ def __init__(self, state_id, estimator, job_name, data=None, hyperparameters=Non
6666
else:
6767
parameters = training_config(estimator=estimator, inputs=data, mini_batch_size=mini_batch_size)
6868

69+
if estimator.debugger_hook_config != None:
70+
parameters['DebugHookConfig'] = estimator.debugger_hook_config._to_request_dict()
71+
72+
if estimator.rules != None:
73+
parameters['DebugRuleConfigurations'] = [rule.to_debugger_rule_config_dict() for rule in estimator.rules]
74+
6975
if isinstance(job_name, (ExecutionInput, StepInput)):
7076
parameters['TrainingJobName'] = job_name
7177

src/stepfunctions/template/pipeline/inference.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,10 @@ def execute(self, job_name=None, hyperparameters=None):
204204
s3_bucket=self.s3_bucket,
205205
pipeline_name=self.workflow.name
206206
)
207+
inputs[StepId.TrainPreprocessor.value]['DebugHookConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models/debug'.format(
208+
s3_bucket=self.s3_bucket,
209+
pipeline_name=self.workflow.name
210+
)
207211
inputs[StepId.CreatePreprocessorModel.value]['PrimaryContainer']['ModelDataUrl'] = '{s3_uri}/{job}/output/model.tar.gz'.format(
208212
s3_uri=inputs[StepId.TrainPreprocessor.value]['OutputDataConfig']['S3OutputPath'],
209213
job=inputs[StepId.TrainPreprocessor.value]['TrainingJobName']
@@ -236,6 +240,10 @@ def execute(self, job_name=None, hyperparameters=None):
236240
s3_bucket=self.s3_bucket,
237241
pipeline_name=self.workflow.name
238242
)
243+
inputs[StepId.Train.value]['DebugHookConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models/debug'.format(
244+
s3_bucket=self.s3_bucket,
245+
pipeline_name=self.workflow.name
246+
)
239247
inputs[StepId.CreatePipelineModel.value]['ModelName'] = job_name
240248
self.replace_sagemaker_job_name(inputs[StepId.Train.value], inputs[StepId.Train.value]['TrainingJobName'])
241249

tests/unit/test_pipeline.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sagemaker.sklearn.estimator import SKLearn
1919
from unittest.mock import MagicMock, patch
2020
from stepfunctions.template import TrainingPipeline, InferencePipeline
21+
from sagemaker.debugger import DebuggerHookConfig
2122

2223
from tests.unit.utils import mock_boto_api_call
2324

@@ -65,6 +66,10 @@ def sklearn_preprocessor():
6566
source_dir=source_dir,
6667
sagemaker_session=sagemaker_session
6768
)
69+
70+
sklearn_preprocessor.debugger_hook_config = DebuggerHookConfig(
71+
s3_output_path='s3://sagemaker/source/debug'
72+
)
6873

6974
return sklearn_preprocessor
7075

@@ -86,6 +91,10 @@ def linear_learner_estimator():
8691
sagemaker_session=sagemaker_session
8792
)
8893

94+
ll_estimator.debugger_hook_config = DebuggerHookConfig(
95+
s3_output_path='s3://sagemaker/models/debug'
96+
)
97+
8998
ll_estimator.set_hyperparameters(feature_dim=10, predictor_type='regressor', mini_batch_size=32)
9099

91100
return ll_estimator
@@ -238,6 +247,7 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
238247
assert result['States']['Train Preprocessor'] == {
239248
'Parameters': {
240249
'AlgorithmSpecification.$': "$$.Execution.Input['Train Preprocessor'].AlgorithmSpecification",
250+
'DebugHookConfig.$': "$$.Execution.Input['Train Preprocessor'].DebugHookConfig",
241251
'HyperParameters.$': "$$.Execution.Input['Train Preprocessor'].HyperParameters",
242252
'InputDataConfig.$': "$$.Execution.Input['Train Preprocessor'].InputDataConfig",
243253
'OutputDataConfig.$': "$$.Execution.Input['Train Preprocessor'].OutputDataConfig",
@@ -342,6 +352,9 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
342352
'OutputDataConfig': {
343353
'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models'
344354
},
355+
'DebugHookConfig': {
356+
'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models/debug'
357+
},
345358
'ResourceConfig': {
346359
'InstanceCount': 1,
347360
'InstanceType': 'ml.c4.xlarge',
@@ -406,6 +419,7 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
406419
}
407420
}],
408421
'OutputDataConfig': { 'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models' },
422+
'DebugHookConfig': { 'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models/debug' },
409423
'ResourceConfig': {
410424
'InstanceCount': 1,
411425
'InstanceType': 'ml.c4.xlarge',

tests/unit/test_sagemaker_steps.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from sagemaker.tensorflow import TensorFlow
2222
from sagemaker.pipeline import PipelineModel
2323
from sagemaker.model_monitor import DataCaptureConfig
24+
from sagemaker.debugger import Rule, rule_configs, DebuggerHookConfig, CollectionConfig
2425

2526
from unittest.mock import MagicMock, patch
2627
from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointStep, EndpointConfigStep
@@ -58,6 +59,54 @@ def pca_estimator():
5859

5960
return pca
6061

62+
@pytest.fixture
63+
def pca_estimator_with_debug_hook():
64+
s3_output_location = 's3://sagemaker/models'
65+
66+
hook_config = DebuggerHookConfig(
67+
s3_output_path='s3://sagemaker/output/debug',
68+
hook_parameters={
69+
"save_interval": "1"
70+
},
71+
collection_configs=[
72+
CollectionConfig("hyperparameters"),
73+
CollectionConfig("metrics")
74+
]
75+
)
76+
77+
rules = [Rule.sagemaker(rule_configs.confusion(),
78+
rule_parameters={
79+
"category_no": "15",
80+
"min_diag": "0.7",
81+
"max_off_diag": "0.3",
82+
"start_step": "17",
83+
"end_step": "19"}
84+
)]
85+
86+
pca = sagemaker.estimator.Estimator(
87+
PCA_IMAGE,
88+
role=EXECUTION_ROLE,
89+
train_instance_count=1,
90+
train_instance_type='ml.c4.xlarge',
91+
output_path=s3_output_location,
92+
debugger_hook_config = hook_config,
93+
rules=rules
94+
)
95+
96+
pca.set_hyperparameters(
97+
feature_dim=50000,
98+
num_components=10,
99+
subtract_mean=True,
100+
algorithm_mode='randomized',
101+
mini_batch_size=200
102+
)
103+
104+
pca.sagemaker_session = MagicMock()
105+
pca.sagemaker_session.boto_region_name = 'us-east-1'
106+
pca.sagemaker_session._default_bucket = 'sagemaker'
107+
108+
return pca
109+
61110
@pytest.fixture
62111
def pca_model():
63112
model_data = 's3://sagemaker/models/pca.tar.gz'
@@ -95,6 +144,10 @@ def tensorflow_estimator():
95144
checkpoint_path='s3://sagemaker/models/sagemaker-tensorflow/checkpoints'
96145
)
97146

147+
estimator.debugger_hook_config = DebuggerHookConfig(
148+
s3_output_path='s3://sagemaker/models/debug'
149+
)
150+
98151
estimator.sagemaker_session = MagicMock()
99152
estimator.sagemaker_session.boto_region_name = 'us-east-1'
100153
estimator.sagemaker_session._default_bucket = 'sagemaker'
@@ -148,6 +201,65 @@ def test_training_step_creation(pca_estimator):
148201
'End': True
149202
}
150203

204+
@patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
205+
def test_training_step_creation_with_debug_hook(pca_estimator_with_debug_hook):
206+
step = TrainingStep('Training',
207+
estimator=pca_estimator_with_debug_hook,
208+
job_name='TrainingJob')
209+
assert step.to_dict() == {
210+
'Type': 'Task',
211+
'Parameters': {
212+
'AlgorithmSpecification': {
213+
'TrainingImage': PCA_IMAGE,
214+
'TrainingInputMode': 'File'
215+
},
216+
'OutputDataConfig': {
217+
'S3OutputPath': 's3://sagemaker/models'
218+
},
219+
'StoppingCondition': {
220+
'MaxRuntimeInSeconds': 86400
221+
},
222+
'ResourceConfig': {
223+
'InstanceCount': 1,
224+
'InstanceType': 'ml.c4.xlarge',
225+
'VolumeSizeInGB': 30
226+
},
227+
'RoleArn': EXECUTION_ROLE,
228+
'HyperParameters': {
229+
'feature_dim': '50000',
230+
'num_components': '10',
231+
'subtract_mean': 'True',
232+
'algorithm_mode': 'randomized',
233+
'mini_batch_size': '200'
234+
},
235+
'DebugHookConfig': {
236+
'S3OutputPath': 's3://sagemaker/output/debug',
237+
'HookParameters': {'save_interval': '1'},
238+
'CollectionConfigurations': [
239+
{'CollectionName': 'hyperparameters'},
240+
{'CollectionName': 'metrics'}
241+
]
242+
},
243+
'DebugRuleConfigurations': [
244+
{
245+
'RuleConfigurationName': 'Confusion',
246+
'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
247+
'RuleParameters': {
248+
'rule_to_invoke': 'Confusion',
249+
'category_no': '15',
250+
'min_diag': '0.7',
251+
'max_off_diag': '0.3',
252+
'start_step': '17',
253+
'end_step': '19'
254+
}
255+
}
256+
],
257+
'TrainingJobName': 'TrainingJob'
258+
},
259+
'Resource': 'arn:aws:states:::sagemaker:createTrainingJob.sync',
260+
'End': True
261+
}
262+
151263
@patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
152264
def test_training_step_creation_with_model(pca_estimator):
153265
training_step = TrainingStep('Training', estimator=pca_estimator, job_name='TrainingJob')
@@ -231,6 +343,9 @@ def test_training_step_creation_with_framework(tensorflow_estimator):
231343
'OutputDataConfig': {
232344
'S3OutputPath': 's3://sagemaker/models'
233345
},
346+
'DebugHookConfig': {
347+
'S3OutputPath': 's3://sagemaker/models/debug'
348+
},
234349
'StoppingCondition': {
235350
'MaxRuntimeInSeconds': 86400
236351
},

0 commit comments

Comments
 (0)