1+ import time
2+
13from dpdispatcher .machine import Machine
24from dpdispatcher import dlog
35from dpdispatcher .JobStatus import JobStatus
@@ -62,14 +64,25 @@ def gen_script_header(self, job):
6264
6365 return lsf_script_header
6466
65- def do_submit (self , job ):
67+ def do_submit (self , job , retry = 0 , max_retry = 3 ):
6668 script_file_name = job .script_file_name
6769 script_str = self .gen_script (job )
6870 job_id_name = job .job_hash + '_job_id'
6971 self .context .write_file (fname = script_file_name , write_str = script_str )
70- stdin , stdout , stderr = self .context .block_checkcall (
71- 'cd %s && %s %s' % (self .context .remote_root , 'bsub < ' , script_file_name )
72- )
72+
73+ try :
74+ stdin , stdout , stderr = self .context .block_checkcall (
75+ 'cd %s && %s %s' % (self .context .remote_root , 'bsub < ' , script_file_name )
76+ )
77+ except RuntimeError as err :
78+ if retry < max_retry :
79+ dlog .warning (err )
80+ dlog .warning ("Sleep 60 s and retry submitting..." )
81+ # rest 60s
82+ time .sleep (60 )
83+ return self .do_submit (job , retry = retry + 1 , max_retry = max_retry )
84+ raise
85+
7386 subret = (stdout .readlines ())
7487 job_id = subret [0 ].split ()[1 ][1 :- 1 ]
7588 self .context .write_file (job_id_name , job_id )
@@ -85,7 +98,7 @@ def sub_script_cmd(self, res):
8598 def sub_script_head (self , res ):
8699 pass
87100
88- def check_status (self , job ):
101+ def check_status (self , job , retry = 0 , max_retry = 3 ):
89102 try :
90103 job_id = job .job_id
91104 except AttributeError :
@@ -101,6 +114,14 @@ def check_status(self, job):
101114 else :
102115 return JobStatus .terminated
103116 elif ret != 0 :
117+ # just retry when any unknown error raised.
118+ if retry < max_retry :
119+ dlog .warning ("Get error code %d in checking status through ssh with job: %s . message: %s" %
120+ (ret , job .job_hash , err_str ))
121+ dlog .warning ("Sleep 60 s and retry checking..." )
122+ # rest 60s
123+ time .sleep (60 )
124+ return self .check_status (job , retry = retry + 1 , max_retry = max_retry )
104125 raise RuntimeError ("status command bjobs fails to execute.\n error info: %s \n return code %d\n "
105126 % (err_str , ret ))
106127 status_out = stdout .read ().decode ('utf-8' ).split ('\n ' )
0 commit comments