11#!/usr/bin/env python
22# coding: utf-8
33# %%
4+ import time
45import uuid
56
67from dargs .dargs import Argument
1415from .dpcloudserver import zip_file
1516import shutil
1617import tqdm
18+
1719# from zip_file import zip_files
20+ from .dpcloudserver .config import ALI_OSS_BUCKET_URL
21+
1822DP_CLOUD_SERVER_HOME_DIR = os .path .join (
1923 os .path .expanduser ('~' ),
2024 '.dpdispatcher/' ,
2327ENDPOINT = 'http://oss-cn-shenzhen.aliyuncs.com'
2428BUCKET_NAME = 'dpcloudserver'
2529
30+
2631class DpCloudServerContext (BaseContext ):
27- def __init__ (self ,
28- local_root ,
29- remote_root = None ,
30- remote_profile = {},
31- * args ,
32- ** kwargs ,
33- ):
32+ def __init__ (self ,
33+ local_root ,
34+ remote_root = None ,
35+ remote_profile = {},
36+ * args ,
37+ ** kwargs ,
38+ ):
3439 self .init_local_root = local_root
3540 self .init_remote_root = remote_root
3641 self .temp_local_root = os .path .abspath (local_root )
@@ -83,6 +88,43 @@ def _gen_oss_path(self, job, zip_filename):
8388 setattr (job , 'upload_path' , path )
8489 return path
8590
91+ def upload_job (self , job , common_files = None ):
92+ MAX_RETRY = 3
93+ if common_files is None :
94+ common_files = []
95+ self .machine .gen_local_script (job )
96+ zip_filename = job .job_hash + '.zip'
97+ oss_task_zip = self ._gen_oss_path (job , zip_filename )
98+ zip_task_file = os .path .join (self .local_root , zip_filename )
99+
100+ upload_file_list = [job .script_file_name , ]
101+ upload_file_list .extend (common_files )
102+
103+ for task in job .job_task_list :
104+ for file in task .forward_files :
105+ upload_file_list .append (
106+ os .path .join (
107+ task .task_work_path , file
108+ )
109+ )
110+
111+ upload_zip = zip_file .zip_file_list (
112+ self .local_root ,
113+ zip_task_file ,
114+ file_list = upload_file_list
115+ )
116+ result = self .api .upload (oss_task_zip , upload_zip , ENDPOINT , BUCKET_NAME )
117+ retry_count = 0
118+ while True :
119+ if self .api .check_file_has_uploaded (ALI_OSS_BUCKET_URL + oss_task_zip ):
120+ self ._backup (self .local_root , upload_zip )
121+ break
122+ elif retry_count < MAX_RETRY :
123+ time .sleep (1 + retry_count )
124+ retry_count += 1
125+ else :
126+ raise ValueError (f"upload retried excess { MAX_RETRY } terminate." )
127+
86128 def upload (self , submission ):
87129 # oss_task_dir = os.path.join('%s/%s/%s.zip' % ('indicate', file_uuid, file_uuid))
88130 # zip_filename = submission.submission_hash + '.zip'
@@ -100,30 +142,8 @@ def upload(self, submission):
100142 if len (job_to_be_uploaded ) == 0 :
101143 dlog .info ("all job has been uploaded, continue" )
102144 return result
103- for job in tqdm .tqdm (job_to_be_uploaded , desc = "Uploading to Lebesgue" , bar_format = bar_format ):
104- self .machine .gen_local_script (job )
105- zip_filename = job .job_hash + '.zip'
106- oss_task_zip = self ._gen_oss_path (job , zip_filename )
107- zip_task_file = os .path .join (self .local_root , zip_filename )
108-
109- upload_file_list = [job .script_file_name , ]
110- upload_file_list .extend (submission .forward_common_files )
111-
112- for task in job .job_task_list :
113- for file in task .forward_files :
114- upload_file_list .append (
115- os .path .join (
116- task .task_work_path , file
117- )
118- )
119-
120- upload_zip = zip_file .zip_file_list (
121- self .local_root ,
122- zip_task_file ,
123- file_list = upload_file_list
124- )
125- result = self .api .upload (oss_task_zip , upload_zip , ENDPOINT , BUCKET_NAME )
126- self ._backup (self .local_root , upload_zip )
145+ for job in tqdm .tqdm (job_to_be_uploaded , desc = "Uploading to Lebesgue" , bar_format = bar_format , leave = False ):
146+ self .upload_job (job , submission .forward_common_files )
127147 return result
128148 # return oss_task_zip
129149 # api.upload(self.oss_task_dir, zip_task_file)
@@ -151,7 +171,8 @@ def download(self, submission):
151171 job_hash = job_hashs [each ['task_id' ]]
152172 job_infos [job_hash ] = each
153173 bar_format = "{l_bar}{bar}| {n:.02f}/{total:.02f} % [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
154- for job_hash , info in tqdm .tqdm (job_infos .items (), desc = "Validating download file from Lebesgue" , bar_format = bar_format ):
174+ for job_hash , info in tqdm .tqdm (job_infos .items (), desc = "Validating download file from Lebesgue" ,
175+ bar_format = bar_format , leave = False ):
155176 result_filename = job_hash + '_back.zip'
156177 target_result_zip = os .path .join (self .local_root , result_filename )
157178 if self ._check_if_job_has_already_downloaded (target_result_zip , self .local_root ):
@@ -234,7 +255,7 @@ def clean(self):
234255 # retcode = cmd_pipes['stdout'].channel.recv_exit_status()
235256 # return retcode, cmd_pipes['stdout'], cmd_pipes['stderr']
236257
237- def kill (self , cmd_pipes ) :
258+ def kill (self , cmd_pipes ):
238259 pass
239260
240261 @classmethod
@@ -251,11 +272,12 @@ def machine_subfields(cls) -> List[Argument]:
251272 Argument ("email" , str , optional = False , doc = "Email" ),
252273 Argument ("password" , str , optional = False , doc = "Password" ),
253274 Argument ("program_id" , int , optional = False , doc = "Program ID" ),
275+ Argument ("keep_backup" , bool , optional = True , doc = "keep download and upload zip" ),
254276 Argument ("input_data" , dict , optional = False , doc = "Configuration of job" ),
255277 ], doc = doc_remote_profile )]
256278
257279
258280class LebesgueContext (DpCloudServerContext ):
259281 pass
260282
261- #%%
283+ # %%
0 commit comments