1010from copy import deepcopy
1111from datetime import datetime
1212from threading import Condition , Lock
13- from typing import Optional
13+ from typing import Optional , Dict
1414
1515import requests
1616
1919from .fetch import make_fetcher
2020from .logs import get_log
2121from .parse import parse_resource
22- from .utils import Watchable , hex_digest , img_to_data , non_blocking_lock , url_get , utc_now
22+
23+ from .utils import (
24+ Watchable ,
25+ hex_digest ,
26+ img_to_data ,
27+ non_blocking_lock ,
28+ url_get ,
29+ utc_now ,
30+ resource_string ,
31+ resource_filename ,
32+ safe_write ,
33+ hash_id ,
34+ )
2335
2436requests .packages .urllib3 .disable_warnings ()
2537
@@ -169,6 +181,10 @@ def __getstate__(self):
169181 def __setstate__ (self , state ):
170182 raise ValueError ("this object should not be unpickled" )
171183
184+ @property
185+ def local_copy_fn (self ):
186+ return os .path .join (config .local_copy_dir , hash_id (self .url , 'sha256' , False ))
187+
172188 @property
173189 def post (self ):
174190 return self .opts ['via' ]
@@ -231,8 +247,12 @@ def is_expired(self) -> bool:
231247 def is_valid (self ) -> bool :
232248 return not self .is_expired () and self .last_seen is not None and self .last_parser is not None
233249
234- def add_info (self , info ):
250+ def add_info (self ) -> Dict :
251+ info = dict ()
252+ info ['State' ] = None
253+ info ['Resource' ] = self .url
235254 self ._infos .append (info )
255+ return info
236256
237257 def _replace (self , r ):
238258 for i in range (0 , len (self .children )):
@@ -275,35 +295,76 @@ def errors(self):
275295 else :
276296 return []
277297
278- def parse (self , getter ):
279- info = dict ()
280- info ['Resource' ] = self .url
281- self .add_info (info )
282- data = None
283- log .debug ("getting {}" .format (self .url ))
298+ def load_backup (self , r ):
299+ try :
300+ return resource_string (self .local_copy_fn )
301+ log .warn ("Got status={:d} while getting {}. Fallback to local copy." .format (r .status_code , self .url ))
302+ except IOError as ex :
303+ log .warn (
304+ "Caught an exception trying to load local backup for {} via {}: {}" .format (
305+ r .url , self .local_copy_fn , ex
306+ )
307+ )
308+ return None
284309
285- r = getter (self .url )
310+ def load_resource (self , getter ):
311+ info = self .add_info ()
312+ data : Optional [str ] = None
313+ status : Optional [int ] = None
286314
287- info ['HTTP Response Headers' ] = r .headers
288- log .debug (
289- "got status_code={:d}, encoding={} from_cache={} from {}" .format (
290- r .status_code , r .encoding , getattr (r , "from_cache" , False ), self .url
291- )
292- )
293- info ['Status Code' ] = str (r .status_code )
294- info ['Reason' ] = r .reason
315+ log .debug ("Loading resource {}" .format (self .url ))
295316
296- if r .ok :
297- data = r .text
298- else :
317+ try :
318+ r = getter (self .url )
319+
320+ info ['HTTP Response Headers' ] = r .headers
321+ log .debug (
322+ "got status_code={:d}, encoding={} from_cache={} from {}" .format (
323+ r .status_code , r .encoding , getattr (r , "from_cache" , False ), self .url
324+ )
325+ )
326+ status = r .status_code
327+ info ['Reason' ] = r .reason
328+
329+ if r .ok :
330+ data = r .text
331+ self .etag = r .headers .get ('ETag' , None ) or hex_digest (r .text , 'sha256' )
332+ elif self .local_copy_fn is not None :
333+ data = self .load_backup (r )
334+ if data is not None and len (data ) > 0 :
335+ info ['Reason' ] = "Retrieved from local cache because status: {} != 200" .format (status )
336+ status = 218
337+
338+ info ['Status Code' ] = str (status )
339+
340+ except IOError as ex :
341+ log .warn ("caught exception from {}: {}" .format (self .url , ex ))
342+ if self .local_copy_fn is not None :
343+ data = self .load_backup (r )
344+ if data is not None and len (data ) > 0 :
345+ info ['Reason' ] = "Retrieved from local cache because exception: {}" .format (ex )
346+ status = 218
347+
348+ if data is None or not len (data ) > 0 :
299349 raise ResourceException ("Got status={:d} while getting {}" .format (r .status_code , self .url ))
300350
351+ if status == 200 :
352+ self .last_seen = utc_now ().replace (microsecond = 0 )
353+ safe_write (self .local_copy_fn , data , True )
354+
355+ info ['State' ] = 'Fetched'
356+
357+ return data , info
358+
359+ def parse (self , getter ):
360+ data , info = self .load_resource (getter )
361+ info ['State' ] = 'Parsing'
301362 parse_info = parse_resource (self , data )
302363 if parse_info is not None and isinstance (parse_info , dict ):
303364 info .update (parse_info )
304365
366+ info ['State' ] = 'Parsed'
305367 if self .t is not None :
306- self .last_seen = utc_now ().replace (microsecond = 0 )
307368 if self .post and isinstance (self .post , list ):
308369 for cb in self .post :
309370 if self .t is not None :
@@ -318,6 +379,6 @@ def parse(self, getter):
318379 for (eid , error ) in list (info ['Validation Errors' ].items ()):
319380 log .error (error )
320381
321- self . etag = r . headers . get ( 'ETag' , None ) or hex_digest ( r . text , 'sha256' )
382+ info [ 'State' ] = 'Ready'
322383
323384 return self .children
0 commit comments