diff --git a/CHANGELOG.md b/CHANGELOG.md index b54346c..bce7d8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,35 @@ Changelog ========= +0.7.6 +----- + +* Fix: run\_workers command will no longer run into an infinite loop. +* Fix: Google Analytics detection did not handle the case when aip is not + set correctly. + +0.7.5 +----- + +* Fix: POST data extraction failed under certain circumstances. + +0.7.4 +----- + +* Fix: Also look into POST data for Google Analytics request to find aip=1 + +0.7.3 +----- + +* Fix: Check for \_\_utm.gif in Google Analytics check instead of utm.gif +* Fix: Do not consider gtm/js requests as tracking requests for Google + Analytics, since they just load the GTM configuration. This fixes + a bug where a site is mistakenly detected as not using the anonymize IP + extension. +* Start counter for numeric locks at zero instead of one. This makes the + remote debugging ports for Google Chrome used by the "scan" command + consistent with those used the "run\_workers" command. + 0.7.2 ----- diff --git a/privacyscanner/scanmodules/chromedevtools/__init__.py b/privacyscanner/scanmodules/chromedevtools/__init__.py index f7baffb..a6aeb2e 100644 --- a/privacyscanner/scanmodules/chromedevtools/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/__init__.py @@ -9,7 +9,7 @@ TLSDetailsExtractor, CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, \ FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, \ CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, ImprintExtractor, \ - HSTSPreloadExtractor, FingerprintingExtractor + HSTSPreloadExtractor, FingerprintingExtractor, CookieSyncExtractor from privacyscanner.scanmodules.chromedevtools.utils import TLDEXTRACT_CACHE_FILE, parse_domain from privacyscanner.utils import file_is_outdated, set_default_options, calculate_jaccard_index @@ -19,7 +19,8 @@ CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, - ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor] + ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor, + CookieSyncExtractor] EXTRACTOR_CLASSES_HTTPS_RUN = [FinalUrlExtractor, TLSDetailsExtractor, CertificateExtractor, InsecureContentExtractor, SecurityHeadersExtractor, diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index 61d2462..7e8ba82 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -19,6 +19,9 @@ from privacyscanner.scanmodules.chromedevtools.utils import scripts_disabled from privacyscanner.utils import kill_everything + +CHANGE_WAIT_TIME = 15 + # See https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md # See also https://peter.sh/experiments/chromium-command-line-switches/ CHROME_OPTIONS = [ @@ -137,6 +140,8 @@ })(); """.lstrip() +# TODO: There are still some contexts in which this JavaScript snippet does not +# run properly. This requires more research. EXTRACT_ARGUMENTS_JAVASCRIPT = ''' (function(logArguments) { let retval = 'null'; @@ -349,10 +354,10 @@ def scan(self, browser, result, logger, options): # because page_loaded event is already set. self._page_loaded.wait(load_max_wait) self._page_interaction() - # We wait 5 seconds after the page has loaded, so that any + # We wait 15 seconds after the page has loaded, so that any # resources can load. This includes JavaScript which might # issue further requests. - if not self._document_will_change.wait(5): + if not self._document_will_change.wait(CHANGE_WAIT_TIME): # OK, our page should be stable now. So we will disable any # further requests by just intercepting them and not # taking care of them. @@ -374,9 +379,14 @@ def scan(self, browser, result, logger, options): raise NotReachableError('No stable page to scan.') response = self._page.final_response - res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'], - url=response['url']) - content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode() + # If there is no frameId, there is no content that was rendered. + # This is usually the case, when the site has a redirect. + if 'frameId' in response['extra']: + res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'], + url=response['url']) + content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode() + else: + content = b'' else: self._tab.stop() browser.close_tab(self._tab) @@ -409,6 +419,16 @@ def _cb_request_will_be_sent(self, request, requestId, **kwargs): request['requestId'] = requestId request['document_url'] = kwargs.get('documentURL') request['extra'] = kwargs + if request.get('hasPostData', False): + if 'postData' in request: + request['post_data'] = request['postData'] + else: + post_data = self._tab.Network.getRequestPostData(requestId=requestId) + # To avoid a too high memory usage by single requests + # we just store the first 64 KiB of the post data + request['post_data'] = post_data['postData'][:65536] + else: + request['post_data'] = None self._page.add_request(request) # Redirect requests don't have a received response but issue another @@ -447,9 +467,15 @@ def _cb_paused(self, **info): if self._log_breakpoint in info['hitBreakpoints']: call_frames = [] for call_frame in info['callFrames']: - args = json.loads(self._tab.Debugger.evaluateOnCallFrame( + javascript_result = self._tab.Debugger.evaluateOnCallFrame( callFrameId=call_frame['callFrameId'], - expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result']['value']) + expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result'] + if 'value' in javascript_result: + args = json.loads(javascript_result['value']) + else: + # TODO: We should look for the error here and handle those + # cases to reliably extract the arguments. + args = ['error', None] call_frames.append({ 'url': call_frame['url'], 'functionName': call_frame['functionName'], diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py index ee3c5f0..b5f93b0 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py @@ -15,4 +15,5 @@ from .screenshot import ScreenshotExtractor from .imprint import ImprintExtractor from .hstspreload import HSTSPreloadExtractor -from .fingerprinting import FingerprintingExtractor \ No newline at end of file +from .fingerprinting import FingerprintingExtractor +from .cookiesync import CookieSyncExtractor diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py new file mode 100644 index 0000000..763585d --- /dev/null +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -0,0 +1,85 @@ +from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor +from datetime import datetime + + +class CookieSyncExtractor(Extractor): + + def extract_information(self): + cookies_synced = dict(cookie_sync_occurred=None, number_sync_relations=0, number_sync_domains=0, + sync_relation=[], sync_domains=[]) + tracker_requests = [] + tracker_cookies = [] + + for request in self.page.request_log: + if request['is_thirdparty']: + tracker_requests.append(request) + + for cookie in self.result['cookies']: + if cookie['is_tracker']: + tracker_cookies.append(cookie) + + if len(tracker_cookies) == 0: + cookies_synced['cookie_sync_occurred'] = False + + for cookie in tracker_cookies: + for request in tracker_requests: + if len(cookie['value']) > 6: + if cookie['value'] in request['url']: + cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + if cookie_domain not in request['url']: + + try: + target_domain = request['url'].split('/')[2] + except IndexError: + target_domain = request['url'] + if target_domain not in cookies_synced['sync_domains']: + cookies_synced['sync_domains'].append(target_domain) + + try: + origin_domain = cookie['domain'] + except IndexError: + origin_domain = cookie['domain'] + if origin_domain not in cookies_synced['sync_domains']: + cookies_synced['sync_domains'].append(origin_domain) + + strikeout_count = 0 + if len(cookies_synced) > 0: + for element in cookies_synced['sync_relation']: + strikeout_subcount = 0 + if target_domain in element['target']: + strikeout_subcount += 1 + if origin_domain in element['target']: + strikeout_subcount += 1 + if origin_domain in element['origin']: + strikeout_subcount += 1 + if strikeout_subcount > 1: + strikeout_count = 1 + + if len(cookie['value']) == 10: + possible_time_cookie = None + utcstamp = None + try: + possible_time_cookie = datetime.utcfromtimestamp(int(cookie['value'])) + utcstamp = datetime.utcnow() + except ValueError: + strikeout_count += 0 + if possible_time_cookie is not None: + if possible_time_cookie.date().year == utcstamp.date().year: + if possible_time_cookie.date().month == utcstamp.date().month: + strikeout_count += 1 + + if strikeout_count == 0: + cookies_synced['cookie_sync_occurred'] = True + cookies_synced['sync_relation'].append({'origin': cookie['domain'], + 'target': request['url'], + 'value': cookie['value']}) + + if cookies_synced['cookie_sync_occurred'] is None: + cookies_synced['cookie_sync_occurred'] = False + cookies_synced['sync_domains'] = None + + if cookies_synced['sync_domains'] and cookies_synced['sync_relation'] is not None: + cookies_synced['number_sync_relations'] = len(cookies_synced['sync_relation']) + cookies_synced['number_sync_domains'] = len(cookies_synced['sync_domains']) + + self.result['cookiesync'] = cookies_synced diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py index 30c0e08..6cf9ca5 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py @@ -66,10 +66,8 @@ def extract_information(self): num_requests_no_aip = 0 has_ga_requests = False for request in self.page.request_log: - parsed_url = request['parsed_url'] - if self._is_google_request(parsed_url): - qs = parse_qs(parsed_url.query) - if 'aip' in qs and qs['aip'][-1] in ('1', 'true'): + if self._is_google_request(request['parsed_url']): + if self._is_anonymized(request): num_requests_aip += 1 else: num_requests_no_aip += 1 @@ -110,4 +108,20 @@ def _is_google_request(parsed_url): ga_domains = ('www.google-analytics.com', 'ssl.google-analytics.com', 'stats.g.doubleclick.net') if parsed_url.netloc in ga_domains: - return any(p in parsed_url.path for p in ('collect', 'utm.gif', 'gtm/js')) + return any(p in parsed_url.path for p in ('collect', '__utm.gif')) + + @staticmethod + def _is_anonymized(request): + # There could be conflicting aip options, e.g., when a POST request + # contains aip=0 in their post data, but aip=1 in the URL. + # In this case, post data takes precedence. + aip = None + if request['method'] == 'POST' and request['post_data']: + qs = parse_qs(request['post_data']) + aip = qs.get('aip') + if aip is None: + qs = parse_qs(request['parsed_url'].query) + aip = qs.get('aip') + if aip and aip[-1] in ('1', 'true'): + return True + return False diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py b/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py index 0ab5e98..883c53f 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py @@ -57,7 +57,7 @@ def _parse_hsts(header_value): try: max_age = int(max_age) except ValueError: - pass + max_age = None break return { 'header_value': header_value, diff --git a/privacyscanner/worker.py b/privacyscanner/worker.py index b42cde2..1a29659 100644 --- a/privacyscanner/worker.py +++ b/privacyscanner/worker.py @@ -235,6 +235,7 @@ def _execute_sql_autocommit(self, query, params): with self._conn.cursor() as c: c.execute(query, params) self._conn.commit() + break except psycopg2.OperationalError: print('Database operational error. Retrying after 10 seconds.') time.sleep(10) diff --git a/setup.cfg b/setup.cfg index b6904a8..9adecac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = privacyscanner -version = 0.7.2 +version = 0.7.6 home-page = https://github.com/PrivacyScore/privacyscanner license = MIT license-file = LICENSE