From 920e31cc6c9d697c923c644bc70c5582bc66b3a6 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 21 May 2019 11:21:26 +0200 Subject: [PATCH 01/37] Init CookieSyncExtractor --- privacyscanner/scanmodules/chromedevtools/__init__.py | 5 +++-- .../scanmodules/chromedevtools/extractors/__init__.py | 3 ++- .../scanmodules/chromedevtools/extractors/cookiesync.py | 8 ++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py diff --git a/privacyscanner/scanmodules/chromedevtools/__init__.py b/privacyscanner/scanmodules/chromedevtools/__init__.py index f7baffb..a6aeb2e 100644 --- a/privacyscanner/scanmodules/chromedevtools/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/__init__.py @@ -9,7 +9,7 @@ TLSDetailsExtractor, CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, \ FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, \ CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, ImprintExtractor, \ - HSTSPreloadExtractor, FingerprintingExtractor + HSTSPreloadExtractor, FingerprintingExtractor, CookieSyncExtractor from privacyscanner.scanmodules.chromedevtools.utils import TLDEXTRACT_CACHE_FILE, parse_domain from privacyscanner.utils import file_is_outdated, set_default_options, calculate_jaccard_index @@ -19,7 +19,8 @@ CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, - ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor] + ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor, + CookieSyncExtractor] EXTRACTOR_CLASSES_HTTPS_RUN = [FinalUrlExtractor, TLSDetailsExtractor, CertificateExtractor, InsecureContentExtractor, SecurityHeadersExtractor, diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py index ee3c5f0..b5f93b0 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py @@ -15,4 +15,5 @@ from .screenshot import ScreenshotExtractor from .imprint import ImprintExtractor from .hstspreload import HSTSPreloadExtractor -from .fingerprinting import FingerprintingExtractor \ No newline at end of file +from .fingerprinting import FingerprintingExtractor +from .cookiesync import CookieSyncExtractor diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py new file mode 100644 index 0000000..e0c4e16 --- /dev/null +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -0,0 +1,8 @@ +from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor + + +class CookieSyncExtractor(Extractor): + + def extract_information(self): + cookies_synced = [] + self.result['cookiesync'] = cookies_synced From 20da0c13e473954c0fb93abfca4e80a98a5ebce1 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 21 May 2019 13:54:51 +0200 Subject: [PATCH 02/37] Finding first instances where an cookie value is broadcasted in a request --- .../chromedevtools/extractors/cookiesync.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e0c4e16..e5f3d74 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -4,5 +4,39 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = [] + cookies_synced = {} + cookies_synced['cookie_sync_occured'] = None + cookies_synced['sync_occurence_counter'] = 0 + # cookies_synced['cookie_sync_origin'] = None + # cookies_synced['cookie_sync_target'] = None + cookies_synced['sync_relation'] = [] + + tracker_requests = [] + tracker_cookies = [] + user_ids = [] + + for request in self.page.request_log: + if request['is_thirdparty']: + tracker_requests.append(request) + + for cookie in self.result['cookies']: + if cookie['is_tracker']: + tracker_cookies.append(cookie) + + if len(tracker_cookies) == 0: + cookies_synced['cookie_sync_occured'] = False + + for cookie in tracker_cookies: + for request in tracker_requests: + if len(cookie['value']) > 10: + if cookie['value'] in request['url']: + cookies_synced['cookie_sync_occured'] = True + cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], + 'cookie_sync_target': request['url'], + 'cookie_sync_value': cookie['value']}) + if cookies_synced['cookie_sync_occured'] is None: + cookies_synced['cookie_sync_occured'] = False + + cookies_synced['sync_occurence_counter'] = len(cookies_synced['sync_relation']) + self.result['cookiesync'] = cookies_synced From 412d487fe17f5f8d8bc002fa22d1cc5e5da3310d Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 11:11:43 +0200 Subject: [PATCH 03/37] Fixing double entries in final message (cookiesync) --- .../chromedevtools/extractors/cookiesync.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e5f3d74..e7fe564 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -30,10 +30,25 @@ def extract_information(self): for request in tracker_requests: if len(cookie['value']) > 10: if cookie['value'] in request['url']: - cookies_synced['cookie_sync_occured'] = True - cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], - 'cookie_sync_target': request['url'], - 'cookie_sync_value': cookie['value']}) + if not cookie['domain'] in request['url']: + try: + t_url = request['url'].split('/')[2] + d_name = t_url.split('.') + company = d_name[len(d_name)-2] + except IndexError: + company = request['url'] + if len(cookies_synced) > 0: + domaincounter = 0 + for element in cookies_synced['sync_relation']: + if company in element['cookie_sync_target']: + domaincounter += 1 + if cookie['domain'] in element['cookie_sync_origin']: + domaincounter += 1 + if domaincounter == 0: + cookies_synced['cookie_sync_occured'] = True + cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], + 'cookie_sync_target': request['url'], + 'cookie_sync_value': cookie['value']}) if cookies_synced['cookie_sync_occured'] is None: cookies_synced['cookie_sync_occured'] = False From ab8789f820a9bc317ec9add38d388c7aa3286505 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 11:45:26 +0200 Subject: [PATCH 04/37] Fixing the problem of cookiesyncing between the same third party on different (sub)domains --- .../chromedevtools/extractors/cookiesync.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e7fe564..5588c75 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -30,7 +30,8 @@ def extract_information(self): for request in tracker_requests: if len(cookie['value']) > 10: if cookie['value'] in request['url']: - if not cookie['domain'] in request['url']: + cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + if cookie_domain not in request['url']: try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') @@ -38,13 +39,16 @@ def extract_information(self): except IndexError: company = request['url'] if len(cookies_synced) > 0: - domaincounter = 0 + strikeout_count = 0 for element in cookies_synced['sync_relation']: + strikeout_subcount = 0 if company in element['cookie_sync_target']: - domaincounter += 1 + strikeout_subcount += 1 if cookie['domain'] in element['cookie_sync_origin']: - domaincounter += 1 - if domaincounter == 0: + strikeout_subcount += 1 + if strikeout_subcount > 0: + strikeout_count = 1 + if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], 'cookie_sync_target': request['url'], From 76c597c8067cbacdd050b14358196f547c2df5ee Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:10:31 +0200 Subject: [PATCH 05/37] cleanup --- .../chromedevtools/extractors/cookiesync.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 5588c75..71a7af9 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -4,16 +4,10 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = {} - cookies_synced['cookie_sync_occured'] = None - cookies_synced['sync_occurence_counter'] = 0 - # cookies_synced['cookie_sync_origin'] = None - # cookies_synced['cookie_sync_target'] = None - cookies_synced['sync_relation'] = [] + cookies_synced = {'cookie_sync_occured': None, 'sync_occurence_counter': 0, 'sync_relation': []} tracker_requests = [] tracker_cookies = [] - user_ids = [] for request in self.page.request_log: if request['is_thirdparty']: @@ -38,8 +32,8 @@ def extract_information(self): company = d_name[len(d_name)-2] except IndexError: company = request['url'] + strikeout_count = 0 if len(cookies_synced) > 0: - strikeout_count = 0 for element in cookies_synced['sync_relation']: strikeout_subcount = 0 if company in element['cookie_sync_target']: From 4024fa69d0ea1a7736c8f0c25be6b3fe02cfccb3 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:34:23 +0200 Subject: [PATCH 06/37] More fixes for adding cookiesync-events to the list. --- .../chromedevtools/extractors/cookiesync.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 71a7af9..106db85 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -29,18 +29,28 @@ def extract_information(self): try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') - company = d_name[len(d_name)-2] + target_company_name = d_name[len(d_name)-2] except IndexError: - company = request['url'] + target_company_name = request['url'] + + try: + origin_company_name = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + except IndexError: + origin_company_name = cookie['domain'] + strikeout_count = 0 if len(cookies_synced) > 0: for element in cookies_synced['sync_relation']: strikeout_subcount = 0 - if company in element['cookie_sync_target']: + if target_company_name in element['cookie_sync_target']: + strikeout_subcount += 1 + if origin_company_name in element['cookie_sync_target']: strikeout_subcount += 1 - if cookie['domain'] in element['cookie_sync_origin']: + if origin_company_name in element['cookie_sync_origin']: strikeout_subcount += 1 - if strikeout_subcount > 0: + # if cookie['domain'] in element['cookie_sync_origin']: + # strikeout_subcount += 1 + if strikeout_subcount > 1: strikeout_count = 1 if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True From e86c976bb5b5fc65fbecd837a2567c594b2748f5 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:44:30 +0200 Subject: [PATCH 07/37] cleanup --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 106db85..a1505a9 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -5,7 +5,6 @@ class CookieSyncExtractor(Extractor): def extract_information(self): cookies_synced = {'cookie_sync_occured': None, 'sync_occurence_counter': 0, 'sync_relation': []} - tracker_requests = [] tracker_cookies = [] @@ -26,6 +25,7 @@ def extract_information(self): if cookie['value'] in request['url']: cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] if cookie_domain not in request['url']: + try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') @@ -48,15 +48,15 @@ def extract_information(self): strikeout_subcount += 1 if origin_company_name in element['cookie_sync_origin']: strikeout_subcount += 1 - # if cookie['domain'] in element['cookie_sync_origin']: - # strikeout_subcount += 1 if strikeout_subcount > 1: strikeout_count = 1 + if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], 'cookie_sync_target': request['url'], 'cookie_sync_value': cookie['value']}) + if cookies_synced['cookie_sync_occured'] is None: cookies_synced['cookie_sync_occured'] = False From 89af02cebe77c2ba57eca3beba71f72dbcd99554 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 21 May 2019 11:21:26 +0200 Subject: [PATCH 08/37] Init CookieSyncExtractor --- privacyscanner/scanmodules/chromedevtools/__init__.py | 5 +++-- .../scanmodules/chromedevtools/extractors/__init__.py | 3 ++- .../scanmodules/chromedevtools/extractors/cookiesync.py | 8 ++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py diff --git a/privacyscanner/scanmodules/chromedevtools/__init__.py b/privacyscanner/scanmodules/chromedevtools/__init__.py index f7baffb..a6aeb2e 100644 --- a/privacyscanner/scanmodules/chromedevtools/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/__init__.py @@ -9,7 +9,7 @@ TLSDetailsExtractor, CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, \ FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, \ CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, ImprintExtractor, \ - HSTSPreloadExtractor, FingerprintingExtractor + HSTSPreloadExtractor, FingerprintingExtractor, CookieSyncExtractor from privacyscanner.scanmodules.chromedevtools.utils import TLDEXTRACT_CACHE_FILE, parse_domain from privacyscanner.utils import file_is_outdated, set_default_options, calculate_jaccard_index @@ -19,7 +19,8 @@ CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, - ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor] + ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor, + CookieSyncExtractor] EXTRACTOR_CLASSES_HTTPS_RUN = [FinalUrlExtractor, TLSDetailsExtractor, CertificateExtractor, InsecureContentExtractor, SecurityHeadersExtractor, diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py index ee3c5f0..b5f93b0 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/__init__.py @@ -15,4 +15,5 @@ from .screenshot import ScreenshotExtractor from .imprint import ImprintExtractor from .hstspreload import HSTSPreloadExtractor -from .fingerprinting import FingerprintingExtractor \ No newline at end of file +from .fingerprinting import FingerprintingExtractor +from .cookiesync import CookieSyncExtractor diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py new file mode 100644 index 0000000..e0c4e16 --- /dev/null +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -0,0 +1,8 @@ +from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor + + +class CookieSyncExtractor(Extractor): + + def extract_information(self): + cookies_synced = [] + self.result['cookiesync'] = cookies_synced From 99cda3877c90539db8daecdb083a1324ee2e95d3 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 21 May 2019 13:54:51 +0200 Subject: [PATCH 09/37] Finding first instances where an cookie value is broadcasted in a request --- .../chromedevtools/extractors/cookiesync.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e0c4e16..e5f3d74 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -4,5 +4,39 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = [] + cookies_synced = {} + cookies_synced['cookie_sync_occured'] = None + cookies_synced['sync_occurence_counter'] = 0 + # cookies_synced['cookie_sync_origin'] = None + # cookies_synced['cookie_sync_target'] = None + cookies_synced['sync_relation'] = [] + + tracker_requests = [] + tracker_cookies = [] + user_ids = [] + + for request in self.page.request_log: + if request['is_thirdparty']: + tracker_requests.append(request) + + for cookie in self.result['cookies']: + if cookie['is_tracker']: + tracker_cookies.append(cookie) + + if len(tracker_cookies) == 0: + cookies_synced['cookie_sync_occured'] = False + + for cookie in tracker_cookies: + for request in tracker_requests: + if len(cookie['value']) > 10: + if cookie['value'] in request['url']: + cookies_synced['cookie_sync_occured'] = True + cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], + 'cookie_sync_target': request['url'], + 'cookie_sync_value': cookie['value']}) + if cookies_synced['cookie_sync_occured'] is None: + cookies_synced['cookie_sync_occured'] = False + + cookies_synced['sync_occurence_counter'] = len(cookies_synced['sync_relation']) + self.result['cookiesync'] = cookies_synced From 6b1d327c970a16318716cd7ff6a8b4b1148914d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Fri, 24 May 2019 14:30:52 +0200 Subject: [PATCH 10/37] chromedevtools: Remove gtm/js path for Google Analytics check gtm/js loads the configuration for Google Tag Manager. It is not a tracking request itself. You can actually include gtm/js without having Google Analytics. Moreover, we checked whether gtm/js has a aip=1 parameter set, which is not a valid parameter for gtm/js. --- .../scanmodules/chromedevtools/extractors/googleanalytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py index 30c0e08..d8cd235 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py @@ -110,4 +110,4 @@ def _is_google_request(parsed_url): ga_domains = ('www.google-analytics.com', 'ssl.google-analytics.com', 'stats.g.doubleclick.net') if parsed_url.netloc in ga_domains: - return any(p in parsed_url.path for p in ('collect', 'utm.gif', 'gtm/js')) + return any(p in parsed_url.path for p in ('collect', 'utm.gif')) From 1925b1b7b982f74e3bec06da0a0fdcba24b7bcb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Fri, 24 May 2019 14:33:22 +0200 Subject: [PATCH 11/37] chromedevtools: Change utf.gif to __utm.gif in Google Analytics There is no utm.gif file in Google Analytics but only a __utm.gif. --- .../scanmodules/chromedevtools/extractors/googleanalytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py index d8cd235..fd0ad14 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py @@ -110,4 +110,4 @@ def _is_google_request(parsed_url): ga_domains = ('www.google-analytics.com', 'ssl.google-analytics.com', 'stats.g.doubleclick.net') if parsed_url.netloc in ga_domains: - return any(p in parsed_url.path for p in ('collect', 'utm.gif')) + return any(p in parsed_url.path for p in ('collect', '__utm.gif')) From 9fa141c22b47c1a6d1824ce149f0c310fedd6b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Fri, 24 May 2019 14:39:06 +0200 Subject: [PATCH 12/37] Release 0.7.3 --- CHANGELOG.md | 12 ++++++++++++ setup.cfg | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b54346c..60111f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ Changelog ========= +0.7.3 +----- + +* Fix: Check for \_\_utm.gif in Google Analytics check instead of utm.gif +* Fix: Do not consider gtm/js requests as tracking requests for Google + Analytics, since they just load the GTM configuration. This fixes + a bug where a site is mistakenly detected as not using the anonymize IP + extension. +* Start counter for numeric locks at zero instead of one. This makes the + remote debugging ports for Google Chrome used by the "scan" command + consistent with those used the "run\_workers" command. + 0.7.2 ----- diff --git a/setup.cfg b/setup.cfg index b6904a8..b379ace 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = privacyscanner -version = 0.7.2 +version = 0.7.3 home-page = https://github.com/PrivacyScore/privacyscanner license = MIT license-file = LICENSE From 419638fe129e0488c4d42cc27b648871d59a6b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Wed, 29 May 2019 22:11:39 +0200 Subject: [PATCH 13/37] Store POST data of requests (up to 64 KiB per request) --- .../scanmodules/chromedevtools/chromescan.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index 61d2462..aa86a20 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -409,6 +409,16 @@ def _cb_request_will_be_sent(self, request, requestId, **kwargs): request['requestId'] = requestId request['document_url'] = kwargs.get('documentURL') request['extra'] = kwargs + if request.get('hasPostData', False): + if 'postData' in request: + request['post_data'] = request['postData'] + else: + post_data = self._tab.Network.getRequestPostData(requestId=requestId) + # To avoid a too high memory usage by single requests + # we just store the first 64 KiB of the post data + request['post_data'] = post_data[:65536] + else: + request['post_data'] = None self._page.add_request(request) # Redirect requests don't have a received response but issue another From 08f955aedb9a3e2df68375339be79eeeac578b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Wed, 29 May 2019 22:25:31 +0200 Subject: [PATCH 14/37] chromedevtools: Also look into post data for aip=1 in Google Analytics --- .../extractors/googleanalytics.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py index fd0ad14..79a871e 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py @@ -66,10 +66,8 @@ def extract_information(self): num_requests_no_aip = 0 has_ga_requests = False for request in self.page.request_log: - parsed_url = request['parsed_url'] - if self._is_google_request(parsed_url): - qs = parse_qs(parsed_url.query) - if 'aip' in qs and qs['aip'][-1] in ('1', 'true'): + if self._is_google_request(request['parsed_url']): + if self._is_anonymized(request): num_requests_aip += 1 else: num_requests_no_aip += 1 @@ -111,3 +109,19 @@ def _is_google_request(parsed_url): 'stats.g.doubleclick.net') if parsed_url.netloc in ga_domains: return any(p in parsed_url.path for p in ('collect', '__utm.gif')) + + @staticmethod + def _is_anonymized(request): + # There could be conflicting aip options, e.g., when a POST request + # contains aip=0 in their post data, but aip=1 in the URL. + # In this case, post data takes precedence. + aip = None + if request['method'] == 'POST' and request['post_data']: + qs = parse_qs(request['post_data']) + aip = qs.get('aip') + if aip is None: + qs = parse_qs(request['parsed_url'].query) + aip = qs.get('aip') + if aip[-1] in ('1', 'true'): + return True + return False From 14f9dafb9352d204562cde1ef5892d2f3a057b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Wed, 29 May 2019 22:29:13 +0200 Subject: [PATCH 15/37] Release 0.7.4 --- CHANGELOG.md | 5 +++++ setup.cfg | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60111f2..f0cee37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +0.7.4 +----- + +* Fix: Also look into POST data for Google Analytics request to find aip=1 + 0.7.3 ----- diff --git a/setup.cfg b/setup.cfg index b379ace..d076171 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = privacyscanner -version = 0.7.3 +version = 0.7.4 home-page = https://github.com/PrivacyScore/privacyscanner license = MIT license-file = LICENSE From ce260b0f59c62a1bcb6b0637e73e3539aab7f71f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Fri, 31 May 2019 11:20:55 +0200 Subject: [PATCH 16/37] chromedevtools: Fix post data extraction --- privacyscanner/scanmodules/chromedevtools/chromescan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index aa86a20..06b8292 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -416,7 +416,7 @@ def _cb_request_will_be_sent(self, request, requestId, **kwargs): post_data = self._tab.Network.getRequestPostData(requestId=requestId) # To avoid a too high memory usage by single requests # we just store the first 64 KiB of the post data - request['post_data'] = post_data[:65536] + request['post_data'] = post_data['postData'][:65536] else: request['post_data'] = None self._page.add_request(request) From 172149d2d895d66edb7e1d6d39b67a4bc13cba6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Fri, 31 May 2019 11:22:44 +0200 Subject: [PATCH 17/37] Release 0.7.5 --- CHANGELOG.md | 5 +++++ setup.cfg | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0cee37..3f03c6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +0.7.5 +----- + +* Fix: POST data extraction failed under certain circumstances. + 0.7.4 ----- diff --git a/setup.cfg b/setup.cfg index d076171..984b954 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = privacyscanner -version = 0.7.4 +version = 0.7.5 home-page = https://github.com/PrivacyScore/privacyscanner license = MIT license-file = LICENSE From 38b8314e48b50d7252d07c51721e28482bd01df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Mon, 3 Jun 2019 18:36:30 +0200 Subject: [PATCH 18/37] Fix run_workers command (infinite loop in _execute_sql_autocommit) --- privacyscanner/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/privacyscanner/worker.py b/privacyscanner/worker.py index b42cde2..1a29659 100644 --- a/privacyscanner/worker.py +++ b/privacyscanner/worker.py @@ -235,6 +235,7 @@ def _execute_sql_autocommit(self, query, params): with self._conn.cursor() as c: c.execute(query, params) self._conn.commit() + break except psycopg2.OperationalError: print('Database operational error. Retrying after 10 seconds.') time.sleep(10) From 3e58a94611c5bcbc49b38c0d530a7ab70b13cc0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Mon, 3 Jun 2019 18:38:09 +0200 Subject: [PATCH 19/37] chromedevtools: Fix handling of missing aip key in Google Analytics --- .../scanmodules/chromedevtools/extractors/googleanalytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py index 79a871e..6cf9ca5 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/googleanalytics.py @@ -122,6 +122,6 @@ def _is_anonymized(request): if aip is None: qs = parse_qs(request['parsed_url'].query) aip = qs.get('aip') - if aip[-1] in ('1', 'true'): + if aip and aip[-1] in ('1', 'true'): return True return False From 7681627d67b952930f2bcd1e31aecb640586eda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Mon, 3 Jun 2019 18:40:15 +0200 Subject: [PATCH 20/37] Release 0.7.6 --- CHANGELOG.md | 7 +++++++ setup.cfg | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f03c6b..bce7d8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ Changelog ========= +0.7.6 +----- + +* Fix: run\_workers command will no longer run into an infinite loop. +* Fix: Google Analytics detection did not handle the case when aip is not + set correctly. + 0.7.5 ----- diff --git a/setup.cfg b/setup.cfg index 984b954..9adecac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = privacyscanner -version = 0.7.5 +version = 0.7.6 home-page = https://github.com/PrivacyScore/privacyscanner license = MIT license-file = LICENSE From 9af51767836945a6e7962ee7cf0b1597b0a550da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Sat, 22 Jun 2019 17:02:41 +0200 Subject: [PATCH 21/37] chromedevtools: Handle incorrect HSTS header correctly --- .../scanmodules/chromedevtools/extractors/securityheaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py b/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py index 0ab5e98..883c53f 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/securityheaders.py @@ -57,7 +57,7 @@ def _parse_hsts(header_value): try: max_age = int(max_age) except ValueError: - pass + max_age = None break return { 'header_value': header_value, From 3a510837ea2bfa173a195ab29173a5082f49823b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Sat, 22 Jun 2019 17:53:41 +0200 Subject: [PATCH 22/37] chromedevtools: Fix resource content with pages without frameId To be able to fetch the resource content the content must actually be rendered in a frame. However, some pages, e.g. redirects, do not render in a frame and have no content. --- .../scanmodules/chromedevtools/chromescan.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index 06b8292..2621e36 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -374,9 +374,14 @@ def scan(self, browser, result, logger, options): raise NotReachableError('No stable page to scan.') response = self._page.final_response - res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'], - url=response['url']) - content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode() + # If there is no frameId, there is no content that was rendered. + # This is usually the case, when the site has a redirect. + if 'frameId' in response['extra']: + res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'], + url=response['url']) + content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode() + else: + content = b'' else: self._tab.stop() browser.close_tab(self._tab) From 1ab7a6fa0ed55f8874c8ca982d0c375734834fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Sat, 22 Jun 2019 17:59:18 +0200 Subject: [PATCH 23/37] chromedevtools: Fix error handling of argument extraction for JavaScript We run some JavaScript snippet inside a call frame to extract the arguments of certain JavaScript function calls. However, our JavaScript snippet may throw errors in a few cases. We should fix the snippet, but for now we just handle those errors properly and go without arguments. --- .../scanmodules/chromedevtools/chromescan.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index 2621e36..df2d8af 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -137,6 +137,8 @@ })(); """.lstrip() +# TODO: There are still some contexts in which this JavaScript snippet does not +# run properly. This requires more research. EXTRACT_ARGUMENTS_JAVASCRIPT = ''' (function(logArguments) { let retval = 'null'; @@ -462,9 +464,15 @@ def _cb_paused(self, **info): if self._log_breakpoint in info['hitBreakpoints']: call_frames = [] for call_frame in info['callFrames']: - args = json.loads(self._tab.Debugger.evaluateOnCallFrame( + javascript_result = self._tab.Debugger.evaluateOnCallFrame( callFrameId=call_frame['callFrameId'], - expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result']['value']) + expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result'] + if 'value' in javascript_result: + args = json.loads(javascript_result['value']) + else: + # TODO: We should look for the error here and handle those + # cases to reliably extract the arguments. + args = [] call_frames.append({ 'url': call_frame['url'], 'functionName': call_frame['functionName'], From 979867d74fbd3967d040228f7dee687653fed2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Sat, 22 Jun 2019 18:04:42 +0200 Subject: [PATCH 24/37] chromedevtools: Send JavaScript argument extraction errors as error log_type --- privacyscanner/scanmodules/chromedevtools/chromescan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index df2d8af..03dc3b3 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -472,7 +472,7 @@ def _cb_paused(self, **info): else: # TODO: We should look for the error here and handle those # cases to reliably extract the arguments. - args = [] + args = ['error', None] call_frames.append({ 'url': call_frame['url'], 'functionName': call_frame['functionName'], From 0ac880e18a5f02d47a45a85934ee7d62425ac87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20Prid=C3=B6hl?= Date: Sat, 22 Jun 2019 18:14:33 +0200 Subject: [PATCH 25/37] chromedevtools: Increase change wait time to 15 seconds --- privacyscanner/scanmodules/chromedevtools/chromescan.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/chromescan.py b/privacyscanner/scanmodules/chromedevtools/chromescan.py index 03dc3b3..7e8ba82 100644 --- a/privacyscanner/scanmodules/chromedevtools/chromescan.py +++ b/privacyscanner/scanmodules/chromedevtools/chromescan.py @@ -19,6 +19,9 @@ from privacyscanner.scanmodules.chromedevtools.utils import scripts_disabled from privacyscanner.utils import kill_everything + +CHANGE_WAIT_TIME = 15 + # See https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md # See also https://peter.sh/experiments/chromium-command-line-switches/ CHROME_OPTIONS = [ @@ -351,10 +354,10 @@ def scan(self, browser, result, logger, options): # because page_loaded event is already set. self._page_loaded.wait(load_max_wait) self._page_interaction() - # We wait 5 seconds after the page has loaded, so that any + # We wait 15 seconds after the page has loaded, so that any # resources can load. This includes JavaScript which might # issue further requests. - if not self._document_will_change.wait(5): + if not self._document_will_change.wait(CHANGE_WAIT_TIME): # OK, our page should be stable now. So we will disable any # further requests by just intercepting them and not # taking care of them. From 51ef96d68ca46458a15baa6359c6e1949f982e45 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 11:11:43 +0200 Subject: [PATCH 26/37] Fixing double entries in final message (cookiesync) --- .../chromedevtools/extractors/cookiesync.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e5f3d74..e7fe564 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -30,10 +30,25 @@ def extract_information(self): for request in tracker_requests: if len(cookie['value']) > 10: if cookie['value'] in request['url']: - cookies_synced['cookie_sync_occured'] = True - cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], - 'cookie_sync_target': request['url'], - 'cookie_sync_value': cookie['value']}) + if not cookie['domain'] in request['url']: + try: + t_url = request['url'].split('/')[2] + d_name = t_url.split('.') + company = d_name[len(d_name)-2] + except IndexError: + company = request['url'] + if len(cookies_synced) > 0: + domaincounter = 0 + for element in cookies_synced['sync_relation']: + if company in element['cookie_sync_target']: + domaincounter += 1 + if cookie['domain'] in element['cookie_sync_origin']: + domaincounter += 1 + if domaincounter == 0: + cookies_synced['cookie_sync_occured'] = True + cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], + 'cookie_sync_target': request['url'], + 'cookie_sync_value': cookie['value']}) if cookies_synced['cookie_sync_occured'] is None: cookies_synced['cookie_sync_occured'] = False From 7865130455016265fb5b8042453bd0f7021395fd Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 11:45:26 +0200 Subject: [PATCH 27/37] Fixing the problem of cookiesyncing between the same third party on different (sub)domains --- .../chromedevtools/extractors/cookiesync.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index e7fe564..5588c75 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -30,7 +30,8 @@ def extract_information(self): for request in tracker_requests: if len(cookie['value']) > 10: if cookie['value'] in request['url']: - if not cookie['domain'] in request['url']: + cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + if cookie_domain not in request['url']: try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') @@ -38,13 +39,16 @@ def extract_information(self): except IndexError: company = request['url'] if len(cookies_synced) > 0: - domaincounter = 0 + strikeout_count = 0 for element in cookies_synced['sync_relation']: + strikeout_subcount = 0 if company in element['cookie_sync_target']: - domaincounter += 1 + strikeout_subcount += 1 if cookie['domain'] in element['cookie_sync_origin']: - domaincounter += 1 - if domaincounter == 0: + strikeout_subcount += 1 + if strikeout_subcount > 0: + strikeout_count = 1 + if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], 'cookie_sync_target': request['url'], From 21cb3e4b16c5f124fa80d799a7ddadf9220f69b3 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:10:31 +0200 Subject: [PATCH 28/37] cleanup --- .../chromedevtools/extractors/cookiesync.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 5588c75..71a7af9 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -4,16 +4,10 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = {} - cookies_synced['cookie_sync_occured'] = None - cookies_synced['sync_occurence_counter'] = 0 - # cookies_synced['cookie_sync_origin'] = None - # cookies_synced['cookie_sync_target'] = None - cookies_synced['sync_relation'] = [] + cookies_synced = {'cookie_sync_occured': None, 'sync_occurence_counter': 0, 'sync_relation': []} tracker_requests = [] tracker_cookies = [] - user_ids = [] for request in self.page.request_log: if request['is_thirdparty']: @@ -38,8 +32,8 @@ def extract_information(self): company = d_name[len(d_name)-2] except IndexError: company = request['url'] + strikeout_count = 0 if len(cookies_synced) > 0: - strikeout_count = 0 for element in cookies_synced['sync_relation']: strikeout_subcount = 0 if company in element['cookie_sync_target']: From 10b5bad54a11c3bf628cd433101c677310c59421 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:34:23 +0200 Subject: [PATCH 29/37] More fixes for adding cookiesync-events to the list. --- .../chromedevtools/extractors/cookiesync.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 71a7af9..106db85 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -29,18 +29,28 @@ def extract_information(self): try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') - company = d_name[len(d_name)-2] + target_company_name = d_name[len(d_name)-2] except IndexError: - company = request['url'] + target_company_name = request['url'] + + try: + origin_company_name = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + except IndexError: + origin_company_name = cookie['domain'] + strikeout_count = 0 if len(cookies_synced) > 0: for element in cookies_synced['sync_relation']: strikeout_subcount = 0 - if company in element['cookie_sync_target']: + if target_company_name in element['cookie_sync_target']: + strikeout_subcount += 1 + if origin_company_name in element['cookie_sync_target']: strikeout_subcount += 1 - if cookie['domain'] in element['cookie_sync_origin']: + if origin_company_name in element['cookie_sync_origin']: strikeout_subcount += 1 - if strikeout_subcount > 0: + # if cookie['domain'] in element['cookie_sync_origin']: + # strikeout_subcount += 1 + if strikeout_subcount > 1: strikeout_count = 1 if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True From 67f7c1a3350c0194e5d6e16c0c094e065ce33978 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 12:44:30 +0200 Subject: [PATCH 30/37] cleanup --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 106db85..a1505a9 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -5,7 +5,6 @@ class CookieSyncExtractor(Extractor): def extract_information(self): cookies_synced = {'cookie_sync_occured': None, 'sync_occurence_counter': 0, 'sync_relation': []} - tracker_requests = [] tracker_cookies = [] @@ -26,6 +25,7 @@ def extract_information(self): if cookie['value'] in request['url']: cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] if cookie_domain not in request['url']: + try: t_url = request['url'].split('/')[2] d_name = t_url.split('.') @@ -48,15 +48,15 @@ def extract_information(self): strikeout_subcount += 1 if origin_company_name in element['cookie_sync_origin']: strikeout_subcount += 1 - # if cookie['domain'] in element['cookie_sync_origin']: - # strikeout_subcount += 1 if strikeout_subcount > 1: strikeout_count = 1 + if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], 'cookie_sync_target': request['url'], 'cookie_sync_value': cookie['value']}) + if cookies_synced['cookie_sync_occured'] is None: cookies_synced['cookie_sync_occured'] = False From 32c9d6acd8b322efc291b0b38271b903da03f23a Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 13:24:33 +0200 Subject: [PATCH 31/37] Added enumeration of companies that are part of the syncing --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index a1505a9..766258b 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -4,7 +4,7 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = {'cookie_sync_occured': None, 'sync_occurence_counter': 0, 'sync_relation': []} + cookies_synced = dict(cookie_sync_occured=None, sync_occurence_counter=0, sync_relation=[], sync_companies=[]) tracker_requests = [] tracker_cookies = [] @@ -32,11 +32,15 @@ def extract_information(self): target_company_name = d_name[len(d_name)-2] except IndexError: target_company_name = request['url'] + if target_company_name not in cookies_synced['sync_companies']: + cookies_synced['sync_companies'].append(target_company_name) try: origin_company_name = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] except IndexError: origin_company_name = cookie['domain'] + if origin_company_name not in cookies_synced['sync_companies']: + cookies_synced['sync_companies'].append(origin_company_name) strikeout_count = 0 if len(cookies_synced) > 0: From c6280548aa9551f39d8c32e9903c6310fad2f0f3 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Tue, 2 Jul 2019 14:04:51 +0200 Subject: [PATCH 32/37] Allow CookieValues < 10 , check for timevalues, sort those out --- .../chromedevtools/extractors/cookiesync.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 766258b..c4d0eab 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -1,4 +1,6 @@ from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor +from datetime import datetime +from datetime import date class CookieSyncExtractor(Extractor): @@ -21,7 +23,7 @@ def extract_information(self): for cookie in tracker_cookies: for request in tracker_requests: - if len(cookie['value']) > 10: + if len(cookie['value']) > 6: if cookie['value'] in request['url']: cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] if cookie_domain not in request['url']: @@ -55,6 +57,14 @@ def extract_information(self): if strikeout_subcount > 1: strikeout_count = 1 + if len(cookie['value']) == 10: + try: + dateval = datetime.utcfromtimestamp(int(cookie['value'])).strftime('%Y-%m-%d %H:%M:%S') + except ValueError: + strikeout_count += 0 + if str(date.today()).split(' ')[0] in dateval: + strikeout_count += 1 + if strikeout_count == 0: cookies_synced['cookie_sync_occured'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], From eb4dd6b06d13f30cd4148c9bf585397ed5fb5116 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Fri, 12 Jul 2019 08:30:38 +0200 Subject: [PATCH 33/37] Clean up UTC Timestamp check --- .../chromedevtools/extractors/cookiesync.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index c4d0eab..0e2837f 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -1,6 +1,5 @@ from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor from datetime import datetime -from datetime import date class CookieSyncExtractor(Extractor): @@ -58,22 +57,27 @@ def extract_information(self): strikeout_count = 1 if len(cookie['value']) == 10: + possible_time_cookie = None + utcstamp = None try: - dateval = datetime.utcfromtimestamp(int(cookie['value'])).strftime('%Y-%m-%d %H:%M:%S') + possible_time_cookie = datetime.utcfromtimestamp(int(cookie['value'])) + utcstamp = datetime.utcnow() except ValueError: strikeout_count += 0 - if str(date.today()).split(' ')[0] in dateval: - strikeout_count += 1 + if possible_time_cookie is not None: + if possible_time_cookie.date().year == utcstamp.date().year: + if possible_time_cookie.date().month == utcstamp.date().month: + strikeout_count += 1 if strikeout_count == 0: - cookies_synced['cookie_sync_occured'] = True + cookies_synced['cookie_sync_occurred'] = True cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], 'cookie_sync_target': request['url'], 'cookie_sync_value': cookie['value']}) - if cookies_synced['cookie_sync_occured'] is None: - cookies_synced['cookie_sync_occured'] = False + if cookies_synced['cookie_sync_occurred'] is None: + cookies_synced['cookie_sync_occurred'] = False - cookies_synced['sync_occurence_counter'] = len(cookies_synced['sync_relation']) + cookies_synced['sync_occurrence_counter'] = len(cookies_synced['sync_relation']) self.result['cookiesync'] = cookies_synced From 98edd8f8482ce256acac2f573500c30fef0defe6 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Fri, 12 Jul 2019 08:31:24 +0200 Subject: [PATCH 34/37] Spelling --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 0e2837f..17a06ec 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -5,7 +5,7 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = dict(cookie_sync_occured=None, sync_occurence_counter=0, sync_relation=[], sync_companies=[]) + cookies_synced = dict(cookie_sync_occurred=None, sync_occurence_counter=0, sync_relation=[], sync_companies=[]) tracker_requests = [] tracker_cookies = [] @@ -18,7 +18,7 @@ def extract_information(self): tracker_cookies.append(cookie) if len(tracker_cookies) == 0: - cookies_synced['cookie_sync_occured'] = False + cookies_synced['cookie_sync_occurred'] = False for cookie in tracker_cookies: for request in tracker_requests: From 3624266784f353a8752089aefd658700795731d5 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Fri, 12 Jul 2019 09:24:05 +0200 Subject: [PATCH 35/37] Removing "syncing_companies" entries if there was no syncing, only trackers --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 1 + 1 file changed, 1 insertion(+) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 17a06ec..2558b75 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -77,6 +77,7 @@ def extract_information(self): if cookies_synced['cookie_sync_occurred'] is None: cookies_synced['cookie_sync_occurred'] = False + cookies_synced['sync_companies'] = None cookies_synced['sync_occurrence_counter'] = len(cookies_synced['sync_relation']) From 7067f2cd8916456e4ffacfdb92c5b91bb957a303 Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Fri, 12 Jul 2019 09:52:02 +0200 Subject: [PATCH 36/37] Spelling fix --- .../scanmodules/chromedevtools/extractors/cookiesync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 2558b75..9bb18d1 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -5,7 +5,7 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = dict(cookie_sync_occurred=None, sync_occurence_counter=0, sync_relation=[], sync_companies=[]) + cookies_synced = dict(cookie_sync_occurred=None, sync_occurrence_counter=0, sync_relation=[], sync_companies=[]) tracker_requests = [] tracker_cookies = [] From a162304a1bfd040a5ebcde5f62b401d6d45fdfea Mon Sep 17 00:00:00 2001 From: Matthias Kraus Date: Wed, 5 Feb 2020 18:45:38 +0100 Subject: [PATCH 37/37] Reworked to specifications in change request --- .../chromedevtools/extractors/cookiesync.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py index 9bb18d1..763585d 100644 --- a/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py +++ b/privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py @@ -5,7 +5,8 @@ class CookieSyncExtractor(Extractor): def extract_information(self): - cookies_synced = dict(cookie_sync_occurred=None, sync_occurrence_counter=0, sync_relation=[], sync_companies=[]) + cookies_synced = dict(cookie_sync_occurred=None, number_sync_relations=0, number_sync_domains=0, + sync_relation=[], sync_domains=[]) tracker_requests = [] tracker_cookies = [] @@ -28,30 +29,28 @@ def extract_information(self): if cookie_domain not in request['url']: try: - t_url = request['url'].split('/')[2] - d_name = t_url.split('.') - target_company_name = d_name[len(d_name)-2] + target_domain = request['url'].split('/')[2] except IndexError: - target_company_name = request['url'] - if target_company_name not in cookies_synced['sync_companies']: - cookies_synced['sync_companies'].append(target_company_name) + target_domain = request['url'] + if target_domain not in cookies_synced['sync_domains']: + cookies_synced['sync_domains'].append(target_domain) try: - origin_company_name = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2] + origin_domain = cookie['domain'] except IndexError: - origin_company_name = cookie['domain'] - if origin_company_name not in cookies_synced['sync_companies']: - cookies_synced['sync_companies'].append(origin_company_name) + origin_domain = cookie['domain'] + if origin_domain not in cookies_synced['sync_domains']: + cookies_synced['sync_domains'].append(origin_domain) strikeout_count = 0 if len(cookies_synced) > 0: for element in cookies_synced['sync_relation']: strikeout_subcount = 0 - if target_company_name in element['cookie_sync_target']: + if target_domain in element['target']: strikeout_subcount += 1 - if origin_company_name in element['cookie_sync_target']: + if origin_domain in element['target']: strikeout_subcount += 1 - if origin_company_name in element['cookie_sync_origin']: + if origin_domain in element['origin']: strikeout_subcount += 1 if strikeout_subcount > 1: strikeout_count = 1 @@ -71,14 +70,16 @@ def extract_information(self): if strikeout_count == 0: cookies_synced['cookie_sync_occurred'] = True - cookies_synced['sync_relation'].append({'cookie_sync_origin': cookie['domain'], - 'cookie_sync_target': request['url'], - 'cookie_sync_value': cookie['value']}) + cookies_synced['sync_relation'].append({'origin': cookie['domain'], + 'target': request['url'], + 'value': cookie['value']}) if cookies_synced['cookie_sync_occurred'] is None: cookies_synced['cookie_sync_occurred'] = False - cookies_synced['sync_companies'] = None + cookies_synced['sync_domains'] = None - cookies_synced['sync_occurrence_counter'] = len(cookies_synced['sync_relation']) + if cookies_synced['sync_domains'] and cookies_synced['sync_relation'] is not None: + cookies_synced['number_sync_relations'] = len(cookies_synced['sync_relation']) + cookies_synced['number_sync_domains'] = len(cookies_synced['sync_domains']) self.result['cookiesync'] = cookies_synced