Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
920e31c
Init CookieSyncExtractor
mattkrau May 21, 2019
20da0c1
Finding first instances where an cookie value is broadcasted in a req…
mattkrau May 21, 2019
412d487
Fixing double entries in final message (cookiesync)
mattkrau Jul 2, 2019
ab8789f
Fixing the problem of cookiesyncing between the same third party on d…
mattkrau Jul 2, 2019
76c597c
cleanup
mattkrau Jul 2, 2019
4024fa6
More fixes for adding cookiesync-events to the list.
mattkrau Jul 2, 2019
e86c976
cleanup
mattkrau Jul 2, 2019
89af02c
Init CookieSyncExtractor
mattkrau May 21, 2019
99cda38
Finding first instances where an cookie value is broadcasted in a req…
mattkrau May 21, 2019
6b1d327
chromedevtools: Remove gtm/js path for Google Analytics check
hprid May 24, 2019
1925b1b
chromedevtools: Change utf.gif to __utm.gif in Google Analytics
hprid May 24, 2019
9fa141c
Release 0.7.3
hprid May 24, 2019
419638f
Store POST data of requests (up to 64 KiB per request)
hprid May 29, 2019
08f955a
chromedevtools: Also look into post data for aip=1 in Google Analytics
hprid May 29, 2019
14f9daf
Release 0.7.4
hprid May 29, 2019
ce260b0
chromedevtools: Fix post data extraction
hprid May 31, 2019
172149d
Release 0.7.5
hprid May 31, 2019
38b8314
Fix run_workers command (infinite loop in _execute_sql_autocommit)
hprid Jun 3, 2019
3e58a94
chromedevtools: Fix handling of missing aip key in Google Analytics
hprid Jun 3, 2019
7681627
Release 0.7.6
hprid Jun 3, 2019
9af5176
chromedevtools: Handle incorrect HSTS header correctly
hprid Jun 22, 2019
3a51083
chromedevtools: Fix resource content with pages without frameId
hprid Jun 22, 2019
1ab7a6f
chromedevtools: Fix error handling of argument extraction for JavaScript
hprid Jun 22, 2019
979867d
chromedevtools: Send JavaScript argument extraction errors as error l…
hprid Jun 22, 2019
0ac880e
chromedevtools: Increase change wait time to 15 seconds
hprid Jun 22, 2019
51ef96d
Fixing double entries in final message (cookiesync)
mattkrau Jul 2, 2019
7865130
Fixing the problem of cookiesyncing between the same third party on d…
mattkrau Jul 2, 2019
21cb3e4
cleanup
mattkrau Jul 2, 2019
10b5bad
More fixes for adding cookiesync-events to the list.
mattkrau Jul 2, 2019
67f7c1a
cleanup
mattkrau Jul 2, 2019
f4fe671
Merge branch 'cookiesync' of github.com:PrivacyScore/privacyscanner i…
mattkrau Jul 2, 2019
32c9d6a
Added enumeration of companies that are part of the syncing
mattkrau Jul 2, 2019
c628054
Allow CookieValues < 10 , check for timevalues, sort those out
mattkrau Jul 2, 2019
eb4dd6b
Clean up UTC Timestamp check
mattkrau Jul 12, 2019
98edd8f
Spelling
mattkrau Jul 12, 2019
3624266
Removing "syncing_companies" entries if there was no syncing, only tr…
mattkrau Jul 12, 2019
7067f2c
Spelling fix
mattkrau Jul 12, 2019
a162304
Reworked to specifications in change request
mattkrau Feb 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,35 @@
Changelog
=========

0.7.6
-----

* Fix: run\_workers command will no longer run into an infinite loop.
* Fix: Google Analytics detection did not handle the case when aip is not
set correctly.

0.7.5
-----

* Fix: POST data extraction failed under certain circumstances.

0.7.4
-----

* Fix: Also look into POST data for Google Analytics request to find aip=1

0.7.3
-----

* Fix: Check for \_\_utm.gif in Google Analytics check instead of utm.gif
* Fix: Do not consider gtm/js requests as tracking requests for Google
Analytics, since they just load the GTM configuration. This fixes
a bug where a site is mistakenly detected as not using the anonymize IP
extension.
* Start counter for numeric locks at zero instead of one. This makes the
remote debugging ports for Google Chrome used by the "scan" command
consistent with those used the "run\_workers" command.

0.7.2
-----

Expand Down
5 changes: 3 additions & 2 deletions privacyscanner/scanmodules/chromedevtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
TLSDetailsExtractor, CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor, \
FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor, \
CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor, ImprintExtractor, \
HSTSPreloadExtractor, FingerprintingExtractor
HSTSPreloadExtractor, FingerprintingExtractor, CookieSyncExtractor
from privacyscanner.scanmodules.chromedevtools.utils import TLDEXTRACT_CACHE_FILE, parse_domain
from privacyscanner.utils import file_is_outdated, set_default_options, calculate_jaccard_index

Expand All @@ -19,7 +19,8 @@
CertificateExtractor, ThirdPartyExtractor, InsecureContentExtractor,
FailedRequestsExtractor, SecurityHeadersExtractor, TrackerDetectExtractor,
CookieStatsExtractor, JavaScriptLibsExtractor, ScreenshotExtractor,
ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor]
ImprintExtractor, HSTSPreloadExtractor, FingerprintingExtractor,
CookieSyncExtractor]

EXTRACTOR_CLASSES_HTTPS_RUN = [FinalUrlExtractor, TLSDetailsExtractor, CertificateExtractor,
InsecureContentExtractor, SecurityHeadersExtractor,
Expand Down
40 changes: 33 additions & 7 deletions privacyscanner/scanmodules/chromedevtools/chromescan.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from privacyscanner.scanmodules.chromedevtools.utils import scripts_disabled
from privacyscanner.utils import kill_everything


CHANGE_WAIT_TIME = 15

# See https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md
# See also https://peter.sh/experiments/chromium-command-line-switches/
CHROME_OPTIONS = [
Expand Down Expand Up @@ -137,6 +140,8 @@
})();
""".lstrip()

# TODO: There are still some contexts in which this JavaScript snippet does not
# run properly. This requires more research.
EXTRACT_ARGUMENTS_JAVASCRIPT = '''
(function(logArguments) {
let retval = 'null';
Expand Down Expand Up @@ -349,10 +354,10 @@ def scan(self, browser, result, logger, options):
# because page_loaded event is already set.
self._page_loaded.wait(load_max_wait)
self._page_interaction()
# We wait 5 seconds after the page has loaded, so that any
# We wait 15 seconds after the page has loaded, so that any
# resources can load. This includes JavaScript which might
# issue further requests.
if not self._document_will_change.wait(5):
if not self._document_will_change.wait(CHANGE_WAIT_TIME):
# OK, our page should be stable now. So we will disable any
# further requests by just intercepting them and not
# taking care of them.
Expand All @@ -374,9 +379,14 @@ def scan(self, browser, result, logger, options):
raise NotReachableError('No stable page to scan.')

response = self._page.final_response
res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'],
url=response['url'])
content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode()
# If there is no frameId, there is no content that was rendered.
# This is usually the case, when the site has a redirect.
if 'frameId' in response['extra']:
res = self._tab.Page.getResourceContent(frameId=response['extra']['frameId'],
url=response['url'])
content = b64decode(res['content']) if res['base64Encoded'] else res['content'].encode()
else:
content = b''
else:
self._tab.stop()
browser.close_tab(self._tab)
Expand Down Expand Up @@ -409,6 +419,16 @@ def _cb_request_will_be_sent(self, request, requestId, **kwargs):
request['requestId'] = requestId
request['document_url'] = kwargs.get('documentURL')
request['extra'] = kwargs
if request.get('hasPostData', False):
if 'postData' in request:
request['post_data'] = request['postData']
else:
post_data = self._tab.Network.getRequestPostData(requestId=requestId)
# To avoid a too high memory usage by single requests
# we just store the first 64 KiB of the post data
request['post_data'] = post_data['postData'][:65536]
else:
request['post_data'] = None
self._page.add_request(request)

# Redirect requests don't have a received response but issue another
Expand Down Expand Up @@ -447,9 +467,15 @@ def _cb_paused(self, **info):
if self._log_breakpoint in info['hitBreakpoints']:
call_frames = []
for call_frame in info['callFrames']:
args = json.loads(self._tab.Debugger.evaluateOnCallFrame(
javascript_result = self._tab.Debugger.evaluateOnCallFrame(
callFrameId=call_frame['callFrameId'],
expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result']['value'])
expression=EXTRACT_ARGUMENTS_JAVASCRIPT)['result']
if 'value' in javascript_result:
args = json.loads(javascript_result['value'])
else:
# TODO: We should look for the error here and handle those
# cases to reliably extract the arguments.
args = ['error', None]
call_frames.append({
'url': call_frame['url'],
'functionName': call_frame['functionName'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
from .screenshot import ScreenshotExtractor
from .imprint import ImprintExtractor
from .hstspreload import HSTSPreloadExtractor
from .fingerprinting import FingerprintingExtractor
from .fingerprinting import FingerprintingExtractor
from .cookiesync import CookieSyncExtractor
85 changes: 85 additions & 0 deletions privacyscanner/scanmodules/chromedevtools/extractors/cookiesync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from privacyscanner.scanmodules.chromedevtools.extractors.base import Extractor
from datetime import datetime


class CookieSyncExtractor(Extractor):

def extract_information(self):
cookies_synced = dict(cookie_sync_occurred=None, number_sync_relations=0, number_sync_domains=0,
sync_relation=[], sync_domains=[])
tracker_requests = []
tracker_cookies = []

for request in self.page.request_log:
if request['is_thirdparty']:
tracker_requests.append(request)

for cookie in self.result['cookies']:
if cookie['is_tracker']:
tracker_cookies.append(cookie)

if len(tracker_cookies) == 0:
cookies_synced['cookie_sync_occurred'] = False

for cookie in tracker_cookies:
for request in tracker_requests:
if len(cookie['value']) > 6:
if cookie['value'] in request['url']:
cookie_domain = cookie['domain'].split('.')[len(cookie['domain'].split('.'))-2]
if cookie_domain not in request['url']:

try:
target_domain = request['url'].split('/')[2]
except IndexError:
target_domain = request['url']
if target_domain not in cookies_synced['sync_domains']:
cookies_synced['sync_domains'].append(target_domain)

try:
origin_domain = cookie['domain']
except IndexError:
origin_domain = cookie['domain']
if origin_domain not in cookies_synced['sync_domains']:
cookies_synced['sync_domains'].append(origin_domain)

strikeout_count = 0
if len(cookies_synced) > 0:
for element in cookies_synced['sync_relation']:
strikeout_subcount = 0
if target_domain in element['target']:
strikeout_subcount += 1
if origin_domain in element['target']:
strikeout_subcount += 1
if origin_domain in element['origin']:
strikeout_subcount += 1
if strikeout_subcount > 1:
strikeout_count = 1

if len(cookie['value']) == 10:
possible_time_cookie = None
utcstamp = None
try:
possible_time_cookie = datetime.utcfromtimestamp(int(cookie['value']))
utcstamp = datetime.utcnow()
except ValueError:
strikeout_count += 0
if possible_time_cookie is not None:
if possible_time_cookie.date().year == utcstamp.date().year:
if possible_time_cookie.date().month == utcstamp.date().month:
strikeout_count += 1

if strikeout_count == 0:
cookies_synced['cookie_sync_occurred'] = True
cookies_synced['sync_relation'].append({'origin': cookie['domain'],
'target': request['url'],
'value': cookie['value']})

if cookies_synced['cookie_sync_occurred'] is None:
cookies_synced['cookie_sync_occurred'] = False
cookies_synced['sync_domains'] = None

if cookies_synced['sync_domains'] and cookies_synced['sync_relation'] is not None:
cookies_synced['number_sync_relations'] = len(cookies_synced['sync_relation'])
cookies_synced['number_sync_domains'] = len(cookies_synced['sync_domains'])

self.result['cookiesync'] = cookies_synced
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,8 @@ def extract_information(self):
num_requests_no_aip = 0
has_ga_requests = False
for request in self.page.request_log:
parsed_url = request['parsed_url']
if self._is_google_request(parsed_url):
qs = parse_qs(parsed_url.query)
if 'aip' in qs and qs['aip'][-1] in ('1', 'true'):
if self._is_google_request(request['parsed_url']):
if self._is_anonymized(request):
num_requests_aip += 1
else:
num_requests_no_aip += 1
Expand Down Expand Up @@ -110,4 +108,20 @@ def _is_google_request(parsed_url):
ga_domains = ('www.google-analytics.com', 'ssl.google-analytics.com',
'stats.g.doubleclick.net')
if parsed_url.netloc in ga_domains:
return any(p in parsed_url.path for p in ('collect', 'utm.gif', 'gtm/js'))
return any(p in parsed_url.path for p in ('collect', '__utm.gif'))

@staticmethod
def _is_anonymized(request):
# There could be conflicting aip options, e.g., when a POST request
# contains aip=0 in their post data, but aip=1 in the URL.
# In this case, post data takes precedence.
aip = None
if request['method'] == 'POST' and request['post_data']:
qs = parse_qs(request['post_data'])
aip = qs.get('aip')
if aip is None:
qs = parse_qs(request['parsed_url'].query)
aip = qs.get('aip')
if aip and aip[-1] in ('1', 'true'):
return True
return False
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _parse_hsts(header_value):
try:
max_age = int(max_age)
except ValueError:
pass
max_age = None
break
return {
'header_value': header_value,
Expand Down
1 change: 1 addition & 0 deletions privacyscanner/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def _execute_sql_autocommit(self, query, params):
with self._conn.cursor() as c:
c.execute(query, params)
self._conn.commit()
break
except psycopg2.OperationalError:
print('Database operational error. Retrying after 10 seconds.')
time.sleep(10)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = privacyscanner
version = 0.7.2
version = 0.7.6
home-page = https://github.com/PrivacyScore/privacyscanner
license = MIT
license-file = LICENSE
Expand Down