-
Notifications
You must be signed in to change notification settings - Fork 24
feat: optionally split FIND_ITEMS' query in 2 #119
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,9 @@ | |
|
|
||
| EPOCH = datetime.datetime.utcfromtimestamp(0) | ||
|
|
||
| # a bogus FIND_ITEMS bind param | ||
| BOGUS_ID = object() | ||
|
|
||
|
|
||
| def dt2ts(dt): | ||
| """Convert a Python datetime to seconds""" | ||
|
|
@@ -121,8 +124,18 @@ def __init__(self, sqluri, standard_collections=False, **dbkwds): | |
| target_size=int(dbkwds.get("pool_size", 100)) | ||
| ) | ||
| self._database = self._instance.database(database_id, pool=pool) | ||
|
|
||
| # for debugging: support disabling FORCE_INDEX=BsoLastModified | ||
| self._force_bsolm_index = dbkwds.get("_force_bsolm_index", True) | ||
| # split FIND_ITEMS' query in 2 (first query BsoLastModified | ||
| # then query bso with its results) | ||
| self._bsolm_index_separate = dbkwds.get( | ||
| "_bsolm_index_separate", | ||
| False | ||
| ) | ||
| if self._bsolm_index_separate: | ||
| assert self._force_bsolm_index, \ | ||
| "_bsolm_index_separate requires _force_bsolm_index" | ||
|
|
||
| self.standard_collections = standard_collections | ||
| if self.standard_collections and dbkwds.get("create_tables", False): | ||
|
|
@@ -358,13 +371,68 @@ def _find_items(self, session, user, collection, **params): | |
| bind_types[ts_var] = param_types.TIMESTAMP | ||
|
|
||
| # Generate the query | ||
| fields = params.get("fields") | ||
| offset_params = params | ||
|
|
||
| if self._bsolm_index_separate and fields is None: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not obvious why Depends how much effort you want to invest in polishing this though, as opposed to just getting it out in stage to loadtest.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's right, to only apply it to get_items. I like the refactor suggestion but it's a little further complicated by encode_next_offset's need for the original params ( I will keep it in mind if this is revisited. This is definitely a "get it out to stage and see" patch |
||
| # Split the get_items query (TODO: get_item_ids could | ||
| # reference solely the index) query in 2: | ||
| # #1: query BsoLastModified directly for bso ids | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the intended goal of using this 2 query version? I assume its an optimization to avoid the multiple executions that occur otherwise?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, see my description in the issue #118
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and if this solution looks alright under load testing I'd like to relay those results back to Google, in case they have an alternative or a better explanation for the existing single query's perf (I still hope we won't have to do this..) |
||
| # #2: query bso w/ 'id IN UNNEST([<ids from #1>])' | ||
|
|
||
| bsolm_params = params.copy() | ||
| bsolm_params["fields"] = ["id"] | ||
| bsolm_query = FIND_ITEMS(bso, bsolm_params) | ||
| bsolm_query = str(bsolm_query.compile()) | ||
| bsolm_query = bsolm_query.replace( | ||
| "FROM bso", | ||
| "FROM bso@{FORCE_INDEX=BsoLastModified}" | ||
| ) | ||
| bsolm_query = bsolm_query.replace(":", "@") | ||
|
|
||
| result = session.transaction.execute_sql( | ||
| bsolm_query, | ||
| params=bind, | ||
| param_types=bind_types, | ||
| ) | ||
| ids = [row[0] for row in list(result)] | ||
|
|
||
| # keep the orig. for encode_next_offset | ||
| offset_params = params.copy() | ||
|
|
||
| # Setup a 'id IN (:id_1)' bind param for #2 (replacing it | ||
| # below) | ||
| params["ids"] = [BOGUS_ID] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What stops us from just putting the returned
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is specifically to utilize spanner's I do have a TODO for the FIND_ITEMS/sqlalchemy generation piece to do this for us. params["ids"] from the client and/or this case should both utilize UNNEST. Unfortunately coercing the sqlalchemy level to generate it for us is a bit involved, thus the current str replace. Another quick solution for this is having FIND_ITEMS handle UNNEST by returning a str with the appropriate replacements (also not ideal, further straying from the original FIND_ITEMS code..) 🤷♂ |
||
| bind["ids"] = ids | ||
| bind_types["ids"] = param_types.Array(param_types.STRING) | ||
|
|
||
| # offset/limit are accomplished in query #1 and don't make | ||
| # sense for #2 | ||
| if limit is not None: | ||
| del params["limit"] | ||
| del bind["limit"] | ||
| del bind_types["limit"] | ||
| if offset is not None: | ||
| # restore this later | ||
| del params["offset"] | ||
| del bind["offset"] | ||
| del bind_types["offset"] | ||
| # simiarly modified ranges/ttl aren't needed in #2 | ||
| for param in ("newer", "newer_eq", "older", "older_eq", "ttl"): | ||
| params.pop(param, None) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's quite a big chunk of code in this |
||
|
|
||
| query = FIND_ITEMS(bso, params) | ||
| query = str(query.compile()) | ||
| if self._force_bsolm_index: | ||
| if self._force_bsolm_index and not self._bsolm_index_separate: | ||
| query = query.replace( | ||
| "FROM bso", | ||
| "FROM bso@{FORCE_INDEX=BsoLastModified}" | ||
| ) | ||
| if self._bsolm_index_separate and fields is None: | ||
| query = query.replace( | ||
| "bso.id IN (:id_1)", | ||
| "bso.id IN UNNEST(@ids)" | ||
| ) | ||
| query = query.replace(":", "@") | ||
|
|
||
| result = session.transaction.execute_sql( | ||
|
|
@@ -390,7 +458,7 @@ def _find_items(self, session, user, collection, **params): | |
| next_offset = None | ||
| if limit is not None and len(items) > limit: | ||
| items = items[:-1] | ||
| next_offset = self.encode_next_offset(params, items) | ||
| next_offset = self.encode_next_offset(offset_params, items) | ||
| return { | ||
| "items": items, | ||
| "next_offset": next_offset, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this used (or planned to be used) anywhere outside of the SpannerStorage.init()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's only used in _find_items as basically a placeholder