Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 50 additions & 4 deletions web-app/django/VIM/apps/instruments/views/instrument_list.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Union
import logging
import re

import pysolr
import requests
Expand Down Expand Up @@ -28,7 +29,7 @@ def count(self):

# Helper classes to normalize Solr results
class SolrInstrument:
def __init__(self, data: dict, lang_code: str = "en"):
def __init__(self, data: dict, lang_code: str = "en", highlight_dict: dict = None):
sid = data.get("sid")
self.pk = sid.replace("instrument-", "") if sid else ""
self.umil_id = data.get("umil_id_s", "")
Expand All @@ -38,6 +39,10 @@ def __init__(self, data: dict, lang_code: str = "en"):
self.instrumentname_set = InstrumentNameSet(
data.get(name_field, []), data.get(umil_label_name_field, None)
)
self.highlight_info: list[str] = []

if highlight_dict:
self.instrumentname_set.apply_highlights(highlight_dict)


class ThumbnailStub:
Expand All @@ -63,6 +68,23 @@ def __init__(self, names: Union[list[str], str], umil_label_name: str = None):
def all(self) -> list[InstrumentNameStub]:
return [InstrumentNameStub(name, self._umil_label_name) for name in self._names]

def apply_highlights(self, highlight_dict: dict):
"""
Apply Solr highlights to each name in the set.
highlight_dict: dict[str, str] mapping original text -> highlighted text
"""
lookup = {k.lower(): v for k, v in highlight_dict.items() if k}

# Sort longer terms first to prevents partial overlaps
terms = sorted(highlight_dict.keys(), key=len, reverse=True)

# Regex for all terms
pattern = re.compile(r"\b(" + "|".join(map(re.escape, terms)) + r")\b")

self._names = [
pattern.sub(lambda m: f"<b>{m.group(1)}</b>", name) for name in self._names
]

def get_display_names_str(self) -> str:
sorted_names = sorted(self.all(), key=lambda x: not x.umil_label)
name_list = [n.name for n in sorted_names]
Expand Down Expand Up @@ -268,14 +290,38 @@ def _get_solr_page_results(
**query_params,
"rows": page_size,
"start": start,
"hl": "true",
"hl.fl": "text",
"hl.simple.pre": "<b>",
"hl.simple.post": "</b>",
"hl.snippets": 1000,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1000 is too much...50 should be sufficient for most instruments?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some instruments like guitar have labels in 200 languages, adding aliases it can increase even more (many of them share a common ar). Although for one language we might not have more than 10 labels, we are not sure if our highlight query hits all of the labels in the selected language.

As an example, 50 would not be enough for a query of ar in guitar as it hits more then 100 times.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, switching to token-level highlights makes more sense.

"hl.fragsize": 1,
}
# Remove our custom params
lang_code = solr_params.pop("lang_code")

solr_response = solr.search(**solr_params)
instruments = [
SolrInstrument(doc, lang_code=lang_code) for doc in solr_response.docs
]

# Extract highlight info
highlight_info = getattr(solr_response, "highlighting", {})

instruments = []
for doc in solr_response.docs:
pk = doc.get("sid", "").replace("instrument-", "")
hl_snippets = highlight_info.get(f"instrument-{pk}", {}).get("text", [])
# Map original -> highlighted
highlight_map = {}

for snippet in hl_snippets:
for term in re.findall(r"<b>(.*?)</b>", snippet):
highlight_map[term] = f"<b>{term}</b>"

inst = SolrInstrument(
doc, lang_code=lang_code, highlight_dict=highlight_map
)
inst.highlight_info = hl_snippets
instruments.append(inst)

total_count = solr_response.hits # pysolr's hits corresponds to Solr's numFound

# Return facet data if available
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
onerror="this.onerror=null;this.src='{% static "assets/images/instruments/no-image.svg" %}';" />
<div class="card-body pb-0 pt-0">
<p class="card-title text-center notranslate ">
{{ instrument.instrumentname_set.get_display_names_str }}
{{ instrument.instrumentname_set.get_display_names_str|safe }}
</p>
</div>
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
</div>
<div class="card-body pb-0 pt-0">
<p class="card-title text-center notranslate">
{{ instrument.instrumentname_set.get_display_names_str }}
{{ instrument.instrumentname_set.get_display_names_str|safe }}
</p>
</div>
</div>
Expand Down
Loading