Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
d4e66c7
Fix tests for cql grammar
danamansana Apr 1, 2026
8adffea
&ai Update grammar and tests
danamansana Apr 1, 2026
8bcab30
&ai Add case insensitivity in grammar operators
danamansana Apr 2, 2026
6cac1c1
Add support for case-insensitive scope/connective/relation
danamansana Apr 2, 2026
29351f8
Add support for upper and lowercase in cql reserved terms
danamansana Apr 2, 2026
670561c
Add CqlQuery class
danamansana Apr 2, 2026
4723233
Add more whitespace to whitespace test
danamansana Apr 2, 2026
dae24b4
Add unquoted_words as top-level phrases
danamansana Apr 2, 2026
da35b34
Add test for single word query without quotes
danamansana Apr 2, 2026
79ae5eb
Add date validation
danamansana Apr 2, 2026
dc4b62d
&ai add test for date validation
danamansana Apr 2, 2026
230731a
&ai Add tests for displayParsed and fix some bugs
danamansana Apr 2, 2026
5c6542e
&ai add some more tests for displayParsed
danamansana Apr 2, 2026
3b49d33
&ai fix linting
danamansana Apr 2, 2026
304c191
Remove cql check for addInnerHits
danamansana Apr 2, 2026
e335227
Remove python version
danamansana Apr 2, 2026
ecb35d0
&ai Fix top-level should to must
danamansana Apr 2, 2026
48392a0
Remove extraneous comment
danamansana Apr 2, 2026
19b2e66
Add trimming of whitespace within query terms
danamansana Apr 3, 2026
3640f7d
Merge pull request #696 from NYPL/scc-5296/8
danamansana Apr 16, 2026
a9240d6
Add config to exclude marcTag 340 subfield a
danamansana Apr 17, 2026
32caebf
Add comment about marc rules fork
danamansana Apr 21, 2026
4b17003
Merge pull request #727 from NYPL/scc-5199
danamansana Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/annotated-marc-rules.json
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@
"marcIndicatorRegExp": "^340",
"subfieldSpec": {
"subfields": [
"6"
"6", "2"
],
"directive": "exclude"
},
Expand Down
52 changes: 29 additions & 23 deletions lib/elasticsearch/cql_grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,39 @@ function reverseGrammar (grammar) {
const leftCql = `
query ::= query whitespace connective whitespace sub_query | sub_query
connective ::= "AND NOT" | "AND" | "OR" | "NOT"
sub_query ::= atomic_query | "(" query ")"
atomic_query ::= scope relation quoted_term
sub_query ::= atomic_query | lparen_space query rparen_space
atomic_query ::= scope relation search_term
search_term ::= quoted_term | unquoted_word
scope ::= scope_term whitespace | scope_term
relation ::= relation_term whitespace | relation_term
scope_term ::= "title" | "author" | "keyword" | "callnumber" | "identifier" | "subject" | "language" | "date" | "series"| "genre" | "center" | "division" | "format"
relation_term ::= "any" | "adj" | "all" | "<=" | ">=" | "<" | ">" | "==" | "=" | "within" | "encloses"
quoted_term ::= quote phrase quote
phrase ::= phrase whitespace word | word
phrase ::= phrase whitespace_or_word | whitespace_or_word
whitespace_or_word ::= whitespace | word
whitespace ::= [#x20#x09#x0A#x0D]+
word ::= word escaped_char | word regular_char | escaped_char | regular_char
regular_char ::= [^#x22#x5c#x20#x09#x0A#x0D]
unquoted_word ::= unquoted_word escaped_char | unquoted_word unquoted_char | escaped_char | unquoted_char
unquoted_char ::= [^#x22#x5c#x20#x09#x0A#x0D=<>()]
escaped_char ::= slash char
slash ::= [#x5c]
char ::= [a-z]|[^a-z]
quote ::= [#x22]
lparen_space ::= lparen whitespace | lparen
rparen_space ::= whitespace rparen | rparen
lparen ::= [#x28]
rparen ::= [#x29]
`
function makeCaseInsensitiveLiterals (grammar) {
// Transform literals (e.g. "and not") into case-insensitive EBNF matches
return grammar.replace(/"([a-zA-Z ]+)"/g, (match, p1) => {
return p1.split('').map(c => c === ' ' ? 'whitespace' : `[${c.toLowerCase()}${c.toUpperCase()}]`).join(' ')
})
}

const rightCql = reverseGrammar(leftCql)
const processedLeftCql = makeCaseInsensitiveLiterals(leftCql)
const rightCql = reverseGrammar(processedLeftCql)

function simplify (ast) {
switch (ast.type) {
Expand All @@ -56,14 +71,19 @@ function simplify (ast) {
return ast.text
case 'relation_term':
return ast.text
case 'search_term':
return simplify(ast.children.find(child => child.type.includes('quoted_term') || child.type.includes('word')))
case 'quoted_term':
return simplify(ast.children.find(child => child.type.includes('phrase')))
case 'phrase': {
const word = ast.children.find(child => child.type === 'word')
const word = ast.children.find(child => child.type === 'whitespace_or_word')
const phrase = ast.children.find(child => child.type === 'phrase')
return [simplify(word)].concat(phrase ? simplify(phrase) : [])
return [simplify(word)].filter(x => x).concat(phrase ? simplify(phrase) : [])
}
case 'whitespace_or_word':
return simplify(ast.children.find(child => child.type === 'word'))
case 'word':
case 'unquoted_word':
return ast.text
default:
break
Expand Down Expand Up @@ -94,12 +114,12 @@ function parseWithRightCql (string) {

function parsedASTtoNestedArray (ast) {
if (!ast.type.includes('query')) {
return reverseString(ast.text)
return ast.text.trim()
}

const childTypes = [
'atomic_query', 'sub_query', 'query', 'connective',
'scope', 'relation', 'quoted_term'
'scope', 'relation', 'search_term'
]

const children = ast.children
Expand All @@ -113,18 +133,4 @@ function parsedASTtoNestedArray (ast) {
return children
}

// we need to reverse the error message since `parseWithRightCql` doesn't
function displayParsed (string) {
const parsed = parseWithRightCql(string)
if (!parsed) return {}
if (parsed.errors.length) {
return {
error: parsed.errors.map(error =>
`Parsing error likely near end of "${reverseString(error.token.rest)}"`
).join('\n')
}
}
return { parsed: parsedASTtoNestedArray(parsed) }
}

module.exports = { simplify, reverseAST, reverseGrammar, parseRight, parseWithRightCql, rightCqlParser, reverseString, displayParsed }
module.exports = { simplify, reverseAST, reverseGrammar, parseRight, parseWithRightCql, rightCqlParser, reverseString, parsedASTtoNestedArray }
67 changes: 51 additions & 16 deletions lib/elasticsearch/cql_query_builder.js
Original file line number Diff line number Diff line change
@@ -1,19 +1,45 @@
const { parseWithRightCql } = require('./cql_grammar')
const { parseWithRightCql, reverseString, parsedASTtoNestedArray } = require('./cql_grammar')
const { indexMapping } = require('./cql/index-mapping')
const ElasticQueryBuilder = require('./elastic-query-builder')
const { InvalidParameterError } = require('../errors')

function buildEsQuery (cqlQuery, request = null) {
const filterQuery = buildFilterQuery(request)
return {
bool: {
should: [
buildEsQueryFromTree(
parseWithRightCql(cqlQuery.trim())
)
],
...filterQuery
class CqlQuery {
constructor (queryStr) {
this.queryStr = (queryStr || '').trim()
this.parsedAST = null
}

parse () {
if (!this.parsedAST) {
this.parsedAST = parseWithRightCql(this.queryStr)
}
return this.parsedAST
}

buildEsQuery (request = null) {
const filterQuery = buildFilterQuery(request)
return {
bool: {
must: [
buildEsQueryFromTree(this.parse(), this.queryStr)
],
...filterQuery
}
}
}

displayParsed () {
const parsed = this.parse()
if (!parsed) return { error: 'Unknown parsing error. Error most likely near end of string' }
if (parsed.errors && parsed.errors.length) {
return {
error: parsed.errors.map(error =>
`Parsing error likely near end of "${reverseString(error.token.rest)}"`
).join('\n')
}
}
return { parsed: parsedASTtoNestedArray(parsed) }
}
}

function buildFilterQuery (request) {
Expand Down Expand Up @@ -52,6 +78,7 @@ function buildEsQueryFromTree (tree) {
}

function buildBoolean (operator, queries) {
operator = operator.toUpperCase()
if (['NOT', 'AND NOT'].includes(operator)) return buildNegation(queries)
const esOperator = operator === 'AND' ? 'must' : 'should'
return {
Expand Down Expand Up @@ -89,7 +116,7 @@ function atomicQueryParams (atomicQuery) {
for Hamlet Shakespeare, Hamlet, and Shakespeare, and this will return Hamlet Shakespeare
*/
function findTopPhrase (tree) {
if (tree.type === 'phrase') return tree.text
if (tree.type === 'phrase' || tree.type === 'unquoted_word') return tree.text.trim()
const topPhrases = tree.children.map(child => findTopPhrase(child)).filter(x => x)
return topPhrases.length ? topPhrases[0] : null
}
Expand All @@ -100,7 +127,7 @@ function findTopPhrase (tree) {
of word nodes for H, Ha, Ham, etc...
*/
function findTopWords (tree) {
if (tree.type === 'word') return [tree.text]
if (tree.type === 'word' || tree.type === 'unquoted_word') return [tree.text.trim()]
return tree.children.map(child => findTopWords(child)).flat()
}

Expand Down Expand Up @@ -139,6 +166,16 @@ function hasFields (obj) {
*/

function buildAtomic ({ scope, relation, terms, term }) {
scope = scope.toLowerCase()
relation = relation.toLowerCase()

if (scope === 'date') {
const dateRegex = /^\d{4}(?:[-/]\d{2})?(?:[-/]\d{2})?$/
if (!terms.every(t => dateRegex.test(t))) {
throw new InvalidParameterError('Dates must be of the form YYYY, YYYY/MM, or YYYY/MM/DD ')
}
}

const allFields = nestedFilterAndMap(
indexMapping[scope],
field => typeof field === 'string' || field.on(term),
Expand Down Expand Up @@ -214,7 +251,6 @@ function buildAtomicMain ({ fields, relation, terms, term }) {
}

function anyAllQueries ({ fields, relation, terms }) {
if (!['any', 'all'].includes(relation)) { return null }
const operator = (relation === 'any' ? 'should' : 'must')
return {
bool: {
Expand All @@ -224,7 +260,6 @@ function anyAllQueries ({ fields, relation, terms }) {
}

function adjEqQueries ({ fields, relation, terms, term }) {
if (!['=', '==', 'adj'].includes(relation)) { return null }
const type = (relation === '==') ? 'exact' : 'phrase'
return matchTermWithFields(fields, term, type)
}
Expand Down Expand Up @@ -335,7 +370,7 @@ function multiMatch (fields, term, type) {
}

module.exports = {
buildEsQuery,
CqlQuery,
buildEsQueryFromTree,
buildBoolean,
buildAtomic,
Expand Down
21 changes: 10 additions & 11 deletions lib/resources.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ const { parseParams, deepValue } = require('../lib/util')

const ApiRequest = require('./api-request')
const ElasticQueryBuilder = require('./elasticsearch/elastic-query-builder')
const cqlQueryBuilder = require('./elasticsearch/cql_query_builder')
const { displayParsed } = require('./elasticsearch/cql_grammar')
const { CqlQuery } = require('./elasticsearch/cql_query_builder')
const { FILTER_CONFIG, SEARCH_SCOPES, AGGREGATIONS_SPEC } = require('./elasticsearch/config')

const errors = require('./errors')
Expand Down Expand Up @@ -651,10 +650,12 @@ module.exports = function (app, _private = null) {
app.logger.debug('Parsed params: ', params)

let parsed = {}
let cqlQuery = null

if (params.search_scope === 'cql') {
cqlQuery = new CqlQuery(params.q)
try {
parsed = displayParsed(params.q) // ?
parsed = cqlQuery.displayParsed()
} catch (e) {
throw new IndexSearchError('Unknown parsing error. Error most likely near end of string')
}
Expand All @@ -666,16 +667,14 @@ module.exports = function (app, _private = null) {
}
}

let body = buildElasticBody(params)
let body = buildElasticBody(params, cqlQuery)

// Strip unnecessary _source fields
body._source = {
excludes: EXCLUDE_FIELDS.concat(['items'])
}

if (params.search_scope !== 'cql') {
body = addInnerHits(body, { merge_checkin_card_items: params.merge_checkin_card_items })
}
body = addInnerHits(body, { merge_checkin_card_items: params.merge_checkin_card_items })

app.logger.debug('Resources#search', RESOURCES_INDEX, body)

Expand Down Expand Up @@ -878,13 +877,13 @@ module.exports = function (app, _private = null) {
*
* @return {object} An object that can be posted directly to ES
*/
const buildElasticBody = function (params) {
const buildElasticBody = function (params, cqlQuery = null) {
const body = {
from: (params.per_page * (params.page - 1)),
size: params.per_page
}

body.query = buildElasticQuery(params)
body.query = buildElasticQuery(params, cqlQuery)

// Apply sort:
let direction
Expand All @@ -910,10 +909,10 @@ const buildElasticBody = function (params) {
*
* @return {object} ES query object suitable to be POST'd to ES endpoint
*/
const buildElasticQuery = function (params) {
const buildElasticQuery = function (params, cqlQuery = null) {
const request = ApiRequest.fromParams(params)
if (params.search_scope === 'cql') {
const query = cqlQueryBuilder.buildEsQuery(params.q, request)
const query = (cqlQuery || new CqlQuery(params.q)).buildEsQuery(request)
return query
}

Expand Down
2 changes: 2 additions & 0 deletions scripts/update-annotated-marc-rules.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env node
/**
* This file rebuilds data/annotated-marc-rules.json from data/webpub.def
* Note that currently data/annotated-marc-rules.json intentionally diverges
* from webpub.def, in particular 340 excludes subfield 2
*
* Webpub.def is a Sierra configuration file, which controls how specific marc
* fields are rendered in the catalog. We use it to build our own "annotated-
Expand Down
14 changes: 12 additions & 2 deletions test/cql_grammar.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ function validateAtomicQuery (parsed, scope, relation, quotedTerm) {
const relationNode = atomicQuery.children.find(child => child.type === 'relation')
const relationTerm = relationNode.children.find(child => child.type === 'relation_term')
expect(relationTerm.text).to.equal(relation)
const quotedTermNode = atomicQuery.children.find(child => child.type === 'quoted_term')
expect(quotedTermNode.text).to.equal(quotedTerm)
const searchTermNode = atomicQuery.children.find(child => child.type === 'search_term')
expect(searchTermNode.text).to.equal(quotedTerm)
}

describe('CQL Grammar', function () {
Expand All @@ -29,6 +29,16 @@ describe('CQL Grammar', function () {
validateAtomicQuery(parseWithRightCql('subject all "hamlet shakespeare"'), 'subject', 'all', '"hamlet shakespeare"')
})

it('parses single-word atomic queries without quotes', function () {
validateAtomicQuery(parseWithRightCql('title=hamlet'), 'title', '=', 'hamlet')
})

it('parses quoted queries containing special characters', function () {
validateAtomicQuery(parseWithRightCql('title="hamlet=prince"'), 'title', '=', '"hamlet=prince"')
validateAtomicQuery(parseWithRightCql('date > "1990 > 1980"'), 'date', '>', '"1990 > 1980"')
validateAtomicQuery(parseWithRightCql('author adj "shakespeare (william)"'), 'author', 'adj', '"shakespeare (william)"')
})

it('allows whitespace variants', function () {
validateAtomicQuery(parseWithRightCql('title ="hamlet"'), 'title', '=', '"hamlet"')
validateAtomicQuery(parseWithRightCql('title= "hamlet"'), 'title', '=', '"hamlet"')
Expand Down
Loading
Loading