diff --git a/services/Search/SearchStructure.php b/services/Search/SearchStructure.php index fb0b68fd..b093c4a8 100644 --- a/services/Search/SearchStructure.php +++ b/services/Search/SearchStructure.php @@ -85,6 +85,7 @@ static function fromHash($hash) $c = __CLASS__; $obj = new $c(true); $obj->_fillFromHash($hash); + return $obj; } @@ -231,6 +232,7 @@ private function _fillFromHash($hash) } elseif (count($ss) == 1) { $index = $ss[0][0]; + if (isset($this->force_standard[$index]) && $this->force_standard[$index]) { $this->use_dismax = false; } @@ -909,6 +911,9 @@ static function displayStrip($v) } return "Between $start and $end"; } else { + // Remove square brackets and double quote from facet display. + // Dec 2025 Note: this is just a UI string, so this replacement should be unnecessary. + // return $v; return preg_replace('/[\[\]\"]/', '', $v); } diff --git a/sys/Solr.php b/sys/Solr.php index ae53bbc7..46d29267 100755 --- a/sys/Solr.php +++ b/sys/Solr.php @@ -104,7 +104,11 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) { $limit = isset($_REQUEST['pagesize']) ? $_REQUEST['pagesize'] : $configArray['Site']['itemsPerPage']; } - + error_log("SearchStructure: " . print_r($ss) . "\n"); + echo "\n"; + print_r($ss); + echo "\n"; + // The initial query if ($ss->use_dismax) { $ss->action = 'edismax'; @@ -129,12 +133,12 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) { $args = array_merge($args, $this->spellcheckComponents($ss)); } - if ($raw) { return $this->rawSolrSearch($args, $action); } // Otherwise... + print_r('22222222222222222222222222222222 '); $rv = $this->solrSearch($args, $action); return $rv; } @@ -230,7 +234,8 @@ function facetlist($ss, $fields, $sort = 'index', $skip = 0, $limit = 20) { $values = $body['facet_counts']['facet_fields'][$field]; $rv['values'][$field] = array(); - // skip the hidden ones + // Filter out facet values that match the hidden pattern defined on config.ini. + // e.g. skip hlbgeneral = "hlb_both:^U\.S\. National and" to hide "U.S. National..." facets foreach ($values as $valcnt) { if (isset($hide, $hide[$field])) { foreach ($hide[$field] as $regexp) { @@ -269,14 +274,20 @@ function searchArguments($ss) { * @param SearchStructure $ss A fille-in search structure * @return array An array of (key,value) duples for sending to Solr **/ - + // TODO: Remove this function that is never used because the field type is not defined in conf/dismaxsearchspecs.yaml + // TODO: Check by function used by this that could be removed too. function dismaxSearchArguments($ss) { $rv = array(); // Should just be on "lookfor" and "type" $tvb = isset($ss->search[0]) ? $ss->search[0] : array('all', '*:*'); $type = $tvb[0]; + print_r('**************type****************'); + print_r($type); + print_r('******************************'); + // $value is the search string $value = $tvb[1]; + // If search is empty/whitespace-only, default to *:* (match-all) if (!preg_match('/\S/', $value)) { $value = '*:*'; } @@ -284,10 +295,10 @@ function dismaxSearchArguments($ss) { $allspecs = yaml_parse_file('conf/dismaxsearchspecs.yaml'); // If the type isn't set, back up to normal arguments + // Lianet's notes: $type is extracted from conf/dismaxsearchspecs.yaml so the function always return the args in searchArguments if (!isset($allspecs[$type])) { $args = $this->searchArguments($ss); - // print_r($args); return $args; } @@ -334,10 +345,11 @@ function standardSearchComponents($ss) { $searchComponents = array(); + // Lianet's notes: conf/searchspecs is the config used to build the Solr query. $specs = yaml_parse_file('conf/searchspecs.yaml'); $query = ''; - foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolen AND or OR + foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolean AND or OR $type = $tvb[0]; $values = $this->build_and_or_onephrase($tvb[1]); $bool = isset($tvb[2]) ? $tvb[2] : false; @@ -356,8 +368,8 @@ function standardSearchComponents($ss) { } $query .= "id:(" . implode(' OR ', $ss->extraIDs()) . ')'; } - $ids = $this->tagIDs($ss); + // Check if the query has content, otherwise use *:* to match all if (preg_match('/\S/', $query)) { $searchComponents[] = array('q', $query); } @@ -370,7 +382,7 @@ function standardSearchComponents($ss) { /** Quote a filter value, skipping it if it starts with a '[' (and hence is assumed - * to be a range) + * to be a range). Detect date range **/ function quoteFilterValue($v) { @@ -378,7 +390,11 @@ function quoteFilterValue($v) { return $v; } else { - return '"' . $v . '"'; + // Escape internal quotes before wrapping + // input: He said "hello, the output: He said \"hello + // $escaped = str_replace('"', '\\"', $v); + // String ready to Solr "He said \"hello" + return '"' . $this->lucene_escape_fq($v) . '"'; } } @@ -664,6 +680,9 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { } if ($val == 'stdnum') { + // Extract standard number from asis input + // Strips leading 0s. Captures digits, dashes, dots: 978-0-123-45678-9 + // e.g. 0000978-0-12-345678-9 → 978-0-12-345678-9 if (preg_match('/^\s*0*([\d\-\.]+[xX]?).*$/', $values['asis'], $match)) { $stdnum = $match[1]; // $stdnum = preg_replace('/[\.\-]/', '', $stdnum); @@ -675,7 +694,12 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { if (!isset($values[$val]) || ($values[$val] == "")) { continue; } - $sstring = $field . ':(' . $values[$val] . ')'; + // Lianet's notes: Escape the value for safe embedding in field:value syntax + + $escaped_value = $this->lucene_escape_literal($values[$val]); + $sstring = $field . ':(' . $escaped_value . ')'; + + // $sstring = $field . ':(' . $values[$val] . ')'; if (isset($weight) && $weight > 0) { $sstring .= '^' . $weight; } @@ -688,7 +712,7 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { /** - * Turn solr output into a record structure (which shouuld probably be its own class...) + * Turn solr output into a record structure (which should probably be its own class...) * * @param string $result The XML returned by solr * @param string $xslfile The path of the XSL file to use to convert the data @@ -700,6 +724,7 @@ function _process($result, $xslfile = 'xsl/solr-convert.xsl') { global $configArray; if (is_string($result) && preg_match('/^ to capture fuzzy searches like "hello world"~5 - matches a double-quoted string followed by ~ and a number + // "[^"]*" --> to capture exact phrases like "hello world" - matches a double-quoted string + // [^ ]+ --> to capture single words like hello - matches sequences of non-space characters preg_match_all('/"[^"]*"[~[0-9]+]*|"[^"]*"|[^ ]+/', $input, $words); $words = $words[0]; @@ -794,30 +823,159 @@ public function tokenizeInput($input) { return $fixedwords; } + public function remove_wildcards_add_beginning($input) { + // Ensure wildcards are not at beginning of input + // Performance guard, not a security guard. Prevent expensive queries (*table, ?table) + return substr($input, 1); + } + + public function remove_unbalanced_parentheses($input) { + // Ensure all parens match - parentheses balancing + // Prevents Solr parser errors. Deletes all parentheses instead of fixing structure + return str_replace(array('(', ')'), '', $input); + } + + public function remove_invalid_caret_usage($input) { + // Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + // Regular expression does not support ^1.5 + return str_replace('^', '', $input); + } + + /** + * If input matches the pattern: "phrase"*, + * return phrase* (quotes removed, wildcard preserved). + * Otherwise return null. + */ + public function unwrapQuotedWildcard(string $input): ?string { + // Match: optional whitespace + "..." + * + optional whitespace + // ^\s* --> leading whitespace + // " --> opening quote + // ([^"]+) --> capture group for any characters except quotes (the phrase) + // " --> closing quote + // \* --> literal asterisk + // \s*$ --> trailing whitespace + if (preg_match('/^\s*"([^"]+)"\*\s*$/u', $input, $matches)) { + return $matches[1] . '*'; + } + + return null; + } + /** * Input Validater * - * Cleanes the input based on the Lucene Syntax rules. + * Validate the input based on the Lucene Syntax rules. * * @param string $input User's input string - * @return string Fixed input + * @return string array{valid: bool, error?: string} * @access public */ + + // Lianet's notes: Verify if this function could be used to validate the Solr query public function validateInput($input) { + + print_r('*****************validateInput input****************'); + print_r($input); + // 1. Normalize + trim + $trimmed = trim($input); + + // 2. Empty input + if ($trimmed === '') { + return ['valid' => false, 'error' => 'Empty query']; + } + + // 3. Strip garbage-only input ~~//^&$ + if ($trimmed !== '' && !preg_match('/[\p{L}\p{N}]/u', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid garbage-only query']; + } + + // 4. Reject meaningless single-character input + if (mb_strlen($trimmed) === 1 && preg_match('/^[~\\\\]$/', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid single-character query']; + } + + // 5. No leading wildcard + // Ensure wildcards are not at beginning of input + // Performance guard, not a security guard. Prevent expensive queries (*table, ?table) + if ($trimmed[0] === '*' || $trimmed[0] === '?') { + return ['valid' => false, 'error' => 'Leading wildcard not allowed']; + } + + // 6. Balanced parentheses + // Ensure all parens match - parentheses balancing + // Prevents Solr parser errors. Deletes all parentheses instead of fixing structure + if (substr_count($trimmed, '(') !== substr_count($trimmed, ')')) { + return ['valid' => false, 'error' => 'Unbalanced parentheses']; + } + // 7. Valid boost syntax (^number or ^number.number) + // Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + // Regular expression does not support ^1.5 + if (preg_match('/\^/', $trimmed)) { + // Any caret must be followed by a valid numeric boost + if (!preg_match('/\^[0-9]+(\.[0-9]+)?/', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid boost syntax']; + } + } + + return ['valid' => true]; + + + // Ensure wildcards are not at beginning of input + // Performance guard, not a security guard. Prevent expensive queries (*table, ?table) + if ((substr($input, 0, 1) == '*') || + (substr($input, 0, 1) == '?')) { + return substr($input, 1); + } + + // Ensure all parens match - parentheses balancing + // Prevents Solr parser errors. Deletes all parentheses instead of fixing structure + $start = preg_match_all('/\(/', $input, $tmp); + $end = preg_match_all('/\)/', $input, $tmp); + if ($start != $end) { + return str_replace(array('(', ')'), '', $input); + } + + // Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + // Regular expression does not support ^1.5 + $cnt = preg_match_all('/\^/', $input, $tmp); + $matches = preg_match_all('/.+\^[0-9]/', $input, $tmp); + + if (($cnt) && ($cnt !== $matches)) { + return str_replace('^', '', $input); + } + + return $input; + } + + /** + * Input Validater + * + * Validate the input based on the Lucene Syntax rules. + * + * @param string $input User's input string + * @return string Fixed input + * @access public + */ + + // Lianet's notes: Verify if this function could be used to validate the Solr query + public function validateInputOLD($input) { // Ensure wildcards are not at beginning of input + // Performance guard, not a security guard. Prevent expensive queries (*table, ?table) if ((substr($input, 0, 1) == '*') || (substr($input, 0, 1) == '?')) { return substr($input, 1); } - // Ensure all parens match + // Ensure all parens match - parentheses balancing + // Prevents Solr parser errors. Deletes all parentheses instead of fixing structure $start = preg_match_all('/\(/', $input, $tmp); $end = preg_match_all('/\)/', $input, $tmp); if ($start != $end) { return str_replace(array('(', ')'), '', $input); } - // Ensure ^ is used properly + // Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + // Regular expression does not support ^1.5 $cnt = preg_match_all('/\^/', $input, $tmp); $matches = preg_match_all('/.+\^[0-9]/', $input, $tmp); @@ -863,41 +1021,63 @@ function exactmatcherify($str) { * * Given a lookfor string, clean it up, tokenize it, and * return a structure that includes AND, OR, and Phrase - * queries. + * queries. lookfor could be single or multi-word * * @param string $lookfor User's search string * @return array $values Includes 'and', 'or', and 'onephrase' elements * @access public */ - + // Lianet's notes: Check if is necessary to remove illegal characters + // TODO: Refactoring this function to avoid the different output public function build_and_or_onephrase($lookfor = null) { $values = array(); - $illegal = array('.', '{', '}', '/', '!', ':', ';', '[', ']', '(', ')', '+ ', '&', '- '); - $lookfor = trim(str_replace($illegal, '', $lookfor)); + $validation = $this->validateInput($lookfor); + if (!$validation['valid']) { + // Considering the logic of updating the user input query as the application is doing now + + switch ($validation['error']) { + case 'Empty query': + return false; + case 'Invalid garbage-only query': + return false; + case 'Invalid single-character query': + return false; + case 'Leading wildcard not allowed': + $lookfor = $this->remove_wildcards_add_beginning($lookfor); + break; + case 'Unbalanced parentheses': + $lookfor = $this->remove_unbalanced_parentheses($lookfor); + break; + case 'Invalid boost syntax': + $lookfor = $this->remove_invalid_caret_usage($lookfor); + break; + } + + } // Replace fancy quotes $lookfor = str_replace(array('“', '”'), '"', $lookfor); // If it looks like "..."*, pull out the quotes - - if (preg_match('/^\s*"(.*)"\*\s*$/', $lookfor, $match)) { - $em = $match[1]; - $lookfor = $em . '*'; - // $em = $this->exactmatcherify($em) . '*'; - // return array('exactmatcher' => $em, 'emstartswith' => $em, 'asis' => $lookfor); - } + $unwrapped = $this->unwrapQuotedWildcard($lookfor); + if ($unwrapped !== null) { + $lookfor = $unwrapped; +} // Validate input - $lookfor = $this->validateInput($lookfor); + //$lookfor = $this->validateInput($lookfor); - if (!preg_match('/\S/', $lookfor)) { - return false; - } + //if (!preg_match('/\S/', $lookfor)) { + // return false; + //} // Tokenize Input $tokenized = $this->tokenizeInput($lookfor); + //Lianet's note: Escape here!!!!!!! + + $values['onephrase'] = '"' . preg_replace('/"/', '', implode(' ', $tokenized)) . '"'; $values['and'] = implode(' AND ', $tokenized); $values['or'] = implode(' OR ', $tokenized); @@ -905,6 +1085,7 @@ public function build_and_or_onephrase($lookfor = null) { $values['compressed'] = preg_replace('/\s/', '', $lookfor); $values['exactmatcher'] = $this->exactmatcherify($lookfor); $values['emstartswith'] = $values['exactmatcher'] . '*'; + return $values; } @@ -918,7 +1099,7 @@ public function build_and_or_onephrase($lookfor = null) { **/ function solrSearch($args, $action = 'standard') { - $raw = $this->rawSolrSearch($args, $action); + $raw = $this->rawSolrSearch($args, $action); // This is the Solr output if (!PEAR::isError($raw)) { $processed = $this->_process($raw); @@ -956,6 +1137,8 @@ protected function set_proper_action($action, $args) { return $action; } } + // Ensure a non-NULL return from non-edismax cases + return $action; } // Do we just want the IDs? Spit 'em out! @@ -1006,12 +1189,18 @@ function rawSolrSearch($args, $action = 'standard') { $this->solr_connection->add($args); + print_r('======================'); + print_r($args); + # Just want a list of IDs? Produce it and die if (isset($_REQUEST['htid_list'])) { $this->print_out_list_of_ids($args); die(); } + error_log("Solr action used: " . $action); + echo "Solr action used: " . $action; + # Finally, we can deal with the normal case return $this->solr_connection->send(); } @@ -1112,13 +1301,71 @@ function mltesc($str) { return str_replace(array('(', ')','[', ']', '!', '&', ':', ';', '-', '/', '"'), '', $str); } - - function lucene_escape($str) { + // TODO: Check this function for correctness + function lucene_escape($str) { $pattern = '/(\+|-|&&|\|\||!|\(|\)|\{|}|\[|]|\^|"|~|\*|\?|:|\\\)/'; $replace = '\\\$1'; return preg_replace($pattern, $replace, $str); } + + /** + * Strict escape for filter query values and explicit field:value fragments. + + * Use for all fq values and any time you construct field:value with user data. + * This function: + * - Normalizes Unicode to NFC form + * - Removes control characters + * - Escapes backslash FIRST (critical ordering) + * - Escapes multi-char tokens (&&, ||) + * - Escapes all Lucene special characters: + - ! ( ) { } [ ] ^ " ~ * ? : / + * + * @param string $s Raw user input or value to escape + * @return string Safely escaped value for use in Solr fq or field:value + */ + public function lucene_escape_fq(string $s): string { + // Normalize Unicode to composed form (NFC) + if (function_exists('normalizer_normalize')) { + $s = normalizer_normalize($s, Normalizer::FORM_C) ?: $s; + } + + // Remove control characters (0x00-0x1F, 0x7F) + $s = preg_replace('/[\x00-\x1F\x7F]/u', '', $s); + + // Escape backslash FIRST to avoid double-escaping + $s = str_replace('\\', '\\\\', $s); + + // Escape Lucene special characters character-by-character + // $specials = ['+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', '&', '|']; + // foreach ($specials as $c) { + // $s = str_replace($c, '\\' . $c, $s); + // } + + // Use regex to catch all specials, including spaces, in one go + // The characters are: + - && || ! ( ) { } [ ] ^ " ~ * ? : / + // Note: && and || are handled as single chars & and | here + $pattern = '/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\/\&\|])/'; + + return preg_replace($pattern, '\\\\$1', $s); + } + // Less strict escape for general query string values + // '~', '*', '?', + function lucene_escape_literal(string $s): string { + // Escape backslash first + $s = str_replace('\\', '\\\\', $s); + + $specials = [ + '+', '-', '!', '(', ')', '{', '}', '[', ']', + '^', '"', ':', '/', '&', '|' + ]; + + foreach ($specials as $c) { + $s = str_replace($c, '\\' . $c, $s); + } + + return $s; + } + function getMoreLikeThis($record, $id, $max = 5) { global $configArray; @@ -1168,7 +1415,7 @@ function getMoreLikeThis($record, $id, $max = 5) { $query .= ') NOT id:(' . $id . ')'; $ss = new SearchStructure(true); // create a "blank" ss with just the filter queries - + $args = array_merge(array(array('q', $query)), $this->filterComponents($ss)); return $this->solrSearch($args); } diff --git a/test/SolrQueryTest/BuildAndOrOnePhraseTest.php b/test/SolrQueryTest/BuildAndOrOnePhraseTest.php new file mode 100644 index 00000000..25ab5d5c --- /dev/null +++ b/test/SolrQueryTest/BuildAndOrOnePhraseTest.php @@ -0,0 +1,103 @@ +solr = new Solr('', ''); + } + + /** + * @covers Solr::build_and_or_onephrase + * ~ --> reject, then return False + */ + public function testRejectsSingleTilde() + { + $this->assertFalse( + $this->solr->build_and_or_onephrase('~'), + 'Single tilde should be rejected' + ); + } + + /** + * @covers Solr::build_and_or_onephrase + * \\ --> reject, then return False + */ + public function testRejectsSingleBackslash() + { + $this->assertFalse( + $this->solr->build_and_or_onephrase('\\'), + 'Single backslash should be rejected' + ); + } + + /** + * @covers Solr::build_and_or_onephrase + * table~2 --> accepted fuzzy search, then create the query + */ + public function testAllowsFuzzyTerm() + { + $result = $this->solr->build_and_or_onephrase('table~2'); + + $this->assertIsArray($result); + $this->assertEquals('"table~2"', $result['onephrase']); + $this->assertEquals('table~2', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * "table"~2 --> accepted fuzzy search, then create the query + */ + public function testAllowsQuotedFuzzyPhrase() + { + $result = $this->solr->build_and_or_onephrase('"table"~2'); + + $this->assertIsArray($result); + $this->assertEquals('"table~2"', $result['onephrase']); + $this->assertEquals('"table"~2', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * "table" --> accepted one word query, then create the query + */ + public function testAllowsNormalTerm() + { + $result = $this->solr->build_and_or_onephrase('table'); + + $this->assertIsArray($result); + $this->assertEquals('"table"', $result['onephrase']); + $this->assertEquals('table', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * * --> accepted wildcard search, then create the query + */ + public function testAllowsWildcardTerm() + { + $result = $this->solr->build_and_or_onephrase('table*'); + + $this->assertIsArray($result); + $this->assertEquals('"table*"', $result['onephrase']); + $this->assertEquals('table*', $result['asis']); + } +} +?> \ No newline at end of file diff --git a/test/SolrQueryTest/SolrEscapingTest.php b/test/SolrQueryTest/SolrEscapingTest.php new file mode 100644 index 00000000..37de714f --- /dev/null +++ b/test/SolrQueryTest/SolrEscapingTest.php @@ -0,0 +1,111 @@ +lucene_escape_fq($input); + // Should be: \\ (escaped backslash) + \&\& (escaped &&) + $this->assertEquals('\\\\\\&\\&', $result); + } + + /** + * Test all 19 special characters individually + * @covers Solr::lucene_escape_fq + */ + public function test_all_special_chars(): void + { + $solr = new Solr('', ''); + + $specials = [ + ['\\', '\\\\'], + ['&&', '\\&\\&'], + ['||', '\\|\\|'], + ['+', '\\+'], + ['-', '\\-'], + ['!', '\\!'], + ['(', '\\('], + [')', '\\)'], + ['{', '\\{'], + ['}', '\\}'], + ['[', '\\['], + [']', '\\]'], + ['^', '\\^'], + ['"', '\\"'], + ['~', '\\~'], + ['*', '\\*'], + ['?', '\\?'], + [':', '\\:'], + ['/', '\\/'] + ]; + + foreach ($specials as [$input, $expected]) { + $this->assertEquals($expected, $solr->lucene_escape_fq($input), + "Failed to escape: $input"); + } + } + + /** + * Test production failure cases from error logs + * @covers Solr::lucene_escape_fq + */ + public function test_production_failure_cases(): void + { + $solr = new Solr('', ''); + + $cases = [ + // From actual error logs + ['\\', '\\\\'], + ['"\\', '\\"\\\\'], // Quote + backslash + ['C:\\Program Files', 'C\\:\\\\Program Files'], + ['foo~bar', 'foo\\~bar'], + ['{!term}', '\\{\\!term\\}'], + ]; + + foreach ($cases as [$input, $expected]) { + $this->assertEquals($expected, $solr->lucene_escape_fq($input), + "Failed production case: $input"); + } + } + + /** + * Test that empty string is preserved + * @covers Solr::lucene_escape_fq + */ + public function test_empty_string(): void + { + $solr = new Solr('', ''); + $this->assertEquals('', $solr->lucene_escape_fq('')); + } + + /** + * Test normal text is not modified + * @covers Solr::lucene_escape_fq + */ + public function test_normal_text_unchanged(): void + { + $solr = new Solr('', ''); + $this->assertEquals('hello world', $solr->lucene_escape_fq('hello world')); + $this->assertEquals('abc123', $solr->lucene_escape_fq('abc123')); + } + +} +?> \ No newline at end of file diff --git a/test/SolrQueryTest/SolrQueryFullPipeline.php b/test/SolrQueryTest/SolrQueryFullPipeline.php new file mode 100644 index 00000000..0aa96820 --- /dev/null +++ b/test/SolrQueryTest/SolrQueryFullPipeline.php @@ -0,0 +1,100 @@ +assertNotEmpty($ss->search); + $this->assertEquals('title', $ss->search[0][0]); + $this->assertEquals("\\", $ss->search[0][1]); + + // Check cleaned_up_original_search + $this->assertNotEmpty($ss->cleaned_up_original_search); + $this->assertEquals('title', $ss->cleaned_up_original_search[0][0]); + $this->assertEquals('\\', $ss->cleaned_up_original_search[0][1]); + + // Verify backslash is preserved (not removed or double-escaped) + $this->assertStringContainsString('\\', $ss->search[0][1]); + + // ---------------test_backslash_through_full_pipeline--------------- + + // 1. Start with raw input + $rawInput = '\\'; + fwrite(STDOUT, "1. Raw input: '$rawInput' (len=" . strlen($rawInput) . ")\n"); + + // 2. Through SearchStructure + $afterSS = $ss->search[0][1]; + fwrite(STDOUT, "2. After SearchStructure: '$afterSS' (len=" . strlen($afterSS) . ")\n"); + + // 3. Through Solr (dismax) + + $solr = new Solr('', ''); + // (titleProper:(*)^8000 OR titleProper:("\")^1200 OR titleProper:(\)^120 OR title_topProper:("\")^600 OR title_topProper:(\)^60 OR title_restProper:("\")^400 OR title_restProper:(\)^40 OR series:("\")^500 OR series:(\)^50 OR series2:("\")^500 OR series2:(\)^50 OR title:(\)^30 OR title_top:(\)^20 OR title_rest:(\)^1)) + $args = $solr->dismaxSearchArguments($ss); + // args[0][0] is q + fwrite(STDOUT, "Solr args field: " . print_r($args[0][0], true) . "\n"); + fwrite(STDOUT, "*******************************: \n"); + // args[0][1] is the query string + fwrite(STDOUT, "Solr args: " . print_r($args[0][1], true) . "\n"); + $afterSolr = $args[0][1]; + fwrite(STDOUT, "3. After Solr dismax: '$afterSolr' (len=" . strlen($afterSolr) . ")\n"); + + // 4. Check each stage + $this->assertEquals(1, strlen($rawInput), "Raw input should be 1 char"); + $this->assertEquals(1, strlen($afterSS), "After SS should still be 1 char"); + + // The Solr query might legitimately escape it - that's what we're testing + fwrite(STDOUT, "4. Hex dump of Solr query: " . bin2hex($afterSolr) . "\n"); + + } + // TODO Ask Moses how to add covers to Home.php????? + /** + * Testing that serialization to and from cookie preserves backslashes correctly + * @runInSeparateProcess + */ + public function test_cookie_serialization_escaping(): void + { + // $original contains one character, PHP source code uses escaping to represent it. + // After parsing the runtime value is \ + $original = "\\"; + + // Simulate what happens with cookie + // serialization shows 'a:1:{s:7:"lookfor";s:1:"\";}' + $serialized = serialize(['lookfor' => $original]); + $unserialized = unserialize($serialized); + + fwrite(STDOUT, "Original: '$original' (len=" . strlen($original) . ")\n"); + fwrite(STDOUT, "Serialized: '$serialized'\n"); + fwrite(STDOUT, "Unserialized: '{$unserialized['lookfor']}' (len=" . strlen($unserialized['lookfor']) . ")\n"); + + // Cookie serialization is not altering the value at all + $this->assertEquals($original, $unserialized['lookfor'], + "Cookie serialization should not change the value"); + } + + +} + +?> \ No newline at end of file diff --git a/test/SolrQueryTest/TokenizeInputTest.php b/test/SolrQueryTest/TokenizeInputTest.php new file mode 100644 index 00000000..5d0b0b53 --- /dev/null +++ b/test/SolrQueryTest/TokenizeInputTest.php @@ -0,0 +1,124 @@ +solr = new Solr('', ''); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesSimpleWords() + { + $tokens = $this->solr->tokenizeInput('table chair'); + + $this->assertSame( + ['table', 'chair'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesQuotedPhrase() + { + $tokens = $this->solr->tokenizeInput('"table chair"'); + + $this->assertSame( + ['"table chair"'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesFuzzyPhrase() + { + $tokens = $this->solr->tokenizeInput('"table chair"~2'); + + $this->assertSame( + ['"table chair"~2'], + $tokens, + 'Fuzzy phrase must be a single token' + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesUnquotedFuzzyTerm() + { + $tokens = $this->solr->tokenizeInput('table~2'); + + $this->assertSame( + ['table~2'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesMixedInput() + { + $tokens = $this->solr->tokenizeInput('table "chair leg"~3 desk'); + + $this->assertSame( + ['table', '"chair leg"~3', 'desk'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesBooleanOperators() + { + $tokens = $this->solr->tokenizeInput('table AND "chair leg"~2'); + + $this->assertSame( + ['table AND "chair leg"~2'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesMultipleFuzzyPhrases() + { + $tokens = $this->solr->tokenizeInput('"table chair"~2 "wood table"~1'); + + $this->assertSame( + ['"table chair"~2', '"wood table"~1'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testDoesNotSplitQuotedPhraseWithSpaces() + { + $tokens = $this->solr->tokenizeInput('"a b c"~4'); + + $this->assertSame( + ['"a b c"~4'], + $tokens + ); + } +} +?> \ No newline at end of file diff --git a/test/SolrTest.php b/test/SolrTest.php index 2cc70b1b..9938aa98 100644 --- a/test/SolrTest.php +++ b/test/SolrTest.php @@ -23,6 +23,44 @@ public function test_exactmatcherify(): void $this->assertEquals('*?', $solr->exactmatcherify('!@#$%^&*()-=_+,.<>/?')); $this->assertEquals('日本', $solr->exactmatcherify('日本')); } + + /** + * @covers Solr::quoteFilterValue + */ + public function test_quoteFilterValue_escapes_internal_quotes(): void + { + $solr = new Solr('', ''); + + // Test normal value + $result = $solr->quoteFilterValue('Smith, John'); + $this->assertEquals('"Smith, John"', $result); + + // Test value with quotes + $result = $solr->quoteFilterValue('"Kao gu yu wen wu" bian ji bu'); + $this->assertEquals('"\\"Kao gu yu wen wu\\" bian ji bu"', $result); + + // Test value with backslash + $result = $solr->quoteFilterValue('\Kao gu yu wen wu bian ji bu'); + $this->assertEquals('"\\\\Kao gu yu wen wu bian ji bu"', $result); + + // Test date range (should not be quoted) + $result = $solr->quoteFilterValue('[1900 TO 2000]'); + $this->assertEquals('[1900 TO 2000]', $result); + } + + /** + * @covers Solr::quoteFilterValue + */ + public function testUnwrapQuotedWildcard(): void + { + $solr = new Solr('', ''); + + $this->assertSame('table*', $solr->unwrapQuotedWildcard('"table"*')); + $this->assertSame('machine learning*', $solr->unwrapQuotedWildcard('"machine learning"*')); + $this->assertNull($solr->unwrapQuotedWildcard('"table"')); + $this->assertNull($solr->unwrapQuotedWildcard('table*')); + $this->assertNull($solr->unwrapQuotedWildcard('"*"')); + } } ?>