Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions services/Search/SearchStructure.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ static function fromHash($hash)
$c = __CLASS__;
$obj = new $c(true);
$obj->_fillFromHash($hash);

echo "\n";
print_r($obj);
echo "\n";
return $obj;
}

Expand Down Expand Up @@ -231,6 +235,8 @@ private function _fillFromHash($hash)
} elseif (count($ss) == 1) {

$index = $ss[0][0];
print_r('^^^^^^^^^^^');
print_r($index);
if (isset($this->force_standard[$index]) && $this->force_standard[$index]) {
$this->use_dismax = false;
}
Expand Down Expand Up @@ -909,6 +915,9 @@ static function displayStrip($v)
}
return "Between $start and $end";
} else {
// Remove square brackets and double quote from facet display.
// Dec 2025 Note: this is just a UI string, so this replacement should be unnecessary.
// return $v;
return preg_replace('/[\[\]\"]/', '', $v);
}

Expand Down
130 changes: 114 additions & 16 deletions sys/Solr.php
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) {
$limit = isset($_REQUEST['pagesize']) ? $_REQUEST['pagesize'] : $configArray['Site']['itemsPerPage'];
}


error_log("SearchStructure: " . print_r($ss) . "\n");
echo "\n";
print_r($ss);
echo "\n";

// The initial query
if ($ss->use_dismax) {
$ss->action = 'edismax';
Expand All @@ -129,12 +133,12 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) {
$args = array_merge($args, $this->spellcheckComponents($ss));
}


if ($raw) {
return $this->rawSolrSearch($args, $action);
}

// Otherwise...
print_r('22222222222222222222222222222222 ');
$rv = $this->solrSearch($args, $action);
return $rv;
}
Expand Down Expand Up @@ -230,7 +234,8 @@ function facetlist($ss, $fields, $sort = 'index', $skip = 0, $limit = 20) {
$values = $body['facet_counts']['facet_fields'][$field];
$rv['values'][$field] = array();

// skip the hidden ones
// Filter out facet values that match the hidden pattern defined on config.ini.
// e.g. skip hlbgeneral = "hlb_both:^U\.S\. National and" to hide "U.S. National..." facets
foreach ($values as $valcnt) {
if (isset($hide, $hide[$field])) {
foreach ($hide[$field] as $regexp) {
Expand Down Expand Up @@ -275,19 +280,25 @@ function dismaxSearchArguments($ss) {
// Should just be on "lookfor" and "type"
$tvb = isset($ss->search[0]) ? $ss->search[0] : array('all', '*:*');
$type = $tvb[0];
print_r('**************type****************');
print_r($type);
print_r('******************************');
// $value is the search string
$value = $tvb[1];

// If search is empty/whitespace-only, default to *:* (match-all)
if (!preg_match('/\S/', $value)) {
$value = '*:*';
}
// Get the yaml file
$allspecs = yaml_parse_file('conf/dismaxsearchspecs.yaml');

// If the type isn't set, back up to normal arguments
// Lianet's notes: $type is extracted from conf/dismaxsearchspecs.yaml so the function always return the args in searchArguments

if (!isset($allspecs[$type])) {
$args = $this->searchArguments($ss);
// print_r($args);
print_r('***************I am here***************');
return $args;
}

Expand All @@ -309,6 +320,12 @@ function dismaxSearchArguments($ss) {
$rv[] = array('qt', 'edismax');
$rv[] = array('mm', $this->mm($spec, $ss));

echo "\n";
print_r('******************************');
print_r($rv);
print_r('******************************');
echo "\n";

return array_merge($rv, $this->filterComponents($ss), $this->sortComponents($ss));
}

Expand All @@ -334,10 +351,11 @@ function standardSearchComponents($ss) {

$searchComponents = array();

// Lianet's notes: conf/searchspecs is the config used to build the Solr query.
$specs = yaml_parse_file('conf/searchspecs.yaml');
$query = '';

foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolen AND or OR
foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolean AND or OR
$type = $tvb[0];
$values = $this->build_and_or_onephrase($tvb[1]);
$bool = isset($tvb[2]) ? $tvb[2] : false;
Expand All @@ -356,8 +374,8 @@ function standardSearchComponents($ss) {
}
$query .= "id:(" . implode(' OR ', $ss->extraIDs()) . ')';
}

$ids = $this->tagIDs($ss);
// Check if the query has content, otherwise use *:* to match all
if (preg_match('/\S/', $query)) {
$searchComponents[] = array('q', $query);
}
Expand All @@ -370,15 +388,19 @@ function standardSearchComponents($ss) {


/** Quote a filter value, skipping it if it starts with a '[' (and hence is assumed
* to be a range)
* to be a range). Detect date range
**/

function quoteFilterValue($v) {
if (preg_match('/^\[/', $v)) {
return $v;
}
else {
return '"' . $v . '"';
// Escape internal quotes before wrapping
// input: He said "hello, the output: He said \"hello
$escaped = str_replace('"', '\\"', $v);
// String ready to Solr "He said \"hello"
return '"' . $escaped . '"';
}
}

Expand Down Expand Up @@ -664,6 +686,9 @@ private function __buildQueryString($structure, $values, $joiner = "OR") {
}

if ($val == 'stdnum') {
// Extract standard number from asis input
// Strips leading 0s. Captures digits, dashes, dots: 978-0-123-45678-9
// e.g. 0000978-0-12-345678-9 → 978-0-12-345678-9
if (preg_match('/^\s*0*([\d\-\.]+[xX]?).*$/', $values['asis'], $match)) {
$stdnum = $match[1];
// $stdnum = preg_replace('/[\.\-]/', '', $stdnum);
Expand All @@ -675,6 +700,7 @@ private function __buildQueryString($structure, $values, $joiner = "OR") {
if (!isset($values[$val]) || ($values[$val] == "")) {
continue;
}
// Lianet's notes: Escape the value for safe embedding in field:value syntax
$sstring = $field . ':(' . $values[$val] . ')';
if (isset($weight) && $weight > 0) {
$sstring .= '^' . $weight;
Expand All @@ -688,7 +714,7 @@ private function __buildQueryString($structure, $values, $joiner = "OR") {


/**
* Turn solr output into a record structure (which shouuld probably be its own class...)
* Turn solr output into a record structure (which should probably be its own class...)
*
* @param string $result The XML returned by solr
* @param string $xslfile The path of the XSL file to use to convert the data
Expand All @@ -700,6 +726,7 @@ function _process($result, $xslfile = 'xsl/solr-convert.xsl') {
global $configArray;

if (is_string($result) && preg_match('/^<html/', $result)) {
// Detect if Solr returns an error page
if (preg_match('/ParseException/', $result)) {
$errorMsg = "Error+in+search+syntax";
}
Expand Down Expand Up @@ -757,6 +784,7 @@ function _process($result, $xslfile = 'xsl/solr-convert.xsl') {
* @return array Tokenized array
* @access public
*/
// TODO: refactor tokenizer to a single-pass parser instead of regex
public function tokenizeInput($input) {
// Tokenize on spaces and quotes
//preg_match_all('/"[^"]*"|[^ ]+/', $input, $words);
Expand Down Expand Up @@ -803,6 +831,8 @@ public function tokenizeInput($input) {
* @return string Fixed input
* @access public
*/

// Lianet's notes: Verify if this function could be used to validate the Solr query
public function validateInput($input) {
// Ensure wildcards are not at beginning of input
if ((substr($input, 0, 1) == '*') ||
Expand Down Expand Up @@ -863,18 +893,43 @@ function exactmatcherify($str) {
*
* Given a lookfor string, clean it up, tokenize it, and
* return a structure that includes AND, OR, and Phrase
* queries.
* queries. lookfor could be single or multi-word
*
* @param string $lookfor User's search string
* @return array $values Includes 'and', 'or', and 'onephrase' elements
* @access public
*/

// Lianet's notes: Check if is necessary to remove illegal characters
// TODO: Refactoring this function to avoid the different output
public function build_and_or_onephrase($lookfor = null) {
$values = array();

// Removing this characters if the string only contains any of these characters - len == 1
// If there are words and characters, escaped them
// Delete all these characters destroy the user intent without noticing the user. I'll block meaningless
// single-characters explicitly
$illegal = array('.', '{', '}', '/', '!', ':', ';', '[', ']', '(', ')', '+ ', '&', '- ');
$lookfor = trim(str_replace($illegal, '', $lookfor));
// $lookfor = trim(str_replace($illegal, '', $lookfor));

// Reject the input if is exactly one character and if the character is ~ or /
// ~ alone is invalid Lucene syntax causing parsing error
// \ escape alone is syntactically incorrect and causes 500 errors because Lucene expects to escape something
//if (mb_strlen(trim($lookfor)) === 1 && preg_match('/^[~\\\\]$/', $lookfor)) {
// return false;
//}

// 1. Normalize + trim
$lookfor = trim($lookfor);

// 2. Strip garbage-only input
if ($lookfor !== '' && !preg_match('/[\p{L}\p{N}]/u', $lookfor)) {
$lookfor = trim(str_replace($illegal, '', $lookfor));
}

// 3. Reject invalid single-char input
if (mb_strlen($lookfor) === 1 && preg_match('/^[~\\\\]$/', $lookfor)) {
return false;
}

// Replace fancy quotes
$lookfor = str_replace(array('“', '”'), '"', $lookfor);
Expand All @@ -897,14 +952,14 @@ public function build_and_or_onephrase($lookfor = null) {

// Tokenize Input
$tokenized = $this->tokenizeInput($lookfor);

$values['onephrase'] = '"' . preg_replace('/"/', '', implode(' ', $tokenized)) . '"';
$values['and'] = implode(' AND ', $tokenized);
$values['or'] = implode(' OR ', $tokenized);
$values['asis'] = $lookfor;
$values['compressed'] = preg_replace('/\s/', '', $lookfor);
$values['exactmatcher'] = $this->exactmatcherify($lookfor);
$values['emstartswith'] = $values['exactmatcher'] . '*';

return $values;
}

Expand All @@ -918,7 +973,7 @@ public function build_and_or_onephrase($lookfor = null) {
**/

function solrSearch($args, $action = 'standard') {
$raw = $this->rawSolrSearch($args, $action);
$raw = $this->rawSolrSearch($args, $action); // This is the Solr output
if (!PEAR::isError($raw)) {
$processed = $this->_process($raw);

Expand Down Expand Up @@ -956,6 +1011,8 @@ protected function set_proper_action($action, $args) {
return $action;
}
}
// Ensure a non-NULL return from non-edismax cases
return $action;
}

// Do we just want the IDs? Spit 'em out!
Expand Down Expand Up @@ -1006,12 +1063,18 @@ function rawSolrSearch($args, $action = 'standard') {

$this->solr_connection->add($args);

print_r('======================');
print_r($args);

# Just want a list of IDs? Produce it and die
if (isset($_REQUEST['htid_list'])) {
$this->print_out_list_of_ids($args);
die();
}

error_log("Solr action used: " . $action);
echo "Solr action used: " . $action;

# Finally, we can deal with the normal case
return $this->solr_connection->send();
}
Expand Down Expand Up @@ -1112,13 +1175,48 @@ function mltesc($str) {
return str_replace(array('(', ')','[', ']', '!', '&', ':', ';', '-', '/', '"'), '', $str);
}


// TODO: Check this function for correctness
function lucene_escape($str) {
$pattern = '/(\+|-|&&|\|\||!|\(|\)|\{|}|\[|]|\^|"|~|\*|\?|:|\\\)/';
$replace = '\\\$1';
return preg_replace($pattern, $replace, $str);
}

/**
* Strict escape for filter query values and explicit field:value fragments.
*
* Use for all fq values and any time you construct field:value with user data.
* This function:
* - Normalizes Unicode to NFC form
* - Removes control characters
* - Escapes backslash FIRST (critical ordering)
* - Escapes multi-char tokens (&&, ||)
* - Escapes all Lucene special characters: + - ! ( ) { } [ ] ^ " ~ * ? : /
*
* @param string $s Raw user input or value to escape
* @return string Safely escaped value for use in Solr fq or field:value
*/
function lucene_escape_fq(string $s): string {
// Normalize Unicode to composed form (NFC)
if (function_exists('normalizer_normalize')) {
$s = normalizer_normalize($s, Normalizer::FORM_C) ?: $s;
}

// Remove control characters (0x00-0x1F, 0x7F)
$s = preg_replace('/[\x00-\x1F\x7F]/u', '', $s);

// CRITICAL: Escape backslash FIRST to avoid double-escaping
$s = str_replace('\\', '\\\\', $s);

// Escape Lucene special characters character-by-character
$specials = ['+', '-', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '/', '&', '|'];
foreach ($specials as $c) {
$s = str_replace($c, '\\' . $c, $s);
}

return $s;
}

function getMoreLikeThis($record, $id, $max = 5) {
global $configArray;

Expand Down Expand Up @@ -1168,7 +1266,7 @@ function getMoreLikeThis($record, $id, $max = 5) {
$query .= ') NOT id:(' . $id . ')';

$ss = new SearchStructure(true); // create a "blank" ss with just the filter queries

$args = array_merge(array(array('q', $query)), $this->filterComponents($ss));
return $this->solrSearch($args);
}
Expand Down
Loading