Skip to content

Commit 62dbff0

Browse files
committed
Refactored concordance method
Separated from concordance method the code block that finds the needle positions and the code block that extract an excerpt of the text. Also included option to mark the needle to the returned text.
1 parent ec43479 commit 62dbff0

File tree

1 file changed

+22
-53
lines changed

1 file changed

+22
-53
lines changed

src/Corpus/TextCorpus.php

Lines changed: 22 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -77,71 +77,44 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
7777
* @param int $contextLength The amount of space left and right of the found needle
7878
* @param bool $ignorecase
7979
* @param int $position. Available options: contain, begin, end, equal.
80+
* @param bool $mark Option to mark the needle
8081
* @return array
8182
*/
82-
public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
83+
public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain', bool $mark = false) : array
8384
{
8485
// temporary solution to handle unicode chars
85-
$this->text = utf8_decode($this->text);
86+
$text = utf8_decode($this->text);
87+
$text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $text));
8688
$needle = utf8_decode($needle);
87-
88-
$found = [];
89-
$text = ' ' . trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text)) . ' ';
9089
$needleLength = strlen($needle);
91-
$textLength = strlen($text);
92-
$bufferLength = $needleLength + 2 * $contextLength;
93-
94-
// \p{L} or \p{Letter}: any kind of letter from any language.
95-
96-
$special_chars = "\/\-_\'";
97-
$word_part = '\p{L}'.$special_chars;
98-
99-
switch ($position) {
100-
case 'equal':
101-
$pattern = "/[^$word_part]($needle)[^$word_part]/";
102-
break;
103-
case 'begin':
104-
$pattern = "/[^$word_part]($needle)[$special_chars]?[\p{L}]*|^($needle)/";
105-
break;
106-
case 'end':
107-
$pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)[^$word_part]/";
108-
break;
109-
case 'contain':
110-
$pattern = "/($needle)/";
111-
break;
112-
default:
113-
$pattern = "/($needle)/";
114-
break;
115-
}
90+
$found = [];
11691

117-
$case = $ignorecase ? 'i' : '';
118-
preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
92+
$positions = $this->concordancePositions($text, $needle, $contextLength, $ignorecase, $position);
11993

12094
// Getting excerpts
121-
foreach($matches[1] as $match) {
122-
123-
$needlePosition = $match[1];
124-
$left = max($needlePosition - $contextLength, 0);
95+
foreach($positions as $needlePosition) {
96+
//marking the term
97+
$text_marked = ($mark) ? Text::markString($text, $needlePosition, $needleLength, ['{{','}}']) : $text;
98+
$needleLength_marked = ($mark) ? $needleLength+4 : $needleLength;
12599

126-
if($needleLength + $contextLength + $needlePosition > $textLength) {
127-
$tmp = substr($text, $left);
128-
} else {
129-
$tmp = substr($text, $left, $bufferLength);
130-
}
131-
$found[] = utf8_encode($tmp);
100+
$found[] = utf8_encode(Text::getExcerpt($text_marked, $needlePosition, $needleLength_marked, $contextLength));
132101
}
133102

134103
return $found;
135104
}
136105

137-
public function occurrences(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain', bool $mark = false) : array
106+
/**
107+
* Return all positions of the needle in the text according to the position of the needle in a word.
108+
* @param string $text
109+
* @param int $needle
110+
* @param int $contextLength The amount of space left and right of the found needle
111+
* @param bool $ignorecase
112+
* @param int $position. Available options: contain, begin, end, equal.
113+
* @return array
114+
*/
115+
public function concordancePositions(string $text, string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
138116
{
139-
// temporary solution to handle unicode chars
140-
$text = utf8_decode($this->text);
141-
$needle = utf8_decode($needle);
142-
143117
$found = [];
144-
$text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $text));
145118
$needleLength = strlen($needle);
146119
$textLength = strlen($text);
147120
$bufferLength = $needleLength + 2 * $contextLength;
@@ -173,11 +146,7 @@ public function occurrences(string $needle, int $contextLength = 20, bool $ignor
173146
preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
174147
$positions = array_column($matches[1], 1);
175148

176-
$excerpts = array_map(function($needlePos) use ($needleLength, $text, $contextLength, $mark) {
177-
return $this->extractExcerptTerm($needlePos, $needleLength, $text, $contextLength, $mark);
178-
}, $positions);
179-
180-
return $excerpts;
149+
return $positions;
181150
}
182151

183152
/**

0 commit comments

Comments
 (0)