Skip to content

Commit 638d8a4

Browse files
committed
add in document cosine similarity comparison
1 parent af647a6 commit 638d8a4

File tree

2 files changed

+100
-0
lines changed

2 files changed

+100
-0
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?php
2+
3+
namespace TextAnalysis\Comparisons;
4+
5+
use TextAnalysis\Interfaces\IDistance;
6+
use TextAnalysis\Interfaces\ISimilarity;
7+
8+
/**
9+
* Implements cosine similarity algorithm for comparing two sets of arrays
10+
*
11+
* @author yooper
12+
*/
13+
class CosineSimilarityComparison implements IDistance, ISimilarity
14+
{
15+
/**
16+
*
17+
* @param array $text1 an array of tokens
18+
* @param array $text2 an array of tokens
19+
*/
20+
public function similarity($text1, $text2)
21+
{
22+
$text1Freq = array_count_values($text1);
23+
$text2Freq = array_count_values($text2);
24+
$product = 0.0;
25+
26+
// always choose the smaller document
27+
if(count($text1Freq) > count($text2Freq)) {
28+
$iterateTokens =& $text1Freq;
29+
} else {
30+
$iterateTokens =& $text2Freq;
31+
}
32+
33+
foreach($iterateTokens as $term => $freq)
34+
{
35+
if (isset($text1Freq[$term]) && isset($text2Freq[$term])) {
36+
$product += $text1Freq[$term] * $text2Freq[$term];
37+
}
38+
}
39+
40+
$productFunc = function($carry, $freq)
41+
{
42+
$carry += pow($freq, 2);
43+
return $carry;
44+
};
45+
46+
$text1VectorSum = sqrt(array_reduce(array_values($text1Freq), $productFunc, 0));
47+
$text2VectorSum = sqrt(array_reduce(array_values($text2Freq), $productFunc, 0));
48+
return $product / ($text1VectorSum * $text2VectorSum);
49+
50+
}
51+
52+
/**
53+
*
54+
* @param array $text1
55+
* @param array $text2
56+
* @return float
57+
*/
58+
public function distance($text1, $text2)
59+
{
60+
return 1 - $this->similarity($text1, $text2);
61+
}
62+
63+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
namespace Tests\TextAnalysis\Comparisons;
4+
5+
use TextAnalysis\Comparisons\CosineSimilarityComparison;
6+
7+
/**
8+
*
9+
* @author yooper
10+
*/
11+
class CosineSimilarityComparisonTest extends \PHPUnit_Framework_TestCase
12+
{
13+
public function testIdentical()
14+
{
15+
$text1 = ["hiking" , "camping", "swimming"];
16+
$text2 = ["hiking" , "camping", "swimming"];
17+
$compare = new CosineSimilarityComparison();
18+
$this->assertEquals(1.0, $compare->similarity($text1, $text2));
19+
20+
}
21+
22+
public function testDifferent()
23+
{
24+
$text1 = ["hiking" , "hiking", "camping", "swimming"];
25+
$text2 = ["hiking" , "biking", "camping", "swimming"];
26+
$compare = new CosineSimilarityComparison();
27+
$this->assertEquals(0.8, round($compare->similarity($text1, $text2), 1));
28+
}
29+
30+
public function testNothingInCommon()
31+
{
32+
$text1 = ["hiking", "camping", "swimming"];
33+
$text2 = ["biking", "boating", "floating"];
34+
$compare = new CosineSimilarityComparison();
35+
$this->assertEquals(0, $compare->similarity($text1, $text2));
36+
}
37+
}

0 commit comments

Comments
 (0)