Skip to content

Commit 7842ea9

Browse files
committed
move nktl downloader to ptadownloader. Added in morph stemmer, it is dependent on having wordnet installed
1 parent baaadfd commit 7842ea9

File tree

8 files changed

+104
-12
lines changed

8 files changed

+104
-12
lines changed

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
},
2626
"require" : {
2727
"php": ">=5.5",
28-
"yooper/stop-words": "dev-master",
28+
"yooper/stop-words": "^1.0",
2929
"symfony/console": ">=2.7"
3030
},
3131
"require-dev": {

src/Console/Commands/NltkPackageInstallAllCommand.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@
1111

1212

1313
/**
14-
* Install all the nltk data packages
14+
* Install all the pta data packages
1515
* @author yooper
1616
*/
1717
class NltkPackageInstallAllCommand extends Command
1818
{
1919
protected function configure()
2020
{
21-
$this->setName('nltk:install:all')
21+
$this->setName('pta:install:all')
2222
->setDescription('Install all packages from pta data');
2323
}
2424

src/Console/Commands/NltkPackageInstallCommand.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ class NltkPackageInstallCommand extends Command
2020
{
2121
protected function configure()
2222
{
23-
$this->setName('nltk:install:package')
23+
$this->setName('pta:install:package')
2424
->setDescription('Install the selected corpus')
2525
->addArgument(
2626
'package',
2727
InputArgument::REQUIRED,
28-
'You must selected a valid package id, use nltk:list to explore the available options.'
28+
'You must selected a valid package id, use pta:list to explore the available options.'
2929
);
3030
}
3131

@@ -47,7 +47,7 @@ protected function execute(InputInterface $input, OutputInterface $output)
4747
}
4848

4949
if(!$packageFound) {
50-
$output->writeln("Package {$packageId} was not found, try textconsole nltk:list, to see the available packages");
50+
$output->writeln("Package {$packageId} was not found, try textconsole pta:list, to see the available packages");
5151
} else {
5252
$output->writeln("Package {$package->getId()} - {$package->getName()} was installed into {$download->getInstallDir()}");
5353
}

src/Console/Commands/NltkPackageListCommand.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ class NltkPackageListCommand extends Command
1717
{
1818
protected function configure()
1919
{
20-
$this->setName('nltk:list')
21-
->setDescription('List Corpora available in the nltk data repo.')
20+
$this->setName('pta:list')
21+
->setDescription('List Corpora available in the pta data repo.')
2222
->addArgument(
2323
'url',
2424
InputArgument::OPTIONAL,
25-
'Use a different url to download the nltk package list.'
25+
'Use a different url to download the pta/nltk package list.'
2626
);
2727
}
2828

src/Downloaders/DownloadPackageFactory.php

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,14 @@ static public function download(Package $package)
5252
}
5353

5454
/**
55-
* Verify the packages checksum against the downloaded file
55+
* Verify the packages checksum against the downloaded file, if it exists
5656
* @return boolean
5757
*/
5858
public function verifyChecksum()
5959
{
60+
if(empty($this->getPackage()->getChecksum())) {
61+
return true;
62+
}
6063
return $this->getPackage()->getChecksum() === md5($this->getDownloadFullPath());
6164
}
6265

@@ -169,7 +172,7 @@ protected function downloadRemoteFile()
169172
*/
170173
public function getDownloadFullPath()
171174
{
172-
return sys_get_temp_dir().DIRECTORY_SEPARATOR.'nltk-downloads'
175+
return sys_get_temp_dir().DIRECTORY_SEPARATOR.'pta-downloads'
173176
.DIRECTORY_SEPARATOR.$this->getPackage()->getSubdir()
174177
.DIRECTORY_SEPARATOR.basename($this->getPackage()->getUrl());
175178
}

src/Downloaders/NltkCorporaIndexDownloader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ public function getXmlContent()
8989
*/
9090
protected function getCacheFileName()
9191
{
92-
return 'nltk-list.xml';
92+
return 'pta-list.xml';
9393
}
9494

9595

src/Stemmers/MorphStemmer.php

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
<?php
2+
3+
namespace TextAnalysis\Stemmers;
4+
5+
use TextAnalysis\Indexes\WordnetIndex;
6+
use TextAnalysis\Corpus\WordnetCorpus;
7+
use TextAnalysis\Interfaces\IStemmer;
8+
9+
/**
10+
*
11+
* Use a morph stemmer to stem to the base word
12+
* @author dcardin
13+
*/
14+
class MorphStemmer implements IStemmer
15+
{
16+
/**
17+
*
18+
* @var array
19+
*/
20+
protected $cache = [];
21+
22+
/**
23+
*
24+
* @var WordnetIndex
25+
*/
26+
protected $wordnetIndex = null;
27+
28+
public function __construct()
29+
{
30+
$this->wordnetIndex = new WordnetIndex(new WordnetCorpus(get_storage_path('corpora/wordnet')));
31+
}
32+
33+
/**
34+
*
35+
* @return WordnetIndex
36+
*/
37+
public function getWordnetIndex()
38+
{
39+
return $this->wordnetIndex;
40+
}
41+
42+
/**
43+
*
44+
* @param string $token
45+
* @return string
46+
*/
47+
public function stem($token)
48+
{
49+
if(!isset($this->cache[$token])) {
50+
if(mb_strlen($token) < 3){
51+
$this->cache[$token] = $token;
52+
} else {
53+
$this->cache[$token] = $this->getWordnetIndex()->getMorph($token);
54+
}
55+
}
56+
return $this->cache[$token];
57+
}
58+
59+
public function __destruct()
60+
{
61+
unset($this->cache);
62+
unset($this->wordnetIndex);
63+
}
64+
65+
66+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?php
2+
3+
namespace Tests\TextAnalysis\Stemmers;
4+
5+
use TextAnalysis\Stemmers\MorphStemmer;
6+
7+
/**
8+
* Description of MorphStemmerTest
9+
*
10+
* @author yooper
11+
*/
12+
class MorphStemmerTest extends \PHPUnit_Framework_TestCase
13+
{
14+
public function testMorphStemmer()
15+
{
16+
if( getenv('SKIP_TEST')) {
17+
return;
18+
}
19+
$stemmer = new MorphStemmer();
20+
$this->assertEquals('university', $stemmer->stem('universities'));
21+
22+
}
23+
}

0 commit comments

Comments
 (0)