Skip to content

Commit 4268ff9

Browse files
committed
Adding Stanford POS tagger
1 parent f885272 commit 4268ff9

File tree

13 files changed

+496
-222
lines changed

13 files changed

+496
-222
lines changed

composer.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "yooper/php-text-analysis",
3-
"description": "PHP Text Analysis is a library for performing Information Retrival (IR) and Natural Language Processing (NLP) tasks using the PHP language",
3+
"description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language",
44
"keywords": ["nlp","ir","text analysis","natural language processing"],
55
"license": "Apache 2",
66
"authors": [
@@ -13,7 +13,7 @@
1313
"psr-4": {
1414
"TextAnalysis\\": "src/"
1515
},
16-
"files": ["src/helpers/storage.php", "src/helpers/print.php"]
16+
"files": ["src/helpers/storage.php", "src/helpers/print.php", "simplified.php"]
1717
},
1818
"autoload-dev": {
1919
"files": ["tests/TestBaseCase.php"]
@@ -26,7 +26,7 @@
2626
"require" : {
2727
"php": ">=5.5",
2828
"yooper/stop-words": "dev-master",
29-
"symfony/console": "2.7.*"
29+
"symfony/console": ">=2.7"
3030
},
3131
"require-dev": {
3232
"phpunit/phpunit": "4.*",
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
3+
namespace TextAnalysis\Console\Commands;
4+
5+
use Symfony\Component\Console\Command\Command;
6+
use Symfony\Component\Console\Input\InputInterface;
7+
use Symfony\Component\Console\Output\OutputInterface;
8+
use Symfony\Component\Console\Helper\ProgressBar;
9+
use TextAnalysis\Downloaders\DownloadPackageFactory as DPF;
10+
use TextAnalysis\Downloaders\NltkCorporaIndexDownloader;
11+
12+
13+
/**
14+
* Install all the nltk data packages
15+
* @author yooper
16+
*/
17+
class NltkPackageInstallAllCommand extends Command
18+
{
19+
protected function configure()
20+
{
21+
$this->setName('nltk:install:all')
22+
->setDescription('Install all packages from pta data');
23+
}
24+
25+
protected function execute(InputInterface $input, OutputInterface $output)
26+
{
27+
$listPackages = (new NltkCorporaIndexDownloader())->getPackages();
28+
29+
// create a new progress bar (50 units)
30+
$progress = new ProgressBar($output, count($listPackages));
31+
$progress->setFormat(' %current%/%max% [%bar%] %percent:3s%% %elapsed:6s%/%estimated:-6s% %memory:6s%');
32+
33+
// start and displays the progress bar
34+
$progress->start();
35+
36+
foreach($listPackages as $package)
37+
{
38+
// ... do some work
39+
$progress->setMessage("Downloading {$package->getName()}");
40+
41+
$download = DPF::download($package);
42+
$progress->setMessage("Package {$package->getId()} - {$package->getName()} was installed into {$download->getInstallDir()}");
43+
// advance the progress bar 1 unit
44+
$progress->advance();
45+
46+
}
47+
48+
// ensure that the progress bar is at 100%
49+
$progress->finish();
50+
}
51+
52+
}

src/Console/Commands/NltkPackageInstallCommand.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class NltkPackageInstallCommand extends Command
2020
{
2121
protected function configure()
2222
{
23-
$this->setName('nltk:install-package')
23+
$this->setName('nltk:install:package')
2424
->setDescription('Install the selected corpus')
2525
->addArgument(
2626
'package',

src/Downloaders/DownloadPackageFactory.php

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ protected function unpackPackage()
6868
{
6969
// it is zipped, we must unzip it
7070
if($this->getPackage()->getUnzip()) {
71-
$this->extractZip();
71+
$this->extractZip($this->getDownloadFullPath(), $this->getInstallDir());
7272
} else {
7373
$this->recursiveCopy($this->getDownloadFullPath(), $this->getInstallDir());
7474
}
@@ -83,6 +83,10 @@ protected function unpackPackage()
8383
*/
8484
protected function recursiveCopy($src,$dst)
8585
{
86+
if($this->isZip($src)) {
87+
$this->extractZip($src, $this->getInstallDir());
88+
return;
89+
}
8690
$dir = opendir($src);
8791
if(!is_dir($dst)) {
8892
mkdir($dst);
@@ -100,19 +104,32 @@ protected function recursiveCopy($src,$dst)
100104
closedir($dir);
101105
}
102106

107+
/**
108+
*
109+
* @param string $path
110+
* @return boolean
111+
*/
112+
protected function isZip($path)
113+
{
114+
$r = zip_open($path);
115+
if(is_resource($r)) {
116+
zip_close($r);
117+
return true;
118+
}
119+
return false;
120+
}
121+
103122
/**
104123
* Use PHP's ZipArchive to extract out the data
105124
*/
106-
protected function extractZip()
125+
protected function extractZip($srcPath, $extractToDir)
107126
{
108127
$zip = new ZipArchive();
109-
$r = $zip->open($this->getDownloadFullPath());
110-
if(!$r) { // error occurred
111-
112-
} else {
113-
$zip->extractTo($this->getInstallDir());
128+
$r = $zip->open($srcPath);
129+
if($r) {
130+
$zip->extractTo($extractToDir);
114131
$zip->close();
115-
}
132+
}
116133
}
117134

118135
/**
@@ -164,7 +181,7 @@ public function getDownloadFullPath()
164181
*/
165182
public function getInstallDir()
166183
{
167-
return 'storage'.DIRECTORY_SEPARATOR.'corpora'.DIRECTORY_SEPARATOR;
184+
return 'storage'.DIRECTORY_SEPARATOR.$this->getPackage()->getSubdir().DIRECTORY_SEPARATOR;
168185
}
169186

170187
/**

src/Downloaders/NltkCorporaIndexDownloader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class NltkCorporaIndexDownloader
3333
* @param string $url Default value is provided, but you can override
3434
* @param boolean $useCache use the cached copy if it is available, by default it is off
3535
*/
36-
public function __construct($url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml', $useCache = false)
36+
public function __construct($url = 'https://raw.githubusercontent.com/yooper/pta_data/gh-pages/index.xml', $useCache = false)
3737
{
3838
$this->url = $url;
3939
$this->useCache = $useCache;

src/Taggers/StanfordAbstract.php

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
<?php
2+
3+
4+
namespace TextAnalysis\Taggers;
5+
6+
use RuntimeException;
7+
use TextAnalysis\Filters\PunctuationFilter;
8+
use TextAnalysis\Tokenizers\WhitespaceTokenizer;
9+
10+
/**
11+
* Abstract class for the Stanford NerTagger and PosTagger to extend
12+
* @author yooper
13+
*/
14+
abstract class StanfordAbstract
15+
{
16+
/**
17+
*
18+
* @var array options passed the java vm
19+
*/
20+
protected $javaOptions = [];
21+
22+
/**
23+
*
24+
* @var string Path to the classifier you want to use
25+
*/
26+
protected $classifierPath = null;
27+
28+
/**
29+
* Path to the stanford NER jar
30+
* @var string
31+
*/
32+
protected $jarPath = null;
33+
34+
/**
35+
* Place for storing the tokenized words to
36+
* @var string
37+
*/
38+
protected $tmpFilePath = null;
39+
40+
/**
41+
* The separators between tokens
42+
* @var string
43+
*/
44+
protected $separator = null;
45+
46+
/**
47+
*
48+
* @var string Output from proc_open
49+
*/
50+
protected $output;
51+
52+
/**
53+
*
54+
* @var string Errors from proc_open
55+
*/
56+
protected $errors;
57+
58+
59+
/**
60+
*
61+
* @param string $jarPath
62+
* @param string $classifierPath
63+
* @param string $javaOptions
64+
*/
65+
public function __construct($jarPath, $classifierPath, $javaOptions = ['-mx700m'], $separator = '/')
66+
{
67+
$this->jarPath = $jarPath;
68+
$this->classifierPath = $classifierPath;
69+
$this->javaOptions = $javaOptions;
70+
$this->separator = $separator;
71+
}
72+
73+
/**
74+
*
75+
* @return string
76+
*/
77+
public function getTmpFilePath()
78+
{
79+
return $this->tmpFilePath;
80+
}
81+
82+
/**
83+
* @throws \RuntimeException
84+
* @param array $tokens Use a tokenizer
85+
*/
86+
public function tag(array $tokens)
87+
{
88+
$this->verify();
89+
//write tokens to temp file
90+
file_put_contents($this->getTmpFilePath(), implode($this->getSeparator(), $tokens));
91+
$this->exec();
92+
93+
return $this->getParsedOutput();
94+
}
95+
96+
/**
97+
* Requires that class output var be populated. Punctuation may cause issues
98+
* @return array
99+
*/
100+
abstract protected function getParsedOutput();
101+
102+
103+
/**
104+
* Separator used between tokens
105+
* @return string default is /
106+
*/
107+
public function getSeparator()
108+
{
109+
return $this->separator;
110+
}
111+
112+
/**
113+
* verifies required files exist
114+
* @throws \RuntimeException
115+
*/
116+
protected function verify()
117+
{
118+
if(!is_file($this->getJarPath())) {
119+
throw new RuntimeException("Jar not found {$this->getJarPath()}");
120+
}
121+
122+
if(!is_file($this->getClassifierPath())) {
123+
throw new RuntimeException("Classifier not found {$this->getClassifierPath()}");
124+
}
125+
}
126+
127+
/**
128+
* Returns the path to the classifier
129+
* @return string
130+
*/
131+
public function getClassifierPath()
132+
{
133+
return $this->classifierPath;
134+
}
135+
136+
/**
137+
* Returns the path to the ner jar
138+
* @return string
139+
*/
140+
public function getJarPath()
141+
{
142+
return $this->jarPath;
143+
}
144+
145+
/**
146+
*
147+
* @return array
148+
*/
149+
public function getJavaOptions()
150+
{
151+
return $this->javaOptions;
152+
}
153+
154+
/**
155+
*
156+
* @return string
157+
*/
158+
public function getPathToJava()
159+
{
160+
if(getenv('JAVA_HOME')) {
161+
return getenv('JAVA_HOME');
162+
} else {
163+
return 'java';
164+
}
165+
}
166+
167+
/**
168+
* @return string Returns the cli that is passed to proc_open
169+
*/
170+
abstract public function getCommand();
171+
172+
/**
173+
* @return string Return based on the OS used
174+
*/
175+
protected function getPathSeparator()
176+
{
177+
if(strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
178+
return ';';
179+
} else {
180+
return ':';
181+
}
182+
}
183+
184+
/**
185+
* Calls the stanford jar file
186+
* @throws RuntimeException
187+
*/
188+
protected function exec()
189+
{
190+
$descriptors = [
191+
0 => ["pipe", "r"], // stdin is a pipe that the child will read from
192+
1 => ["pipe", "w"], // stdout is a pipe that the child will write to
193+
2 => ["pipe", "w"] // stderr is a file to write to
194+
];
195+
196+
$process = proc_open($this->getCommand(), $descriptors, $pipes, dirname($this->getJarPath()), []);
197+
198+
if (is_resource($process)) {
199+
fclose($pipes[0]); // close stdin pipe
200+
$this->output = stream_get_contents($pipes[1]);
201+
$this->errors = stream_get_contents($pipes[2]);
202+
fclose($pipes[2]);
203+
fclose($pipes[1]);
204+
if(proc_close($process) === -1) {
205+
throw new RuntimeException($this->errors);
206+
}
207+
}
208+
}
209+
210+
public function __destruct()
211+
{
212+
unset($this->classifierPath);
213+
unset($this->errors);
214+
unset($this->jarPath);
215+
unset($this->javaOptions);
216+
unset($this->output);
217+
unset($this->separator);
218+
if(file_exists($this->tmpFilePath)) {
219+
unlink($this->tmpFilePath);
220+
}
221+
unset($this->tmpFilePath);
222+
}
223+
}

0 commit comments

Comments
 (0)