Skip to content

pane-e-design/xcrawler

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

52 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

xCrawler

xCrawler is a toolkit to develop spider application based on xsl stylesheets.

The simple example below show how to use this toolkit to parse a rss feed (yahoo news in this case) throw a xsl stylesheet to get in output a xml forged in another format.

require_once('vendor/autoload.php');

use xcrawler\Bot;
use xcrawler\Processor\Factory;

define ('XSL_TEST', 'stylesheets/test.xsl');

$oBot = new Bot();

if(!file_exists('buffer.xml')){
        $oBot->setUrl("http://www.example.com");
        $aParams = array('post_key' => 'post_value');
        $sPage = $oBot->post($aParams);
        file_put_contents('buffer.xml', $sPage);
}else{
        $sPage = file_get_contents('buffer.xml');
}

$oProcessor = Factory::factory($sPage, XSL_TEST);
$sXml = $oProcessor->process($sPage, XSL_TEST);

echo $sXml;

Toolkit parse xml or html source file in trasparent way; so in setUrl method, the only one parameter can set with an xml or html url. Above example use get method to retrieve code of source page, but also it is available post method for post request.

A more complex example is:

require_once('vendor/autoload.php');

use xcrawler\Bot;
use xcrawler\Processor\Factory;
use xcrawler\Utils;


class Spider
{

	const XSL_LIST = 'stylesheets/example.xsl';
	const URL_LIST = "http://www.example.comlist.php?ltr=%s";
	const XSL_DETAILS = 'stylesheets/example-details.xsl';
	const URL_DETAILS_BASE = 'http://www.example.com/x/details';
	const BUFFER_PATH = 'buffer/';

	static public function run()
	{

		$aPages = array('A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

		for ($nPage=0; $nPage < count($aPages); $nPage++) {

			$oBot = new Bot();
			$oBot->setUrl(sprintf(self::URL_LIST, $aPages[$nPage]));
			$oBot->setProxy('192.168.99.100:9050');
			$oBot->setSOCKS5();
			$sPage = "";
			$sPage = Utils::bufferize(self::BUFFER_PATH . sprintf('buffer-%d.bak', $nPage), array('Spider', 'getData'), array($soBot));

			$oProcessor = Factory::factory($sPage, self::XSL_LIST);
			$sXml = $oProcessor->process($sPage, self::XSL_LIST);
			$aXml = Utils::xmlToArray($sXml);

			$i=0;
			foreach($aXml['result']['item'] as $item) {

				if(empty($item['name']))
					continue;

				$oDetailsBot = new Bot();
				$oDetailsBot->setProxy('192.168.99.100:9050');
				$oDetailsBot->setSOCKS5();
				$oDetailsBot->setUrl(self::URL_DETAILS_BASE . $item['link']);
				$sDetailsPage = "";
				$bak = sprintf('buffer-details-%d-%d.bak', $nPage, $i);
				$sDetailsPage = Utils::bufferize(self::BUFFER_PATH . $bak, array('Spider', 'getData'), array($oDetailsBot));

				$oDetailsProcessor = Factory::factory($sDetailsPage, self::XSL_DETAILS);
				$sDetailsXml = $oDetailsProcessor->process($sDetailsPage, self::XSL_DETAILS);
				$aEmail = Utils::xmlToArray($sDetailsXml);
				if ($aEmail['result'])
					echo $item['name'] . ", " . $aEmail['result']['item']['email'] . "\n";
				$i++;
				$oDetailsBot->close();
			}

			$oBot->close();
		}

	}
	
	static public function getData($oBot) {
		return $oBot->get();
	}

}

Spider::run();

Install

xCrawler can be install via composer as follow:

php composer.phar install

About

PHP toolkit for web crawler xsl based

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • PHP 100.0%