%PDF- %PDF-
| Direktori : /home/lightco1/www/administrator/components/com_jmap/framework/seostats/services/google/ |
| Current File : /home/lightco1/www/administrator/components/com_jmap/framework/seostats/services/google/search.php |
<?php
// namespace administrator\components\com_jmap\framework\seostats\services\google;
/**
*
* @package JMAP::SEOSTATS::administrator::components::com_jmap
* @subpackage seostats
* @subpackage services
* @subpackage google
* @author Joomla! Extensions Store
* @copyright (C) 2015 - Joomla! Extensions Store
* @license GNU/GPLv2 http://www.gnu.org/licenses/gpl-2.0.html
*/
defined ( '_JEXEC' ) or die ( 'Restricted access' );
/**
* Google stats service
*
* @package JMAP::SEOSTATS::administrator::components::com_jmap
* @subpackage seostats
* @subpackage services
* @subpackage google
* @since 3.3
*/
class JMapSeostatsServicesGoogleSearch extends JMapSeostats {
/**
* Store the number of curled SERP pages
*
* @access public
* @static
* @var string
*/
public static $numberIndexedPages;
/**
* Store the number of curled SERP page
*
* @access public
* @static
* @var string
*/
public static $paginationNumber;
/**
* Start the request for the SERP and the parsing of results
*
* @access protected
* @return boolean
*/
protected static function makeRequest($pageNumber, $query, $result, $customHeaders, $onlyIndexedCount = false) {
$ref = static::getReference ( $pageNumber, $query );
$pageSerp = static::getPageSerp ( $pageNumber, $query );
$curledSerp = static::gCurl ( $pageSerp, $ref, $customHeaders );
// Get total number of indexed pages
preg_match ( '#<div id="resultStats">(.*?)</div>#', $curledSerp, $matchesTotalIndexedPages );
static::$numberIndexedPages = $matchesTotalIndexedPages;
static::$paginationNumber = $pageNumber;
if($onlyIndexedCount) {
return $matchesTotalIndexedPages;
}
// Found the captcha Google ban, return false
if (preg_match ( "#answer[=|/]86640#i", $curledSerp )) {
return false;
}
$matches = array ();
// Get titles and links
preg_match_all ( '#<h3 class="?r"?>(.*?)</h3>#', $curledSerp, $matches );
// Get descriptions
$curledSerp = preg_replace ( '/<span class="?f"?>(.*?)<\/span>/i', '$1', $curledSerp);
preg_match_all ( '#<span class="?st"?>(.*?)</span>#', $curledSerp, $matchesDesc );
// Nothing found, return false
if (empty ( $matches [1] )) {
// No [@id="rso"]/li/h3 on currect page
return false;
}
// Parse SERP results
static::parseResults ( $matches, $matchesDesc, $pageNumber * 10, $result );
return true;
}
/**
* Get the reference query string, ncr is for no country redirect
*
* @access protected
* @return string
*/
protected static function getReference($pageNumber, $query) {
return 0 == $pageNumber ? 'ncr' : sprintf ( 'search?q=%s&hl=en&prmd=imvns&start=%s0&sa=N', $query, $pageNumber );
}
/**
* Filters the domain
*
* @access protected
* @return boolean
*/
protected static function getDomainFilter($domain) {
return $domain ? "#^(https?://)?[^/]*{$domain}#i" : false;
}
/**
* Format the pagination for the Google query
*
* @access protected
* @return string
*/
protected static function getPageSerp($pageNumber, $query) {
return 0 == $pageNumber ? sprintf ( 'search?q=%s&filter=0', $query ) : sprintf ( 'search?q=%s&filter=0&start=%s0', $query, $pageNumber );
}
/**
* Parse and format the array structure containing the SERP informations
*
* @access protected
* @return void
*/
protected static function parseResults($matches, $matchesDesc, $pageNumber, $result) {
$c = 0;
$skipped = 0;
foreach ( $matches [1] as $indexResult=>$link ) {
$match = static::parseLink ( $link );
if(!$match) {
$skipped++;
continue;
}
$description = null;
if(array_key_exists(($indexResult-$skipped), $matchesDesc[1])) {
$description = $matchesDesc[1][($indexResult-$skipped)];
}
$c ++;
$resCnt = $pageNumber + $c;
$arrayPageIndex = ($pageNumber / 10) + 1;
// Format results
$result->setElement ( $arrayPageIndex, array (
'url' => $match [1],
'headline' => trim ( strip_tags ( $match [2] ) ),
'description' => $description
) );
}
}
/**
* Return the parsed links in the SERP
*
* @access protected
* @return string
*/
protected static function parseLink($link) {
$isValidLink = preg_match ( '#<a\s+[^>]*href=[\'"]?([^\'" ]+)[\'"]?[^>]*>(.*?)</a>#', $link, $match );
// is valid and not webmaster link
return (! $isValidLink || self::isAGoogleWebmasterLink ( $match [1] )) ? false : $match;
}
/**
* Detect an invalid Google link
*
* @access protected
* @return boolean
*/
protected static function isAGoogleWebmasterLink($url) {
return preg_match ( '#^https?://www.google.com/(?:intl/.+/)?webmasters#', $url );
}
/**
* Perform the remote query to Google through CURL
*
* @access protected
* @return string
*/
protected static function gCurl($path, $ref, $customHeaders) {
$url = sprintf ( 'https://www.google.%s/', (@$customHeaders['countrytld'] ? $customHeaders['countrytld'] : JMapSeostatsServices::GOOGLE_TLD));
$referer = $ref == '' ? $url : ($ref != 'ncr' ? $url . $ref : $ref);
$url .= $path;
// Randomize the user agent to avoid Google ban
$userAgents=array(
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/42.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/40.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/45.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/44.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)");
$ua = $userAgents[rand (0, count($userAgents) - 1)];
// Format the request header array
$header = array (
'Host: www.google.' . (@$customHeaders['countrytld'] ? $customHeaders['countrytld'] : JMapSeostatsServices::GOOGLE_TLD),
'Connection: keep-alive',
'Cache-Control: max-age=0',
'User-Agent: ' . $ua,
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer: ' . $referer,
'Accept-Language: ' . (@$customHeaders['acceptlanguage'] ? $customHeaders['acceptlanguage'] : JMapSeostatsServices::HTTP_HEADER_ACCEPT_LANGUAGE),
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7'
);
$ch = curl_init ( $url );
curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, 0 );
curl_setopt ( $ch, CURLOPT_HEADER, true);
if(!ini_get('open_basedir')) {
curl_setopt ( $ch, CURLOPT_FOLLOWLOCATION, 1 );
}
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
curl_setopt ( $ch, CURLOPT_USERAGENT, $ua );
curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 30 );
$result = curl_exec ( $ch );
$info = curl_getinfo ( $ch );
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close ( $ch );
// If it's a redirection (3XX) follow the redirect
if ($httpStatus >= 300 && $httpStatus < 400) {
$headers = explode("\n", $result);
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redirectionLink = trim(str_replace("Location:","",$headers[$i]));
$redirectURI = parse_url($redirectionLink);
parse_str($redirectURI['query'], $queryArray);
$safeQuery = http_build_query($queryArray);
$redirectionURI = (trim($redirectURI['path'], '/') . '?' . $safeQuery);
break;
}
}
if($redirectionURI) {
return static::gCurl($redirectionURI, $ref, $customHeaders );
}
}
return ($info ['http_code'] != 200) ? false : $result;
}
/**
* Returns array, containing detailed results parsed and formatted for any Google search SERP
*
* @access public
* @param string $query The containing the search query.
* @param int $pageNumber The SERP page number requested
* @return array $customHeaders The custom headers for country and language to get SERP for
*/
public static function getSerps($query, $pageNumber = 0, $customHeaders = array()) {
$q = rawurlencode ( $query );
$result = new JMapSeostatsHelperArrayhandle ();
static::makeRequest ( $pageNumber / 10, $q, $result, $customHeaders);
return $result->toArray ();
}
/**
* Returns integer, the number of aestimated indexed links
*
* @access public
* @param string $query The containing the search query.
* @return array $customHeaders The custom headers for country and language to get SERP for
*/
public static function getSerpsIndexedLinks($query) {
return static::makeRequest ( 0, $query, array(), array(), true);
}
/**
* Returns integer, the number of aestimated indexed links
*
* @access public
* @param string $query The containing the search query.
* @return int The number of the page where the keyword for a given domain is found
*/
public static function getRankedPageKeyword($query, $domain, $pageNumber = 0, $numResults = 100, $customHeaders = array()) {
$query = rawurlencode ( $query );
$ref = 0 == $pageNumber ? 'ncr' : sprintf ( 'search?q=%s&hl=en&prmd=imvns&start=%s&num=%s&sa=N', $query, $pageNumber, $numResults );
$pageSerp = sprintf ( 'search?q=%s&filter=0&start=0&num=%s', $query, $numResults );
$curledSerp = static::gCurl ( $pageSerp, $ref, $customHeaders );
// Found the captcha Google ban, return false
if (preg_match ( "#answer[=|/]86640#i", $curledSerp )) {
return false;
}
$matches = array ();
// Get titles and links
preg_match_all ( '#<h3 class="?r"?>(.*?)</h3>#', $curledSerp, $matches );
// Nothing found, return false
if (empty ( $matches [1] )) {
// No [@id="rso"]/li/h3 on currect page
return false;
}
$numSerpResult = 0;
$skipped = 0;
$pageSerpIndex = null;
foreach ( $matches [1] as $indexResult=>$link ) {
$match = static::parseLink ( $link );
if(!$match) {
$skipped++;
continue;
}
// Found a match in a SERP for this domain?
if(stripos($link, $domain) !== false) {
$pageSerpIndex = intval($numSerpResult / 10) + 1;
break;
}
$numSerpResult++;
}
return $pageSerpIndex;
}
}