%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /home/lightco1/www/administrator/components/com_jmap/framework/seostats/services/google/
Upload File :
Create Path :
Current File : /home/lightco1/www/administrator/components/com_jmap/framework/seostats/services/google/search.php

<?php
// namespace administrator\components\com_jmap\framework\seostats\services\google;
/**
 *
 * @package JMAP::SEOSTATS::administrator::components::com_jmap
 * @subpackage seostats
 * @subpackage services
 * @subpackage google
 * @author Joomla! Extensions Store
 * @copyright (C) 2015 - Joomla! Extensions Store
 * @license GNU/GPLv2 http://www.gnu.org/licenses/gpl-2.0.html
 */
defined ( '_JEXEC' ) or die ( 'Restricted access' );

/**
 * Google stats service
 *
 * @package JMAP::SEOSTATS::administrator::components::com_jmap
 * @subpackage seostats
 * @subpackage services
 * @subpackage google
 * @since 3.3
 */
class JMapSeostatsServicesGoogleSearch extends JMapSeostats {
	/**
	 * Store the number of curled SERP pages
	 *
	 * @access public
	 * @static
	 * @var string
	 */
	public static $numberIndexedPages;
	
	/**
	 * Store the number of curled SERP page
	 *
	 * @access public
	 * @static
	 * @var string
	 */
	public static $paginationNumber;
	
	/**
	 * Start the request for the SERP and the parsing of results
	 *
	 * @access protected
	 * @return boolean
	 */
	protected static function makeRequest($pageNumber, $query, $result, $customHeaders, $onlyIndexedCount = false) {
		$ref = static::getReference ( $pageNumber, $query );
		$pageSerp = static::getPageSerp ( $pageNumber, $query );
		
		$curledSerp = static::gCurl ( $pageSerp, $ref, $customHeaders );
		
		// Get total number of indexed pages
		preg_match ( '#<div id="resultStats">(.*?)</div>#', $curledSerp, $matchesTotalIndexedPages );
		static::$numberIndexedPages = $matchesTotalIndexedPages;
		static::$paginationNumber = $pageNumber;
		if($onlyIndexedCount) {
			return $matchesTotalIndexedPages;
		}
		
		// Found the captcha Google ban, return false
		if (preg_match ( "#answer[=|/]86640#i", $curledSerp )) {
			return false;
		}
		
		$matches = array ();
		// Get titles and links
		preg_match_all ( '#<h3 class="?r"?>(.*?)</h3>#', $curledSerp, $matches );
		// Get descriptions
		$curledSerp = preg_replace ( '/<span class="?f"?>(.*?)<\/span>/i', '$1', $curledSerp);
		preg_match_all ( '#<span class="?st"?>(.*?)</span>#', $curledSerp, $matchesDesc );
		
		// Nothing found, return false
		if (empty ( $matches [1] )) {
			// No [@id="rso"]/li/h3 on currect page
			return false;
		}
		
		// Parse SERP results
		static::parseResults ( $matches, $matchesDesc, $pageNumber * 10, $result );
		
		return true;
	}
	
	/**
	 * Get the reference query string, ncr is for no country redirect
	 *
	 * @access protected
	 * @return string
	 */
	protected static function getReference($pageNumber, $query) {
		return 0 == $pageNumber ? 'ncr' : sprintf ( 'search?q=%s&hl=en&prmd=imvns&start=%s0&sa=N', $query, $pageNumber );
	}
	
	/**
	 * Filters the domain
	 *
	 * @access protected
	 * @return boolean
	 */
	protected static function getDomainFilter($domain) {
		return $domain ? "#^(https?://)?[^/]*{$domain}#i" : false;
	}
	
	/**
	 * Format the pagination for the Google query
	 *
	 * @access protected
	 * @return string
	 */
	protected static function getPageSerp($pageNumber, $query) {
		return 0 == $pageNumber ? sprintf ( 'search?q=%s&filter=0', $query ) : sprintf ( 'search?q=%s&filter=0&start=%s0', $query, $pageNumber );
	}

	/**
	 * Parse and format the array structure containing the SERP informations
	 *
	 * @access protected
	 * @return void
	 */
	protected static function parseResults($matches, $matchesDesc, $pageNumber, $result) {
		$c = 0;
		$skipped = 0;
		foreach ( $matches [1] as $indexResult=>$link ) {
			$match = static::parseLink ( $link );
			if(!$match) {
				$skipped++;
				continue;
			}
			
			$description = null;
			if(array_key_exists(($indexResult-$skipped), $matchesDesc[1])) {
				$description = $matchesDesc[1][($indexResult-$skipped)];
			}
			
			$c ++;
			$resCnt = $pageNumber + $c;
			$arrayPageIndex = ($pageNumber / 10) + 1;

			// Format results
			$result->setElement ( $arrayPageIndex, array (
					'url' => $match [1],
					'headline' => trim ( strip_tags ( $match [2] ) ),
					'description' => $description 
			) );
		}
	}
	
	/**
	 * Return the parsed links in the SERP
	 *
	 * @access protected
	 * @return string
	 */
	protected static function parseLink($link) {
		$isValidLink = preg_match ( '#<a\s+[^>]*href=[\'"]?([^\'" ]+)[\'"]?[^>]*>(.*?)</a>#', $link, $match );
		
		// is valid and not webmaster link
		return (! $isValidLink || self::isAGoogleWebmasterLink ( $match [1] )) ? false : $match;
	}
	
	/**
	 * Detect an invalid Google link
	 *
	 * @access protected
	 * @return boolean
	 */
	protected static function isAGoogleWebmasterLink($url) {
		return preg_match ( '#^https?://www.google.com/(?:intl/.+/)?webmasters#', $url );
	}
	
	/**
	 * Perform the remote query to Google through CURL
	 * 
	 * @access protected
	 * @return string
	 */
	protected static function gCurl($path, $ref, $customHeaders) {
		$url = sprintf ( 'https://www.google.%s/', (@$customHeaders['countrytld'] ? $customHeaders['countrytld'] : JMapSeostatsServices::GOOGLE_TLD));
		$referer = $ref == '' ? $url : ($ref != 'ncr' ? $url . $ref : $ref);
		$url .= $path;
		
		// Randomize the user agent to avoid Google ban
		$userAgents=array(
		        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/42.0",
		        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/40.0",
			 	"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/45.0",
				"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/44.0",
		        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
		        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2227.1 Safari/537.36",
			 	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.1944.0 Safari/537.36",
			 	"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
		 		"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
		 		"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
		 		"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
		 		"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
		 		"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
		 		"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
		 		"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
		 		"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)");
	    $ua = $userAgents[rand (0, count($userAgents) - 1)];
	    
	    // Format the request header array
		$header = array (
				'Host: www.google.' . (@$customHeaders['countrytld'] ? $customHeaders['countrytld'] : JMapSeostatsServices::GOOGLE_TLD),
				'Connection: keep-alive',
				'Cache-Control: max-age=0',
				'User-Agent: ' . $ua,
				'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
				'Referer: ' . $referer,
				'Accept-Language: ' . (@$customHeaders['acceptlanguage'] ? $customHeaders['acceptlanguage'] : JMapSeostatsServices::HTTP_HEADER_ACCEPT_LANGUAGE),
				'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7' 
		);
		
		$ch = curl_init ( $url );
		curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, 0 );
		curl_setopt ( $ch, CURLOPT_HEADER, true);
		if(!ini_get('open_basedir')) {
			curl_setopt ( $ch, CURLOPT_FOLLOWLOCATION, 1 );
		}
		curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 );
		curl_setopt ( $ch, CURLOPT_HTTPHEADER, $header );
		curl_setopt ( $ch, CURLOPT_USERAGENT, $ua );
		curl_setopt ( $ch, CURLOPT_CONNECTTIMEOUT, 30 );
		$result = curl_exec ( $ch );
		
		$info = curl_getinfo ( $ch );
		$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
		curl_close ( $ch );
		
		// If it's a redirection (3XX) follow the redirect
		if ($httpStatus >= 300 && $httpStatus < 400) {
			$headers = explode("\n", $result);
			// loop through the headers and check for a Location: str
			$j = count($headers);
			for($i = 0; $i < $j; $i++){
				// if we find the Location header strip it and fill the redir var
				if(strpos($headers[$i],"Location:") !== false){
					$redirectionLink = trim(str_replace("Location:","",$headers[$i]));
					$redirectURI = parse_url($redirectionLink);

					parse_str($redirectURI['query'], $queryArray);
					$safeQuery = http_build_query($queryArray);

					$redirectionURI = (trim($redirectURI['path'], '/') . '?' . $safeQuery);
					break;
				}
			}
			if($redirectionURI) {
				return static::gCurl($redirectionURI, $ref, $customHeaders );
			}
		}
		
		return ($info ['http_code'] != 200) ? false : $result;
	}
	
	/**
	 * Returns array, containing detailed results parsed and formatted for any Google search SERP
	 *
	 * @access public
	 * @param string $query The containing the search query.
	 * @param int $pageNumber The SERP page number requested
	 * @return array $customHeaders The custom headers for country and language to get SERP for
	 */
	public static function getSerps($query, $pageNumber = 0, $customHeaders = array()) {
		$q = rawurlencode ( $query );
		$result = new JMapSeostatsHelperArrayhandle ();
		
		static::makeRequest ( $pageNumber / 10, $q, $result, $customHeaders);
		return $result->toArray ();
	}
	
	/**
	 * Returns integer, the number of aestimated indexed links
	 *
	 * @access public
	 * @param string $query The containing the search query.
	 * @return array $customHeaders The custom headers for country and language to get SERP for
	 */
	public static function getSerpsIndexedLinks($query) {
		return static::makeRequest ( 0, $query, array(), array(), true);
	}
	
	/**
	 * Returns integer, the number of aestimated indexed links
	 *
	 * @access public
	 * @param string $query The containing the search query.
	 * @return int The number of the page where the keyword for a given domain is found
	 */
	public static function getRankedPageKeyword($query, $domain, $pageNumber = 0, $numResults = 100, $customHeaders = array()) {
		$query = rawurlencode ( $query );
		$ref = 0 == $pageNumber ? 'ncr' : sprintf ( 'search?q=%s&hl=en&prmd=imvns&start=%s&num=%s&sa=N', $query, $pageNumber, $numResults );
		$pageSerp = sprintf ( 'search?q=%s&filter=0&start=0&num=%s', $query, $numResults );
		
		$curledSerp = static::gCurl ( $pageSerp, $ref, $customHeaders );
		
		// Found the captcha Google ban, return false
		if (preg_match ( "#answer[=|/]86640#i", $curledSerp )) {
			return false;
		}
		
		$matches = array ();
		// Get titles and links
		preg_match_all ( '#<h3 class="?r"?>(.*?)</h3>#', $curledSerp, $matches );
		
		// Nothing found, return false
		if (empty ( $matches [1] )) {
			// No [@id="rso"]/li/h3 on currect page
			return false;
		}
		
		$numSerpResult = 0;
		$skipped = 0;
		$pageSerpIndex = null;
		foreach ( $matches [1] as $indexResult=>$link ) {
			$match = static::parseLink ( $link );
			if(!$match) {
				$skipped++;
				continue;
			}
			
			
			// Found a match in a SERP for this domain?
			if(stripos($link, $domain) !== false) {
				$pageSerpIndex = intval($numSerpResult / 10) + 1;
				break;
			}

			$numSerpResult++;
		}
		
		return $pageSerpIndex;
	}
}

Zerion Mini Shell 1.0