You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pdd-order-api/app/libs/tool/class.PageSpider.php

137 lines
4.1 KiB
PHP

<?php
class PageSpider {
private $contentMap;
private $urls;
private $urlTryTimesMap = array();
private $mrHandler;
private $headers;
private $curlOptions;
private $errorLog;
private $successLog;
public static function simpleCrawl($urls, $refererUrl) {
$spider = new PageSpider($urls, $refererUrl);
return $spider->crawl();
}
public function __construct($urls = array(), $refererUrl = null, $httpXFordwardFor = null) {
$this->errorLog = Zc::getLog('tool/page_spider_error');
$this->successLog = Zc::getLog('tool/page_spider_success');
$this->mrHandler = new \MultiRequest\Handler();
$this->reset($urls);
$headers = array (
'Connection: keep-alive',
'Cache-Control: max-age=0',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,de;q=0.6,en;q=0.4,es;q=0.2,nl;q=0.2,pt;q=0.2,ru;q=0.2,zh-TW;q=0.2,fr;q=0.2,ja;q=0.2',
//'Cookie: ipLoc-djd=1-72-2799-0; ipLocation=%u5317%u4EAC'
);
$headers[] = !empty($refererUrl) ? ('Referer:' . $refererUrl) : 'Referer:http://www.jd.com/?utm_source=jd.com';
if (!empty($httpXFordwardFor)) {
$headers[] = 'X-FORWARDED-FOR: ' . $httpXFordwardFor;
}
$this->headers = $headers;
$this->curlOptions = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
CURLOPT_CONNECTTIMEOUT => 5,
CURLOPT_TIMEOUT => 60
);
}
public function setHeaders($headers) {
$this->headers = $headers;
}
public function setCurlOptions($curlOptions) {
$this->curlOptions = $curlOptions;
}
private $checkSuccess = null;
public function crawl($urls = null, $checkSuccess = null) {
if (!empty($urls)) {
$this->reset($urls);
}
$this->checkSuccess = $checkSuccess;
$this->mrHandler->setConnectionsLimit(1000);
$this->mrHandler->requestsDefaults()->addHeaders($this->headers);
$this->mrHandler->requestsDefaults()->addCurlOptions($this->curlOptions);
foreach($this->urls as $url) {
$request = new \MultiRequest\Request($url);
$request->addCurlOptions($this->curlOptions);
$request->addHeaders($this->headers);
$request->onSuccess(array($this, 'onRequestSuccess'));
$request->onFailed(array($this, 'onRequestFailed'));
$this->mrHandler->pushRequestToQueue($request);
}
try {
$this->mrHandler->start();
} catch (\Exception $ex) {
$this->markRemainUrlsFail($ex);
}
return $this->contentMap;
}
/**
*
* @param \Exception $ex
*/
private function markRemainUrlsFail($ex) {
foreach ($this->urls as $url) {
if (isset($this->contentMap[$url])) {
continue;
}
$this->contentMap[$url] = CommonTool::failResult($ex->getMessage());
}
}
public function onRequestSuccess(\MultiRequest\Request $request, \MultiRequest\Handler $handler) {
$url = $request->getUrl();
$content = $request->getContent();
if ($this->checkSuccess) {
$checkRet = call_user_func_array($this->checkSuccess, array($url, $content));
if (!$checkRet) {
$this->errorLog->error('url[' . $url . '] check false, goto onRequestFailed');
$ex = new \MultiRequest\Exception('checkSuccessContent fail');
return $this->onRequestFailed($request, $ex, $handler);
}
}
$this->successLog->info("url[{$url}] request success");
$this->contentMap[$url] = CommonTool::successResult('content', $content);
}
public function onRequestFailed(\MultiRequest\Request $request, \MultiRequest\Exception $ex, \MultiRequest\Handler $handler) {
$url = $request->getUrl();
$md5 = md5($url);
if ($this->urlTryTimesMap[$md5] < 5) {
$this->errorLog->error('url[' . $url . '], msg: ' . $ex->getMessage() . ' tryTimes: ' . $this->urlTryTimesMap[$md5]);
$this->mrHandler->pushRequestToQueue($request);
$this->urlTryTimesMap[$md5]++;
return ;
}
$this->contentMap[$url] = CommonTool::failResult($ex->getMessage());
}
private function reset($urls){
if (empty($urls)) {
$this->urls = array();
} else {
$this->urls = is_array($urls) ? $urls : array($urls);
}
$this->contentMap = array();
$this->urlTryTimesMap = array();
}
}