You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
137 lines
4.1 KiB
PHP
137 lines
4.1 KiB
PHP
<?php
|
|
class PageSpider {
|
|
private $contentMap;
|
|
private $urls;
|
|
private $urlTryTimesMap = array();
|
|
|
|
private $mrHandler;
|
|
private $headers;
|
|
private $curlOptions;
|
|
|
|
private $errorLog;
|
|
private $successLog;
|
|
|
|
public static function simpleCrawl($urls, $refererUrl) {
|
|
$spider = new PageSpider($urls, $refererUrl);
|
|
return $spider->crawl();
|
|
}
|
|
|
|
public function __construct($urls = array(), $refererUrl = null, $httpXFordwardFor = null) {
|
|
$this->errorLog = Zc::getLog('tool/page_spider_error');
|
|
$this->successLog = Zc::getLog('tool/page_spider_success');
|
|
$this->mrHandler = new \MultiRequest\Handler();
|
|
|
|
$this->reset($urls);
|
|
|
|
$headers = array (
|
|
'Connection: keep-alive',
|
|
'Cache-Control: max-age=0',
|
|
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language: zh-CN,zh;q=0.8,de;q=0.6,en;q=0.4,es;q=0.2,nl;q=0.2,pt;q=0.2,ru;q=0.2,zh-TW;q=0.2,fr;q=0.2,ja;q=0.2',
|
|
//'Cookie: ipLoc-djd=1-72-2799-0; ipLocation=%u5317%u4EAC'
|
|
);
|
|
$headers[] = !empty($refererUrl) ? ('Referer:' . $refererUrl) : 'Referer:http://www.jd.com/?utm_source=jd.com';
|
|
if (!empty($httpXFordwardFor)) {
|
|
$headers[] = 'X-FORWARDED-FOR: ' . $httpXFordwardFor;
|
|
}
|
|
$this->headers = $headers;
|
|
|
|
$this->curlOptions = array(
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
|
|
CURLOPT_CONNECTTIMEOUT => 5,
|
|
CURLOPT_TIMEOUT => 60
|
|
);
|
|
}
|
|
|
|
public function setHeaders($headers) {
|
|
$this->headers = $headers;
|
|
}
|
|
|
|
public function setCurlOptions($curlOptions) {
|
|
$this->curlOptions = $curlOptions;
|
|
}
|
|
|
|
private $checkSuccess = null;
|
|
|
|
public function crawl($urls = null, $checkSuccess = null) {
|
|
if (!empty($urls)) {
|
|
$this->reset($urls);
|
|
}
|
|
$this->checkSuccess = $checkSuccess;
|
|
|
|
$this->mrHandler->setConnectionsLimit(1000);
|
|
$this->mrHandler->requestsDefaults()->addHeaders($this->headers);
|
|
$this->mrHandler->requestsDefaults()->addCurlOptions($this->curlOptions);
|
|
|
|
foreach($this->urls as $url) {
|
|
$request = new \MultiRequest\Request($url);
|
|
$request->addCurlOptions($this->curlOptions);
|
|
$request->addHeaders($this->headers);
|
|
$request->onSuccess(array($this, 'onRequestSuccess'));
|
|
$request->onFailed(array($this, 'onRequestFailed'));
|
|
$this->mrHandler->pushRequestToQueue($request);
|
|
}
|
|
|
|
try {
|
|
$this->mrHandler->start();
|
|
} catch (\Exception $ex) {
|
|
$this->markRemainUrlsFail($ex);
|
|
}
|
|
|
|
return $this->contentMap;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param \Exception $ex
|
|
*/
|
|
private function markRemainUrlsFail($ex) {
|
|
foreach ($this->urls as $url) {
|
|
if (isset($this->contentMap[$url])) {
|
|
continue;
|
|
}
|
|
$this->contentMap[$url] = CommonTool::failResult($ex->getMessage());
|
|
}
|
|
}
|
|
|
|
public function onRequestSuccess(\MultiRequest\Request $request, \MultiRequest\Handler $handler) {
|
|
$url = $request->getUrl();
|
|
$content = $request->getContent();
|
|
if ($this->checkSuccess) {
|
|
$checkRet = call_user_func_array($this->checkSuccess, array($url, $content));
|
|
if (!$checkRet) {
|
|
$this->errorLog->error('url[' . $url . '] check false, goto onRequestFailed');
|
|
|
|
$ex = new \MultiRequest\Exception('checkSuccessContent fail');
|
|
return $this->onRequestFailed($request, $ex, $handler);
|
|
}
|
|
}
|
|
$this->successLog->info("url[{$url}] request success");
|
|
$this->contentMap[$url] = CommonTool::successResult('content', $content);
|
|
}
|
|
|
|
public function onRequestFailed(\MultiRequest\Request $request, \MultiRequest\Exception $ex, \MultiRequest\Handler $handler) {
|
|
$url = $request->getUrl();
|
|
|
|
$md5 = md5($url);
|
|
if ($this->urlTryTimesMap[$md5] < 5) {
|
|
$this->errorLog->error('url[' . $url . '], msg: ' . $ex->getMessage() . ' tryTimes: ' . $this->urlTryTimesMap[$md5]);
|
|
|
|
$this->mrHandler->pushRequestToQueue($request);
|
|
$this->urlTryTimesMap[$md5]++;
|
|
return ;
|
|
}
|
|
|
|
$this->contentMap[$url] = CommonTool::failResult($ex->getMessage());
|
|
}
|
|
|
|
private function reset($urls){
|
|
if (empty($urls)) {
|
|
$this->urls = array();
|
|
} else {
|
|
$this->urls = is_array($urls) ? $urls : array($urls);
|
|
}
|
|
$this->contentMap = array();
|
|
$this->urlTryTimesMap = array();
|
|
}
|
|
} |