|
|
@ -29,8 +29,10 @@ $timeStart = microtime(true); |
|
|
|
|
|
|
|
|
|
|
|
$hostPagesProcessed = 0; |
|
|
|
$hostPagesProcessed = 0; |
|
|
|
$hostImagesProcessed = 0; |
|
|
|
$hostImagesProcessed = 0; |
|
|
|
|
|
|
|
$manifestsProcessed = 0; |
|
|
|
$hostPagesIndexed = 0; |
|
|
|
$hostPagesIndexed = 0; |
|
|
|
$hostImagesIndexed = 0; |
|
|
|
$hostImagesIndexed = 0; |
|
|
|
|
|
|
|
$manifestsIndexed = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
$hostPagesAdded = 0; |
|
|
|
$hostImagesAdded = 0; |
|
|
|
$hostImagesAdded = 0; |
|
|
|
$hostsAdded = 0; |
|
|
|
$hostsAdded = 0; |
|
|
@ -38,8 +40,187 @@ $hostsAdded = 0; |
|
|
|
// Connect database |
|
|
|
// Connect database |
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
|
|
|
|
|
|
|
|
|
// Process images crawl queue |
|
|
|
$db->beginTransaction(); |
|
|
|
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { |
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Process manifests crawl queue |
|
|
|
|
|
|
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$curl = new Curl($queueManifest->url); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Update manifest index anyway, with the current time and http code |
|
|
|
|
|
|
|
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing non 200 code |
|
|
|
|
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing without returned data |
|
|
|
|
|
|
|
if (!$remoteManifest = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on json encoding error |
|
|
|
|
|
|
|
if (!$remoteManifest = @json_decode($remoteManifest)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on required fields missed |
|
|
|
|
|
|
|
if (empty($remoteManifest->status) || |
|
|
|
|
|
|
|
empty($remoteManifest->result->config->crawlUrlRegexp) || |
|
|
|
|
|
|
|
empty($remoteManifest->result->api->version) || |
|
|
|
|
|
|
|
empty($remoteManifest->result->api->hosts)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on API version not compatible |
|
|
|
|
|
|
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on host API not available |
|
|
|
|
|
|
|
if (!$remoteManifest->result->api->hosts) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition |
|
|
|
|
|
|
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on host link does not match condition |
|
|
|
|
|
|
|
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin hosts collection |
|
|
|
|
|
|
|
$curl = new Curl($remoteManifest->result->api->hosts); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing non 200 code |
|
|
|
|
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing without returned data |
|
|
|
|
|
|
|
if (!$remoteManifestHosts = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on json encoding error |
|
|
|
|
|
|
|
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on required fields missed |
|
|
|
|
|
|
|
if (empty($remoteManifestHosts->status) || |
|
|
|
|
|
|
|
empty($remoteManifestHosts->result)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Begin hosts processing |
|
|
|
|
|
|
|
foreach ($remoteManifestHosts->result as $remoteManifestHost) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Skip processing on required fields missed |
|
|
|
|
|
|
|
if (empty($remoteManifestHost->scheme) || |
|
|
|
|
|
|
|
empty($remoteManifestHost->name)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostURL = $remoteManifestHost->scheme . '://' . |
|
|
|
|
|
|
|
$remoteManifestHost->name . |
|
|
|
|
|
|
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Validate formatted link |
|
|
|
|
|
|
|
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Host exists |
|
|
|
|
|
|
|
if ($host = $db->getHost(crc32($hostURL))) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostStatus = $host->status; |
|
|
|
|
|
|
|
$hostPageLimit = $host->crawlPageLimit; |
|
|
|
|
|
|
|
$hostImageLimit = $host->crawlImageLimit; |
|
|
|
|
|
|
|
$hostId = $host->hostId; |
|
|
|
|
|
|
|
$hostRobots = $host->robots; |
|
|
|
|
|
|
|
$hostRobotsPostfix = $host->robotsPostfix; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Register new host |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Get robots.txt if exists |
|
|
|
|
|
|
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { |
|
|
|
|
|
|
|
$hostRobots = $curl->getContent(); |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; |
|
|
|
|
|
|
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; |
|
|
|
|
|
|
|
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostId = $db->addHost($remoteManifestHosts->result->scheme, |
|
|
|
|
|
|
|
$remoteManifestHosts->result->name, |
|
|
|
|
|
|
|
$remoteManifestHosts->result->port, |
|
|
|
|
|
|
|
crc32($hostURL), |
|
|
|
|
|
|
|
time(), |
|
|
|
|
|
|
|
null, |
|
|
|
|
|
|
|
$hostPageLimit, |
|
|
|
|
|
|
|
$hostImageLimit, |
|
|
|
|
|
|
|
(string) CRAWL_HOST_DEFAULT_META_ONLY, |
|
|
|
|
|
|
|
(string) $hostStatus, |
|
|
|
|
|
|
|
$hostRobots, |
|
|
|
|
|
|
|
$hostRobotsPostfix); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($hostId) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostsAdded++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Init robots parser |
|
|
|
|
|
|
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Save home page info |
|
|
|
|
|
|
|
// Until page API not implemented, save at least home page to have ability to crawl |
|
|
|
|
|
|
|
// @TODO |
|
|
|
|
|
|
|
if ($hostStatus && // host enabled |
|
|
|
|
|
|
|
$robots->uriAllowed('/') && // page allowed by robots.txt rules |
|
|
|
|
|
|
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit |
|
|
|
|
|
|
|
!$db->getHostPage($hostId, crc32('/'))) { // page not exists |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ($db->addHostPage($hostId, crc32('/'), '/', time())) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$hostPagesAdded++; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Process images crawl queue |
|
|
|
|
|
|
|
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { |
|
|
|
|
|
|
|
|
|
|
|
// Build URL from the DB |
|
|
|
// Build URL from the DB |
|
|
|
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; |
|
|
|
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; |
|
|
@ -77,10 +258,10 @@ foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SEC |
|
|
|
|
|
|
|
|
|
|
|
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); |
|
|
|
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Process pages crawl queue |
|
|
|
// Process pages crawl queue |
|
|
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { |
|
|
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { |
|
|
|
|
|
|
|
|
|
|
|
// Build URL from the DB |
|
|
|
// Build URL from the DB |
|
|
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; |
|
|
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; |
|
|
@ -204,10 +385,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
// Validate formatted src link |
|
|
|
// Validate formatted src link |
|
|
|
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { |
|
|
|
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { |
|
|
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Parse formatted src link |
|
|
|
// Parse formatted src link |
|
|
|
$hostImageURL = Parser::hostURL($imageSrc); |
|
|
|
$hostImageURL = Parser::hostURL($imageSrc); |
|
|
|
$hostImageURI = Parser::uri($imageSrc); |
|
|
|
$hostImageURI = Parser::uri($imageSrc); |
|
|
@ -310,15 +487,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
|
|
|
|
|
|
|
|
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); |
|
|
|
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} catch(Exception $e) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$db->rollBack(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@ -370,10 +538,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
// Validate formatted link |
|
|
|
// Validate formatted link |
|
|
|
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { |
|
|
|
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { |
|
|
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Parse formatted link |
|
|
|
// Parse formatted link |
|
|
|
$hostURL = Parser::hostURL($href); |
|
|
|
$hostURL = Parser::hostURL($href); |
|
|
|
$hostPageURI = Parser::uri($href); |
|
|
|
$hostPageURI = Parser::uri($href); |
|
|
@ -454,17 +618,17 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND |
|
|
|
|
|
|
|
|
|
|
|
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); |
|
|
|
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
|
|
} catch(Exception $e){ |
|
|
|
} catch(Exception $e) { |
|
|
|
|
|
|
|
|
|
|
|
var_dump($e); |
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
|
|
$db->rollBack(); |
|
|
|
$db->rollBack(); |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Debug |
|
|
|
// Debug |
|
|
@ -474,5 +638,7 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; |
|
|
|
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; |
|
|
|
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; |
|
|
|
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; |
|
|
|
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; |
|
|
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL; |
|
|
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL; |
|
|
|
|
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; |
|
|
|
|
|
|
|
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL; |
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; |
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; |
|
|
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; |
|
|
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; |
|
|
|