disk_free_space('/') / 1000000) { echo 'Disk quota reached.' . PHP_EOL; exit; } // Debug $timeStart = microtime(true); $httpRequestsTotal = 0; $httpRequestsSizeTotal = 0; $httpDownloadSizeTotal = 0; $httpRequestsTimeTotal = 0; $hostPagesProcessed = 0; $hostImagesProcessed = 0; $manifestsProcessed = 0; $hostPagesIndexed = 0; $hostImagesIndexed = 0; $manifestsAdded = 0; $hostPagesAdded = 0; $hostImagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; $hostImagesBanned = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db->beginTransaction(); try { // Process manifests crawl queue foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { $curl = new Curl($queueManifest->url); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Update manifest index anyway, with the current time and http code $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode()); // Skip processing non 200 code if (200 != $curl->getCode()) { continue; } // Skip processing without returned data if (!$remoteManifest = $curl->getContent()) { continue; } // Skip processing on json encoding error if (!$remoteManifest = @json_decode($remoteManifest)) { continue; } // Skip processing on required fields missed if (empty($remoteManifest->status) || empty($remoteManifest->result->config->crawlUrlRegexp) || empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->hosts)) { continue; } // Skip processing on API version not compatible if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { continue; } // Skip processing on host API not available if (!$remoteManifest->result->api->hosts) { continue; } // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { continue; } // Skip processing on host link does not match condition if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { continue; } // Begin hosts collection $curl = new Curl($remoteManifest->result->api->hosts); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Skip processing non 200 code if (200 != $curl->getCode()) { continue; } // Skip processing without returned data if (!$remoteManifestHosts = $curl->getContent()) { continue; } // Skip processing on json encoding error if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { continue; } // Skip processing on required fields missed if (empty($remoteManifestHosts->status) || empty($remoteManifestHosts->result)) { continue; } // Begin hosts processing foreach ($remoteManifestHosts->result as $remoteManifestHost) { // Skip processing on required fields missed if (empty($remoteManifestHost->scheme) || empty($remoteManifestHost->name)) { continue; } $hostURL = $remoteManifestHost->scheme . '://' . $remoteManifestHost->name . (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); // Validate formatted link if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { // Host exists if ($host = $db->getHost(crc32($hostURL))) { $hostStatus = $host->status; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; // Register new host } else { // Get robots.txt if exists $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($remoteManifestHosts->result->scheme, $remoteManifestHosts->result->name, $remoteManifestHosts->result->port, crc32($hostURL), time(), null, $hostPageLimit, $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, $hostRobots, $hostRobotsPostfix); if ($hostId) { $hostsAdded++; } else { continue; } } // Init robots parser $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); // Save home page info // Until page API not implemented, save at least home page to have ability to crawl // @TODO if ($hostStatus && // host enabled $robots->uriAllowed('/') && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit !$db->getHostPage($hostId, crc32('/'))) { // page not exists if ($db->addHostPage($hostId, crc32('/'), '/', time())) { $hostPagesAdded++; } } } } } // Process images crawl queue foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { // Build URL from the DB $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; // Init image request $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Update image index anyway, with the current time and http code $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); // Skip image processing non 200 code if (200 != $curl->getCode()) { $db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time()); $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } // Skip image processing on MIME type not provided if (!$hostImageContentType = $curl->getContentType()) { $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } // Skip image processing on MIME type not allowed in settings $hostImageBanned = true; foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) { if (false !== strpos($hostImageContentType, trim($mime))) { $hostImageBanned = false; break; } } if ($hostImageBanned) { $db->updateHostImageMime($queueHostImage->hostImageId, $hostImageContentType, time()); $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } // Convert remote image data to base64 string if (!CRAWL_HOST_DEFAULT_META_ONLY) { // Skip image processing without returned content if (!$hostImageContent = $curl->getContent()) { $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } if (!$hostImageBase64 = @base64_encode($hostImageContent)) { $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time()); continue; } $hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64; } else { $hostImageData = null; } $hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId, Filter::mime($hostImageContentType), $hostImageData, time()); } // Process pages crawl queue foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { // Build URL from the DB $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; // Init page request $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Update page index anyway, with the current time and http code $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); // Skip page processing non 200 code if (200 != $curl->getCode()) { $db->updateHostPageHttpCode($queueHostPage->hostPageId, $curl->getCode(), time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page processing on MIME type not provided if (!$contentType = $curl->getContentType()) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page processing on MIME type not allowed in settings $hostPageBanned = true; foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) { if (false !== strpos($contentType, trim($mime))) { $hostPageBanned = false; break; } } if ($hostPageBanned) { $db->updateHostPageMime($queueHostPage->hostPageId, $contentType, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page processing without returned data if (!$content = $curl->getContent()) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Grab page content $dom = new DomDocument(); @$dom->loadHTML($content); // Skip index page links without titles $title = @$dom->getElementsByTagName('title'); if ($title->length == 0) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Get optional page meta data $metaDescription = ''; $metaKeywords = ''; $metaRobots = ''; $metaYggoManifest = ''; foreach (@$dom->getElementsByTagName('meta') as $meta) { if (@$meta->getAttribute('name') == 'description') { $metaDescription = @$meta->getAttribute('content'); } if (@$meta->getAttribute('name') == 'keywords') { $metaKeywords = @$meta->getAttribute('content'); } if (@$meta->getAttribute('name') == 'robots') { $metaRobots = @$meta->getAttribute('content'); } if (@$meta->getAttribute('name') == 'yggo:manifest') { $metaYggoManifest = Filter::url(@$meta->getAttribute('content')); } } // Append page with meta robots:noindex value to the robotsPostfix disallow list if (false !== stripos($metaRobots, 'noindex')) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page links following by robots:nofollow attribute detected if (false !== stripos($metaRobots, 'nofollow')) { continue; } // Update queued page data $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, Filter::pageTitle($title->item(0)->nodeValue), Filter::pageDescription($metaDescription), Filter::pageKeywords($metaKeywords), Filter::mime($contentType), CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content), time()); // Update manifest registry if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { $metaYggoManifestCRC32 = crc32($metaYggoManifest); if (!$db->getManifest($metaYggoManifestCRC32)) { $db->addManifest($metaYggoManifestCRC32, $metaYggoManifest, (string) CRAWL_MANIFEST_DEFAULT_STATUS, time()); $manifestsAdded++; } } // Collect page images if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { foreach (@$dom->getElementsByTagName('img') as $img) { // Skip images without src attribute if (!$imageSrc = @$img->getAttribute('src')) { continue; } // Skip images without alt attribute if (!$imageAlt = @$img->getAttribute('alt')) { continue; } if (!$imageTitle = @$img->getAttribute('title')) { $imageTitle = null; } // Add domain to the relative src links if (!parse_url($imageSrc, PHP_URL_HOST)) { $imageSrc = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : '') . '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); } // Validate formatted src link if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { // Parse formatted src link $hostImageURL = Parser::hostURL($imageSrc); $hostImageURI = Parser::uri($imageSrc); // Host exists if ($host = $db->getHost(crc32($hostImageURL->string))) { $hostStatus = $host->status; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; // Register new host } else { // Get robots.txt if exists $curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($hostImageURL->scheme, $hostImageURL->name, $hostImageURL->port, crc32($hostURL->string), time(), null, $hostPageLimit, $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, $hostRobots, $hostRobotsPostfix); if ($hostId) { $hostsAdded++; } else { continue; } } // Init robots parser $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); // Save new image info $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string)); if (!$hostImageId && // image not exists $hostStatus && // host enabled $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit // Add host image if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time())) { $hostImagesAdded++; } else { continue; } } // Host image exists or created new one if ($hostImageId) { // Add/update host image description $db->setHostImageDescription($hostImageId, crc32(md5((string) $imageAlt . (string) $imageTitle)), Filter::imageAlt($imageAlt), Filter::imageTitle($imageTitle), time(), time()); // Relate host image with host page was found $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1); } // Increase image rank when link does not match the current host if ($hostImageURL->scheme . '://' . $hostImageURL->name . ($hostImageURL->port ? ':' . $hostImageURL->port : '') != $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); } } } } // Collect internal links from page content foreach(@$dom->getElementsByTagName('a') as $a) { // Skip links without required attribute if (!$href = @$a->getAttribute('href')) { continue; } // Skip anchor links if (false !== strpos($href, '#')) { continue; } // Skip javascript links if (false !== strpos($href, 'javascript:')) { continue; } // Skip mailto links if (false !== strpos($href, 'mailto:')) { continue; } // Skip x-raw-image links if (false !== strpos($href, 'x-raw-image:')) { continue; } // @TODO skip other apps // Add absolute URL prefixes to the relative links found if (!parse_url($href, PHP_URL_HOST)) { $href = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : '') . '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.'); } // Validate formatted link if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { // Parse formatted link $hostURL = Parser::hostURL($href); $hostPageURI = Parser::uri($href); // Host exists if ($host = $db->getHost(crc32($hostURL->string))) { $hostStatus = $host->status; $hostPageLimit = $host->crawlPageLimit; $hostImageLimit = $host->crawlImageLimit; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; // Register new host } else { // Get robots.txt if exists $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; $hostId = $db->addHost($hostURL->scheme, $hostURL->name, $hostURL->port, crc32($hostURL->string), time(), null, $hostPageLimit, $hostImageLimit, (string) CRAWL_HOST_DEFAULT_META_ONLY, (string) $hostStatus, $hostRobots, $hostRobotsPostfix); if ($hostId) { $hostsAdded++; } else { continue; } } // Init robots parser $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); // Save page info if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { $hostPagesAdded++; } } // Increase page rank when link does not match the current host if ($hostURL->scheme . '://' . $hostURL->name . ($hostURL->port ? ':' . $hostURL->port : '') != $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); } } } } $db->commit(); } catch(Exception $e) { var_dump($e); $db->rollBack(); } // Debug $executionTimeTotal = microtime(true) - $timeStart; $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; if (CRAWL_LOG_ENABLED) { $db->addCrawlerLog(time(), $hostsAdded, $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, $hostPagesBanned, $hostImagesIndexed, $hostImagesProcessed, $hostImagesAdded, $hostImagesBanned, $manifestsProcessed, $manifestsAdded, $httpRequestsTotal, $httpRequestsSizeTotal, $httpDownloadSizeTotal, $httpRequestsTimeTotal, $executionTimeTotal); } // Debug output echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; echo 'Images added: ' . $hostImagesAdded . PHP_EOL; echo 'Images banned: ' . $hostImagesBanned . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests added: ' . $manifestsAdded . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;