disk_free_space('/') / 1000000) { echo 'Disk quota reached.' . PHP_EOL; exit; } // Debug $timeStart = microtime(true); $httpRequestsTotal = 0; $httpRequestsSizeTotal = 0; $httpDownloadSizeTotal = 0; $httpRequestsTimeTotal = 0; $hostPagesProcessed = 0; $manifestsProcessed = 0; $hostPagesIndexed = 0; $manifestsAdded = 0; $hostPagesAdded = 0; $hostsAdded = 0; $hostPagesBanned = 0; $hostPagesSnapUrlAdded = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db->beginTransaction(); try { // Process manifests crawl queue foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { $curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Update manifest index anyway, with the current time and http code $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode()); // Skip processing non 200 code if (200 != $curl->getCode()) { continue; } // Skip processing without returned data if (!$remoteManifest = $curl->getContent()) { continue; } // Skip processing on json encoding error if (!$remoteManifest = @json_decode($remoteManifest)) { continue; } // Skip processing on required fields missed if (empty($remoteManifest->status) || empty($remoteManifest->result->config->crawlUrlRegexp) || empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->hosts)) { continue; } // Skip processing on API version not compatible if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { continue; } // Skip processing on host API not available if (!$remoteManifest->result->api->hosts) { continue; } // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { continue; } // Skip processing on host link does not match condition if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { continue; } // Begin hosts collection $curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Skip processing non 200 code if (200 != $curl->getCode()) { continue; } // Skip processing without returned data if (!$remoteManifestHosts = $curl->getContent()) { continue; } // Skip processing on json encoding error if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { continue; } // Skip processing on required fields missed if (empty($remoteManifestHosts->status) || empty($remoteManifestHosts->result)) { continue; } // Begin hosts processing foreach ($remoteManifestHosts->result as $remoteManifestHost) { // Skip processing on required fields missed if (empty($remoteManifestHost->scheme) || empty($remoteManifestHost->name)) { continue; } $hostURL = $remoteManifestHost->scheme . '://' . $remoteManifestHost->name . (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); // Validate formatted link if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { // Host not exists if (!$db->getHost(crc32($hostURL))) { // Get robots.txt if exists $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostId = $db->addHost( $remoteManifestHosts->result->scheme, $remoteManifestHosts->result->name, $remoteManifestHosts->result->port, crc32($hostURL), time(), null, $hostPageLimit, (string) $hostMetaOnly, (string) $hostStatus, (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); // Add web root host page to make host visible in the crawl queue $db->addHostPage($hostId, crc32('/'), '/', time()); // Increase counters $hostPagesAdded++; $hostsAdded++; } } } } // Process pages crawl queue foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { // Build URL from the DB $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; // Init page request $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); // Update page index anyway, with the current time and http code $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); // Skip page processing non 200 code if (200 != $curl->getCode()) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Validate MIME content type if ($contentType = $curl->getContentType()) { $db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time()); // Ban page if not available } else { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Parse index MIME $hostPageIsDom = false; $hostPageInMime = false; foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) { $mime = Filter::mime($mime); // Check for DOM if (false !== stripos('text/html', $mime)) { $hostPageIsDom = true; $hostPageInMime = true; break; } // Ban page on MIME type not allowed in settings if (false !== stripos(Filter::mime($contentType), $mime)) { $hostPageInMime = true; break; } } // Ban page not in MIME list if (!$hostPageInMime) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page processing without returned data if (!$content = $curl->getContent()) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Define variables $title = null; $description = null; $keywords = null; $robots = null; $yggoManifest = null; // Is DOM content if ($hostPageIsDom) { // Parse content $dom = new DomDocument(); @$dom->loadHTML($content); // Skip index page links without titles $title = @$dom->getElementsByTagName('title'); if ($title->length == 0) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } else { $title = $title->item(0)->nodeValue; } // Get optional page meta data foreach (@$dom->getElementsByTagName('meta') as $meta) { if (@$meta->getAttribute('name') == 'description') { $description = @$meta->getAttribute('content'); } if (@$meta->getAttribute('name') == 'keywords') { $keywords = @$meta->getAttribute('content'); } if (@$meta->getAttribute('name') == 'robots') { $robots = @$meta->getAttribute('content'); // Ban page with meta robots:noindex value if (false !== stripos($robots, 'noindex')) { $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); continue; } // Skip page with meta robots:nofollow attribute if (false !== stripos($robots, 'nofollow')) { continue; } } // Grab meta yggo:manifest link when available if (@$meta->getAttribute('name') == 'yggo:manifest') { $yggoManifest = Filter::url(@$meta->getAttribute('content')); } } } // Add queued page description if not exists if ($title || $description || $keywords) { $db->addHostPageDescription($queueHostPage->hostPageId, $title ? Filter::pageTitle($title) : null, $description ? Filter::pageDescription($description) : null, $keywords ? Filter::pageKeywords($keywords) : null, $content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null, time()); } // Update manifest registry if (CRAWL_MANIFEST && !empty($yggoManifest) && filter_var($yggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $yggoManifest)) { $yggoManifestCRC32 = crc32($yggoManifest); if (!$db->getManifest($yggoManifestCRC32)) { $db->addManifest($yggoManifestCRC32, $yggoManifest, (string) CRAWL_MANIFEST_DEFAULT_STATUS, time()); $manifestsAdded++; } } // Save local snap if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) { foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) { $mime = Filter::mime($mime); // MIME type allowed in settings if (false !== stripos(Filter::mime($contentType), $mime)) { $crc32data = crc32($content); $crc32host = crc32(''); // WEBSITE_DOMAIN, use empty for this host // Create not duplicated data snaps only for each storage host if (!$db->getHostPageSnapURL($queueHostPage->hostPageId, $crc32data, $crc32host)) { $time = time(); $dir = chunk_split($queueHostPage->hostPageId, 1, '/'); @mkdir('../public/snap/hp/' . $dir, 755, true); $zip = new ZipArchive(); // Create new container if (true === $zip->open('../public/snap/hp/' . $dir . $time . '.zip', ZipArchive::CREATE)) { // Insert compressed snap data if (true === $zip->addFromString($queueHostPage->hostPageId . '.' . $time . '.' . preg_replace('|^[A-z-]+/([A-z-]+).*|ui', '$1', Filter::mime($contentType)), $content)) { // Update DB registry $hostPagesSnapUrlAdded += $db->addHostPageSnapURL($queueHostPage->hostPageId, $crc32data, // do not create duplicated content snaps $crc32host, // multi host storage with same timestamp / crc32data '/snap/hp/' . $dir . $time . '.zip', // public url $time); $zip->close(); break; } } } } } } // Begin page links collection $links = []; // Collect image links foreach (@$dom->getElementsByTagName('img') as $img) { // Skip images without src attribute if (!$src = @$img->getAttribute('src')) { continue; } // Skip images without alt attribute if (!$alt = @$img->getAttribute('alt')) { continue; } if (!$title = @$img->getAttribute('title')) { $title = null; } // Skip encoded content if (false !== stripos($src, 'data:')) { continue; } // Add link to queue $links[] = [ 'title' => null, 'description' => null, 'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')), 'data' => null, 'mime' => null, 'ref' => $src, ]; } // Collect media links foreach (@$dom->getElementsByTagName('source') as $source) { // Skip images without src attribute if (!$src = @$source->getAttribute('src')) { continue; } // Skip media without type attribute if (!$type = @$source->getAttribute('type')) { continue; } // Skip encoded content if (false !== stripos($src, 'data:')) { continue; } // Add link to queue $links[] = [ 'title' => null, 'description' => null, 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), 'ref' => $src, ]; } foreach (@$dom->getElementsByTagName('video') as $video) { // Skip images without src attribute if (!$src = @$video->getAttribute('src')) { continue; } // Skip media without type attribute if (!$type = @$video->getAttribute('type')) { $type = 'video/*'; } // Skip encoded content if (false !== stripos($src, 'data:')) { continue; } // Add link to queue $links[] = [ 'title' => null, 'description' => null, 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), 'ref' => $src, ]; } foreach (@$dom->getElementsByTagName('audio') as $audio) { // Skip images without src attribute if (!$src = @$audio->getAttribute('src')) { continue; } // Skip media without type attribute if (!$type = @$audio->getAttribute('type')) { $type = 'audio/*'; } // Skip encoded content if (false !== stripos($src, 'data:')) { continue; } // Add link to queue $links[] = [ 'title' => null, 'description' => null, 'keywords' => null, 'data' => null, 'mime' => Filter::mime($type), 'ref' => $src, ]; } // Collect internal links from page content foreach(@$dom->getElementsByTagName('a') as $a) { // Skip links without required attribute if (!$href = @$a->getAttribute('href')) { continue; } // Get title attribute if available if (!$title = @$a->getAttribute('title')) { $title = null; } // Skip anchor links if (false !== stripos($href, '#')) { continue; } // Skip javascript links if (false !== stripos($href, 'javascript:')) { continue; } // Skip mailto links if (false !== stripos($href, 'mailto:')) { continue; } // Skip x-raw-image links if (false !== stripos($href, 'x-raw-image:')) { continue; } // Add link to queue $links[] = [ 'title' => null, 'description' => null, 'keywords' => Filter::pageKeywords($title), 'data' => null, 'mime' => null, 'ref' => $href, ]; } // Process links collected foreach ($links as $link) { //Make relative links absolute if (!parse_url($link['ref'], PHP_URL_HOST)) { $link['ref'] = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : '') . '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); } // Validate formatted link if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) { // Parse formatted link $hostURL = Parser::hostURL($link['ref']); $hostPageURI = Parser::uri($link['ref']); // Host exists if ($host = $db->getHost(crc32($hostURL->string))) { $hostStatus = $host->status; $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostMetaOnly = $host->crawlMetaOnly; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; // Register new host } else { // Get robots.txt if exists $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); // Update curl stats $httpRequestsTotal++; $httpRequestsSizeTotal += $curl->getSizeRequest(); $httpDownloadSizeTotal += $curl->getSizeDownload(); $httpRequestsTimeTotal += $curl->getTotalTime(); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostId = $db->addHost( $hostURL->scheme, $hostURL->name, $hostURL->port, crc32($hostURL->string), time(), null, $hostPageLimit, (string) $hostMetaOnly, (string) $hostStatus, (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); // Add web root host page to make host visible in the crawl queue $db->addHostPage($hostId, crc32('/'), '/', time()); // Increase counters $hostPagesAdded++; $hostsAdded++; // When page is root, skip next operations if ($hostPageURI->string == '/') { continue; } } // Init robots parser $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); // Save page info if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) { $hostPageId = $hostPage->hostPageId; } else { $hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); $db->addHostPageDescription($hostPageId, $link['title'], $link['description'], $link['keywords'], $hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null), time(), null, null, null, $link['mime']); $hostPagesAdded++; } $db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId); } } } } $db->commit(); } catch(Exception $e) { var_dump($e); $db->rollBack(); } // Debug $executionTimeTotal = microtime(true) - $timeStart; $httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000; if (CRAWL_LOG_ENABLED) { $db->addCrawlerLog(time(), $hostsAdded, $hostPagesProcessed, $hostPagesIndexed, $hostPagesAdded, $hostPagesSnapUrlAdded, $hostPagesBanned, $manifestsProcessed, $manifestsAdded, $httpRequestsTotal, $httpRequestsSizeTotal, $httpDownloadSizeTotal, $httpRequestsTimeTotal, $executionTimeTotal); } // Debug output echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages snaps added: ' . $hostPagesSnapUrlAdded . PHP_EOL; echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL; echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; echo 'Manifests added: ' . $manifestsAdded . PHP_EOL; echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL; echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL; echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL; echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL; echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;