<?php

// Lock multi-thread execution
$semaphore = sem_get(crc32('crontab.crawler'), 1);

if (false === sem_acquire($semaphore, true)) {

  echo 'Process locked by another thread.' . PHP_EOL;
  exit;
}

// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
require_once('../library/filter.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');

// Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {

  echo 'Disk quota reached.' . PHP_EOL;
  exit;
}

// Debug
$timeStart = microtime(true);

$httpRequestsTotal     = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;

$hostPagesProcessed    = 0;
$hostImagesProcessed   = 0;
$manifestsProcessed    = 0;
$hostPagesIndexed      = 0;
$hostImagesIndexed     = 0;
$manifestsAdded        = 0;
$hostPagesAdded        = 0;
$hostImagesAdded       = 0;
$hostsAdded            = 0;
$hostPagesBanned       = 0;
$hostImagesBanned      = 0;

// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);

$db->beginTransaction();

try {

  // Process manifests crawl queue
  foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {

    $curl = new Curl($queueManifest->url);

    // Update curl stats
    $httpRequestsTotal++;
    $httpRequestsSizeTotal += $curl->getSizeRequest();
    $httpDownloadSizeTotal += $curl->getSizeDownload();
    $httpRequestsTimeTotal += $curl->getTotalTime();

    // Update manifest index anyway, with the current time and http code
    $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());

    // Skip processing non 200 code
    if (200 != $curl->getCode()) {

      continue;
    }

    // Skip processing without returned data
    if (!$remoteManifest = $curl->getContent()) {

      continue;
    }

    // Skip processing on json encoding error
    if (!$remoteManifest = @json_decode($remoteManifest)) {

      continue;
    }

    // Skip processing on required fields missed
    if (empty($remoteManifest->status) ||
        empty($remoteManifest->result->config->crawlUrlRegexp) ||
        empty($remoteManifest->result->api->version) ||
        empty($remoteManifest->result->api->hosts)) {

      continue;
    }

    // Skip processing on API version not compatible
    if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {

      continue;
    }

    // Skip processing on host API not available
    if (!$remoteManifest->result->api->hosts) {

      continue;
    }

    // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
    if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {

      continue;
    }

    // Skip processing on host link does not match condition
    if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {

      continue;
    }

    // Begin hosts collection
    $curl = new Curl($remoteManifest->result->api->hosts);

    // Update curl stats
    $httpRequestsTotal++;
    $httpRequestsSizeTotal  += $curl->getSizeRequest();
    $httpDownloadSizeTotal += $curl->getSizeDownload();
    $httpRequestsTimeTotal += $curl->getTotalTime();

    // Skip processing non 200 code
    if (200 != $curl->getCode()) {

      continue;
    }

    // Skip processing without returned data
    if (!$remoteManifestHosts = $curl->getContent()) {

      continue;
    }

    // Skip processing on json encoding error
    if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {

      continue;
    }

    // Skip processing on required fields missed
    if (empty($remoteManifestHosts->status) ||
        empty($remoteManifestHosts->result)) {

      continue;
    }

    // Begin hosts processing
    foreach ($remoteManifestHosts->result as $remoteManifestHost) {

      // Skip processing on required fields missed
      if (empty($remoteManifestHost->scheme) ||
          empty($remoteManifestHost->name)) {

        continue;
      }

      $hostURL = $remoteManifestHost->scheme . '://' .
                 $remoteManifestHost->name .
                (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);

      // Validate formatted link
      if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {

        // Host exists
        if ($host = $db->getHost(crc32($hostURL))) {

          $hostStatus        = $host->status;
          $hostPageLimit     = $host->crawlPageLimit;
          $hostImageLimit    = $host->crawlImageLimit;
          $hostId            = $host->hostId;
          $hostRobots        = $host->robots;
          $hostRobotsPostfix = $host->robotsPostfix;

        // Register new host
        } else {

          // Get robots.txt if exists
          $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

          // Update curl stats
          $httpRequestsTotal++;
          $httpRequestsSizeTotal  += $curl->getSizeRequest();
          $httpDownloadSizeTotal += $curl->getSizeDownload();
          $httpRequestsTimeTotal += $curl->getTotalTime();

          if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
            $hostRobots = $curl->getContent();
          } else {
            $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
          }

          $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;

          $hostStatus    = CRAWL_HOST_DEFAULT_STATUS;
          $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
          $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;

          $hostId        = $db->addHost($remoteManifestHosts->result->scheme,
                                        $remoteManifestHosts->result->name,
                                        $remoteManifestHosts->result->port,
                                        crc32($hostURL),
                                        time(),
                                        null,
                                        $hostPageLimit,
                                        $hostImageLimit,
                                        (string) CRAWL_HOST_DEFAULT_META_ONLY,
                                        (string) $hostStatus,
                                        $hostRobots,
                                        $hostRobotsPostfix);

          if ($hostId) {

            $hostsAdded++;

          } else {

            continue;
          }
        }

        // Init robots parser
        $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));

        // Save home page info
        // Until page API not implemented, save at least home page to have ability to crawl
        // @TODO
        if ($hostStatus && // host enabled
            $robots->uriAllowed('/') && // page allowed by robots.txt rules
            $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
            !$db->getHostPage($hostId, crc32('/'))) {  // page not exists

            if ($db->addHostPage($hostId, crc32('/'), '/', time())) {

              $hostPagesAdded++;
            }
        }
      }
    }
  }

  // Process images crawl queue
  foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {

    // Build URL from the DB
    $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;

    // Init image request
    $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);

    // Update curl stats
    $httpRequestsTotal++;
    $httpRequestsSizeTotal += $curl->getSizeRequest();
    $httpDownloadSizeTotal += $curl->getSizeDownload();
    $httpRequestsTimeTotal += $curl->getTotalTime();

    // Update image index anyway, with the current time and http code
    $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());

    // Skip image processing non 200 code
    if (200 != $curl->getCode()) {

      $db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());

      $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

      continue;
    }

    // Skip image processing on MIME type not provided
    if (!$hostImageContentType = $curl->getContentType()) {

      $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

      continue;
    }

    // Skip image processing on MIME type not allowed in settings
    $hostImageBanned = true;
    foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {

      if (false !== strpos($hostImageContentType, trim($mime))) {

        $hostImageBanned = false;
        break;
      }
    }

    if ($hostImageBanned) {

      $db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());

      $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

      continue;
    }

    // Convert remote image data to base64 string
    if (!$queueHostImage->crawlMetaOnly) {

      // Skip image processing without returned content
      if (!$hostImageContent = $curl->getContent()) {

        $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

        continue;
      }

      if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {

        $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

        continue;
      }

      if (!$hostImageBase64 = @base64_encode($hostImageContent)) {

        $hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());

        continue;
      }

      $hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;

      // Set host image description
      // On link collection we knew meta but data,
      // this step use latest description slice and insert the data received by curl request
      if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {

        $db->setHostImageDescription($queueHostImage->hostImageId,
                                     crc32($hostImageData),
                                     $lastHostImageDescription->alt,
                                     $lastHostImageDescription->title,
                                     $hostImageData,
                                     time());
      }
    }

    $hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
                                               Filter::mime($hostImageContentType),
                                               time());
  }

  // Process pages crawl queue
  foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {

    // Build URL from the DB
    $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;

    // Init page request
    $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);

    // Update curl stats
    $httpRequestsTotal++;
    $httpRequestsSizeTotal += $curl->getSizeRequest();
    $httpDownloadSizeTotal += $curl->getSizeDownload();
    $httpRequestsTimeTotal += $curl->getTotalTime();

    // Update page index anyway, with the current time and http code
    $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());

    // Skip page processing non 200 code
    if (200 != $curl->getCode()) {

      $db->updateHostPageHttpCode($queueHostPage->hostPageId, $curl->getCode(), time());

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Skip page processing on MIME type not provided
    if (!$contentType = $curl->getContentType()) {

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Skip page processing on MIME type not allowed in settings
    $hostPageBanned = true;
    foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {

      if (false !== strpos($contentType, trim($mime))) {

        $hostPageBanned = false;
        break;
      }
    }

    if ($hostPageBanned) {

      $db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Skip page processing without returned data
    if (!$content = $curl->getContent()) {

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Grab page content
    $dom = new DomDocument();

    @$dom->loadHTML($content);

    // Skip index page links without titles
    $title = @$dom->getElementsByTagName('title');

    if ($title->length == 0) {

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Get optional page meta data
    $metaDescription  = '';
    $metaKeywords     = '';
    $metaRobots       = '';
    $metaYggoManifest = '';

    foreach (@$dom->getElementsByTagName('meta') as $meta) {

      if (@$meta->getAttribute('name') == 'description') {
        $metaDescription = @$meta->getAttribute('content');
      }

      if (@$meta->getAttribute('name') == 'keywords') {
        $metaKeywords = @$meta->getAttribute('content');
      }

      if (@$meta->getAttribute('name') == 'robots') {
        $metaRobots = @$meta->getAttribute('content');
      }

      if (@$meta->getAttribute('name') == 'yggo:manifest') {
        $metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
      }
    }

    // Append page with meta robots:noindex value to the robotsPostfix disallow list
    if (false !== stripos($metaRobots, 'noindex')) {

      $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());

      continue;
    }

    // Skip page links following by robots:nofollow attribute detected
    if (false !== stripos($metaRobots, 'nofollow')) {

      continue;
    }

    // Update queued page
    $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
                                             Filter::mime($contentType),
                                             time());

    // Format page content
    $content = Filter::pageData($content);

    // Add queued page description if not exists
    $db->setHostPageDescription($queueHostPage->hostPageId,
                                crc32($content),
                                Filter::pageTitle($title->item(0)->nodeValue),
                                Filter::pageDescription($metaDescription),
                                Filter::pageKeywords($metaKeywords),
                                $queueHostPage->crawlMetaOnly ? null : $content,
                                time());

    // Update manifest registry
    if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {

      $metaYggoManifestCRC32 = crc32($metaYggoManifest);

      if (!$db->getManifest($metaYggoManifestCRC32)) {
           $db->addManifest($metaYggoManifestCRC32,
                            $metaYggoManifest,
                            (string) CRAWL_MANIFEST_DEFAULT_STATUS,
                            time());

           $manifestsAdded++;
      }
    }

    // Collect page images
    if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {

      foreach (@$dom->getElementsByTagName('img') as $img) {

        // Skip images without src attribute
        if (!$imageSrc = @$img->getAttribute('src')) {

          continue;
        }

        // Skip images without alt attribute
        if (!$imageAlt = @$img->getAttribute('alt')) {

          continue;
        }

        if (!$imageTitle = @$img->getAttribute('title')) {
            $imageTitle = null;
        }

        // Add domain to the relative src links
        if (!parse_url($imageSrc, PHP_URL_HOST)) {

          $imageSrc = $queueHostPage->scheme . '://' .
                      $queueHostPage->name .
                     ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
                      '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
        }

        // Validate formatted src link
        if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {

          // Parse formatted src link
          $hostImageURL = Parser::hostURL($imageSrc);
          $hostImageURI = Parser::uri($imageSrc);

          // Host exists
          if ($host = $db->getHost(crc32($hostImageURL->string))) {

            $hostStatus        = $host->status;
            $hostPageLimit     = $host->crawlPageLimit;
            $hostImageLimit    = $host->crawlImageLimit;
            $hostId            = $host->hostId;
            $hostRobots        = $host->robots;
            $hostRobotsPostfix = $host->robotsPostfix;

          // Register new host
          } else {

            // Get robots.txt if exists
            $curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

            // Update curl stats
            $httpRequestsTotal++;
            $httpRequestsSizeTotal += $curl->getSizeRequest();
            $httpDownloadSizeTotal += $curl->getSizeDownload();
            $httpRequestsTimeTotal += $curl->getTotalTime();

            if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
              $hostRobots = $curl->getContent();
            } else {
              $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
            }

            $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;

            $hostStatus    = CRAWL_HOST_DEFAULT_STATUS;
            $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
            $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
            $hostId        = $db->addHost($hostImageURL->scheme,
                                          $hostImageURL->name,
                                          $hostImageURL->port,
                                          crc32($hostURL->string),
                                          time(),
                                          null,
                                          $hostPageLimit,
                                          $hostImageLimit,
                                          (string) CRAWL_HOST_DEFAULT_META_ONLY,
                                          (string) $hostStatus,
                                          $hostRobots,
                                          $hostRobotsPostfix);

            if ($hostId) {

              $hostsAdded++;

            } else {

              continue;
            }
          }

          // Init robots parser
          $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));

          // Save new image info
          $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));

          if (!$hostImageId && // image not exists
               $hostStatus && // host enabled
               $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
               $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit

            // Add host image
            if ($hostImageId = $db->addHostImage($hostId,
                                                 crc32($hostImageURI->string),
                                                 $hostImageURI->string,
                                                 time())) {

              $hostImagesAdded++;

            } else {

              continue;
            }
          }

          // Add/update host image description
          $db->setHostImageDescription($hostImageId,
                                       null, // no data, download it in the crawler queue
                                       Filter::imageAlt($imageAlt),
                                       Filter::imageTitle($imageTitle),
                                       null,
                                       time());

          // Relate host image with host page was found
          $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);

          // Increase image rank when link does not match the current host
          if ($hostImageURL->scheme . '://' .
              $hostImageURL->name .
              ($hostImageURL->port ? ':' . $hostImageURL->port : '')
              !=
              $queueHostPage->scheme . '://' .
              $queueHostPage->name .
              ($queueHostPage->port ? ':' . $queueHostPage->port : '')) {

              $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
          }
        }
      }
    }

    // Collect internal links from page content
    foreach(@$dom->getElementsByTagName('a') as $a) {

      // Skip links without required attribute
      if (!$href = @$a->getAttribute('href')) {

        continue;
      }

      // Skip anchor links
      if (false !== strpos($href, '#')) {

        continue;
      }

      // Skip javascript links
      if (false !== strpos($href, 'javascript:')) {

        continue;
      }

      // Skip mailto links
      if (false !== strpos($href, 'mailto:')) {

        continue;
      }

      // Skip x-raw-image links
      if (false !== strpos($href, 'x-raw-image:')) {

        continue;
      }

      // @TODO skip other apps

      // Add absolute URL prefixes to the relative links found
      if (!parse_url($href, PHP_URL_HOST)) {

        $href = $queueHostPage->scheme . '://' .
                $queueHostPage->name .
              ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
              '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
      }

      // Validate formatted link
      if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {

        // Parse formatted link
        $hostURL     = Parser::hostURL($href);
        $hostPageURI = Parser::uri($href);

        // Host exists
        if ($host = $db->getHost(crc32($hostURL->string))) {

          $hostStatus        = $host->status;
          $hostPageLimit     = $host->crawlPageLimit;
          $hostImageLimit    = $host->crawlImageLimit;
          $hostId            = $host->hostId;
          $hostRobots        = $host->robots;
          $hostRobotsPostfix = $host->robotsPostfix;

        // Register new host
        } else {

          // Get robots.txt if exists
          $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);

          // Update curl stats
          $httpRequestsTotal++;
          $httpRequestsSizeTotal += $curl->getSizeRequest();
          $httpDownloadSizeTotal += $curl->getSizeDownload();
          $httpRequestsTimeTotal += $curl->getTotalTime();

          if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
            $hostRobots = $curl->getContent();
          } else {
            $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
          }

          $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;

          $hostStatus    = CRAWL_HOST_DEFAULT_STATUS;
          $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
          $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
          $hostId        = $db->addHost($hostURL->scheme,
                                        $hostURL->name,
                                        $hostURL->port,
                                        crc32($hostURL->string),
                                        time(),
                                        null,
                                        $hostPageLimit,
                                        $hostImageLimit,
                                        (string) CRAWL_HOST_DEFAULT_META_ONLY,
                                        (string) $hostStatus,
                                        $hostRobots,
                                        $hostRobotsPostfix);

          if ($hostId) {

            $hostsAdded++;

          } else {

            continue;
          }
        }

        // Init robots parser
        $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));

        // Save page info
        if ($hostStatus && // host enabled
            $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
            $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
            !$db->getHostPage($hostId, crc32($hostPageURI->string))) {  // page not exists

            if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {

              $hostPagesAdded++;
            }
        }

        // Increase page rank when link does not match the current host
        if ($hostURL->scheme . '://' .
            $hostURL->name .
            ($hostURL->port ? ':' . $hostURL->port : '')
            !=
            $queueHostPage->scheme . '://' .
            $queueHostPage->name .
            ($queueHostPage->port ? ':' . $queueHostPage->port : '')) {

            $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
        }
      }
    }
  }

  $db->commit();

} catch(Exception $e) {

  var_dump($e);

  $db->rollBack();
}

// Debug
$executionTimeTotal    = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;

if (CRAWL_LOG_ENABLED) {

  $db->addCrawlerLog(time(),
                     $hostsAdded,
                     $hostPagesProcessed,
                     $hostPagesIndexed,
                     $hostPagesAdded,
                     $hostPagesBanned,
                     $hostImagesIndexed,
                     $hostImagesProcessed,
                     $hostImagesAdded,
                     $hostImagesBanned,
                     $manifestsProcessed,
                     $manifestsAdded,
                     $httpRequestsTotal,
                     $httpRequestsSizeTotal,
                     $httpDownloadSizeTotal,
                     $httpRequestsTimeTotal,
                     $executionTimeTotal);
}

// Debug output
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;

echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;

echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;

echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;

echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;

echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;