From 5999fb3a73cadab646106d6ae5f6d5ec0d4c28f8 Mon Sep 17 00:00:00 2001 From: ghost Date: Fri, 5 May 2023 05:26:53 +0300 Subject: [PATCH] add distributed hosts crawling using yggo nodes manifest --- README.md | 2 +- config/app.php.txt | 44 +++- crontab/cleaner.php | 93 +++++++-- crontab/crawler.php | 498 +++++++++++++++++++++++++++++--------------- database/yggo.mwb | Bin 13925 -> 13968 bytes library/mysql.php | 51 +++++ 6 files changed, 500 insertions(+), 188 deletions(-) diff --git a/README.md b/README.md index 5163a3c..0d69957 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ GET m=SphinxQL * [ ] Implement database auto backup on crawl process completing * [x] Add transactions to prevent data loss on DB crashes * [x] JSON API -* [ ] Distributed index data sharing between the nodes trough service API +* [x] Distributed index data sharing between the nodes trough service API * [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6 * [ ] An idea to make some visitors counters, like in good old times? diff --git a/config/app.php.txt b/config/app.php.txt index aba24d5..102caf6 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -121,8 +121,10 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500); * Usually up to 20 pages per minute, * to prevent websites overload by sending GET crawling requests * + * Set 0 to disable + * */ -define('CRAWL_PAGE_LIMIT', 10); +define('CRAWL_PAGE_LIMIT', 20); /* * Images (URI) processing limit in the crawler.php queue @@ -133,8 +135,27 @@ define('CRAWL_PAGE_LIMIT', 10); * Usually up to 20 pages per minute, * to prevent websites overload by sending GET crawling requests * + * Set 0 to disable + * + */ +define('CRAWL_IMAGE_LIMIT', 10); + +/* + * Manifest (URI) processing limit in the crawler.php queue + * + * Used to collect distributed data index + * that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION + * + * This option related to CRAWL_MANIFEST_SECONDS_OFFSET value + * and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab) + * + * Usually up to 20 pages per minute, + * to prevent websites overload by sending GET crawling requests + * + * Set 0 to disable + * */ -define('CRAWL_IMAGE_LIMIT', 20); +define('CRAWL_MANIFEST_LIMIT', 10); /* * Renew page index by timing offset provided @@ -162,6 +183,19 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12); */ define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12); +/* + * Renew manifests index by timing offset provided + * + * This option works with CRAWL_MANIFEST_LIMIT step queue + * + * Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair + * must have enough value to crawl all manifests collected in the DB index + * + * or the crawler can stuck in queue + * + */ +define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30); + /* * Only URL addresses match this rule will be auto-crawled * @@ -242,6 +276,12 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null */ define('CRAWL_MANIFEST', true); +/* + * Manifest API version compatibility + * + */ +define('CRAWL_MANIFEST_API_VERSION', 0.4); + /* * Set default auto-crawl status for new manifest added * diff --git a/crontab/cleaner.php b/crontab/cleaner.php index 3ba16fa..7b14b2a 100644 --- a/crontab/cleaner.php +++ b/crontab/cleaner.php @@ -22,29 +22,31 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $timeStart = microtime(true); $hostsTotal = $db->getTotalHosts(); +$manifestsTotal = $db->getTotalManifests(); $hostsUpdated = 0; $hostsPagesDeleted = 0; $hostsImagesDeleted = 0; +$manifestsDeleted = 0; -// Get host queue -foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { +// Begin update +$db->beginTransaction(); - // Parse host info - $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false); +try { - // Get robots.txt if exists - $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + // Get cleaner queue + foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { - if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { - $hostRobots = $curl->getContent(); - } else { - $hostRobots = null; - } + // Parse host info + $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false); - // Begin update - $db->beginTransaction(); + // Get robots.txt if exists + $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); - try { + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = null; + } // Update host data $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); @@ -118,15 +120,66 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS // Delete host image $hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); } + } + + // Clean up deprecated manifests + foreach ($db->getManifests() as $manifest) { + + $delete = false; + + $curl = new Curl($manifest->url); + + // Skip processing non 200 code + if (200 != $curl->getCode()) { + + continue; // Wait for reconnect + } - $db->commit(); + // Skip processing without returned data + if (!$remoteManifest = $curl->getContent()) { - } catch(Exception $e){ + $delete = true; + } - var_dump($e); + // Skip processing on json encoding error + if (!$remoteManifest = @json_decode($remoteManifest)) { - $db->rollBack(); + $delete = true; + } + + // Skip processing on required fields missed + if (empty($remoteManifest->status) || + empty($remoteManifest->result->config->crawlUrlRegexp) || + empty($remoteManifest->result->api->version)) { + + $delete = true; + } + + // Skip processing on API version not compatible + if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { + + $delete = true; + } + + // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition + if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { + + $delete = true; + } + + if ($delete) { + + $manifestsDeleted += $db->deleteManifest($manifest->manifestId); + } } + + $db->commit(); + +} catch(Exception $e){ + + var_dump($e); + + $db->rollBack(); } // Debug @@ -134,4 +187,6 @@ echo 'Hosts total: ' . $hostsTotal . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL; -echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL; \ No newline at end of file +echo 'Manifests total: ' . $manifestsTotal . PHP_EOL; +echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL; +echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; \ No newline at end of file diff --git a/crontab/crawler.php b/crontab/crawler.php index 791b9ca..aca05ff 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -29,8 +29,10 @@ $timeStart = microtime(true); $hostPagesProcessed = 0; $hostImagesProcessed = 0; +$manifestsProcessed = 0; $hostPagesIndexed = 0; $hostImagesIndexed = 0; +$manifestsIndexed = 0; $hostPagesAdded = 0; $hostImagesAdded = 0; $hostsAdded = 0; @@ -38,175 +40,350 @@ $hostsAdded = 0; // Connect database $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); -// Process images crawl queue -foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { +$db->beginTransaction(); - // Build URL from the DB - $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; +try { - $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT); + // Process manifests crawl queue + foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { - // Update image index anyway, with the current time and http code - $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); + $curl = new Curl($queueManifest->url); - // Skip next image processing non 200 code - if (200 != $curl->getCode()) { + // Update manifest index anyway, with the current time and http code + $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode()); - continue; - } + // Skip processing non 200 code + if (200 != $curl->getCode()) { - // Save image content on data settings enabled - if (!CRAWL_HOST_DEFAULT_META_ONLY) { + continue; + } - // Skip next image processing images without returned data - if (!$content = $curl->getContent()) { + // Skip processing without returned data + if (!$remoteManifest = $curl->getContent()) { continue; } - // Convert remote image data to base64 string to prevent direct URL call - if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { + // Skip processing on json encoding error + if (!$remoteManifest = @json_decode($remoteManifest)) { continue; } - if (!$hostImageBase64 = @base64_encode($curl->getContent())) { + // Skip processing on required fields missed + if (empty($remoteManifest->status) || + empty($remoteManifest->result->config->crawlUrlRegexp) || + empty($remoteManifest->result->api->version) || + empty($remoteManifest->result->api->hosts)) { continue; } - $hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); - } -} + // Skip processing on API version not compatible + if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { -// Process pages crawl queue -foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { + continue; + } - // Build URL from the DB - $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; + // Skip processing on host API not available + if (!$remoteManifest->result->api->hosts) { - $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); + continue; + } - // Update page index anyway, with the current time and http code - $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); + // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition + if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { - // Skip next page processing non 200 code - if (200 != $curl->getCode()) { + continue; + } - continue; - } + // Skip processing on host link does not match condition + if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { - // Skip next page processing pages without returned data - if (!$content = $curl->getContent()) { + continue; + } - continue; - } + // Begin hosts collection + $curl = new Curl($remoteManifest->result->api->hosts); - // Grab page content - $dom = new DomDocument(); + // Skip processing non 200 code + if (200 != $curl->getCode()) { - @$dom->loadHTML($content); + continue; + } - // Skip index page links without titles - $title = @$dom->getElementsByTagName('title'); + // Skip processing without returned data + if (!$remoteManifestHosts = $curl->getContent()) { - if ($title->length == 0) { - continue; - } + continue; + } - // Get optional page meta data - $metaDescription = ''; - $metaKeywords = ''; - $metaRobots = ''; - $metaYggoManifest = ''; + // Skip processing on json encoding error + if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { + + continue; + } - foreach (@$dom->getElementsByTagName('meta') as $meta) { + // Skip processing on required fields missed + if (empty($remoteManifestHosts->status) || + empty($remoteManifestHosts->result)) { - if (@$meta->getAttribute('name') == 'description') { - $metaDescription = @$meta->getAttribute('content'); + continue; } - if (@$meta->getAttribute('name') == 'keywords') { - $metaKeywords = @$meta->getAttribute('content'); + // Begin hosts processing + foreach ($remoteManifestHosts->result as $remoteManifestHost) { + + // Skip processing on required fields missed + if (empty($remoteManifestHost->scheme) || + empty($remoteManifestHost->name)) { + + continue; + } + + $hostURL = $remoteManifestHost->scheme . '://' . + $remoteManifestHost->name . + (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); + + // Validate formatted link + if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) { + + // Host exists + if ($host = $db->getHost(crc32($hostURL))) { + + $hostStatus = $host->status; + $hostPageLimit = $host->crawlPageLimit; + $hostImageLimit = $host->crawlImageLimit; + $hostId = $host->hostId; + $hostRobots = $host->robots; + $hostRobotsPostfix = $host->robotsPostfix; + + // Register new host + } else { + + // Get robots.txt if exists + $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = CRAWL_ROBOTS_DEFAULT_RULES; + } + + $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; + + $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT; + + $hostId = $db->addHost($remoteManifestHosts->result->scheme, + $remoteManifestHosts->result->name, + $remoteManifestHosts->result->port, + crc32($hostURL), + time(), + null, + $hostPageLimit, + $hostImageLimit, + (string) CRAWL_HOST_DEFAULT_META_ONLY, + (string) $hostStatus, + $hostRobots, + $hostRobotsPostfix); + + if ($hostId) { + + $hostsAdded++; + + } else { + + continue; + } + } + + // Init robots parser + $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); + + // Save home page info + // Until page API not implemented, save at least home page to have ability to crawl + // @TODO + if ($hostStatus && // host enabled + $robots->uriAllowed('/') && // page allowed by robots.txt rules + $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit + !$db->getHostPage($hostId, crc32('/'))) { // page not exists + + if ($db->addHostPage($hostId, crc32('/'), '/', time())) { + + $hostPagesAdded++; + } + } + } } + } + + // Process images crawl queue + foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) { + + // Build URL from the DB + $queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri; + + $curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT); - if (@$meta->getAttribute('name') == 'robots') { - $metaRobots = @$meta->getAttribute('content'); + // Update image index anyway, with the current time and http code + $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode()); + + // Skip next image processing non 200 code + if (200 != $curl->getCode()) { + + continue; } - if (@$meta->getAttribute('name') == 'yggo:manifest') { - $metaYggoManifest = Filter::url(@$meta->getAttribute('content')); + // Save image content on data settings enabled + if (!CRAWL_HOST_DEFAULT_META_ONLY) { + + // Skip next image processing images without returned data + if (!$content = $curl->getContent()) { + + continue; + } + + // Convert remote image data to base64 string to prevent direct URL call + if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { + + continue; + } + + if (!$hostImageBase64 = @base64_encode($curl->getContent())) { + + continue; + } + + $hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); } } - // Update queued page data - $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, - Filter::pageTitle($title->item(0)->nodeValue), - Filter::pageDescription($metaDescription), - Filter::pageKeywords($metaKeywords), - CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); + // Process pages crawl queue + foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { + + // Build URL from the DB + $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; + + $curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); - // Update manifest registry - if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { + // Update page index anyway, with the current time and http code + $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); - $metaYggoManifestCRC32 = crc32($metaYggoManifest); + // Skip next page processing non 200 code + if (200 != $curl->getCode()) { - if (!$db->getManifest($metaYggoManifestCRC32)) { - $db->addManifest($metaYggoManifestCRC32, - $metaYggoManifest, - (string) CRAWL_MANIFEST_DEFAULT_STATUS, - time()); + continue; } - } - // Append page with meta robots:noindex value to the robotsPostfix disallow list - if (false !== stripos($metaRobots, 'noindex')) { + // Skip next page processing pages without returned data + if (!$content = $curl->getContent()) { - continue; - } + continue; + } - // Skip page links following by robots:nofollow attribute detected - if (false !== stripos($metaRobots, 'nofollow')) { + // Grab page content + $dom = new DomDocument(); - continue; - } + @$dom->loadHTML($content); + + // Skip index page links without titles + $title = @$dom->getElementsByTagName('title'); - // Collect page images - if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { + if ($title->length == 0) { + continue; + } - foreach (@$dom->getElementsByTagName('img') as $img) { + // Get optional page meta data + $metaDescription = ''; + $metaKeywords = ''; + $metaRobots = ''; + $metaYggoManifest = ''; - // Skip images without src attribute - if (!$imageSrc = @$img->getAttribute('src')) { + foreach (@$dom->getElementsByTagName('meta') as $meta) { - continue; + if (@$meta->getAttribute('name') == 'description') { + $metaDescription = @$meta->getAttribute('content'); } - // Skip images without alt attribute - if (!$imageAlt = @$img->getAttribute('alt')) { + if (@$meta->getAttribute('name') == 'keywords') { + $metaKeywords = @$meta->getAttribute('content'); + } - continue; + if (@$meta->getAttribute('name') == 'robots') { + $metaRobots = @$meta->getAttribute('content'); } - if (!$imageTitle = @$img->getAttribute('title')) { - $imageTitle = null; + if (@$meta->getAttribute('name') == 'yggo:manifest') { + $metaYggoManifest = Filter::url(@$meta->getAttribute('content')); } + } - // Add domain to the relative src links - if (!parse_url($imageSrc, PHP_URL_HOST)) { + // Update queued page data + $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, + Filter::pageTitle($title->item(0)->nodeValue), + Filter::pageDescription($metaDescription), + Filter::pageKeywords($metaKeywords), + CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); - $imageSrc = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); + // Update manifest registry + if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { + + $metaYggoManifestCRC32 = crc32($metaYggoManifest); + + if (!$db->getManifest($metaYggoManifestCRC32)) { + $db->addManifest($metaYggoManifestCRC32, + $metaYggoManifest, + (string) CRAWL_MANIFEST_DEFAULT_STATUS, + time()); } + } - // Validate formatted src link - if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { + // Append page with meta robots:noindex value to the robotsPostfix disallow list + if (false !== stripos($metaRobots, 'noindex')) { - $db->beginTransaction(); + continue; + } + + // Skip page links following by robots:nofollow attribute detected + if (false !== stripos($metaRobots, 'nofollow')) { + + continue; + } + + // Collect page images + if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { + + foreach (@$dom->getElementsByTagName('img') as $img) { + + // Skip images without src attribute + if (!$imageSrc = @$img->getAttribute('src')) { + + continue; + } - try { + // Skip images without alt attribute + if (!$imageAlt = @$img->getAttribute('alt')) { + + continue; + } + + if (!$imageTitle = @$img->getAttribute('title')) { + $imageTitle = null; + } + + // Add domain to the relative src links + if (!parse_url($imageSrc, PHP_URL_HOST)) { + + $imageSrc = $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '') . + '/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); + } + + // Validate formatted src link + if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { // Parse formatted src link $hostImageURL = Parser::hostURL($imageSrc); @@ -269,9 +446,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string)); if (!$hostImageId && // image not exists - $hostStatus && // host enabled - $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules - $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit + $hostStatus && // host enabled + $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules + $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit // Add host image if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) { @@ -289,11 +466,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Add/update host image description $db->setHostImageDescription($hostImageId, - crc32(md5((string) $imageAlt . (string) $imageTitle)), - Filter::imageAlt($imageAlt), - Filter::imageTitle($imageTitle), - time(), - time()); + crc32(md5((string) $imageAlt . (string) $imageTitle)), + Filter::imageAlt($imageAlt), + Filter::imageTitle($imageTitle), + time(), + time()); // Relate host image with host page was found $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1); @@ -302,77 +479,64 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND // Increase image rank when link does not match the current host if ($hostImageURL->scheme . '://' . $hostImageURL->name . - ($hostImageURL->port ? ':' . $hostImageURL->port : '') + ($hostImageURL->port ? ':' . $hostImageURL->port : '') != $queueHostPage->scheme . '://' . $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { + ($queueHostPage->port ? ':' . $queueHostPage->port : '')) { $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); } - - $db->commit(); - - } catch(Exception $e) { - - var_dump($e); - - $db->rollBack(); } } } - } - - // Collect internal links from page content - foreach(@$dom->getElementsByTagName('a') as $a) { - // Skip links without required attribute - if (!$href = @$a->getAttribute('href')) { - - continue; - } + // Collect internal links from page content + foreach(@$dom->getElementsByTagName('a') as $a) { - // Skip anchor links - if (false !== strpos($href, '#')) { + // Skip links without required attribute + if (!$href = @$a->getAttribute('href')) { - continue; - } + continue; + } - // Skip javascript links - if (false !== strpos($href, 'javascript:')) { + // Skip anchor links + if (false !== strpos($href, '#')) { - continue; - } + continue; + } - // Skip mailto links - if (false !== strpos($href, 'mailto:')) { + // Skip javascript links + if (false !== strpos($href, 'javascript:')) { - continue; - } + continue; + } - // Skip x-raw-image links - if (false !== strpos($href, 'x-raw-image:')) { + // Skip mailto links + if (false !== strpos($href, 'mailto:')) { - continue; - } + continue; + } - // @TODO skip other apps + // Skip x-raw-image links + if (false !== strpos($href, 'x-raw-image:')) { - // Add absolute URL prefixes to the relative links found - if (!parse_url($href, PHP_URL_HOST)) { + continue; + } - $href = $queueHostPage->scheme . '://' . - $queueHostPage->name . - ($queueHostPage->port ? ':' . $queueHostPage->port : '') . - '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.'); - } + // @TODO skip other apps - // Validate formatted link - if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { + // Add absolute URL prefixes to the relative links found + if (!parse_url($href, PHP_URL_HOST)) { - $db->beginTransaction(); + $href = $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '') . + '/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.'); + } - try { + // Validate formatted link + if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { // Parse formatted link $hostURL = Parser::hostURL($href); @@ -435,7 +599,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit - !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists + !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { @@ -454,17 +618,17 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); } + } + } + } - $db->commit(); + $db->commit(); - } catch(Exception $e){ +} catch(Exception $e) { - var_dump($e); + var_dump($e); - $db->rollBack(); - } - } - } + $db->rollBack(); } // Debug @@ -474,5 +638,7 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; echo 'Images added: ' . $hostImagesAdded . PHP_EOL; +echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL; +echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL; -echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; +echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb index 15ed8f42e9cd78b908e3a58f7705431e85e2d293..9fc68cb51ed13439555a21e324695257a1b6a2a6 100644 GIT binary patch literal 13968 zcmZ|01yCkIvn2|AxVt+HGPt|D+u-gn@Zs+64ucNv?(XjH?(Q%!ILqAs-q?K`Z@Z(T zIx4cFDmtq3oXk9>C<6dT2LS_t1_{9|R3nBFA8nuq0U2EdfFOf_fSA}CI|EH^oalkB zM)Yn#>r3xzl>_#s$G`71R9i@`uIIug&ndjWCOW|-g-t_9daPXp3QeSJI-J%C*7r z*U@!*(^QibEl>7DclsBzNfQ*b)E~5jMHj>G(@5&wzH6^!lQtsBaT%S{)TaCIrp8mb z^`E!4o}L0;{rfoU8T!c~pJoLs+vAbe;t_%13QsAna5SNm4|0!xjb8S5h& zKFNGWsF(YPDxId6rgzG9^+hw={7d0xq5Gxuv=xW2zXaW%71a2qnH#&ixKB2I-lO^u zg>iG>O~JY!aZ%I0dnAk|p7&3s`wl;$oGsPZ*js9Xcl_8@22Jh^eu$Rg>f|GJEA5)< zTsrj)7Brn^(rW&jkp?f{QbMg%&okonz=<>65%^{2M%)?)in0z05>Auv9 z^mHHYl5+=!LP&Sdl%;B-xLWqn{kMH2V$Y5rKkp~8n)-ol1mT^EJJ#h<{0VK z2R2^(0%>7w@S@nHArTJ~iezOio~?9KD=>!{)%a!N)u83IzSVu#XJ3Tu4ne zq)9@L4ZPlaG~>l>{XPdDo2p}+l}5_J*}{#^BT~N@k4V{s+y)47XACmclSocqE)op&|i%9rX0jpupI6c}$)up84~$H7_Qi zJPmhDqsvD9kc2Vs7+&%A+3?5H!`sD?P?)Kupq6>^Y=nW;9nz3@RfHBs<wz?kJ7q*V zWJW!WDLc4I<9_9!%LD-R1QfifI(t_)j7C;szrq>uB=NL^O~mVvKmszL z`oFp!&XbZ+FVi6zlz>yChy2I zAX(AtS;%?eLE(kX=6Z#8N!U&e1a@OPO}QkP5jT!s12rHiP&`h$3?O{hGqcMeOF3drb@?scAUevoE+a3pR2P;SDY} z9sr3xDI;tu`-ZNtCrm~RyNZX5%e~-FD0<)rL7Uo4E!yR302L+{e{P7y6d2+y(vOVI zdhrc6WF;q8NaPzUZ-zyPh<#vtXlP#Jy1tV{!j1S{yt``lsVcJyRFH9+Mb}ok9W)SL z#rW+8&+~~a60S%sCrgyuM8HR$1v3_PD@hFB^DU1goh=!@-U)p|K!&PaAAC4yD9ExT zMIX=mdrEAMQ>&Mcsvnr}-$P1f!dT_4X$~|~SWqCa>z5+A;_aQ4B^Wsa;(^dKWUidn z+G53vBAY=jZY|==1F`VA=roLs#d5ObjFnP>H#;&jW^F~1WA4K;k(krBwRkmf8fRV^ zKvwv~A7`YTT91MH(Ae?PJ8hj9IJU9D&u7cu3G3xg0u?e{Kh~ar3q@t2 zNaA=1@!ap?QaX#Ws!D5`&D{YC`2BA9SV+3~S`Qb$OZ9ZJy!qSSQhtWVN782PT;xU? zA3Ij4@R&)xEk{vbpO<(B&ob1@kHe80^Ohf5f}Z>3_Ay7H-a{RKYm^mN26~Gvf~DLYYZci}t-eCu|1P^g==i)i3Xi$WwXW_OxXdZ zY1}ogp&Ef}gP#jsX=laKfr@z^fs6IPZYMG%LKsFwhs!BoW~Sd~Mr2m0+!O2kzNO5> zDBzl5zFp_xXL9t4W}mU)>|o!riLRtA6EYn+lA#=-1B~ZD8@I6u>eEG_lZ%N2%ncUw)IHjw+nFl(lZx(y8M|2EEtW_t zkq?@|)Vdr(!RzZQrra)?k`=i$YM48^U^?0E>t*%EX=7k7G;nS!EWaRLG&Fm(RZ#N; zUWSW)`p1+(81k<9_d$GbPR-kR|p$qUxY9;#7bx>jIWN(A+A3 zJnnBXAei7yvB)MAo&?A}c|9i8@8B9yQP%3yf_6~IjQyjPkhS!nGyq|K8c}}7yzh-p z2zLm8XRzUdSvrl)HVv^i7ln0A6=J9EK2GF_E<)2MQ|-eu9iaC?rWBfB*oPP=M^?vJ5i0 z|7zz?K}7^`1r7PSmNJq`DA;fCh%m@}*t0XCkjPNrVGHbwDIeHR z_LuHb-1}oTQorMxdJ)zI=uVAw34GSMI(d0N9_j#o(v?^K;I@d}LgI1j&K2jA=cFwt zZ1E%2bA%(HUn*H12BD!>o#YG(OFH*IvXt ziYg0T>uanoZK#fJx_N927F(cm(bCp1DZ&F9=F@dB0e2(E>!&4BKF=!Xt({G8gLK>^XzkDXg20zC|mtM1}&5HNv zxSdEp0*_sP{e9xRuZvlT;Q87-PN(lRK2u-JZrJ-yn1it^zL+9Oj`(_0~ra}b;&{7eG%8>qjJQYS! z59n~y6GMTM+?lvV@0SmEQB`K9-oGiQx>X)*zyGk`DM!QyZ^r_9*VTp)HaW$`d z7J70Lvwi4^>(5+X&tQQ6f(w%`xX>lZE&^&1-muPGM2gACYoJpuDln`+{mBRkpa$}n z0e?p*^*LYdeeipXDfk(6FAg2`6kCu@j~l69rodtGY4BfjWg1&*Cgtv*RvsOha~2JA z)ECISxi!SQGS1|})&03TX3a2Id`iS>85xJR8-*rJq*tTYNWO<@iJ4*hwNEXlvs-5( z-x|-5g1sywZk8)7tN9=(6VfKPW-7ydbXnJlxL-s?gPnwaMn2V+$|)yquZ((3E~Vh3 z+^jPB9s_LM(;z>H#W-Gc{+-b}wKp4<*wiL#cPzib-43p`oyoS!!s(2Q zfaCx$-6DT=jTA#gy&!9?^s~dO(a&>#1#oW+kB5ZlzIeB^#Y$`^6V#m4&2D$|#avB{{vmrbT1p}$OV$pJOY;}Pfq}UReLF-_e zDnKs)@j29Y0N zHxhD1mN^eFTCT*Cw9}#lN@FEi`>Mi4kU`b_wn^t`(>v#WH;aEK&*${^%40+B+7>L7 z5ux=l(5`^|+Wpu-wgKz4tS zWbfG*k?%$y7~hoXIVmx#D7+h|wRCNn@QX=+A$|-7TTS_=q)|RG=6qQsU5l zOKia7E$_YMIN~vg4_P`Z+p6wBKghNb1TRkw39e?btEtc%2wMZ}2FVx!?a8F?p+-S^+v4RY<l}JQ^>fyN9yDV1pf36S~0&>xIi4g1~V>!hz1E zRzJU?_~k)uELwJZBJanDim^$V;KP9bo3?^a1dNat#`zsTgkTx%J=#y&U0JEGEr(#) zWMf4vu&iFPiTJ14xaIB!UyI&5ziqMUr*NEN|7@9GP_!7a@x%8(2FU^1^_%s&B;!KoB5j>N_&2wM#Ypa~mp21VsNo5o#|n~1i`b-klxCRDjug@M3a*Hg zpIqtjLmY`w1el32`K47qva9gD^Ceetrdb#%%O;~}KXUZZ`OaxBfHvT5vfyt?El+u;`_8!A3nA0@WRFJKkV_!}*iw z2E9!_5P=puE3Im{O~(9(vV_Nqhb*&7bxaNVA{+OT)E8!k8_&`QX}WeSt?GD)Z~|3P z)hhLN4J31%X^)Jw97IyUA`NHc^98yGWZk_%5TbuirngKTU& z0^}C)nS+{6@MY+9{3XBh80pgCHAK9jYidsf5(=u4wIKK&L~w)fQN34~ngu4XGd@7? zrs8(EbIElF{5klY!h0mm)Qw<&mmC~O@IzTk>#RvSukAMfK9Z$Nh5DcIf3;aE@{96PZH~ar)=JwRTvoai<^D` z`Zz9g8DN72di4wPuQR)wGQ>|eg$Z?Kk!*7wUak0F3YiAB^=F6u+6)vJ}1;wTgCo0)Iw|eD1nJ3+XxQ1qzJZu%xZjFQX$d^tsSdxK^0tU z?S1;Yin4Qe912@3=$JrPc(gv9ngRCPboYB$M-Sve|4%kcty;32O+o#-xxZGvUg%uk zI*)eI?a_F&${NoYZpM#EXjTp@E?(~$JSB#g&Go0)a+Vx+`^v&@^8eaHU2`OP_Q8Y( zH&o0QhAnRhheon3wI60_XJw_wl>!Y=y}$Ali_Xudr*kuR|2GkOxKsZhB1He)I7z_C z+=v;HM=5`~2Z>Ji4n+awJB2J@i$Th9;a4^O>6rRDx-FT<7=KqItHh<1{@Sv5y7B8A zKIykT7p-)WuV2X$k!-S-$uj1Dh!E|Rq4U8%B7}X)z9;y$)yUOgN{hF2gp5;*7Y9U> z3U-afYm566hqu&)_~lSqb{|^}|6;V$Ird*g>V@Fp+N}8FU4>Y$M-Jw(czkY2<-rk% zpuP&BeuKJ(d?Fb!E6`Og&!la_aBe)gPG8r(1V%bE(L(hM{4_j zNKK)i%l|=YjPi_HlS!&bUoe^f)DZnqbXu?pAW`0Ni=TWMM2=6sC$ODDCW~~)f`S*h zAQM3^F=||gSNe*pLoBfOdT{cjYja8FL>ioCwr2V>i$aA()c&IEAET&_4uEg#7rC~c zEz;uE)(1)S=0yfqkG2_X84=5K^9uG|MdrJYB8$PP$l4r-Mjt9DDcGwGe2XI}u6JIP z##K=dk2Q) zjsr@M(`!68I0iB9zz@2N(pHpCRv1pBvjoxFlYhrZrpU+)hha2U2BpQk0Gy-Oo@262 zxpc8$wSUt17GH*C};rcf6T2EPvtk@zNFTO#dV&l9M z2Kw7N?bv9)oV14pW}SP0iu?+oVddmR#wVx1x}Zc3 zt8VfWCpD{^N7-eQEjlVGQM~QrM2+?-8clerX;Z;$!04DD zHazmN9!U{6<_v>cue(mFo8XTx3;M~2Zz0D(BFPuphe^qrV@>_6^w&eg9^=Ja^!~21 zM_zO|nQ0Ul#`l}V$5L5`nH&gai6c)#?=?c&uS4%4=pawK{D2HW*pf#^ z^77kygt*OOufVqC{beFL6rV13S0bUaPN}EaG=tkFN!Tc~fE3Rg>rOAy93cgZvr%c3@Frlg;3=CcO_s2H2uy>Xb#sj8g=6XSQmJ>BWITV$@?Yies|fTf zUA4|p1cj9nx5l>e6A}^u1oXrShX4(T?Ie*;v?%5;zJa$P<3L~FfaAincNyng3&t8n zAabIj2Kobf1&!u?!^>JkFH^lkZBe(@xr`l746<&kvLLde9AI<{Zda^JL7MwfonU0QzACLAD88FiWYWnGEnlEcVI z+_+VhK=%SN`$)4&U)h?j-1I#yEQu1&XZFv-m?5m!`eLo~4iQtkr%sNcD-CH%S7l9l zQ!hI=xC@P(&r39)Jfqr2#}rPTQpJ3o*uF}jfn0-@3d^&m2>-33^C(Xsy5;jF+fka) zW+_s>QW>j5Az9wH1Hn>@7JAesKApcMP<+oN%OrcH5gO>Fvws&fmM7rMa zMGXn{d9?n=Rk*Ep$y8*LvEP`*=x2=2%*smTnk}Q4`HG~iWoe_EatO7b3GN>>`nbZ4 z`HIXwV;tpTiznw0;83LT76_JukT{Kzfd9&{_l$^7nbBgM9ZT;>yiOZM`A|^U&wm>N)0kxnGWWs$$d6KLI~I zn|{4g0foJmA?)<67e|Y*k#(o#Z1$KsD+oV4ch|LTj@R`~ZkydE#4l>4#ooEL)qmi4 z8akIsfISn!K)cq-d~+TeBgG(@TJ@s)$shjl@wdOb9D~x$%=%;R_UvhSep&+7R<&k& zfF`ryQA{OYfYAG;z_WRJZL+v5Ltd=t=lppbVtEZujfpNKlEfo~_Hm|jhBkK<_d*2$ z%`%qmdm-x0E(Zwz5fI}^V}WJ|4hrD4EFs{{t=iK$BbG7LSw62&JiP23$p1#48(Rk| zr2n9H9_JZYv=P(WquV1`=ME1KvqAwGLVkh)Ip-SdKMHB~LofKk=n#rt}g zGC$tsPwyU^^F^)fa`4ol{NN9+zz*srMd)E}*hx|h+STLmIz2}UtuRlZU|Qvv@3AN) zmW%I}GjE`*J8dXQvN|QgC_(Y@2OLH0P3q2c{oL@Z3HM5`tD2yu|1%}y)A&|F5YaMCa)c5>wLlUAuPUo!o2 z#cWJ(3tudgD3y{q-7Vtwf7B!^+_+Ut;}iMZTIWq?@yi(U&Y9EV7Hg!?5kd^7%g#Ds z#-!03M-89>6~tPn0j%)Cj# zy;AFOpFQpw?zb1&NxtG)gHx{PqmRtu@T!xcQoT~D%GHhRc=5osp>@$>YGCYBPlMQ4 zovyn-rOM`G1K&khA)1;+WY_!;qO@K08m=$D*mB!1xh|{gR`p};clp|BgiF0W%DIz? zsbV|Fx!IYMCAO9Eq9wPPhI+dObH>S-mB#Gl4DzYV)t=Q<9ah?o^(oEl8g!78&EVYT zeR86_oDA$I)%jN8Xnjgd*Pn=jSSA;2-IhIVV1sGzi)HY)--EoT#~lSaBNj7az$Eim z*K|61F=h3k(fMD3M`Ka9IPvv^jcUY>_L`J-$#9$YgTl(2>gCm!k5zIGhmvQaI_p1Y z+9ax`JEU}1n?e?5<*K4G^fvWG=N@?JZaO%*M(5gjrRVcTr^@Y&c4J=5@}=zE-d2*- zfu5N9xOgZ|VHK{HE_ii88~F0Jgm$($^J;7cucsX z*S`MjkWshQZy0Zdp@AVZ1>X?R=J{DDb9}^m_l7BZA1iD3)tZi0wyxZ;{cI`a5$NTy8|(PWd}^hDuc)2@ z;4E0$Ru02P_u16`egstM0OFgJ;FOuo2R?$818IPsaG-+pzUXac%Lt8->P$F3k@B{A zp(0~Ky#5(Hx4kCw1NXbLKi{G`fFIIp3(V`xFm^4Z6Z57PNf#ghyy*uUA6zK>E`6A} z8yWw@VE zCa7`3sO%#6RP=OTf@_Gm34hjsbGER*X@z$U=?vL*@(4+tB9ysQF>IVQj%+!vvB`Kw zYl*? zTZWY0r0|1V2JU__1|3ws@x{*0;ubwF=(UePe(u&f;h^}h%I_Hm&xmh4o!Nc62^gou zD3q)e6*4?6a1@-8pZ)TCUfe@S(XTLK0N+Kr@iFyS6D+IW_HebTmfM{>i2F$rL7!MT z**^SS3C;wG4=<>~8#hzkySbaUomIW>Xg+QmbKF%vdO_Nwl}9E62{rpr=DVEK2N#^Z_BRnVkfHFWchCSnij=fG5lVw@s!^jzW0Guyvd1fR@c_;uyb| zF)4%YVdAa^U#07%G>rc3IuuTLoz_a04>Al3i`vf zh+75vRJn1F5rM{ewZ$u&wky3yailt|H%d9mw{#KmJZ%ysI=dJRgX`1ZG97OJ2ksv7Nvn*w90Jw(joxQt@``N>8^om4e;ho0U&alW@uWYRi zRL4B!0=(Af{Mz605=q}>DIyKi*E(F z@7)>fRNGH=pf@(y4;RLqKjrXYJHaH$?>In-T8VeQtk^Re^*Hv1^reXqob*21pX5Lk z-)y#My}qI=7(LG%32*OO2x}?ePFy?W`k-YObH|VGQ-9BJK@t8p+N#=THI z^&K*rPF)l%J}~Z5qBdO)it&%q;uYpgPCzu(jv*S5`6?%j7}ao&KXYN12814gF1yn8 zHLc7Zd_B6HSw7x_&(tthAM1M1M-_+nLHF&korLw7Rim0&co>*`N#;>s!3i>N0OkBAR^-CB$#LxhA#D2M6oB5PgHu5dtuMPV6U;3s% zKaq%K!kwgj{-Gv3;uc6})r}ZUbPrIy#1%PCtX4H?d=YFEVHgct{hdtrd)*7>E>$>l z`TLcLQG^UC-T){v+!wodvbQ0^tGTEn*k)+=5Ujn&$CUEWyP|(}?FxUrZ&O4eAdn>5 z^JkXnqr-3iVpEW%O4E`6n~@R>(2E8R-A#UfdWV{Ay;db}F~61=F(vRN4t=&YG>$)M z=c)vLctnDh&q^i&bqgUQM?CRS#px=6BAbK>LwNy$#a5or8dH z!me3sAeHI3m>lLiaB&qRW{+CEmAdhSUl{CQI|2js{2q%e_zKbj)w2pX354L+NHz%{ z1T!a7o3MnDiQtSF_qH|45KXdiR;5^Tt6E)aiK?oBlXr_d6oDv(TR^6_~tJzUJL-|IJ2w)KQpYQ%qQ>8^c%wMx?i2 zti+SBzTfndhdRv^r+ao!rqH`E8YxmO1ebX{(nK!fFsCggZ+W1!sQk;?=8e0qx#`qtt&z_o=JQ|xm+vMCcNdEKos<$5mp|^D8{B4^QNOVXW-rpI!!cw7^C;lj7N-;e;qB6&Rx-r5EY+Dy0O z38Pwg7)^Ra2G``s43Bxe`3#9j)8JH=!khiTthxs}q`I?qmAje5>7-h|cPtqed=8oAk+5 zmDU)}l7<|eS03uA>w~Lw-9GN7<>d@pC6LVGl1qO^`SmAPb7q4ESA4kr5^ zuAyWu#=yyH*UzKHDP7L@t#CN?NLQ?COkbZJ|JaWAg|~|dul82gtASkcrn;-Q%F0-^ z{ff2iGP7H!jm{2^E4{L#y{^)hb#1vrzRA@uh4i+$y^OT92Z>Y5EO%;VlskNIw;!%yq)Y*NZ|HVf7t zbw_F+x-KS^Wo481l#}zS-X8{Ji^mKO-h}R<;ZF2+R0`kv!tUDmRLefIGhVvuo?paW z^L;i>In0VLwoBt59`fuPwE$OF)vx|i!4->~kL?9Ke>`K|?TBqv z{q?UDA|`)ZT5yVUy4yz}}5AVy**Hmu16&BABBPeMY*&M!_(!TTk0 zLS+gRL&lhuPs>#DVL-m9=W|CMF@f?3WQM zLC0O=U&6hXYFHEGU&_c$8Vc0!Mn{wp`A)d^36USgsgDE;x>NWNB55u3pXKp=n(Lb` zKH{JHtf(9(GQ<>m@q{C3a38QHrNH9NPKP4vp%uzXai;$5wJYq+-rg~&EU#|vb&CmP z0Sk1uOm%?v>XJku`Sv*Tb`c;Xi3@Sa_sFXOGj*)tfXNpXtk>-%v`&m{zKv(DphU)oBcA(XnhYY78` zla@?n*Gpcv9|&0#y*S%!2!9xEYHt9|sDKg4hlZSlbz6~EgFj{^mzsb*G`-DN)2ydJ z^yds~B0u*>C!29G6P%%02oEAE^PGLHQsY9D$gu)ze-IR1R(#6M^C{;q5<&M1>fr^A z6E6yM_h^*g%W9{7oJqxC)RCSKVX_|r)YR0vp9)`lb@$~{XX&?<55jInGY#Ie<@Beg zD)XhrVQc2fZZRbC(Jw z_uw-zA;IIhJB@DOjmQud6qv zNqleaJJ%G8V5)j34=yV6S$9c~JT$}ukB~)(G+|B$yxLlhl?wE8XJwxdOd(@Zv#BM# zc_Yyu3C4Z)+)n>4PfUtwexvMD2606%fSQ(kx s7+Vm_*_xPI6B{}hTUff75;M^=!UF&G&^a1gm;wzQxryD)&23@-7ch$B7ytkO literal 13925 zcmaL8b8sfX+wLFR$;RfBjkB?x4L7!Jn@?z?ZB zp803${#ki9R<;ORzf9m+FW8gY+o7L?>!%Ry{MQ8+F@`; zyBevpa0zAG5|OM5ErInp?pzIHH0SSCB+Gm_zz}+kHuP}7l)R4C>!nAb`40|`-_o-5 z#@UFS3BXuoa$ZLBee72;X=_$!KFl@n7Z`s)cMdokVhL|S^yx3o%MemQS~3pL#5W%t z?Hi7DYTt4c!V>Pg&D4>X@P7;8Z(!?%V}gAvG!cRA)Heo>g)^t6*Z9>8KtK?RG`#$l zZmNy$+X7;mSV4-wiMa8qBcH6Io`T0K4GH*)b2b*#Or z_rl81$+o%3-u6pBw-2e>8R2)O)rQZnqjw6~=c3BA_NVpV ztx4>Cjdb`&WwK(T^>tcjZdan@w?r{LEW&J9XPbQlVUFDJXp4R z!Qaa0&`8!Tw}vG{v~D)ZNZBg*CVKBDTG0P0bw9PX<_Nvq-~=s*0)^yvrTjysx#!*+ z3UgiCK5=}6+Wp2RnCEGX|AZvnY56^@4=eTYQ;JlUT%OvW06=jnw@ zhG`+Nr+;jI)|9a;0MB49f#=w@DXzNwZkifBvdWHNkRL|wO{yk?j>PYOkJWjCGys;p-wVsBgbG~-ZLv5L(;WQ zTz3d)Am}mMldKzmCS$P9!P9-98(tg)AuV1EQ{fx)ulhdOIgzv4kYf%K3HPv4jFJFB z+}-7-H@wgG6M#mYy^hw{fZ_IbUEc0Wz*EG)pdrt971_@HnD^_|{`==jEYreM+@ihk zyr!X%74W3gAba$R7>r6&5Ecmtc% zH?8GJC_2gA{0y5I9|i@g`tW7QD78JHvt?Jhz&z_7N5hqPoc1^DJBFxIFLOmzG3Yjp zJpqjn6XO#Xz87SJ;Z=Ri(+HOh;0X?)0|{EUQ-+Li`;NFQtD0oLN(Xz%Clql23EGej z24Ua<)lhx#-V7G2r2HRU zUH17@`eW08xRT{GgQb#XoJJ-{HT8`=4T=S$_6Io+0w{uz#Y{KoHVLW8trf#|jPsgj zPo_F1!BEW+%joFE5|x4RPX@eP5R|m5KT+9&9fHJ}=%{9~l_5ie+!MsQo3WL7$eJM? zJn*#&y*esP+~EHF2)$ikZA*VlJ)u6Jt_i1Hq<0oaAAx1N2yZw~ib0*DWq{p1K`c19 zR6G=H502>U^Q!=YJ|;bm$VUG@CMtZ-53bncMZW{*tQmV>Fa8cA`y+U|>qV!xqTRL5 zykXGpTzGG2Pyx3f0`Qn#p!jC`=n)m*r2$m*Hogj+(npq3q8Jq==nN4C@$Zlyp~zk+ z6id-62I(eu4Y-J*m_~I(kzj8|cD6d_IW=y_g z1lIVdvC{+T%J>vY;`AYr+xvw;;iC~o)`1w`{21TK-|;1wMUSi)MV}O-=FF1niU;;* z#}VJAt(-lJf*;ZgQN5F)2U#Rx=H(bVnO?N$i3E=wp3h1^Aj!EBGKbuFM%4H>WtDV0 zg#Hj~cO=S@%U$1SnuC!uAngfGLS@Tntki4VXIpe>u^JHEoeBmngz2E9ifL<-3Ra57 zJ@g9#mCTiqdt>{R;KckMFbzafe4^e) z#i?#d+Qe+iChCtve@^uD@30e|6!+c>u12)_iRycaCD)7i+K(t&(s0Sbj*)U4jE8H} z#AM^5d zGaIr2;IV>;toCRw^0Z>=kr-urH7&RI$s!keB7nygY^4YM$TUqn&kp6YTN!|g;lLKuKq4$&ABO` zC1A&^uP4^9-et!1i`FB)h>ztT=idXS`HruNH_ z$)?El4&DHCt(P_0tSo>Ae33lu``=a6UwnON$S_&n5b0!F1i-C;-Ji$^leZ{J5<-3G zu`N*-s8z^W@1WnE{^~K;(BHO=hu6S|H|{g}9BF17ueN|{0YZ)&U`S%$>pX1k8T$TM z*t*$Sy0~x=N4IyyRUGa77m0~Rc*#Hb43_)7yjAoi*=6LmAFM9)vq)&A%Cc!xF!i*- zv@$(bOKJ^MM!;4qYek&CtyqMV5*@!i?v{+-ppOV z(+mGnM1-aXG%ahru6f{Tg`Z_DEYEbqU!G+$Af(?p8AS204m4O5wwin(Le~S5-f8rY z8f|azZUTWW{6bgUz7~iw1eoVCl;^py(zzMdA$FL(+@%EVONyK7=pV)Tg7bfclzlw* z6fvfMP6}d>D1!H>!WCnoE7cFPKXE~L1w|XP5#6Kex(^~k>dGu|5ql|fD6X9s9ab@7 zt_q@PG?+RpZT}2`Tq@gmxV{Hma@2ZQ+R6Di%Bjo#stL7NA|y967k~)`!+t6HeqxPc zM+}3F3vosT7JL~2G|5+n21~lrKSe1rew^^JUh8H-Y6nSM{38t30mR)prMQBe%kzPH z+Vis&lJfeUV2C)J`GreAJs3kFQZNJ_Vlx3bM2KJJf;V|3 zB<0X>6bMMLD7@I>*~o~*NQZ5Cwkfyq?K}*_ToWKUpdzdSZ#3R}6p`?}W&jK8pK_r> z?4dyS9~^nN1SX&6zb0J)XZ);kO~wFx(=h^WkdQXPb$z+Mk&Et31JG?2<0vEW>z3v8 za-)KjPb%ha&vw@B&}QcEQA~@vPo5bGO7%XKmDxU*4C=DKQ5iD$ku`tmnicJ~nt5lB z5y#WdZGlG`>;QY|v-d>14^=g6?;Br4{B%0CZr>`h7^qdA7Kn&M!+9#)Ir6b0&X&FD zEEHc@N0xZ;-gQ?#>_kv-AAirSO^rIAVn^2x96>x8S~; zi9z-IGup##;Y`dHHh zqkln8xe8-i8MWX;I`)Okb@3dUyd3}v2fl^F*+bIodsDH!ewB4JDuzUBxZJ?%{Zwyk z-t3T^To-f%R5Cc8MpXpVf$06mIFgl$LxnT!;3eT;0I5RtDuTRT0bZ~f_D&a0PPlA0 z$}vfa3T@q#`EdC_QFJ{;9uvgdh3&6$rIA zfPa4834>0WskwA=dlU;hd8bX9BlfRRt#go<3Lz_kAtKAG!$!O!E|vNH>?s@Hr-^8H z0vi!4+e9B=o6D8S*Vkj>7to-Sb--dvu}&VNZAtlczef}qbJA1Rl6GGdhbv=4WBm=E zYI^vb_8g5r!wQ1{!Rg4Kr}aw`QK_+hU{KJ*6M|L%x(#RgM<~YO*+3UjTa@?Rn!+r{ zqz~`ZsHk-}iu_JdOc>Rbv%6vdm2dCI-?$0wg*ySgmWtiL2_r1k{ zLxejv!2x+4ZX8|PBndW{r&5HoNUOkL613!K=} zl_%Uxz(tDKa&vk zQ}@euXGo5Hf6Enb5HAA$2~aq7f^ro~_`}4xKyY31U-2^qt2G&Bs0XxT6gU#n>!+MO zBYLZj4)7ahL9F8wJVnY&d6S+i1mwhJ>?#WtKm}IvStgvKPi~#csW{LvRE17Qoo!0m z%-g#WKUuI)LkCvEQSG-9=n-NL7+AG}`a>Cp+%{mW#6=(uJX~5+pK?qxt&H*Zc56== zFWh{u{bS1%^}W_a`M`qzzNa`Cv6%T2u$Ykr?GSf=VI%e7+{HFJ1q9y7qZM7()16x? z2AjMZn~6jVo;l&*#--SdOeHh3h|$-S69g0s_cA(N<^x+4-uy;I2Wv7!RpUB>SykW*b!)DJvNJArKo5VF~d{NPsSs zwr7;uQ-|M;1i}aVPJS)#x}N5r0&Sdx7nOCsd`Qjil4ft4y6WMT@4&mPzagjsAjV|} zzh?yt3A=#rO$C4~VRR85r>bRy2(1Y_9C0p1fa@h(zWhWIMG8M-;ru6s@o^o@5-uMM z_NQ*ZgusQ7Uj>m_0h>6^=(#pI4T`Gp%WNl<++{X1*0tI4>`W(=V(WcHX>|cGj3<=f z%Iw7n!obJw>pbXS%k!9);WmW^d3O0>;DX1m+7?(uS}bvTegU*+=7{=tMfPf*pXWYB zX?>XFE?TI{%{pH%Yt@hhav(A9dw8fx0$uUh_OGN<{0V5v3!Zdj z1I^l8`!hgLaJl9Vt>GF$Y6}cvT#`knDEqpsKJ_rK<9VS>5XUY^$4-Qnyunn#wDe41O%rz4zhvPgj6>`odyYI2PX~HZ zpRtqGphA9v9T_B>9EKN5e&>Q;nq>Ree`ylbyal?S<5GGYcvyKC(#DwO4C%A?dF)=# zAi2pLTUAm3cljUt%FBB%t1(0HRw(Xj<&-j`rTmlOIW}1g>W{DG{Cwf*@jAD5l9jx? z5nLK@jVBC`qX#5($Mcbo@BNDbzo^&`bs1Q)7HG7cA8yxPn*ttH6FhsU1A?SAS|5Uf zkgzOiy+?T(Ie1w~6j!^*UnuuaanEjMXQPBh3keKj6rlMUUn6#YdNAdi%#Z30Q`g(S zeA|3ytBmaSJ(}naHw`6&bz?5s*^sxM;4cevvU7iEOT#DH{p(-2=54sg!K+Xf8T?$O$6poA@jJY&MZ)_b%`QFw|+Yin9kKbD9zLZ9U&c9-E$q(-$ICQnN zKj-#Hzf;2C%n%+gehE1q!0E=`g#`a4$%A*4UnV}*s0pgWR_)GCbaN<+Jl0Zf&torT-q{|ksZ)cC~%PmxvucTLLOkdAK}4dp6?*@l?2)`&t%%5C%BrM@H`TB{g~ zMGx-5FdE~s+|<+ZgZKE*8c%W;691&aj|0m(xZnSz!YK06XIxTy6WZxfNw`}yGTA0_ zB*vNEcLHvmGxdUv+}Q5FB~D}9^eo|S1xb6+!}mP$L5_$Hyl4YTjd$=UQLb1S_to&U z3~Y?t7|HcKf5eu$tS3BZOFbcPp! zR-Xc2<9x)=!N3E-<0)0%+~u@VNL1{oTHkb{drXhOQ_W^>v;w zW_f2UQp?J%wLGn^3NGWabkFaOmlVd0)bq5PuB|&huGHLd-VS)NHHMQ&*>N?NZNIqO ze1GMiwUB;gp1xKx<3;V+p`XiP80ekDK53omI<@yYvB}gc3-v}xPoI}$^SfdBEm(~PEkS@d4AV#R+jBLc3o|_?1My`-X%UuZDNPbplpo@h%Axnk= zDsV=JpX(?Sa4Cz06q=?RO~@KGV9x^;iBA;7b2TN`7aBG7m1O3+X<#LTQ9 zHVRS=wqupZRl_7m{ zd3Hmx=)zxWDKwiTSgg#QKWWt)1eJBKifB(RK&=*a?}xvaMM(Gi6wJ+e9dRaV-w%H` zF_cM|oq8zyQJ5V>EtE-@J?LO5gU@$}>n`XcYh9E}_ulZh0YwQhBm81@a)z5w%h$eL zWB78xh&}zSKtkX<%J6N`p~9ZjV>mTsb7?=h_lmP^Dm5i_<622^GWjKk)tskvwj^2n z;=XkX%(R&Fhq1jMu3@_01CEP ztN|W23_ctLRJD}QqJZ8dan%XOq=4Qg@zl?ydjfE8Xb;rrDO6!014cU#^DJA)DK_k- zPL`?0Hn;H$0GYliqunf;izkE+F*!^S2@~v=-kqi)e19KM_p&R-Sqp>*+=myuGoH_i z;QgxUEZPH1EEvLz36=yR1GJ+Xgs4V))X19`f^WfoimA&K_>1J6W>9IcrVyC24$hxD zU!q=Gzd??F_v*HT>KX0fUhX1oY%1b3R}9Uu zF~&_>th3`ZD{92@6oq@+P>HbQ~4WH6o#;6gos=*M>Ylm?~4cOwBMs9Pww z6)==nq`Y}p&{nD(2flj`YKWPqm9OQ1?cdy8RFJa2FIY1F3DaM=e4I}C;6b~A0l&FG zhJve2)eQO7a)&}$6Rl(@H2)@AF@ab;fh!I#DAoL@OL4d%CeiA2F}0)Ei2F#gVF8*u zmH->`z9;CfGG3)o1fM|NyJQ2rWko?kxg;g&{_wX>wfp;<9Gs5JlcK)&#=s4Jx~ZvG zia4oF6*ZYvJyov0lL7|&hrfI>j3d4}hX~7n1@mQteZ>H0Pm;wsX`kuXNm?cC>Iq`w zfzI}BT16|adwZ6V1hEPyk-9*9z&FH2(Vm8oQ^NZ!Dx!NX5uST9DwJat4~yYHqa7Mt zk?mcS(ZOaeGPjp3pJQvBrW3z(+fv79R#BU|)JO^oL`7w=nk?>v%((3qONJ)}-G|L< z7j18n&WW^r5P~(S&4#76nn%Sd+VVru@&?gTNU@HXoVn-c3R-Cen+X;1@f4+U>52wb z?0I0q zoi(vY=`=#4r_Eo?H~sdr7cO=R*4IIb>M4K!fV`N7L2`TdEK4t-fk zeYNPj$3iJ$(fM-HMQ$DJwKi6laGWv~LKcC`nrX|`+8!H@xaxfGC&1@5>-L=CcF&W! zfb2ynKj&Es|9eSfdriv}nV(k-1bI(a6uI+&TU-rTctf}O{`m`wwnxF@X+91?3q3vUX<m!zHeuJJwa#iN;u*ZV@!2^bY5If^F7_P&<_!KCPUcu@j{vT{d;s_4~EH>3_XW z31Au~7yanzYhl2kA^hFhXnrRU&O%F?P_S|op+o1(9yoEUERr@de=5AQjMM_8;`u6K zca^GCq_5n{Ch_rnhOKfYGx((RnP*IC2~E}h&|yLjrb>%0)8Z>?4*(=6S5gNW9W9s> zE$Z$G4!JZ77Sp47$0B}}%o8FDm)>P7jUE3`9wd%}8dT?+c`h1r+~$CHjzaS;u$VMw z8R9LRUh52}o&UCuJ--`1lREy6YROV$-6rFdI`a@3qokR)cm7?HC*`D}@LXSvOhuh&O!{O*h{E&Ry9EtZ7_3 z)=a`0;V)aFh`_DuYjb~b&Wtf#?zv2zirR8yCuKL?GG&p)wN&!gjdr<5c}ive3EtBE2#lvh~QxI_Gel&~%vZ zDPOIldFM^R`JlWuz`auSV^)IOF|Vp)w88nUuY|!z`{VG60g0}`L};XDal~@CP*6TB zT`@7*de0LN+ol=dI1tP*cl0nyQC=}N_kW0IqWN7;)Y|q%#H8E&6lnQ{O#4X0l=hvt5V1 zPp&KTn5^W?h%IfSrs^tLU>n|ov;0%&oRx9 zwGmb-PHm~p!0WTDJxc^vcSbloztuf+pISrMY-tdJ0Xti6)&|I`Cl!W=LZ*5M8}#+S z9x=gOXPJu14IfzSggbTpR>j}LS5-o~5W5K>Z4S=K3s&{lNF#hG?QBWx*a?IaR18B1 znyiWLBJ4eFI!PN>2htWrZA>QtuJ;vAc-fjpt;w+KnlS4!gHHUii~>o&X;3p)VBqBpIo+o|7sJk{H0lb(o<)>wt6Wji0T8p_I#e} zkp097!*iv_sEqpqS3^2VvIKjWSYA_BHa?>CQjyc_^!M_9wMiu$l7r7DGfHa|e%5^B ztm1WOK|D@Xo%w;qwsg*0vaEMPYFQu4CED+wd1@uH1@*I8jm(G`r(=-`miT?hZ=y=Ty{l$_`xOzKHa*-D>GNht9rz`8LpU?|HmtQt%C7H(zEffwVd?FJ*MxQ!$2 zCVrH^5#4YewEm<8dKr&*MCU4#Hm%AsIxdtYSVXV~7dS=8@9vEYSR z#eivG6vp~GIewHU;x|)bc?`$n)O~GOs%Ef`5gGfI^EhMNc2f3a8~T-tiuG&H`-?B; zT@2`J=6!dA-=nXeqZQ}b?|ppX!55J25agI#c^3dit<{Ly1R8@~b7p4&txD#{SSb(_ z%?J~?Uj^}<$A_zbzB@4I9cjr;Lf~_a6fw@$g z)I7ei7#h2&ZhKz3)rJUFi^&iVXRi`?MWnBs==5rNDfD+n6M|`T>HH(XnRKZbp`If< z=G+z6>cDbyd`BIQLHS#hF`5o$l{a^-7R9~76jJ$idgOrp0Nwny0iA=mjGf1}hsR!> z@oJHyu)>ATdd;3}(AjH>5TOM^lFBXxe1|nx$NP)}TYjrqPk4JQ4F9P6rO$cFU-8LD ztDOGJU=!wL=9qoS$U;&cf^*WURkGPk`Gl`u5bTGoYC0lJz+R4KD;T)aS=%aH9YyKQGNLdF5T&#QI)%m?7gN@ZJFNrUVjgE#|wb9X~`8@Ah?e0~IIgvhZ zSWDdKlye0`7@0d2a!Ryad3Vh>nk%%8NiupRw6@{X;Q_G44YEP;W1TAPd+tT)BF?ka z;2n2(PA2+S-vXU2aZ0lAW`@E+T~^X6Y%&F~IG1c0kcvA<1+)gkZ{))#eQI-%H}Ri#UqIjmizou=R`fN%p_nV*WeFQl*E z_SvM%I}(8^qv?9uB?^59l!C7^7}N%aJPGb#--3UV+^bc{my^EWrOc5I_CZV(m-)n) z3mOd0zchq;2X&ZTY}IWFvSx<(#u6&_wDx>C z#xhc78kF`ut_xw-X!0%dtiS5)>EK%(aJ2O^|6UQ6s;*4ZyD%QrF`V@Kw4%kxsx3y= z@WgGr*6N+AgRcb7v<{PPiG~87dcUGErZe?B zR8%*>gwHuOLElZ&E)!blkd3F5yLADiG<8T7s4kZok>rE57~fpf1cxJ0+NA#YT`5C* zT9$+wP&m6g8#(62d3DP&fho*dmoMy$Z9wFQ!u!`Os&@bV!$IFiW#guoPa>JO?%B}2 z{R7^^SB@YRts|ilkiNF#)W>esxrc8n8&+8zuh_xG)VPBWdIgKaqhrE^F%lS}W%!SML)Bz)`XW$*5Z zr_Cugccq?H;tfdmn_46d=ZULLS!3h%@!*#!`m&AhVARdFvx zyoUOE{?{PBnw71gB@;U{Co{;uasP412Yxx^Wd`ArDEx9t^y!TZ z&eNo9@su$J2VGY3f>DlgEEA^up5XQtiF@8X?F$FH~o+;iVE`%4^(1(th`?O@#s3>n0c5U=J>vn7*#RZJ8_Y8 zaX_9nWKng6d9Dlq>5PZ zOzS+{>ti(Qt5=!zjrU0m$SQwq$}{)@&AG1kDxN=dJ~JsED|K7riOjP4o~J6#f+bxp zoRF!jxivL?T3;DdHFcWbrh7&-KDK8xJ{}Pt*KfLun~x8)N`1Uswx#J-ms~!44rj-g zh^+|<7siy<+BtDtZ4(@?`$Sktb}!VtThfQ3h-#ZCC@9)7xir&w{?ZK~YqreZw-R(7 zTeI{330ePJ?&Yh0^o_Oj`O_1#F)uGKH_K$#shs`YGXHauQsFl_1%apxdxq(~2UV#n zF2B(sQtrrF+jW~y^VnqOS>D^5^TZ_CC1czKB+`Dd2)4XVeWB#@XKCW+=J|!v;Q9N2 z71k@!2s8+~@<*n3e7`kcE2mF;QF)?eS8qxQQL@sIMwDTvl*iA`$7HHP+~4`z>_5z} ze?|=Y>hh3IR)tayMdLbwzeiLHX(cWG$$Dww&7hbdo19ydQ z0rye@q2AAHj9r){>dU2#f+Qs@Nr?6dFNR~^fr|mwC3T7zx8(nX@kvEN;QX2CotC&t zlQ+1nw*RZ6Blhi_gHP``c2-X3yH#HYf8bRrz!68AfUd&V*_fJ6KMhl z;UHgsgv8D}j#NnUEc;^_G9Z=(Zp8D9}xNM7ArDOjazu5}0)+o21eReaYnqcyAw>c~m zPHGasuA96jH|nJzYM#la7kdPKbf*hNFXKC=CnYg}!Js^)3V*~>COIB^V3L=|x$UHh zs{IY}7F(`lBh`K>If!w*J31g4!CA{niH>!5>KQ#izb62bVOZ+v%_)5ZC5z`F!O()% zi4Q%zZ2)G%veZcce?-wQp}*Zrq~M94nwpyT^OvL~ZrtZ$XTa4xv@!OxD8t8NiQY;z z^MT7wY$mQiawAmlkl77~^!?%I=O$JZE6&6xyp;rC$}G2|4+77auEGs=tD0#6H9VW= z8k2JrkWeT(1^9`0z=@`Tg0!F5y&`Gc%~P`>#~`@@;uevTr=bQQYKzCjA{-&JYNzb5 z_yuoHgo_cED(sHo_~T48Wt|JTnBhmy51nYDN>Cnja?FHX5)+)`M}7$j39@(J=eJ5e zJXgbdtkmD$pShP!#(9rpBUmA{tSAS2S(?Gg2+hU}FgqOZsp&(!+QTo8~3zn8C5 zT>lvM|NY+oj$i-p3{r;wAI;>XApcR>b-ahGyn2#>7nYjBwWfdgvSt&5W(}9l40zO-*g#{s+W!?*#w= diff --git a/library/mysql.php b/library/mysql.php index e799656..d5c5a8d 100644 --- a/library/mysql.php +++ b/library/mysql.php @@ -29,6 +29,24 @@ class MySQL { } // Manifest + public function getTotalManifests() { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`'); + + $query->execute(); + + return $query->fetch()->total; + } + + public function getManifests() { + + $query = $this->_db->prepare('SELECT * FROM `manifest`'); + + $query->execute(); + + return $query->fetchAll(); + } + public function getManifest(int $crc32url) { $query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1'); @@ -47,6 +65,15 @@ class MySQL { return $this->_db->lastInsertId(); } + public function deleteManifest(int $manifestId) { + + $query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1'); + + $query->execute([$manifestId]); + + return $query->rowCount(); + } + // Host public function getAPIHosts(string $apiHostFields) { @@ -570,4 +597,28 @@ class MySQL { return $query->rowCount(); } + + public function getManifestCrawlQueue(int $limit, int $timeFrom) { + + $query = $this->_db->prepare('SELECT * FROM `manifest` + + WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> 0 + + ORDER BY RAND() + + LIMIT ' . (int) $limit); + + $query->execute([$timeFrom]); + + return $query->fetchAll(); + } + + public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) { + + $query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1'); + + $query->execute([$timeUpdated, $httpCode, $manifestId]); + + return $query->rowCount(); + } }