Browse Source

add distributed hosts crawling using yggo nodes manifest

main
ghost 2 years ago
parent
commit
5999fb3a73
  1. 2
      README.md
  2. 44
      config/app.php.txt
  3. 93
      crontab/cleaner.php
  4. 498
      crontab/crawler.php
  5. BIN
      database/yggo.mwb
  6. 51
      library/mysql.php

2
README.md

@ -150,7 +150,7 @@ GET m=SphinxQL
* [ ] Implement database auto backup on crawl process completing * [ ] Implement database auto backup on crawl process completing
* [x] Add transactions to prevent data loss on DB crashes * [x] Add transactions to prevent data loss on DB crashes
* [x] JSON API * [x] JSON API
* [ ] Distributed index data sharing between the nodes trough service API * [x] Distributed index data sharing between the nodes trough service API
* [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6 * [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6
* [ ] An idea to make some visitors counters, like in good old times? * [ ] An idea to make some visitors counters, like in good old times?

44
config/app.php.txt

@ -121,8 +121,10 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
* Usually up to 20 pages per minute, * Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests * to prevent websites overload by sending GET crawling requests
* *
* Set 0 to disable
*
*/ */
define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_LIMIT', 20);
/* /*
* Images (URI) processing limit in the crawler.php queue * Images (URI) processing limit in the crawler.php queue
@ -133,8 +135,27 @@ define('CRAWL_PAGE_LIMIT', 10);
* Usually up to 20 pages per minute, * Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests * to prevent websites overload by sending GET crawling requests
* *
* Set 0 to disable
*
*/
define('CRAWL_IMAGE_LIMIT', 10);
/*
* Manifest (URI) processing limit in the crawler.php queue
*
* Used to collect distributed data index
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
*
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/ */
define('CRAWL_IMAGE_LIMIT', 20); define('CRAWL_MANIFEST_LIMIT', 10);
/* /*
* Renew page index by timing offset provided * Renew page index by timing offset provided
@ -162,6 +183,19 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
*/ */
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12); define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
/*
* Renew manifests index by timing offset provided
*
* This option works with CRAWL_MANIFEST_LIMIT step queue
*
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
* must have enough value to crawl all manifests collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
/* /*
* Only URL addresses match this rule will be auto-crawled * Only URL addresses match this rule will be auto-crawled
* *
@ -242,6 +276,12 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
*/ */
define('CRAWL_MANIFEST', true); define('CRAWL_MANIFEST', true);
/*
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.4);
/* /*
* Set default auto-crawl status for new manifest added * Set default auto-crawl status for new manifest added
* *

93
crontab/cleaner.php

@ -22,29 +22,31 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
$timeStart = microtime(true); $timeStart = microtime(true);
$hostsTotal = $db->getTotalHosts(); $hostsTotal = $db->getTotalHosts();
$manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostsPagesDeleted = 0; $hostsPagesDeleted = 0;
$hostsImagesDeleted = 0; $hostsImagesDeleted = 0;
$manifestsDeleted = 0;
// Get host queue // Begin update
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { $db->beginTransaction();
// Parse host info try {
$hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
// Get robots.txt if exists // Get cleaner queue
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT); foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { // Parse host info
$hostRobots = $curl->getContent(); $hostURL = $host->scheme . '://' . $host->name . ($host->port ? ':' . $host->port : false);
} else {
$hostRobots = null;
}
// Begin update // Get robots.txt if exists
$db->beginTransaction(); $curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
try { if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = null;
}
// Update host data // Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
@ -118,15 +120,66 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
// Delete host image // Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId); $hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
} }
}
// Clean up deprecated manifests
foreach ($db->getManifests() as $manifest) {
$delete = false;
$curl = new Curl($manifest->url);
// Skip processing non 200 code
if (200 != $curl->getCode()) {
continue; // Wait for reconnect
}
$db->commit(); // Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
} catch(Exception $e){ $delete = true;
}
var_dump($e); // Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->rollBack(); $delete = true;
}
// Skip processing on required fields missed
if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->api->version)) {
$delete = true;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$delete = true;
}
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$delete = true;
}
if ($delete) {
$manifestsDeleted += $db->deleteManifest($manifest->manifestId);
}
} }
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
} }
// Debug // Debug
@ -134,4 +187,6 @@ echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL; echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL; echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

498
crontab/crawler.php

@ -29,8 +29,10 @@ $timeStart = microtime(true);
$hostPagesProcessed = 0; $hostPagesProcessed = 0;
$hostImagesProcessed = 0; $hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0; $hostPagesIndexed = 0;
$hostImagesIndexed = 0; $hostImagesIndexed = 0;
$manifestsIndexed = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostImagesAdded = 0; $hostImagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
@ -38,175 +40,350 @@ $hostsAdded = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Process images crawl queue $db->beginTransaction();
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB try {
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT); // Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
// Update image index anyway, with the current time and http code $curl = new Curl($queueManifest->url);
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code // Update manifest index anyway, with the current time and http code
if (200 != $curl->getCode()) { $manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
continue; // Skip processing non 200 code
} if (200 != $curl->getCode()) {
// Save image content on data settings enabled continue;
if (!CRAWL_HOST_DEFAULT_META_ONLY) { }
// Skip next image processing images without returned data // Skip processing without returned data
if (!$content = $curl->getContent()) { if (!$remoteManifest = $curl->getContent()) {
continue; continue;
} }
// Convert remote image data to base64 string to prevent direct URL call // Skip processing on json encoding error
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { if (!$remoteManifest = @json_decode($remoteManifest)) {
continue; continue;
} }
if (!$hostImageBase64 = @base64_encode($curl->getContent())) { // Skip processing on required fields missed
if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) {
continue; continue;
} }
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); // Skip processing on API version not compatible
} if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
}
// Process pages crawl queue continue;
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { }
// Build URL from the DB // Skip processing on host API not available
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; if (!$remoteManifest->result->api->hosts) {
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT); continue;
}
// Update page index anyway, with the current time and http code // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
// Skip next page processing non 200 code continue;
if (200 != $curl->getCode()) { }
continue; // Skip processing on host link does not match condition
} if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
// Skip next page processing pages without returned data continue;
if (!$content = $curl->getContent()) { }
continue; // Begin hosts collection
} $curl = new Curl($remoteManifest->result->api->hosts);
// Grab page content // Skip processing non 200 code
$dom = new DomDocument(); if (200 != $curl->getCode()) {
@$dom->loadHTML($content); continue;
}
// Skip index page links without titles // Skip processing without returned data
$title = @$dom->getElementsByTagName('title'); if (!$remoteManifestHosts = $curl->getContent()) {
if ($title->length == 0) { continue;
continue; }
}
// Get optional page meta data // Skip processing on json encoding error
$metaDescription = ''; if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$metaKeywords = '';
$metaRobots = ''; continue;
$metaYggoManifest = ''; }
foreach (@$dom->getElementsByTagName('meta') as $meta) { // Skip processing on required fields missed
if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) {
if (@$meta->getAttribute('name') == 'description') { continue;
$metaDescription = @$meta->getAttribute('content');
} }
if (@$meta->getAttribute('name') == 'keywords') { // Begin hosts processing
$metaKeywords = @$meta->getAttribute('content'); foreach ($remoteManifestHosts->result as $remoteManifestHost) {
// Skip processing on required fields missed
if (empty($remoteManifestHost->scheme) ||
empty($remoteManifestHost->name)) {
continue;
}
$hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save home page info
// Until page API not implemented, save at least home page to have ability to crawl
// @TODO
if ($hostStatus && // host enabled
$robots->uriAllowed('/') && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
$hostPagesAdded++;
}
}
}
} }
}
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
if (@$meta->getAttribute('name') == 'robots') { // Update image index anyway, with the current time and http code
$metaRobots = @$meta->getAttribute('content'); $hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip next image processing non 200 code
if (200 != $curl->getCode()) {
continue;
} }
if (@$meta->getAttribute('name') == 'yggo:manifest') { // Save image content on data settings enabled
$metaYggoManifest = Filter::url(@$meta->getAttribute('content')); if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip next image processing images without returned data
if (!$content = $curl->getContent()) {
continue;
}
// Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue;
}
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
continue;
}
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
} }
} }
// Update queued page data // Process pages crawl queue
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), // Build URL from the DB
Filter::pageKeywords($metaKeywords), $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
// Update manifest registry // Update page index anyway, with the current time and http code
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) { $hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
$metaYggoManifestCRC32 = crc32($metaYggoManifest); // Skip next page processing non 200 code
if (200 != $curl->getCode()) {
if (!$db->getManifest($metaYggoManifestCRC32)) { continue;
$db->addManifest($metaYggoManifestCRC32,
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
} }
}
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Skip next page processing pages without returned data
if (false !== stripos($metaRobots, 'noindex')) { if (!$content = $curl->getContent()) {
continue; continue;
} }
// Skip page links following by robots:nofollow attribute detected // Grab page content
if (false !== stripos($metaRobots, 'nofollow')) { $dom = new DomDocument();
continue; @$dom->loadHTML($content);
}
// Skip index page links without titles
$title = @$dom->getElementsByTagName('title');
// Collect page images if ($title->length == 0) {
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { continue;
}
foreach (@$dom->getElementsByTagName('img') as $img) { // Get optional page meta data
$metaDescription = '';
$metaKeywords = '';
$metaRobots = '';
$metaYggoManifest = '';
// Skip images without src attribute foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (!$imageSrc = @$img->getAttribute('src')) {
continue; if (@$meta->getAttribute('name') == 'description') {
$metaDescription = @$meta->getAttribute('content');
} }
// Skip images without alt attribute if (@$meta->getAttribute('name') == 'keywords') {
if (!$imageAlt = @$img->getAttribute('alt')) { $metaKeywords = @$meta->getAttribute('content');
}
continue; if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
} }
if (!$imageTitle = @$img->getAttribute('title')) { if (@$meta->getAttribute('name') == 'yggo:manifest') {
$imageTitle = null; $metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
} }
}
// Add domain to the relative src links // Update queued page data
if (!parse_url($imageSrc, PHP_URL_HOST)) { $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
$imageSrc = $queueHostPage->scheme . '://' . // Update manifest registry
$queueHostPage->name . if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.'); $metaYggoManifestCRC32 = crc32($metaYggoManifest);
if (!$db->getManifest($metaYggoManifestCRC32)) {
$db->addManifest($metaYggoManifestCRC32,
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
} }
}
// Validate formatted src link // Append page with meta robots:noindex value to the robotsPostfix disallow list
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) { if (false !== stripos($metaRobots, 'noindex')) {
$db->beginTransaction(); continue;
}
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
// Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
foreach (@$dom->getElementsByTagName('img') as $img) {
// Skip images without src attribute
if (!$imageSrc = @$img->getAttribute('src')) {
continue;
}
try { // Skip images without alt attribute
if (!$imageAlt = @$img->getAttribute('alt')) {
continue;
}
if (!$imageTitle = @$img->getAttribute('title')) {
$imageTitle = null;
}
// Add domain to the relative src links
if (!parse_url($imageSrc, PHP_URL_HOST)) {
$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}
// Validate formatted src link
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
// Parse formatted src link // Parse formatted src link
$hostImageURL = Parser::hostURL($imageSrc); $hostImageURL = Parser::hostURL($imageSrc);
@ -269,9 +446,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string)); $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
if (!$hostImageId && // image not exists if (!$hostImageId && // image not exists
$hostStatus && // host enabled $hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
// Add host image // Add host image
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) { if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) {
@ -289,11 +466,11 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Add/update host image description // Add/update host image description
$db->setHostImageDescription($hostImageId, $db->setHostImageDescription($hostImageId,
crc32(md5((string) $imageAlt . (string) $imageTitle)), crc32(md5((string) $imageAlt . (string) $imageTitle)),
Filter::imageAlt($imageAlt), Filter::imageAlt($imageAlt),
Filter::imageTitle($imageTitle), Filter::imageTitle($imageTitle),
time(), time(),
time()); time());
// Relate host image with host page was found // Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1); $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1);
@ -302,77 +479,64 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
// Increase image rank when link does not match the current host // Increase image rank when link does not match the current host
if ($hostImageURL->scheme . '://' . if ($hostImageURL->scheme . '://' .
$hostImageURL->name . $hostImageURL->name .
($hostImageURL->port ? ':' . $hostImageURL->port : '') ($hostImageURL->port ? ':' . $hostImageURL->port : '')
!= !=
$queueHostPage->scheme . '://' . $queueHostPage->scheme . '://' .
$queueHostPage->name . $queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) { ($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1); $db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
} }
$db->commit();
} catch(Exception $e) {
var_dump($e);
$db->rollBack();
} }
} }
} }
}
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
// Skip links without required attribute // Collect internal links from page content
if (!$href = @$a->getAttribute('href')) { foreach(@$dom->getElementsByTagName('a') as $a) {
continue;
}
// Skip anchor links // Skip links without required attribute
if (false !== strpos($href, '#')) { if (!$href = @$a->getAttribute('href')) {
continue; continue;
} }
// Skip javascript links // Skip anchor links
if (false !== strpos($href, 'javascript:')) { if (false !== strpos($href, '#')) {
continue; continue;
} }
// Skip mailto links // Skip javascript links
if (false !== strpos($href, 'mailto:')) { if (false !== strpos($href, 'javascript:')) {
continue; continue;
} }
// Skip x-raw-image links // Skip mailto links
if (false !== strpos($href, 'x-raw-image:')) { if (false !== strpos($href, 'mailto:')) {
continue; continue;
} }
// @TODO skip other apps // Skip x-raw-image links
if (false !== strpos($href, 'x-raw-image:')) {
// Add absolute URL prefixes to the relative links found continue;
if (!parse_url($href, PHP_URL_HOST)) { }
$href = $queueHostPage->scheme . '://' . // @TODO skip other apps
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
}
// Validate formatted link // Add absolute URL prefixes to the relative links found
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { if (!parse_url($href, PHP_URL_HOST)) {
$db->beginTransaction(); $href = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
}
try { // Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
// Parse formatted link // Parse formatted link
$hostURL = Parser::hostURL($href); $hostURL = Parser::hostURL($href);
@ -435,7 +599,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if ($hostStatus && // host enabled if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
@ -454,17 +618,17 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1); $db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
} }
}
}
}
$db->commit(); $db->commit();
} catch(Exception $e){ } catch(Exception $e) {
var_dump($e); var_dump($e);
$db->rollBack(); $db->rollBack();
}
}
}
} }
// Debug // Debug
@ -474,5 +638,7 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL; echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL; echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL; echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

51
library/mysql.php

@ -29,6 +29,24 @@ class MySQL {
} }
// Manifest // Manifest
public function getTotalManifests() {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`');
$query->execute();
return $query->fetch()->total;
}
public function getManifests() {
$query = $this->_db->prepare('SELECT * FROM `manifest`');
$query->execute();
return $query->fetchAll();
}
public function getManifest(int $crc32url) { public function getManifest(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
@ -47,6 +65,15 @@ class MySQL {
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
public function deleteManifest(int $manifestId) {
$query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1');
$query->execute([$manifestId]);
return $query->rowCount();
}
// Host // Host
public function getAPIHosts(string $apiHostFields) { public function getAPIHosts(string $apiHostFields) {
@ -570,4 +597,28 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT * FROM `manifest`
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> 0
ORDER BY RAND()
LIMIT ' . (int) $limit);
$query->execute([$timeFrom]);
return $query->fetchAll();
}
public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) {
$query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1');
$query->execute([$timeUpdated, $httpCode, $manifestId]);
return $query->rowCount();
}
} }

Loading…
Cancel
Save