YGGo/crontab/crawler.php

833 lines
27 KiB
PHP
Raw Normal View History

2023-04-01 19:29:39 +03:00
<?php
2023-04-02 00:27:33 +03:00
// Lock multi-thread execution
$semaphore = sem_get(crc32('crontab.crawler'), 1);
if (false === sem_acquire($semaphore, true)) {
echo 'Process locked by another thread.' . PHP_EOL;
2023-04-02 00:27:33 +03:00
exit;
}
2023-04-01 19:29:39 +03:00
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
2023-04-01 19:29:39 +03:00
require_once('../library/filter.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
2023-04-23 04:31:32 +03:00
// Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
echo 'Disk quota reached.' . PHP_EOL;
exit;
}
// Debug
$timeStart = microtime(true);
2023-05-08 11:04:59 +03:00
$httpRequestsTotal = 0;
$httpRequestsSizeTotal = 0;
$httpDownloadSizeTotal = 0;
$httpRequestsTimeTotal = 0;
$hostPagesProcessed = 0;
$hostImagesProcessed = 0;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$hostImagesIndexed = 0;
$manifestsAdded = 0;
2023-05-08 11:04:59 +03:00
$hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0;
$hostPagesBanned = 0;
$hostImagesBanned = 0;
2023-04-01 19:29:39 +03:00
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
2023-04-01 19:29:39 +03:00
$db->beginTransaction();
2023-05-04 06:45:04 +03:00
try {
2023-05-04 06:45:04 +03:00
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
2023-05-04 06:45:04 +03:00
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
2023-05-04 06:45:04 +03:00
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 11:04:59 +03:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
// Update manifest index anyway, with the current time and http code
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
2023-05-04 06:45:04 +03:00
// Skip processing non 200 code
if (200 != $curl->getCode()) {
2023-05-04 06:45:04 +03:00
continue;
}
2023-05-04 06:45:04 +03:00
// Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
2023-05-04 06:45:04 +03:00
continue;
}
// Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
2023-05-04 06:45:04 +03:00
continue;
}
// Skip processing on required fields missed
if (empty($remoteManifest->status) ||
empty($remoteManifest->result->config->crawlUrlRegexp) ||
empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) {
2023-05-04 06:45:04 +03:00
continue;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
2023-05-04 06:45:04 +03:00
continue;
}
// Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Begin hosts collection
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
2023-04-01 19:29:39 +03:00
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
// Skip processing non 200 code
if (200 != $curl->getCode()) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
continue;
}
2023-04-01 19:29:39 +03:00
// Skip processing on required fields missed
if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) {
2023-04-01 19:29:39 +03:00
continue;
2023-04-01 19:29:39 +03:00
}
// Begin hosts processing
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
// Skip processing on required fields missed
if (empty($remoteManifestHost->scheme) ||
empty($remoteManifestHost->name)) {
continue;
}
$hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
// Host exists
if ($host = $db->getHost(crc32($hostURL))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
$remoteManifestHosts->result->name,
$remoteManifestHosts->result->port,
crc32($hostURL),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save home page info
// Until page API not implemented, save at least home page to have ability to crawl
// @TODO
if ($hostStatus && // host enabled
$robots->uriAllowed('/') && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
$hostPagesAdded++;
}
}
}
2023-04-09 03:28:31 +03:00
}
}
// Process images crawl queue
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
// Build URL from the DB
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
// Init image request
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
2023-04-09 03:28:31 +03:00
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 11:04:59 +03:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
// Update image index anyway, with the current time and http code
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
// Skip image processing non 200 code
if (200 != $curl->getCode()) {
$db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());
2023-05-08 17:52:17 +03:00
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
2023-04-09 03:28:31 +03:00
}
2023-04-25 21:10:59 +03:00
2023-05-06 07:25:54 +03:00
// Skip image processing on MIME type not provided
if (!$hostImageContentType = $curl->getContentType()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
2023-05-06 07:25:54 +03:00
continue;
}
2023-05-06 07:25:54 +03:00
// Skip image processing on MIME type not allowed in settings
$hostImageBanned = true;
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
if (false !== strpos($hostImageContentType, trim($mime))) {
$hostImageBanned = false;
break;
}
}
if ($hostImageBanned) {
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
2023-05-06 07:25:54 +03:00
}
// Convert remote image data to base64 string
if (!$queueHostImage->crawlMetaOnly) {
// Skip image processing without returned content
if (!$hostImageContent = $curl->getContent()) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
continue;
}
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
// Set host image description
// On link collection we knew meta but data,
// this step use latest description slice and insert the data received by curl request
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
$db->setHostImageDescription($queueHostImage->hostImageId,
crc32($hostImageData),
$lastHostImageDescription->alt,
$lastHostImageDescription->title,
$hostImageData,
time());
}
2023-04-25 21:10:59 +03:00
}
2023-05-06 07:25:54 +03:00
2023-05-08 07:23:50 +03:00
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
2023-05-06 07:25:54 +03:00
Filter::mime($hostImageContentType),
time());
2023-04-09 03:28:31 +03:00
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
// Init page request
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 11:04:59 +03:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
2023-05-03 09:22:14 +03:00
// Skip page processing non 200 code
if (200 != $curl->getCode()) {
2023-05-03 09:22:14 +03:00
$db->updateHostPageHttpCode($queueHostPage->hostPageId, $curl->getCode(), time());
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
2023-05-03 09:22:14 +03:00
}
// Skip page processing on MIME type not provided
if (!$contentType = $curl->getContentType()) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
// Skip page processing on MIME type not allowed in settings
$hostPageBanned = true;
foreach ((array) explode(',', CRAWL_PAGE_MIME) as $mime) {
if (false !== strpos($contentType, trim($mime))) {
$hostPageBanned = false;
break;
}
}
if ($hostPageBanned) {
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
2023-05-05 21:39:48 +03:00
// Skip page processing without returned data
if (!$content = $curl->getContent()) {
2023-04-09 03:28:31 +03:00
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
2023-04-01 19:29:39 +03:00
// Grab page content
$dom = new DomDocument();
@$dom->loadHTML($content);
// Skip index page links without titles
$title = @$dom->getElementsByTagName('title');
2023-04-01 19:29:39 +03:00
if ($title->length == 0) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
continue;
}
2023-05-04 01:04:39 +03:00
// Get optional page meta data
$metaDescription = '';
$metaKeywords = '';
$metaRobots = '';
$metaYggoManifest = '';
2023-05-04 01:04:39 +03:00
foreach (@$dom->getElementsByTagName('meta') as $meta) {
2023-05-04 01:04:39 +03:00
if (@$meta->getAttribute('name') == 'description') {
$metaDescription = @$meta->getAttribute('content');
2023-05-04 01:04:39 +03:00
}
if (@$meta->getAttribute('name') == 'keywords') {
$metaKeywords = @$meta->getAttribute('content');
}
2023-05-04 01:04:39 +03:00
if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
2023-05-04 01:04:39 +03:00
}
if (@$meta->getAttribute('name') == 'yggo:manifest') {
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
2023-05-04 01:04:39 +03:00
}
}
2023-05-04 01:04:39 +03:00
2023-05-06 07:25:54 +03:00
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
2023-05-06 07:25:54 +03:00
continue;
}
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
// Update queued page
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
2023-05-06 07:25:54 +03:00
Filter::mime($contentType),
time());
2023-05-04 01:04:39 +03:00
// Add queued page description if not exists
$db->setHostPageDescription($queueHostPage->hostPageId,
crc32($content),
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
$queueHostPage->crawlMetaOnly ? null : $content,
time());
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
if (!$db->getManifest($metaYggoManifestCRC32)) {
2023-05-06 07:25:54 +03:00
$db->addManifest($metaYggoManifestCRC32,
$metaYggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
2023-05-08 11:04:59 +03:00
$manifestsAdded++;
2023-05-04 01:04:39 +03:00
}
}
2023-05-04 01:04:39 +03:00
// Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
foreach (@$dom->getElementsByTagName('img') as $img) {
// Skip images without src attribute
if (!$imageSrc = @$img->getAttribute('src')) {
continue;
}
2023-05-04 01:04:39 +03:00
// Skip images without alt attribute
if (!$imageAlt = @$img->getAttribute('alt')) {
continue;
}
if (!$imageTitle = @$img->getAttribute('title')) {
$imageTitle = null;
}
// Add domain to the relative src links
if (!parse_url($imageSrc, PHP_URL_HOST)) {
$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
2023-05-06 07:25:54 +03:00
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}
// Validate formatted src link
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
2023-05-04 01:04:39 +03:00
// Parse formatted src link
$hostImageURL = Parser::hostURL($imageSrc);
$hostImageURI = Parser::uri($imageSrc);
// Host exists
if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
2023-05-04 01:04:39 +03:00
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 11:04:59 +03:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
2023-05-04 01:04:39 +03:00
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme,
$hostImageURL->name,
$hostImageURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
2023-05-06 07:25:54 +03:00
// Save new image info
2023-05-04 09:24:21 +03:00
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
2023-05-04 11:35:22 +03:00
if (!$hostImageId && // image not exists
2023-05-06 07:25:54 +03:00
$hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
2023-05-04 01:04:39 +03:00
2023-05-04 11:35:22 +03:00
// Add host image
2023-05-06 07:25:54 +03:00
if ($hostImageId = $db->addHostImage($hostId,
crc32($hostImageURI->string),
$hostImageURI->string,
time())) {
2023-05-04 01:04:39 +03:00
2023-05-04 11:35:22 +03:00
$hostImagesAdded++;
2023-05-04 01:04:39 +03:00
2023-05-04 11:35:22 +03:00
} else {
2023-05-04 01:04:39 +03:00
2023-05-04 11:35:22 +03:00
continue;
}
2023-05-04 01:04:39 +03:00
}
// Add/update host image description
$db->setHostImageDescription($hostImageId,
null, // no data, download it in the crawler queue
Filter::imageAlt($imageAlt),
Filter::imageTitle($imageTitle),
null,
time());
2023-05-04 01:04:39 +03:00
// Relate host image with host page was found
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
2023-05-04 01:04:39 +03:00
2023-05-04 11:35:22 +03:00
// Increase image rank when link does not match the current host
2023-05-04 01:04:39 +03:00
if ($hostImageURL->scheme . '://' .
$hostImageURL->name .
($hostImageURL->port ? ':' . $hostImageURL->port : '')
2023-05-04 01:04:39 +03:00
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
2023-05-04 01:04:39 +03:00
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
}
}
}
}
2023-04-01 19:29:39 +03:00
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
2023-04-01 19:29:39 +03:00
// Skip links without required attribute
if (!$href = @$a->getAttribute('href')) {
2023-04-01 19:29:39 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// Skip anchor links
if (false !== strpos($href, '#')) {
2023-04-07 05:19:32 +03:00
continue;
}
2023-04-07 05:19:32 +03:00
// Skip javascript links
if (false !== strpos($href, 'javascript:')) {
2023-04-07 05:19:32 +03:00
continue;
}
2023-04-07 05:19:32 +03:00
// Skip mailto links
if (false !== strpos($href, 'mailto:')) {
2023-04-08 19:11:12 +03:00
continue;
}
2023-04-08 19:11:12 +03:00
// Skip x-raw-image links
if (false !== strpos($href, 'x-raw-image:')) {
2023-04-07 05:19:32 +03:00
continue;
}
2023-04-01 19:29:39 +03:00
// @TODO skip other apps
2023-04-01 19:29:39 +03:00
// Add absolute URL prefixes to the relative links found
if (!parse_url($href, PHP_URL_HOST)) {
2023-04-01 19:29:39 +03:00
$href = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
}
// Validate formatted link
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
2023-05-04 01:04:39 +03:00
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
2023-05-08 08:27:21 +03:00
// Update curl stats
2023-05-08 11:04:59 +03:00
$httpRequestsTotal++;
$httpRequestsSizeTotal += $curl->getSizeRequest();
2023-05-08 11:04:59 +03:00
$httpDownloadSizeTotal += $curl->getSizeDownload();
$httpRequestsTimeTotal += $curl->getTotalTime();
2023-05-08 08:27:21 +03:00
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
2023-05-04 01:04:39 +03:00
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
2023-05-04 01:04:39 +03:00
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
2023-05-03 04:17:58 +03:00
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
$hostPagesAdded++;
}
}
2023-04-25 18:19:22 +03:00
// Increase page rank when link does not match the current host
if ($hostURL->scheme . '://' .
$hostURL->name .
($hostURL->port ? ':' . $hostURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
}
}
}
}
2023-04-25 18:19:22 +03:00
$db->commit();
} catch(Exception $e) {
var_dump($e);
$db->rollBack();
}
// Debug
2023-05-08 11:04:59 +03:00
$executionTimeTotal = microtime(true) - $timeStart;
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
if (CRAWL_LOG_ENABLED) {
$db->addCrawlerLog(time(),
$hostsAdded,
$hostPagesProcessed,
$hostPagesIndexed,
$hostPagesAdded,
$hostPagesBanned,
$hostImagesIndexed,
$hostImagesProcessed,
$hostImagesAdded,
$hostImagesBanned,
$manifestsProcessed,
$manifestsAdded,
$httpRequestsTotal,
$httpRequestsSizeTotal,
$httpDownloadSizeTotal,
$httpRequestsTimeTotal,
$executionTimeTotal);
}
// Debug output
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
2023-05-08 11:04:59 +03:00
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
2023-05-08 08:27:21 +03:00
2023-05-04 06:45:04 +03:00
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
2023-05-04 01:04:39 +03:00
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
2023-05-08 11:04:59 +03:00
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
2023-05-08 08:27:21 +03:00
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
2023-05-08 11:04:59 +03:00
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
2023-05-08 08:27:21 +03:00
2023-05-08 11:04:59 +03:00
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
2023-05-08 08:27:21 +03:00
2023-05-08 11:04:59 +03:00
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;