YGGo/crontab/crawler.php

275 lines
8.0 KiB
PHP
Raw Normal View History

2023-04-01 19:29:39 +03:00
<?php
2023-04-02 00:27:33 +03:00
// Lock multi-thread execution
$semaphore = sem_get(crc32('crontab.crawler'), 1);
if (false === sem_acquire($semaphore, true)) {
echo 'Process locked by another thread.' . PHP_EOL;
2023-04-02 00:27:33 +03:00
exit;
}
2023-04-01 19:29:39 +03:00
// Load system dependencies
require_once('../config/app.php');
require_once('../library/curl.php');
require_once('../library/robots.php');
2023-04-01 19:29:39 +03:00
require_once('../library/filter.php');
require_once('../library/parser.php');
require_once('../library/mysql.php');
2023-04-23 04:31:32 +03:00
// Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
echo 'Disk quota reached.' . PHP_EOL;
exit;
}
// Debug
$timeStart = microtime(true);
$hostPagesProcessed = 0;
$hostPagesIndexed = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
2023-04-01 19:29:39 +03:00
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
2023-04-01 19:29:39 +03:00
// Process crawl queue
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
2023-04-01 19:29:39 +03:00
$curl = new Curl($queueHostPageURL);
2023-04-01 19:29:39 +03:00
// Update page index anyway, with the current time and http code
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
2023-04-01 19:29:39 +03:00
// Skip next page processing non 200 code
if (200 != $curl->getCode()) {
2023-04-01 19:29:39 +03:00
continue;
}
// Skip next page processing pages without returned data
if (!$content = $curl->getContent()) {
2023-04-01 19:29:39 +03:00
continue;
}
// Grab page content
2023-04-01 19:29:39 +03:00
$dom = new DomDocument();
@$dom->loadHTML($content);
// Skip index page links without titles
$title = @$dom->getElementsByTagName('title');
if ($title->length == 0) {
continue;
}
// Get optional page meta data
2023-04-25 21:20:35 +03:00
$metaDescription = '';
$metaKeywords = '';
$metaRobots = '';
$metaYggo = '';
2023-04-01 19:29:39 +03:00
foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') {
2023-04-09 03:28:31 +03:00
$metaDescription = @$meta->getAttribute('content');
2023-04-01 19:29:39 +03:00
}
if (@$meta->getAttribute('name') == 'keywords') {
2023-04-09 03:28:31 +03:00
$metaKeywords = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
}
2023-04-25 21:10:59 +03:00
if (@$meta->getAttribute('name') == 'yggo') {
$metaYggo = @$meta->getAttribute('content');
}
2023-04-09 03:28:31 +03:00
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
2023-04-25 21:10:59 +03:00
Filter::url($metaYggo),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
2023-04-09 03:28:31 +03:00
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
2023-04-09 03:28:31 +03:00
$robots = new Robots($queueHostPage->robots);
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
// Ignore URI if does not match existing rules yet
if ($robotsPostfix->uriAllowed($queueHostPage->uri) &&
$robots->uriAllowed($queueHostPage->uri)) {
$robotsPostfix->append('Disallow:', $queueHostPage->uri);
$db->updateHostRobotsPostfix($queueHostPage->hostId, $robotsPostfix->getData(), time());
2023-04-01 19:29:39 +03:00
}
}
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
2023-04-01 19:29:39 +03:00
// Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) {
// Skip links without required attribute
if (!$href = @$a->getAttribute('href')) {
continue;
}
// Skip anchor links
if (false !== strpos($href, '#')) {
continue;
}
2023-04-07 05:19:32 +03:00
// Skip javascript links
if (false !== strpos($href, 'javascript:')) {
continue;
}
// Skip mailto links
if (false !== strpos($href, 'mailto:')) {
continue;
}
2023-04-08 19:11:12 +03:00
// Skip x-raw-image links
if (false !== strpos($href, 'x-raw-image:')) {
continue;
}
2023-04-07 05:19:32 +03:00
// @TODO skip other apps
// Add absolute URL prefixes to the relative links found
2023-04-01 19:29:39 +03:00
if (!parse_url($href, PHP_URL_HOST)) {
$href = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
2023-04-01 19:29:39 +03:00
}
// Validate formatted link
2023-04-01 19:29:39 +03:00
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
$db->beginTransaction();
try {
// Parse formatted link
$hostURL = Parser::hostURL($href);
$hostPageURI = Parser::uri($href);
// Host exists
if ($host = $db->getHost(crc32($hostURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme,
$hostURL->name,
$hostURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix);
// Save page info
if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
$hostPagesAdded++;
}
}
2023-04-25 18:19:22 +03:00
// Increase page rank when link does not match the current host
if ($hostURL->scheme . '://' .
$hostURL->name .
($hostURL->port ? ':' . $hostURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
}
$db->commit();
} catch(Exception $e){
var_dump($e);
$db->rollBack();
}
2023-04-01 19:29:39 +03:00
}
}
}
// Debug
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;