2023-04-01 16:29:39 +00:00
|
|
|
<?php
|
|
|
|
|
2023-04-01 21:27:33 +00:00
|
|
|
// Lock multi-thread execution
|
|
|
|
$semaphore = sem_get(crc32('crontab.crawler'), 1);
|
|
|
|
|
|
|
|
if (false === sem_acquire($semaphore, true)) {
|
|
|
|
|
2023-04-07 01:58:56 +00:00
|
|
|
echo 'Process locked by another thread.' . PHP_EOL;
|
2023-04-01 21:27:33 +00:00
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
2023-04-01 16:29:39 +00:00
|
|
|
// Load system dependencies
|
|
|
|
require_once('../config/app.php');
|
|
|
|
require_once('../library/curl.php');
|
2023-04-07 01:04:24 +00:00
|
|
|
require_once('../library/robots.php');
|
2023-04-01 16:29:39 +00:00
|
|
|
require_once('../library/filter.php');
|
2023-04-07 01:04:24 +00:00
|
|
|
require_once('../library/parser.php');
|
|
|
|
require_once('../library/mysql.php');
|
|
|
|
|
|
|
|
// Debug
|
|
|
|
$timeStart = microtime(true);
|
|
|
|
|
|
|
|
$hostPagesProcessed = 0;
|
|
|
|
$hostPagesIndexed = 0;
|
|
|
|
$hostPagesAdded = 0;
|
|
|
|
$hostsAdded = 0;
|
2023-04-01 16:29:39 +00:00
|
|
|
|
|
|
|
// Connect database
|
2023-04-07 01:04:24 +00:00
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
2023-04-01 16:29:39 +00:00
|
|
|
|
|
|
|
// Process crawl queue
|
2023-04-07 01:04:24 +00:00
|
|
|
foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
|
|
|
|
|
|
|
// Build URL from the DB
|
|
|
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
2023-04-01 16:29:39 +00:00
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
$curl = new Curl($queueHostPageURL);
|
2023-04-01 16:29:39 +00:00
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Update page index anyway, with the current time and http code
|
|
|
|
$hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode());
|
2023-04-01 16:29:39 +00:00
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Skip next page processing non 200 code
|
|
|
|
if (200 != $curl->getCode()) {
|
2023-04-01 16:29:39 +00:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Skip next page processing pages without returned data
|
|
|
|
if (!$content = $curl->getContent()) {
|
2023-04-01 16:29:39 +00:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Grab page content
|
2023-04-01 16:29:39 +00:00
|
|
|
$dom = new DomDocument();
|
|
|
|
|
|
|
|
@$dom->loadHTML($content);
|
|
|
|
|
|
|
|
// Skip index page links without titles
|
|
|
|
$title = @$dom->getElementsByTagName('title');
|
|
|
|
|
|
|
|
if ($title->length == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get optional page meta data
|
|
|
|
$description = '';
|
|
|
|
$keywords = '';
|
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'description') {
|
|
|
|
$description = @$meta->getAttribute('content');
|
|
|
|
}
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'keywords') {
|
|
|
|
$keywords = @$meta->getAttribute('content');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Update queued page data
|
|
|
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
|
|
|
Filter::pageTitle($title->item(0)->nodeValue),
|
|
|
|
Filter::pageDescription($description),
|
|
|
|
Filter::pageKeywords($keywords),
|
|
|
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
2023-04-01 16:29:39 +00:00
|
|
|
|
|
|
|
// Collect internal links from page content
|
|
|
|
foreach(@$dom->getElementsByTagName('a') as $a) {
|
|
|
|
|
|
|
|
// Skip links without required attribute
|
|
|
|
if (!$href = @$a->getAttribute('href')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip anchor links
|
|
|
|
if (false !== strpos($href, '#')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-04-07 02:19:32 +00:00
|
|
|
// Skip javascript links
|
|
|
|
if (false !== strpos($href, 'javascript:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip mailto links
|
|
|
|
if (false !== strpos($href, 'mailto:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-04-08 16:11:12 +00:00
|
|
|
// Skip x-raw-image links
|
|
|
|
if (false !== strpos($href, 'x-raw-image:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-04-07 02:19:32 +00:00
|
|
|
// @TODO skip other apps
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Add absolute URL prefixes to the relative links found
|
2023-04-01 16:29:39 +00:00
|
|
|
if (!parse_url($href, PHP_URL_HOST)) {
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
$href = $queueHostPage->scheme . '://' .
|
|
|
|
$queueHostPage->name .
|
|
|
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
2023-04-08 16:14:04 +00:00
|
|
|
'/' . ltrim(str_replace(['./', '../'], '', $href), '/');
|
2023-04-01 16:29:39 +00:00
|
|
|
}
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
// Validate formatted link
|
2023-04-01 16:29:39 +00:00
|
|
|
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
|
|
|
|
2023-04-07 01:04:24 +00:00
|
|
|
$db->beginTransaction();
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
// Parse formatted link
|
|
|
|
$hostURL = Parser::hostURL($href);
|
|
|
|
$hostPageURI = Parser::uri($href);
|
|
|
|
|
|
|
|
// Host exists
|
|
|
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
|
|
|
|
|
|
|
$hostStatus = $host->status;
|
|
|
|
$hostPageLimit = $host->crawlPageLimit;
|
|
|
|
$hostId = $host->hostId;
|
|
|
|
$hostRobots = $host->robots;
|
|
|
|
|
|
|
|
// Register new host
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Get robots.txt if exists
|
|
|
|
$curl = new Curl($hostURL->string . '/robots.txt');
|
|
|
|
|
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
|
|
$hostRobots = $curl->getContent();
|
|
|
|
} else {
|
|
|
|
$hostRobots = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
|
|
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
|
|
|
$hostId = $db->addHost($hostURL->scheme,
|
|
|
|
$hostURL->name,
|
|
|
|
$hostURL->port,
|
|
|
|
crc32($hostURL->string),
|
|
|
|
time(),
|
|
|
|
null,
|
|
|
|
$hostPageLimit,
|
|
|
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
|
|
|
(string) $hostStatus,
|
|
|
|
$hostRobots);
|
|
|
|
|
|
|
|
if ($hostId) {
|
|
|
|
|
|
|
|
$hostsAdded++;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init robots parser
|
|
|
|
$robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES);
|
|
|
|
|
|
|
|
// Save page info
|
|
|
|
if ($hostStatus && // host enabled
|
|
|
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
|
|
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
|
|
|
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
|
|
|
|
|
|
|
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
|
|
|
|
|
|
|
|
$hostPagesAdded++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$db->commit();
|
|
|
|
|
|
|
|
} catch(Exception $e){
|
|
|
|
|
|
|
|
var_dump($e);
|
|
|
|
|
|
|
|
$db->rollBack();
|
|
|
|
}
|
2023-04-01 16:29:39 +00:00
|
|
|
}
|
|
|
|
}
|
2023-04-07 01:04:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Debug
|
|
|
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
|
|
|
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
|
|
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
|
|
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|