2023-04-01 19:29:39 +03:00
|
|
|
<?php
|
|
|
|
|
2023-04-02 00:27:33 +03:00
|
|
|
// Lock multi-thread execution
|
|
|
|
$semaphore = sem_get(crc32('crontab.crawler'), 1);
|
|
|
|
|
|
|
|
if (false === sem_acquire($semaphore, true)) {
|
|
|
|
|
2023-04-07 04:58:56 +03:00
|
|
|
echo 'Process locked by another thread.' . PHP_EOL;
|
2023-04-02 00:27:33 +03:00
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
2023-04-01 19:29:39 +03:00
|
|
|
// Load system dependencies
|
2023-06-30 14:38:29 +03:00
|
|
|
require_once(__DIR__ . '/../config/app.php');
|
|
|
|
require_once(__DIR__ . '/../library/ftp.php');
|
|
|
|
require_once(__DIR__ . '/../library/curl.php');
|
|
|
|
require_once(__DIR__ . '/../library/robots.php');
|
|
|
|
require_once(__DIR__ . '/../library/filter.php');
|
|
|
|
require_once(__DIR__ . '/../library/parser.php');
|
|
|
|
require_once(__DIR__ . '/../library/mysql.php');
|
|
|
|
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
2023-04-07 04:04:24 +03:00
|
|
|
|
2023-04-23 04:31:32 +03:00
|
|
|
// Check disk quota
|
|
|
|
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
|
|
|
|
|
|
|
echo 'Disk quota reached.' . PHP_EOL;
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
2023-04-07 04:04:24 +03:00
|
|
|
// Debug
|
|
|
|
$timeStart = microtime(true);
|
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal = 0;
|
|
|
|
$httpRequestsSizeTotal = 0;
|
|
|
|
$httpDownloadSizeTotal = 0;
|
|
|
|
$httpRequestsTimeTotal = 0;
|
|
|
|
|
|
|
|
$hostPagesProcessed = 0;
|
|
|
|
$manifestsProcessed = 0;
|
2023-05-08 14:13:53 +03:00
|
|
|
$manifestsAdded = 0;
|
2023-05-08 11:04:59 +03:00
|
|
|
$hostPagesAdded = 0;
|
|
|
|
$hostsAdded = 0;
|
|
|
|
$hostPagesBanned = 0;
|
2023-05-14 01:45:55 +03:00
|
|
|
$hostPagesSnapAdded = 0;
|
2023-04-01 19:29:39 +03:00
|
|
|
|
|
|
|
// Connect database
|
2023-06-05 22:01:22 +03:00
|
|
|
try {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
} catch(Exception $e) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Debug std
|
|
|
|
var_dump($e);
|
|
|
|
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process manifests crawl queue
|
|
|
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
|
|
|
|
|
|
|
$db->beginTransaction();
|
|
|
|
|
|
|
|
try {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-05-09 10:21:37 +03:00
|
|
|
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
2023-05-08 12:10:57 +03:00
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Update manifest index anyway, with the current time and http code
|
|
|
|
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing non 200 code
|
|
|
|
if (200 != $curl->getCode()) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing without returned data
|
|
|
|
if (!$remoteManifest = $curl->getContent()) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-04 06:45:04 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on json encoding error
|
|
|
|
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-04 06:45:04 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on required fields missed
|
|
|
|
if (empty($remoteManifest->status) ||
|
|
|
|
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
|
|
|
empty($remoteManifest->result->api->version) ||
|
|
|
|
empty($remoteManifest->result->api->hosts)) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
|
|
|
continue;
|
2023-05-04 06:45:04 +03:00
|
|
|
}
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on API version not compatible
|
|
|
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
2023-05-04 06:45:04 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-07 04:04:24 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on host API not available
|
|
|
|
if (!$remoteManifest->result->api->hosts) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
|
|
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on host link does not match condition
|
|
|
|
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Begin hosts collection
|
2023-05-09 10:21:37 +03:00
|
|
|
$curl = new Curl($remoteManifest->result->api->hosts, CRAWL_CURLOPT_USERAGENT);
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
2023-05-10 12:47:36 +03:00
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing non 200 code
|
|
|
|
if (200 != $curl->getCode()) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing without returned data
|
|
|
|
if (!$remoteManifestHosts = $curl->getContent()) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on json encoding error
|
|
|
|
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Skip processing on required fields missed
|
|
|
|
if (empty($remoteManifestHosts->status) ||
|
|
|
|
empty($remoteManifestHosts->result)) {
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
2023-04-01 19:29:39 +03:00
|
|
|
}
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Begin hosts processing
|
|
|
|
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
|
|
|
|
|
|
|
|
// Skip processing on required fields missed
|
|
|
|
if (empty($remoteManifestHost->scheme) ||
|
|
|
|
empty($remoteManifestHost->name)) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostURL = $remoteManifestHost->scheme . '://' .
|
2023-06-05 22:01:22 +03:00
|
|
|
$remoteManifestHost->name .
|
2023-05-05 05:26:53 +03:00
|
|
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
|
|
|
|
|
|
|
// Validate formatted link
|
|
|
|
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
|
|
|
|
2023-05-10 12:47:36 +03:00
|
|
|
// Host not exists
|
|
|
|
if (!$db->getHost(crc32($hostURL))) {
|
2023-05-05 05:26:53 +03:00
|
|
|
|
|
|
|
// Get robots.txt if exists
|
|
|
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
2023-05-10 12:47:36 +03:00
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
|
|
$hostRobots = $curl->getContent();
|
|
|
|
} else {
|
|
|
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
|
|
|
|
2023-05-10 12:47:36 +03:00
|
|
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
|
|
|
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
|
|
|
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
2023-05-05 05:26:53 +03:00
|
|
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
|
|
|
|
2023-05-10 12:47:36 +03:00
|
|
|
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
|
|
|
|
$remoteManifestHosts->result->name,
|
|
|
|
$remoteManifestHosts->result->port,
|
|
|
|
crc32($hostURL),
|
|
|
|
time(),
|
|
|
|
null,
|
|
|
|
$hostPageLimit,
|
|
|
|
(string) $hostMetaOnly,
|
|
|
|
(string) $hostStatus,
|
|
|
|
(string) $hostNsfw,
|
|
|
|
$hostRobots,
|
|
|
|
$hostRobotsPostfix);
|
|
|
|
|
|
|
|
// Add web root host page to make host visible in the crawl queue
|
|
|
|
$db->addHostPage($hostId, crc32('/'), '/', time());
|
|
|
|
|
|
|
|
// Increase counters
|
|
|
|
$hostPagesAdded++;
|
|
|
|
$hostsAdded++;
|
2023-05-05 05:26:53 +03:00
|
|
|
}
|
2023-05-09 08:19:49 +03:00
|
|
|
}
|
2023-04-25 21:10:59 +03:00
|
|
|
}
|
2023-06-05 22:01:22 +03:00
|
|
|
|
|
|
|
// Apply changes
|
|
|
|
$db->commit();
|
|
|
|
|
|
|
|
// Process update errors
|
|
|
|
} catch (Exception $e) {
|
|
|
|
|
|
|
|
// Debug std
|
|
|
|
var_dump($e);
|
|
|
|
|
|
|
|
// Skip item
|
|
|
|
$db->rollBack();
|
|
|
|
|
|
|
|
continue;
|
2023-04-09 03:28:31 +03:00
|
|
|
}
|
2023-06-05 22:01:22 +03:00
|
|
|
}
|
2023-04-09 03:28:31 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Process pages crawl queue
|
2023-06-30 13:28:22 +03:00
|
|
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET, time() - CRAWL_PAGE_HOME_SECONDS_OFFSET) as $queueHostPage) {
|
2023-06-05 22:01:22 +03:00
|
|
|
|
|
|
|
$db->beginTransaction();
|
|
|
|
|
|
|
|
try {
|
2023-05-05 05:26:53 +03:00
|
|
|
|
|
|
|
// Build URL from the DB
|
|
|
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
|
|
|
|
2023-05-06 08:45:37 +03:00
|
|
|
// Init page request
|
2023-05-05 05:26:53 +03:00
|
|
|
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT);
|
2023-04-09 15:25:15 +03:00
|
|
|
|
2023-05-08 08:27:21 +03:00
|
|
|
// Update curl stats
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpRequestsTotal++;
|
2023-05-08 12:10:57 +03:00
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
2023-05-08 11:04:59 +03:00
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
// Update page index anyway, with the current time and http code
|
2023-06-13 12:45:12 +03:00
|
|
|
$hostPagesProcessed += $db->updateHostPageCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode(), $curl->getSizeDownload());
|
2023-05-03 09:22:14 +03:00
|
|
|
|
2023-06-04 14:58:33 +03:00
|
|
|
// This page has on 200 code
|
2023-05-05 05:26:53 +03:00
|
|
|
if (200 != $curl->getCode()) {
|
2023-05-03 09:22:14 +03:00
|
|
|
|
2023-06-04 14:58:33 +03:00
|
|
|
// Ban this page
|
2023-05-06 10:11:25 +03:00
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-06-04 14:58:33 +03:00
|
|
|
// Try to receive target page location on page redirect available
|
2023-06-04 17:02:32 +03:00
|
|
|
$curl = new Curl($queueHostPageURL, CRAWL_CURLOPT_USERAGENT, 10, true, true);
|
2023-06-04 14:58:33 +03:00
|
|
|
|
|
|
|
// Update curl stats
|
|
|
|
$httpRequestsTotal++;
|
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
|
|
|
|
|
|
if (200 == $curl->getCode()) {
|
|
|
|
|
|
|
|
if (preg_match('~Location: (.*)~i', $curl->getContent(), $match)) {
|
|
|
|
|
|
|
|
if (empty($match[1])) {
|
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-06-04 14:58:33 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$url = trim($match[1]);
|
|
|
|
|
|
|
|
//Make relative links absolute
|
|
|
|
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
|
|
|
|
|
|
|
|
$url = $queueHostPage->scheme . '://' .
|
2023-06-05 22:01:22 +03:00
|
|
|
$queueHostPage->name .
|
2023-06-04 14:58:33 +03:00
|
|
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
|
|
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate formatted link
|
|
|
|
if (filter_var($url, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $url)) {
|
|
|
|
|
|
|
|
// Parse formatted link
|
|
|
|
$hostURL = Parser::hostURL($url);
|
|
|
|
$hostPageURI = Parser::uri($url);
|
|
|
|
|
|
|
|
// Host exists
|
|
|
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
|
|
|
|
|
|
|
$hostStatus = $host->status;
|
|
|
|
$hostNsfw = $host->nsfw;
|
|
|
|
$hostPageLimit = $host->crawlPageLimit;
|
|
|
|
$hostMetaOnly = $host->crawlMetaOnly;
|
|
|
|
$hostId = $host->hostId;
|
|
|
|
$hostRobots = $host->robots;
|
|
|
|
$hostRobotsPostfix = $host->robotsPostfix;
|
|
|
|
|
|
|
|
// Register new host
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Get robots.txt if exists
|
|
|
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
|
|
|
|
|
|
// Update curl stats
|
|
|
|
$httpRequestsTotal++;
|
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
|
|
|
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
|
|
$hostRobots = $curl->getContent();
|
|
|
|
} else {
|
|
|
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
|
|
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
|
|
|
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
|
|
|
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
|
|
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
|
|
|
|
|
|
|
$hostId = $db->addHost( $hostURL->scheme,
|
|
|
|
$hostURL->name,
|
|
|
|
$hostURL->port,
|
|
|
|
crc32($hostURL->string),
|
|
|
|
time(),
|
|
|
|
null,
|
|
|
|
$hostPageLimit,
|
|
|
|
(string) $hostMetaOnly,
|
|
|
|
(string) $hostStatus,
|
|
|
|
(string) $hostNsfw,
|
|
|
|
$hostRobots,
|
|
|
|
$hostRobotsPostfix);
|
|
|
|
|
|
|
|
// Add web root host page to make host visible in the crawl queue
|
|
|
|
$db->addHostPage($hostId, crc32('/'), '/', time());
|
|
|
|
|
|
|
|
// Increase counters
|
|
|
|
$hostPagesAdded++;
|
|
|
|
$hostsAdded++;
|
|
|
|
|
|
|
|
// When page is root, skip next operations
|
|
|
|
if ($hostPageURI->string == '/') {
|
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-06-04 14:58:33 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init robots parser
|
|
|
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
|
|
|
|
|
|
|
// Save page info
|
|
|
|
if ($hostStatus && // host enabled
|
|
|
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
|
|
|
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
|
|
|
|
|
|
|
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
|
|
|
|
|
|
|
$hostPageId = $hostPage->hostPageId;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
|
|
|
|
|
|
|
// Apply referer meta description to the target page before indexing it
|
|
|
|
if ($lastHostPageDescription = $db->getLastPageDescription($queueHostPage->hostPageId)) {
|
|
|
|
|
|
|
|
$db->addHostPageDescription($hostPageId,
|
|
|
|
$lastHostPageDescription->title,
|
|
|
|
$lastHostPageDescription->description,
|
|
|
|
$lastHostPageDescription->keywords,
|
|
|
|
$hostMetaOnly ? null : ($lastHostPageDescription->data ? base64_encode($lastHostPageDescription->data) : null),
|
|
|
|
time());
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostPagesAdded++;
|
|
|
|
}
|
|
|
|
|
|
|
|
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip other this page actions
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
2023-05-03 09:22:14 +03:00
|
|
|
}
|
|
|
|
|
2023-05-10 14:47:33 +03:00
|
|
|
// Validate MIME content type
|
|
|
|
if ($contentType = $curl->getContentType()) {
|
|
|
|
|
|
|
|
$db->updateHostPageMime($queueHostPage->hostPageId, Filter::mime($contentType), time());
|
|
|
|
|
2023-05-10 19:35:01 +03:00
|
|
|
// Ban page if not available
|
2023-05-10 14:47:33 +03:00
|
|
|
} else {
|
2023-05-05 21:25:57 +03:00
|
|
|
|
2023-05-06 10:11:25 +03:00
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 21:25:57 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-06-13 23:09:44 +03:00
|
|
|
// Check for MIME
|
2023-05-10 19:35:01 +03:00
|
|
|
$hostPageInMime = false;
|
2023-05-13 10:15:07 +03:00
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_INDEX) as $mime) {
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-06-13 22:29:28 +03:00
|
|
|
// Ban page on MIME type not allowed in settings
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
2023-05-10 18:35:18 +03:00
|
|
|
|
2023-05-10 19:35:01 +03:00
|
|
|
$hostPageInMime = true;
|
2023-05-08 12:10:57 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-10 19:35:01 +03:00
|
|
|
// Ban page not in MIME list
|
|
|
|
if (!$hostPageInMime) {
|
2023-05-08 12:10:57 +03:00
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
2023-05-08 13:12:16 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-08 13:12:16 +03:00
|
|
|
continue;
|
2023-05-05 21:25:57 +03:00
|
|
|
}
|
|
|
|
|
2023-05-05 21:39:48 +03:00
|
|
|
// Skip page processing without returned data
|
2023-05-05 05:26:53 +03:00
|
|
|
if (!$content = $curl->getContent()) {
|
2023-04-09 03:28:31 +03:00
|
|
|
|
2023-06-04 17:44:09 +03:00
|
|
|
// Prevent page ban when it MIME in the whitelist, skip steps below only
|
|
|
|
// This case possible for multimedia/streaming resources index
|
|
|
|
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
continue;
|
|
|
|
}
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-13 23:09:44 +03:00
|
|
|
// Is HTML document
|
|
|
|
if (false !== stripos(Filter::mime($contentType), 'text/html')) {
|
2023-05-05 05:26:53 +03:00
|
|
|
|
2023-06-14 02:53:14 +03:00
|
|
|
// Define variables
|
|
|
|
$metaDescription = null;
|
|
|
|
$metaKeywords = null;
|
|
|
|
$metaYggoManifest = null;
|
|
|
|
|
2023-06-25 22:10:47 +03:00
|
|
|
// Parse page content
|
2023-05-10 18:35:18 +03:00
|
|
|
$dom = new DomDocument();
|
2023-04-01 19:29:39 +03:00
|
|
|
|
2023-06-16 13:23:52 +03:00
|
|
|
if ($encoding = mb_detect_encoding($content)) {
|
|
|
|
|
|
|
|
@$dom->loadHTML(sprintf('<?xml encoding="%s" ?>', $encoding) . $content);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
|
|
|
|
|
|
|
$db->commit();
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-07-12 12:27:30 +03:00
|
|
|
// Skip index page links without title tag
|
2023-05-10 18:35:18 +03:00
|
|
|
$title = @$dom->getElementsByTagName('title');
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
if ($title->length == 0) {
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
$db->commit();
|
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
continue;
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
} else {
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaTitle = $title->item(0)->nodeValue;
|
2023-05-05 05:26:53 +03:00
|
|
|
}
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
// Get optional page meta data
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
if (@$meta->getAttribute('name') == 'description') {
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaDescription = @$meta->getAttribute('content');
|
2023-05-10 18:35:18 +03:00
|
|
|
}
|
2023-05-04 01:04:39 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
if (@$meta->getAttribute('name') == 'keywords') {
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaKeywords = @$meta->getAttribute('content');
|
2023-05-10 18:35:18 +03:00
|
|
|
}
|
2023-05-06 07:25:54 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
if (@$meta->getAttribute('name') == 'robots') {
|
2023-05-06 08:45:37 +03:00
|
|
|
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaRobots = @$meta->getAttribute('content');
|
2023-05-06 07:25:54 +03:00
|
|
|
|
2023-07-12 12:27:30 +03:00
|
|
|
// Ban page with meta robots:noindex attribute
|
2023-06-14 02:53:14 +03:00
|
|
|
if (false !== stripos($metaRobots, 'noindex')) {
|
2023-05-06 07:25:54 +03:00
|
|
|
|
2023-05-10 18:35:18 +03:00
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-10 19:35:01 +03:00
|
|
|
// Grab meta yggo:manifest link when available
|
2023-05-10 18:35:18 +03:00
|
|
|
if (@$meta->getAttribute('name') == 'yggo:manifest') {
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaYggoManifest = Filter::url(@$meta->getAttribute('content'));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add queued page description if not exists
|
|
|
|
$db->addHostPageDescription($queueHostPage->hostPageId,
|
|
|
|
$metaTitle,
|
|
|
|
$metaDescription ? Filter::pageDescription($metaDescription) : null,
|
|
|
|
$metaKeywords ? Filter::pageKeywords($metaKeywords) : null,
|
|
|
|
$content ? ($queueHostPage->crawlMetaOnly ? null : base64_encode($content)) : null,
|
|
|
|
time());
|
|
|
|
|
2023-07-12 12:16:26 +03:00
|
|
|
// Collect page DOM elements data on enabled
|
|
|
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
|
|
|
|
|
|
|
|
// Begin selectors extraction
|
|
|
|
$html = str_get_html($content);
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
|
|
|
|
|
|
|
foreach($html->find($selector) as $element) {
|
|
|
|
|
|
|
|
if (!empty($element->innertext)) {
|
|
|
|
|
|
|
|
$db->addHostPageDom($queueHostPage->hostPageId,
|
|
|
|
time(),
|
|
|
|
$selector,
|
|
|
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
|
|
|
preg_replace('/[\s]+/',
|
|
|
|
' ',
|
|
|
|
str_replace(['<br />', '<br/>', '<br>', '</'],
|
|
|
|
[' ', ' ', ' ', ' </'],
|
|
|
|
$element->innertext))) : $element->innertext));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-12 12:27:30 +03:00
|
|
|
// Begin snaps
|
|
|
|
$snapLocal = false;
|
|
|
|
$snapMega = false;
|
|
|
|
|
|
|
|
// Snap local enabled and MIME in white list
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_LOCAL) {
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_LOCAL) as $mime) {
|
|
|
|
|
|
|
|
// MIME type allowed in settings
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
|
|
|
|
|
|
|
$snapLocal = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Snap MEGA enabled and MIME in white list
|
|
|
|
if (false !== CRAWL_PAGE_MIME_SNAP_MEGA) {
|
|
|
|
|
|
|
|
foreach ((array) explode(',', CRAWL_PAGE_MIME_SNAP_MEGA) as $mime) {
|
|
|
|
|
|
|
|
// MIME type allowed in settings
|
|
|
|
if (false !== stripos(Filter::mime($contentType), Filter::mime($mime))) {
|
|
|
|
|
|
|
|
$snapMega = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// At least one snap storage match settings condition
|
|
|
|
if ($snapLocal || $snapMega) {
|
|
|
|
|
|
|
|
$crc32data = crc32($content);
|
|
|
|
|
|
|
|
// Create not duplicated data snaps only, even new time
|
|
|
|
if (!$db->findHostPageSnap($queueHostPage->hostPageId, $crc32data)) {
|
|
|
|
|
|
|
|
$snapTime = time();
|
|
|
|
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
|
|
|
|
|
|
|
|
$snapTmp = __DIR__ . '/../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
|
|
|
|
@mkdir(__DIR__ . '/../storage/tmp/snap/hp/' . $snapPath, 0755, true);
|
|
|
|
|
|
|
|
// Create new ZIP container
|
|
|
|
$zip = new ZipArchive();
|
|
|
|
|
|
|
|
if (true === $zip->open($snapTmp, ZipArchive::CREATE)) {
|
|
|
|
|
|
|
|
// Insert compressed snap data into the tmp storage
|
|
|
|
if (true === $zip->addFromString('DATA', $content) &&
|
|
|
|
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
|
|
|
|
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
|
|
|
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
|
|
|
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
|
|
|
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
|
|
|
|
|
|
|
// Done
|
|
|
|
$zip->close();
|
|
|
|
|
|
|
|
// Temporarily snap file exists
|
|
|
|
if (file_exists($snapTmp)) {
|
|
|
|
|
|
|
|
// Register snap in DB
|
|
|
|
if ($hostPageSnapId = $db->addHostPageSnap($queueHostPage->hostPageId, $crc32data, $snapTime)) {
|
|
|
|
|
|
|
|
$hostPagesSnapAdded++;
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent local storage
|
|
|
|
if ($snapLocal) {
|
|
|
|
|
|
|
|
@mkdir(__DIR__ . '/../storage/snap/hp/' . $snapPath, 0755, true);
|
|
|
|
|
|
|
|
if (copy($snapTmp, __DIR__ . '/../storage/snap/hp/' . $snapPath . $snapTime . '.zip')) {
|
|
|
|
|
|
|
|
// Update snap location info
|
|
|
|
$db->updateHostPageSnapStorageLocal($hostPageSnapId, true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy tmp snap to the permanent MEGA storage
|
|
|
|
if ($snapMega) {
|
|
|
|
|
|
|
|
$ftp = new Ftp();
|
|
|
|
|
|
|
|
if ($ftp->connect(MEGA_FTP_HOST, MEGA_FTP_PORT, null, null, MEGA_FTP_DIRECTORY)) {
|
|
|
|
|
|
|
|
$ftp->mkdir('hp/' . $snapPath, true);
|
|
|
|
|
|
|
|
if ($ftp->copy($snapTmp, 'hp/' . $snapPath . $snapTime . '.zip')) {
|
|
|
|
|
|
|
|
// Update snap location info
|
|
|
|
$db->updateHostPageSnapStorageMega($hostPageSnapId, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
$ftp->close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove tmp
|
|
|
|
@unlink($snapTmp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip page links following with meta robots:nofollow attribute
|
|
|
|
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
|
|
|
|
|
|
|
if (@$meta->getAttribute('name') == 'robots') {
|
|
|
|
|
|
|
|
if (false !== stripos($metaRobots, 'nofollow')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-14 02:53:14 +03:00
|
|
|
// Update manifest registry
|
|
|
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
|
|
|
|
|
|
|
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
|
|
|
|
|
|
|
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
2023-07-12 12:16:26 +03:00
|
|
|
$db->addManifest($metaYggoManifestCRC32,
|
2023-06-14 02:53:14 +03:00
|
|
|
$metaYggoManifest,
|
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
|
|
|
time());
|
|
|
|
|
|
|
|
$manifestsAdded++;
|
2023-05-10 18:35:18 +03:00
|
|
|
}
|
|
|
|
}
|
2023-06-13 23:09:44 +03:00
|
|
|
|
|
|
|
// Begin page links collection
|
|
|
|
$links = [];
|
|
|
|
|
|
|
|
// Collect image links
|
|
|
|
foreach (@$dom->getElementsByTagName('img') as $img) {
|
|
|
|
|
|
|
|
// Skip images without src attribute
|
|
|
|
if (!$src = @$img->getAttribute('src')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip images without alt attribute
|
|
|
|
if (!$alt = @$img->getAttribute('alt')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!$title = @$img->getAttribute('title')) {
|
2023-06-14 02:53:14 +03:00
|
|
|
$title = null;
|
2023-06-13 23:09:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Skip encoded content
|
|
|
|
if (false !== stripos($src, 'data:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add link to queue
|
|
|
|
$links[] = [
|
|
|
|
'title' => null,
|
|
|
|
'description' => null,
|
|
|
|
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
|
|
|
|
'data' => null,
|
|
|
|
'mime' => null,
|
|
|
|
'ref' => $src,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect media links
|
|
|
|
foreach (@$dom->getElementsByTagName('source') as $source) {
|
|
|
|
|
|
|
|
// Skip images without src attribute
|
|
|
|
if (!$src = @$source->getAttribute('src')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip media without type attribute
|
|
|
|
if (!$type = @$source->getAttribute('type')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip encoded content
|
|
|
|
if (false !== stripos($src, 'data:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add link to queue
|
|
|
|
$links[] = [
|
|
|
|
'title' => null,
|
|
|
|
'description' => null,
|
|
|
|
'keywords' => null,
|
|
|
|
'data' => null,
|
|
|
|
'mime' => Filter::mime($type),
|
|
|
|
'ref' => $src,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('video') as $video) {
|
|
|
|
|
|
|
|
// Skip images without src attribute
|
|
|
|
if (!$src = @$video->getAttribute('src')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip media without type attribute
|
|
|
|
if (!$type = @$video->getAttribute('type')) {
|
|
|
|
$type = 'video/*';
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip encoded content
|
|
|
|
if (false !== stripos($src, 'data:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add link to queue
|
|
|
|
$links[] = [
|
|
|
|
'title' => null,
|
|
|
|
'description' => null,
|
|
|
|
'keywords' => null,
|
|
|
|
'data' => null,
|
|
|
|
'mime' => Filter::mime($type),
|
|
|
|
'ref' => $src,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach (@$dom->getElementsByTagName('audio') as $audio) {
|
|
|
|
|
|
|
|
// Skip images without src attribute
|
|
|
|
if (!$src = @$audio->getAttribute('src')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip media without type attribute
|
|
|
|
if (!$type = @$audio->getAttribute('type')) {
|
|
|
|
$type = 'audio/*';
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip encoded content
|
|
|
|
if (false !== stripos($src, 'data:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add link to queue
|
|
|
|
$links[] = [
|
|
|
|
'title' => null,
|
|
|
|
'description' => null,
|
|
|
|
'keywords' => null,
|
|
|
|
'data' => null,
|
|
|
|
'mime' => Filter::mime($type),
|
|
|
|
'ref' => $src,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect internal links from page content
|
|
|
|
foreach(@$dom->getElementsByTagName('a') as $a) {
|
|
|
|
|
|
|
|
// Skip links without required attribute
|
|
|
|
if (!$href = @$a->getAttribute('href')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get title attribute if available
|
|
|
|
if (!$title = @$a->getAttribute('title')) {
|
2023-06-14 02:53:14 +03:00
|
|
|
$title = null;
|
2023-06-13 23:09:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Skip anchor links
|
|
|
|
if (false !== stripos($href, '#')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip javascript links
|
|
|
|
if (false !== stripos($href, 'javascript:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip mailto links
|
|
|
|
if (false !== stripos($href, 'mailto:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip magnet links
|
|
|
|
if (false !== stripos($href, 'magnet:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip x-raw-image links
|
|
|
|
/*
|
|
|
|
if (false !== stripos($href, 'x-raw-image:')) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
// Add link to queue
|
|
|
|
$links[] = [
|
|
|
|
'title' => null,
|
|
|
|
'description' => null,
|
|
|
|
'keywords' => Filter::pageKeywords($title),
|
|
|
|
'data' => null,
|
|
|
|
'mime' => null,
|
|
|
|
'ref' => $href,
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process links collected
|
|
|
|
foreach ($links as $link) {
|
|
|
|
|
|
|
|
//Make relative links absolute
|
|
|
|
if (!parse_url($link['ref'], PHP_URL_HOST)) {
|
|
|
|
|
|
|
|
$link['ref'] = $queueHostPage->scheme . '://' .
|
|
|
|
$queueHostPage->name .
|
|
|
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
|
|
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate formatted link
|
|
|
|
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
|
|
|
|
|
|
|
|
// Parse formatted link
|
|
|
|
$hostURL = Parser::hostURL($link['ref']);
|
|
|
|
$hostPageURI = Parser::uri($link['ref']);
|
|
|
|
|
|
|
|
// Host exists
|
|
|
|
if ($host = $db->getHost(crc32($hostURL->string))) {
|
|
|
|
|
|
|
|
$hostStatus = $host->status;
|
|
|
|
$hostNsfw = $host->nsfw;
|
|
|
|
$hostPageLimit = $host->crawlPageLimit;
|
|
|
|
$hostMetaOnly = $host->crawlMetaOnly;
|
|
|
|
$hostId = $host->hostId;
|
|
|
|
$hostRobots = $host->robots;
|
|
|
|
$hostRobotsPostfix = $host->robotsPostfix;
|
|
|
|
|
|
|
|
// Register new host
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Get robots.txt if exists
|
|
|
|
$curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
|
|
|
|
|
|
|
// Update curl stats
|
|
|
|
$httpRequestsTotal++;
|
|
|
|
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
|
|
|
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
|
|
|
$httpRequestsTimeTotal += $curl->getTotalTime();
|
|
|
|
|
|
|
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
|
|
|
$hostRobots = $curl->getContent();
|
|
|
|
} else {
|
|
|
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
|
|
|
}
|
|
|
|
|
|
|
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
|
|
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
|
|
|
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
|
|
|
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
|
|
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
|
|
|
|
|
|
|
$hostId = $db->addHost( $hostURL->scheme,
|
|
|
|
$hostURL->name,
|
|
|
|
$hostURL->port,
|
|
|
|
crc32($hostURL->string),
|
|
|
|
time(),
|
|
|
|
null,
|
|
|
|
$hostPageLimit,
|
|
|
|
(string) $hostMetaOnly,
|
|
|
|
(string) $hostStatus,
|
|
|
|
(string) $hostNsfw,
|
|
|
|
$hostRobots,
|
|
|
|
$hostRobotsPostfix);
|
|
|
|
|
|
|
|
// Add web root host page to make host visible in the crawl queue
|
|
|
|
$db->addHostPage($hostId, crc32('/'), '/', time());
|
|
|
|
|
|
|
|
// Increase counters
|
|
|
|
$hostPagesAdded++;
|
|
|
|
$hostsAdded++;
|
|
|
|
|
|
|
|
// When page is root, skip next operations
|
|
|
|
if ($hostPageURI->string == '/') {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init robots parser
|
|
|
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
|
|
|
|
|
|
|
// Save page info
|
|
|
|
if ($hostStatus && // host enabled
|
|
|
|
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
|
|
|
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
|
|
|
|
|
|
|
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
|
|
|
|
|
|
|
$hostPageId = $hostPage->hostPageId;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
|
|
|
|
|
|
|
$db->addHostPageDescription($hostPageId,
|
|
|
|
$link['title'],
|
|
|
|
$link['description'],
|
|
|
|
$link['keywords'],
|
|
|
|
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
|
|
|
|
time());
|
|
|
|
|
|
|
|
$hostPagesAdded++;
|
|
|
|
}
|
|
|
|
|
|
|
|
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-05-06 07:25:54 +03:00
|
|
|
}
|
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Apply changes
|
|
|
|
$db->commit();
|
2023-04-07 04:04:24 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Process update errors
|
|
|
|
} catch (Exception $e) {
|
2023-04-07 04:04:24 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Debug std
|
|
|
|
var_dump($e);
|
2023-06-04 12:04:41 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
// Ban page that throws the data type error and stuck the crawl queue
|
|
|
|
if (!empty($queueHostPage->hostPageId) &&
|
|
|
|
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
|
2023-06-17 11:33:32 +03:00
|
|
|
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO change DB
|
2023-06-05 22:01:22 +03:00
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
|
|
|
|
|
|
|
$hostPagesProcessed++;
|
|
|
|
|
2023-06-17 11:33:32 +03:00
|
|
|
// Apply changes
|
|
|
|
$db->commit();
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Skip item
|
|
|
|
$db->rollBack();
|
|
|
|
|
|
|
|
}
|
2023-04-07 04:04:24 +03:00
|
|
|
|
2023-06-05 22:01:22 +03:00
|
|
|
continue;
|
2023-06-04 12:04:41 +03:00
|
|
|
}
|
2023-04-07 04:04:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Debug
|
2023-05-08 11:04:59 +03:00
|
|
|
$executionTimeTotal = microtime(true) - $timeStart;
|
|
|
|
$httpRequestsTimeTotal = $httpRequestsTimeTotal / 1000000;
|
|
|
|
|
|
|
|
if (CRAWL_LOG_ENABLED) {
|
|
|
|
|
|
|
|
$db->addCrawlerLog(time(),
|
|
|
|
$hostsAdded,
|
|
|
|
$hostPagesProcessed,
|
|
|
|
$hostPagesAdded,
|
2023-05-14 01:45:55 +03:00
|
|
|
$hostPagesSnapAdded,
|
2023-05-08 11:04:59 +03:00
|
|
|
$hostPagesBanned,
|
|
|
|
$manifestsProcessed,
|
|
|
|
$manifestsAdded,
|
|
|
|
$httpRequestsTotal,
|
|
|
|
$httpRequestsSizeTotal,
|
|
|
|
$httpDownloadSizeTotal,
|
|
|
|
$httpRequestsTimeTotal,
|
|
|
|
$executionTimeTotal);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Debug output
|
|
|
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
|
|
|
|
2023-04-07 04:04:24 +03:00
|
|
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
|
|
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
2023-05-14 01:45:55 +03:00
|
|
|
echo 'Pages snaps added: ' . $hostPagesSnapAdded . PHP_EOL;
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-05 05:26:53 +03:00
|
|
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'HTTP Requests total: ' . $httpRequestsTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Requests total size: ' . $httpRequestsSizeTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Download total size: ' . $httpDownloadSizeTotal . PHP_EOL;
|
|
|
|
echo 'HTTP Requests total time: ' . $httpRequestsTimeTotal . PHP_EOL;
|
2023-05-08 08:27:21 +03:00
|
|
|
|
2023-05-08 11:04:59 +03:00
|
|
|
echo 'Total time: ' . $executionTimeTotal . PHP_EOL . PHP_EOL;
|