searchHostPagesTotal(Filter::searchQuery($q, $m), $t); $results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $t, $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal); } else { $resultsTotal = 0; $results = []; } // Mime list $hostPagesMime = $sphinx->getHostPagesMime(); // Define page basics $totalPages = $sphinx->getHostPagesTotal(); $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages), sprintf(_('Over %s pages or enter the new one...'), $totalPages), sprintf(_('Over %s pages or enter the new one...'), $totalPages), ]); // Crawl request if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { $db->beginTransaction(); try { // Parse host info if ($hostURL = Parser::hostURL($q)) { // Host exists if ($host = $db->getHost(crc32($hostURL->string))) { $hostStatus = $host->status; $hostNsfw = $host->nsfw; $hostPageLimit = $host->crawlPageLimit; $hostMetaOnly = $host->crawlMetaOnly; $hostId = $host->hostId; $hostRobots = $host->robots; $hostRobotsPostfix = $host->robotsPostfix; // Register new host } else { // Disk quota not reached if (CRAWL_STOP_DISK_QUOTA_MB_LEFT < disk_free_space('/') / 1000000) { // Get robots.txt if exists $curl = new Curl($hostURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT); if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { $hostRobots = $curl->getContent(); } else { $hostRobots = null; } $hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES; $hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0; $hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0; $hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostId = $db->addHost( $hostURL->scheme, $hostURL->name, $hostURL->port, crc32($hostURL->string), time(), null, $hostPageLimit, (string) $hostMetaOnly, (string) $hostStatus, (string) $hostNsfw, $hostRobots, $hostRobotsPostfix); // Add web root host page to make host visible in the crawl queue $db->addHostPage($hostId, crc32('/'), '/', time()); } } // Parse page URI $hostPageURI = Parser::uri($q); // Init robots parser $robots = new Robots((!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . (string) $hostRobotsPostfix); // Save page info if ($hostStatus && // host enabled $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); } } $db->commit(); } catch(Exception $e){ var_dump($e); $db->rollBack(); } } ?>