add meta:robots tag support #2

This commit is contained in:
ghost 2023-04-09 03:28:31 +03:00
parent 6550eb310f
commit 5c8d299a4a
3 changed files with 63 additions and 10 deletions

View File

@ -64,25 +64,46 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
}
// Get optional page meta data
$description = '';
$keywords = '';
$metaDescription = '';
$metaKeywords = '';
$metaRobots = '';
foreach (@$dom->getElementsByTagName('meta') as $meta) {
if (@$meta->getAttribute('name') == 'description') {
$description = @$meta->getAttribute('content');
$metaDescription = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'keywords') {
$keywords = @$meta->getAttribute('content');
$metaKeywords = @$meta->getAttribute('content');
}
if (@$meta->getAttribute('name') == 'robots') {
$metaRobots = @$meta->getAttribute('content');
}
}
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if ($metaRobots == 'noindex') {
$robots = new Robots($queueHostPage->robots);
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
// Ignore URI if does not match existing rules yet
if ($robotsPostfix->uriAllowed($queueHostPage->uri) &&
$robots->uriAllowed($queueHostPage->uri)) {
$robotsPostfix->append('Disallow:', $queueHostPage->uri);
$db->updateHostRobotsPostfix($queueHostPage->hostId, $robotsPostfix->getData(), time());
}
}
// Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($description),
Filter::pageKeywords($keywords),
Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Collect internal links from page content

View File

@ -65,6 +65,15 @@ class MySQL {
return $query->rowCount();
}
public function updateHostRobotsPostfix(int $hostId, mixed $robotsPostfix, int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `host` SET `robotsPostfix` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
$query->execute([$robotsPostfix, $timeUpdated, $hostId]);
return $query->rowCount();
}
// Pages
public function getTotalHostPages(int $hostId) {
@ -223,14 +232,16 @@ class MySQL {
// Crawl tools
public function getCrawlQueue(int $limit, int $timeFrom) {
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
`hostPage`.`hostPageId`,
`hostPage`.`uri`,
`host`.`scheme`,
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlPageMetaOnly`,
`host`.`robots`
`host`.`robots`,
`host`.`robotsPostfix`
FROM `hostPage`
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)

View File

@ -3,12 +3,15 @@
class Robots {
private $_rule = [];
private $_data = null;
public function __construct(string $data) {
public function __construct(mixed $data) {
$this->_data = $data;
$read = false;
foreach ((array) explode(PHP_EOL, $data) as $row) {
foreach ((array) explode(PHP_EOL, (string) $data) as $row) {
$row = strtolower(trim($row));
@ -60,6 +63,24 @@ class Robots {
return $result;
}
public function append(string $key, string $value) {
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
$this->_data .= PHP_EOL . 'User-agent: *' . PHP_EOL;
}
if (false === stripos($this->_data, PHP_EOL . $key . ' ' . $value)) {
$this->_data .= PHP_EOL . $key . ' ' . $value;
}
}
public function getData() {
return $this->_data;
}
private function _regex(string $string) {
return str_replace(