mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
add meta:robots tag support #2
This commit is contained in:
parent
6550eb310f
commit
5c8d299a4a
@ -64,25 +64,46 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
||||
}
|
||||
|
||||
// Get optional page meta data
|
||||
$description = '';
|
||||
$keywords = '';
|
||||
$metaDescription = '';
|
||||
$metaKeywords = '';
|
||||
$metaRobots = '';
|
||||
|
||||
foreach (@$dom->getElementsByTagName('meta') as $meta) {
|
||||
|
||||
if (@$meta->getAttribute('name') == 'description') {
|
||||
$description = @$meta->getAttribute('content');
|
||||
$metaDescription = @$meta->getAttribute('content');
|
||||
}
|
||||
|
||||
if (@$meta->getAttribute('name') == 'keywords') {
|
||||
$keywords = @$meta->getAttribute('content');
|
||||
$metaKeywords = @$meta->getAttribute('content');
|
||||
}
|
||||
|
||||
if (@$meta->getAttribute('name') == 'robots') {
|
||||
$metaRobots = @$meta->getAttribute('content');
|
||||
}
|
||||
}
|
||||
|
||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
||||
if ($metaRobots == 'noindex') {
|
||||
|
||||
$robots = new Robots($queueHostPage->robots);
|
||||
$robotsPostfix = new Robots($queueHostPage->robotsPostfix);
|
||||
|
||||
// Ignore URI if does not match existing rules yet
|
||||
if ($robotsPostfix->uriAllowed($queueHostPage->uri) &&
|
||||
$robots->uriAllowed($queueHostPage->uri)) {
|
||||
|
||||
$robotsPostfix->append('Disallow:', $queueHostPage->uri);
|
||||
|
||||
$db->updateHostRobotsPostfix($queueHostPage->hostId, $robotsPostfix->getData(), time());
|
||||
}
|
||||
}
|
||||
|
||||
// Update queued page data
|
||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($description),
|
||||
Filter::pageKeywords($keywords),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||
|
||||
// Collect internal links from page content
|
||||
|
@ -65,6 +65,15 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostRobotsPostfix(int $hostId, mixed $robotsPostfix, int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `host` SET `robotsPostfix` = ?, `timeUpdated` = ? WHERE `hostId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$robotsPostfix, $timeUpdated, $hostId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Pages
|
||||
public function getTotalHostPages(int $hostId) {
|
||||
|
||||
@ -223,14 +232,16 @@ class MySQL {
|
||||
// Crawl tools
|
||||
public function getCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`,
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`hostId`,
|
||||
`hostPage`.`hostPageId`,
|
||||
`hostPage`.`uri`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`crawlPageMetaOnly`,
|
||||
`host`.`robots`
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`
|
||||
|
||||
FROM `hostPage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`)
|
||||
|
@ -3,12 +3,15 @@
|
||||
class Robots {
|
||||
|
||||
private $_rule = [];
|
||||
private $_data = null;
|
||||
|
||||
public function __construct(string $data) {
|
||||
public function __construct(mixed $data) {
|
||||
|
||||
$this->_data = $data;
|
||||
|
||||
$read = false;
|
||||
|
||||
foreach ((array) explode(PHP_EOL, $data) as $row) {
|
||||
foreach ((array) explode(PHP_EOL, (string) $data) as $row) {
|
||||
|
||||
$row = strtolower(trim($row));
|
||||
|
||||
@ -60,6 +63,24 @@ class Robots {
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function append(string $key, string $value) {
|
||||
|
||||
if (!preg_match('!^user-agent:\s?\*!', strtolower(trim($this->_data)))) {
|
||||
|
||||
$this->_data .= PHP_EOL . 'User-agent: *' . PHP_EOL;
|
||||
}
|
||||
|
||||
if (false === stripos($this->_data, PHP_EOL . $key . ' ' . $value)) {
|
||||
|
||||
$this->_data .= PHP_EOL . $key . ' ' . $value;
|
||||
}
|
||||
}
|
||||
|
||||
public function getData() {
|
||||
|
||||
return $this->_data;
|
||||
}
|
||||
|
||||
private function _regex(string $string) {
|
||||
|
||||
return str_replace(
|
||||
|
Loading…
x
Reference in New Issue
Block a user