Browse Source

create manifests registry

main
ghost 2 years ago
parent
commit
6d8f4f4882
  1. 18
      config/app.php.txt
  2. 14
      crontab/crawler.php
  3. BIN
      database/yggo.mwb
  4. 23
      library/mysql.php

18
config/app.php.txt

@ -165,6 +165,24 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
*/ */
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
/*
* Look for third-party manifests to collect distributed index
*
* API address provided in yggo meta tag
* will be stored in the `manifest` DB table
*
*/
define('CRAWL_MANIFEST', true);
/*
* Set default auto-crawl status for new manifest added
*
* true - crawler autostart manifest indexer
* false - requires manual validation by the moderator in the DB `manifest`.`status` field
*
*/
define('CRAWL_MANIFEST_DEFAULT_STATUS', true);
// Cleaner settings // Cleaner settings
/* /*

14
crontab/crawler.php

@ -100,9 +100,21 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
Filter::pageTitle($title->item(0)->nodeValue), Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords), Filter::pageKeywords($metaKeywords),
Filter::url($metaYggo),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Update manifest registry
if (CRAWL_MANIFEST && !empty($metaYggo) && filter_var($metaYggo, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggo)) {
$metaYggoCRC32url = crc32($metaYggo);
if (!$db->getManifest($metaYggoCRC32url)) {
$db->addManifest($metaYggoCRC32url,
$metaYggo,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
}
}
// Append page with meta robots:noindex value to the robotsPostfix disallow list // Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) { if (false !== stripos($metaRobots, 'noindex')) {

BIN
database/yggo.mwb

Binary file not shown.

23
library/mysql.php

@ -28,6 +28,25 @@ class MySQL {
$this->_db->rollBack(); $this->_db->rollBack();
} }
// Manifest
public function getManifest(int $crc32url) {
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
$query->execute([$crc32url]);
return $query->fetch();
}
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?, ?)');
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
return $this->_db->lastInsertId();
}
// Host // Host
public function getAPIHosts(string $apiHostFields) { public function getAPIHosts(string $apiHostFields) {
@ -184,16 +203,14 @@ class MySQL {
mixed $metaTitle, mixed $metaTitle,
mixed $metaDescription, mixed $metaDescription,
mixed $metaKeywords, mixed $metaKeywords,
mixed $metaYggo,
mixed $data) { mixed $data) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?, $query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?, `metaDescription` = ?,
`metaKeywords` = ?, `metaKeywords` = ?,
`metaYggo` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1'); `data` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $metaYggo, $data, $hostPageId]); $query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
return $query->rowCount(); return $query->rowCount();
} }

Loading…
Cancel
Save