From 034a683df7e2e179cc028f3ee82dfe6a0036d72d Mon Sep 17 00:00:00 2001 From: ghost Date: Mon, 7 Aug 2023 00:13:04 +0300 Subject: [PATCH] add YGGstate DB crawl integration --- README.md | 8 ++++- config/app.php.example | 33 +++++++++++++++++ crontab/crawler.php | 44 +++++++++++++++++++++++ library/yggstate.php | 80 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 library/yggstate.php diff --git a/README.md b/README.md index e486d0e..94977f8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ _Проект присвячується захисникам міста Бахмут_ -Written by inspiration to explore [Yggdrasil](https://yggdrasil-network.github.io) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued. +Written by inspiration to explore [Yggdrasil](https://github.com/yggdrasil-network) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued. This engine also could be useful for crawling regular websites, small business resources, local networks. The project goal - simple interface, clear architecture and lightweight server requirement. @@ -207,6 +207,11 @@ GET m=SphinxQL + [ ] Atom * [ ] Palette image index / filter * [ ] Crawl queue balancer, that depends of CPU available +* [x] Networks integration + + [x] [yggdrasil](https://github.com/yggdrasil-network) + + [x] [YGGstate](https://github.com/YGGverse/YGGstate) (unlimited nodes) + + [x] DB + + [ ] API ##### Cleaner @@ -272,6 +277,7 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway) #### See also * [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave) +* [YGGstate - Yggdrasil Network Analytics](https://github.com/YGGverse/YGGstate) #### Feedback diff --git a/config/app.php.example b/config/app.php.example index 37c1ad3..bb6e1d8 100644 --- a/config/app.php.example +++ b/config/app.php.example @@ -320,6 +320,39 @@ define('CRAWL_MANIFEST', true); */ define('CRAWL_MANIFEST_API_VERSION', 0.13); + +// Integrations + +/* + * Crawl YGGstate for peers to descover new hosts + * + * Yggdrasil networks only + * + * Read more: + * https://github.com/YGGverse/YGGstate + * + */ +define('CRAWL_YGGSTATE', json_encode((object) + [ + 'db' => + [ + [ + // Conditions + 'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds + 'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse + + // Connection + 'port' => 3306, + 'host' => '', + 'database' => '', + 'username' => '', + 'password' => '', + ], + // ... + ], + ]) +); + /* * Remove host ban after following time * diff --git a/crontab/crawler.php b/crontab/crawler.php index 574624e..c552ebb 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -31,6 +31,7 @@ require_once(__DIR__ . '/../library/url.php'); require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/helper.php'); +require_once(__DIR__ . '/../library/yggstate.php'); require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); // Check disk quota @@ -84,6 +85,49 @@ try { exit; } +// Check YGGstate connections to discover new hosts +if (CRAWL_YGGSTATE) { + + foreach (json_decode(CRAWL_YGGSTATE) as $server => $nodes) { + + foreach ($nodes as $i => $node) { + + switch ($server) { + + case 'db': + + try { + + if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) { + + $yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password); + + foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) { + + // Register new host + if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) { + + $hostsAdded += count($linkToDBresult->new->hostId); + $hostPagesAdded += count($linkToDBresult->new->hostPageId); + } + } + + $memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout); + } + + } catch(Exception $e) { + + var_dump($e); + + continue 2; + } + + break; + } + } + } +} + // Process hosts crawl queue foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) { diff --git a/library/yggstate.php b/library/yggstate.php new file mode 100644 index 0000000..4669cd6 --- /dev/null +++ b/library/yggstate.php @@ -0,0 +1,80 @@ +_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); + $this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + $this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); + $this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); + + $this->_debug = (object) + [ + 'query' => (object) + [ + 'select' => (object) + [ + 'total' => 0 + ], + 'insert' => (object) + [ + 'total' => 0 + ], + 'update' => (object) + [ + 'total' => 0 + ], + 'delete' => (object) + [ + 'total' => 0 + ], + ] + ]; + } + + // Tools + public function beginTransaction() { + + $this->_db->beginTransaction(); + } + + public function commit() { + + $this->_db->commit(); + } + + public function rollBack() { + + $this->_db->rollBack(); + } + + public function getDebug() { + + return $this->_debug; + } + + // Peer + public function getPeersByMinLastUptime(int $time) { + + $this->_debug->query->select->total++; + + $query = $this->_db->prepare('SELECT * FROM `peer` + + HAVING ( + SELECT `peerRemote`.`uptime` + FROM `peerRemote` + WHERE `peerRemote`.`peerId` = `peer`.`peerId` + ORDER BY `timeAdded` DESC + LIMIT 1 + ) >= ?'); + + $query->execute([$time]); + + return $query->fetchAll(); + } +}