add YGGstate DB crawl integration

This commit is contained in:
ghost 2023-08-07 00:13:04 +03:00
parent 3d9db381e8
commit 034a683df7
4 changed files with 164 additions and 1 deletions

View File

@ -2,7 +2,7 @@
роект присвячується захисникам міста Бахмут_ роект присвячується захисникам міста Бахмут_
Written by inspiration to explore [Yggdrasil](https://yggdrasil-network.github.io) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued. Written by inspiration to explore [Yggdrasil](https://github.com/yggdrasil-network) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
This engine also could be useful for crawling regular websites, small business resources, local networks. This engine also could be useful for crawling regular websites, small business resources, local networks.
The project goal - simple interface, clear architecture and lightweight server requirement. The project goal - simple interface, clear architecture and lightweight server requirement.
@ -207,6 +207,11 @@ GET m=SphinxQL
+ [ ] Atom + [ ] Atom
* [ ] Palette image index / filter * [ ] Palette image index / filter
* [ ] Crawl queue balancer, that depends of CPU available * [ ] Crawl queue balancer, that depends of CPU available
* [x] Networks integration
+ [x] [yggdrasil](https://github.com/yggdrasil-network)
+ [x] [YGGstate](https://github.com/YGGverse/YGGstate) (unlimited nodes)
+ [x] DB
+ [ ] API
##### Cleaner ##### Cleaner
@ -272,6 +277,7 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### See also #### See also
* [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave) * [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave)
* [YGGstate - Yggdrasil Network Analytics](https://github.com/YGGverse/YGGstate)
#### Feedback #### Feedback

View File

@ -320,6 +320,39 @@ define('CRAWL_MANIFEST', true);
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.13); define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Integrations
/*
* Crawl YGGstate for peers to descover new hosts
*
* Yggdrasil networks only
*
* Read more:
* https://github.com/YGGverse/YGGstate
*
*/
define('CRAWL_YGGSTATE', json_encode((object)
[
'db' =>
[
[
// Conditions
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse
// Connection
'port' => 3306,
'host' => '',
'database' => '',
'username' => '',
'password' => '',
],
// ...
],
])
);
/* /*
* Remove host ban after following time * Remove host ban after following time
* *

View File

@ -31,6 +31,7 @@ require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php'); require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/yggstate.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Check disk quota // Check disk quota
@ -84,6 +85,49 @@ try {
exit; exit;
} }
// Check YGGstate connections to discover new hosts
if (CRAWL_YGGSTATE) {
foreach (json_decode(CRAWL_YGGSTATE) as $server => $nodes) {
foreach ($nodes as $i => $node) {
switch ($server) {
case 'db':
try {
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
// Register new host
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
}
} catch(Exception $e) {
var_dump($e);
continue 2;
}
break;
}
}
}
}
// Process hosts crawl queue // Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) { foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {

80
library/yggstate.php Normal file
View File

@ -0,0 +1,80 @@
<?php
class YGGstate {
private PDO $_db;
private object $_debug;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_debug = (object)
[
'query' => (object)
[
'select' => (object)
[
'total' => 0
],
'insert' => (object)
[
'total' => 0
],
'update' => (object)
[
'total' => 0
],
'delete' => (object)
[
'total' => 0
],
]
];
}
// Tools
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
public function getDebug() {
return $this->_debug;
}
// Peer
public function getPeersByMinLastUptime(int $time) {
$this->_debug->query->select->total++;
$query = $this->_db->prepare('SELECT * FROM `peer`
HAVING (
SELECT `peerRemote`.`uptime`
FROM `peerRemote`
WHERE `peerRemote`.`peerId` = `peer`.`peerId`
ORDER BY `timeAdded` DESC
LIMIT 1
) >= ?');
$query->execute([$time]);
return $query->fetchAll();
}
}