add YGGstate DB crawl integration

This commit is contained in:
ghost 2023-08-07 00:13:04 +03:00
parent 3d9db381e8
commit 034a683df7
4 changed files with 164 additions and 1 deletions

View File

@ -2,7 +2,7 @@
роект присвячується захисникам міста Бахмут_
Written by inspiration to explore [Yggdrasil](https://yggdrasil-network.github.io) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
Written by inspiration to explore [Yggdrasil](https://github.com/yggdrasil-network) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
This engine also could be useful for crawling regular websites, small business resources, local networks.
The project goal - simple interface, clear architecture and lightweight server requirement.
@ -207,6 +207,11 @@ GET m=SphinxQL
+ [ ] Atom
* [ ] Palette image index / filter
* [ ] Crawl queue balancer, that depends of CPU available
* [x] Networks integration
+ [x] [yggdrasil](https://github.com/yggdrasil-network)
+ [x] [YGGstate](https://github.com/YGGverse/YGGstate) (unlimited nodes)
+ [x] DB
+ [ ] API
##### Cleaner
@ -272,6 +277,7 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### See also
* [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave)
* [YGGstate - Yggdrasil Network Analytics](https://github.com/YGGverse/YGGstate)
#### Feedback

View File

@ -320,6 +320,39 @@ define('CRAWL_MANIFEST', true);
*/
define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Integrations
/*
* Crawl YGGstate for peers to descover new hosts
*
* Yggdrasil networks only
*
* Read more:
* https://github.com/YGGverse/YGGstate
*
*/
define('CRAWL_YGGSTATE', json_encode((object)
[
'db' =>
[
[
// Conditions
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse
// Connection
'port' => 3306,
'host' => '',
'database' => '',
'username' => '',
'password' => '',
],
// ...
],
])
);
/*
* Remove host ban after following time
*

View File

@ -31,6 +31,7 @@ require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/yggstate.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Check disk quota
@ -84,6 +85,49 @@ try {
exit;
}
// Check YGGstate connections to discover new hosts
if (CRAWL_YGGSTATE) {
foreach (json_decode(CRAWL_YGGSTATE) as $server => $nodes) {
foreach ($nodes as $i => $node) {
switch ($server) {
case 'db':
try {
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
// Register new host
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
}
} catch(Exception $e) {
var_dump($e);
continue 2;
}
break;
}
}
}
}
// Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {

80
library/yggstate.php Normal file
View File

@ -0,0 +1,80 @@
<?php
class YGGstate {
private PDO $_db;
private object $_debug;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_debug = (object)
[
'query' => (object)
[
'select' => (object)
[
'total' => 0
],
'insert' => (object)
[
'total' => 0
],
'update' => (object)
[
'total' => 0
],
'delete' => (object)
[
'total' => 0
],
]
];
}
// Tools
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
public function getDebug() {
return $this->_debug;
}
// Peer
public function getPeersByMinLastUptime(int $time) {
$this->_debug->query->select->total++;
$query = $this->_db->prepare('SELECT * FROM `peer`
HAVING (
SELECT `peerRemote`.`uptime`
FROM `peerRemote`
WHERE `peerRemote`.`peerId` = `peer`.`peerId`
ORDER BY `timeAdded` DESC
LIMIT 1
) >= ?');
$query->execute([$time]);
return $query->fetchAll();
}
}