Browse Source

add YGGstate DB crawl integration

main
ghost 1 year ago
parent
commit
034a683df7
  1. 8
      README.md
  2. 33
      config/app.php.example
  3. 44
      crontab/crawler.php
  4. 80
      library/yggstate.php

8
README.md

@ -2,7 +2,7 @@
_Проект присвячується захисникам міста Бахмут_ _Проект присвячується захисникам міста Бахмут_
Written by inspiration to explore [Yggdrasil](https://yggdrasil-network.github.io) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued. Written by inspiration to explore [Yggdrasil](https://github.com/yggdrasil-network) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
This engine also could be useful for crawling regular websites, small business resources, local networks. This engine also could be useful for crawling regular websites, small business resources, local networks.
The project goal - simple interface, clear architecture and lightweight server requirement. The project goal - simple interface, clear architecture and lightweight server requirement.
@ -207,6 +207,11 @@ GET m=SphinxQL
+ [ ] Atom + [ ] Atom
* [ ] Palette image index / filter * [ ] Palette image index / filter
* [ ] Crawl queue balancer, that depends of CPU available * [ ] Crawl queue balancer, that depends of CPU available
* [x] Networks integration
+ [x] [yggdrasil](https://github.com/yggdrasil-network)
+ [x] [YGGstate](https://github.com/YGGverse/YGGstate) (unlimited nodes)
+ [x] DB
+ [ ] API
##### Cleaner ##### Cleaner
@ -272,6 +277,7 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
#### See also #### See also
* [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave) * [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave)
* [YGGstate - Yggdrasil Network Analytics](https://github.com/YGGverse/YGGstate)
#### Feedback #### Feedback

33
config/app.php.example

@ -320,6 +320,39 @@ define('CRAWL_MANIFEST', true);
*/ */
define('CRAWL_MANIFEST_API_VERSION', 0.13); define('CRAWL_MANIFEST_API_VERSION', 0.13);
// Integrations
/*
* Crawl YGGstate for peers to descover new hosts
*
* Yggdrasil networks only
*
* Read more:
* https://github.com/YGGverse/YGGstate
*
*/
define('CRAWL_YGGSTATE', json_encode((object)
[
'db' =>
[
[
// Conditions
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse
// Connection
'port' => 3306,
'host' => '',
'database' => '',
'username' => '',
'password' => '',
],
// ...
],
])
);
/* /*
* Remove host ban after following time * Remove host ban after following time
* *

44
crontab/crawler.php

@ -31,6 +31,7 @@ require_once(__DIR__ . '/../library/url.php');
require_once(__DIR__ . '/../library/filter.php'); require_once(__DIR__ . '/../library/filter.php');
require_once(__DIR__ . '/../library/mysql.php'); require_once(__DIR__ . '/../library/mysql.php');
require_once(__DIR__ . '/../library/helper.php'); require_once(__DIR__ . '/../library/helper.php');
require_once(__DIR__ . '/../library/yggstate.php');
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php'); require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
// Check disk quota // Check disk quota
@ -84,6 +85,49 @@ try {
exit; exit;
} }
// Check YGGstate connections to discover new hosts
if (CRAWL_YGGSTATE) {
foreach (json_decode(CRAWL_YGGSTATE) as $server => $nodes) {
foreach ($nodes as $i => $node) {
switch ($server) {
case 'db':
try {
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
// Register new host
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) {
$hostsAdded += count($linkToDBresult->new->hostId);
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
}
}
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
}
} catch(Exception $e) {
var_dump($e);
continue 2;
}
break;
}
}
}
}
// Process hosts crawl queue // Process hosts crawl queue
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) { foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {

80
library/yggstate.php

@ -0,0 +1,80 @@
<?php
class YGGstate {
private PDO $_db;
private object $_debug;
public function __construct(string $host, int $port, string $database, string $username, string $password) {
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
$this->_debug = (object)
[
'query' => (object)
[
'select' => (object)
[
'total' => 0
],
'insert' => (object)
[
'total' => 0
],
'update' => (object)
[
'total' => 0
],
'delete' => (object)
[
'total' => 0
],
]
];
}
// Tools
public function beginTransaction() {
$this->_db->beginTransaction();
}
public function commit() {
$this->_db->commit();
}
public function rollBack() {
$this->_db->rollBack();
}
public function getDebug() {
return $this->_debug;
}
// Peer
public function getPeersByMinLastUptime(int $time) {
$this->_debug->query->select->total++;
$query = $this->_db->prepare('SELECT * FROM `peer`
HAVING (
SELECT `peerRemote`.`uptime`
FROM `peerRemote`
WHERE `peerRemote`.`peerId` = `peer`.`peerId`
ORDER BY `timeAdded` DESC
LIMIT 1
) >= ?');
$query->execute([$time]);
return $query->fetchAll();
}
}
Loading…
Cancel
Save