mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
add YGGstate DB crawl integration
This commit is contained in:
parent
3d9db381e8
commit
034a683df7
@ -2,7 +2,7 @@
|
||||
|
||||
_Проект присвячується захисникам міста Бахмут_
|
||||
|
||||
Written by inspiration to explore [Yggdrasil](https://yggdrasil-network.github.io) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
|
||||
Written by inspiration to explore [Yggdrasil](https://github.com/yggdrasil-network) ecosystem, because of last [YaCy](https://yacy.net/) node there was discontinued.
|
||||
This engine also could be useful for crawling regular websites, small business resources, local networks.
|
||||
|
||||
The project goal - simple interface, clear architecture and lightweight server requirement.
|
||||
@ -207,6 +207,11 @@ GET m=SphinxQL
|
||||
+ [ ] Atom
|
||||
* [ ] Palette image index / filter
|
||||
* [ ] Crawl queue balancer, that depends of CPU available
|
||||
* [x] Networks integration
|
||||
+ [x] [yggdrasil](https://github.com/yggdrasil-network)
|
||||
+ [x] [YGGstate](https://github.com/YGGverse/YGGstate) (unlimited nodes)
|
||||
+ [x] DB
|
||||
+ [ ] API
|
||||
|
||||
##### Cleaner
|
||||
|
||||
@ -272,6 +277,7 @@ See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway)
|
||||
#### See also
|
||||
|
||||
* [YGGwave ~ The Radio Catalog](https://github.com/YGGverse/YGGwave)
|
||||
* [YGGstate - Yggdrasil Network Analytics](https://github.com/YGGverse/YGGstate)
|
||||
|
||||
#### Feedback
|
||||
|
||||
|
@ -320,6 +320,39 @@ define('CRAWL_MANIFEST', true);
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.13);
|
||||
|
||||
|
||||
// Integrations
|
||||
|
||||
/*
|
||||
* Crawl YGGstate for peers to descover new hosts
|
||||
*
|
||||
* Yggdrasil networks only
|
||||
*
|
||||
* Read more:
|
||||
* https://github.com/YGGverse/YGGstate
|
||||
*
|
||||
*/
|
||||
define('CRAWL_YGGSTATE', json_encode((object)
|
||||
[
|
||||
'db' =>
|
||||
[
|
||||
[
|
||||
// Conditions
|
||||
'peer_min_last_uptime' => 60*60*24, // skip short-term connections, seconds
|
||||
'timeout' => 60*60*24, // these calls running in crontab/crawler qeue, prevent remote server abuse
|
||||
|
||||
// Connection
|
||||
'port' => 3306,
|
||||
'host' => '',
|
||||
'database' => '',
|
||||
'username' => '',
|
||||
'password' => '',
|
||||
],
|
||||
// ...
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
/*
|
||||
* Remove host ban after following time
|
||||
*
|
||||
|
@ -31,6 +31,7 @@ require_once(__DIR__ . '/../library/url.php');
|
||||
require_once(__DIR__ . '/../library/filter.php');
|
||||
require_once(__DIR__ . '/../library/mysql.php');
|
||||
require_once(__DIR__ . '/../library/helper.php');
|
||||
require_once(__DIR__ . '/../library/yggstate.php');
|
||||
require_once(__DIR__ . '/../library/vendor/simple_html_dom.php');
|
||||
|
||||
// Check disk quota
|
||||
@ -84,6 +85,49 @@ try {
|
||||
exit;
|
||||
}
|
||||
|
||||
// Check YGGstate connections to discover new hosts
|
||||
if (CRAWL_YGGSTATE) {
|
||||
|
||||
foreach (json_decode(CRAWL_YGGSTATE) as $server => $nodes) {
|
||||
|
||||
foreach ($nodes as $i => $node) {
|
||||
|
||||
switch ($server) {
|
||||
|
||||
case 'db':
|
||||
|
||||
try {
|
||||
|
||||
if (!$memcached->get(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i))) {
|
||||
|
||||
$yggStateDB = new YGGstate($node->host, $node->port, $node->database, $node->username, $node->password);
|
||||
|
||||
foreach ($yggStatePeers = $yggStateDB->getPeersByMinLastUptime($node->peer_min_last_uptime) as $yggStatePeer) {
|
||||
|
||||
// Register new host
|
||||
if ($linkToDBresult = Helper::addLinkToDB($db, $memcached, sprintf('http://[%s]/', $yggStatePeer->address))) {
|
||||
|
||||
$hostsAdded += count($linkToDBresult->new->hostId);
|
||||
$hostPagesAdded += count($linkToDBresult->new->hostPageId);
|
||||
}
|
||||
}
|
||||
|
||||
$memcached->set(sprintf('Crontab.crawler.YGGstate.%s.%s.timeout', $server, $i), true, time() + $node->timeout);
|
||||
}
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
var_dump($e);
|
||||
|
||||
continue 2;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process hosts crawl queue
|
||||
foreach ($db->getHostCrawlQueue(CRAWL_HOST_LIMIT, time() - CRAWL_HOST_SECONDS_OFFSET) as $queueHost) {
|
||||
|
||||
|
80
library/yggstate.php
Normal file
80
library/yggstate.php
Normal file
@ -0,0 +1,80 @@
|
||||
<?php
|
||||
|
||||
class YGGstate {
|
||||
|
||||
private PDO $_db;
|
||||
|
||||
private object $_debug;
|
||||
|
||||
public function __construct(string $host, int $port, string $database, string $username, string $password) {
|
||||
|
||||
$this->_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']);
|
||||
$this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
|
||||
$this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
|
||||
$this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600);
|
||||
|
||||
$this->_debug = (object)
|
||||
[
|
||||
'query' => (object)
|
||||
[
|
||||
'select' => (object)
|
||||
[
|
||||
'total' => 0
|
||||
],
|
||||
'insert' => (object)
|
||||
[
|
||||
'total' => 0
|
||||
],
|
||||
'update' => (object)
|
||||
[
|
||||
'total' => 0
|
||||
],
|
||||
'delete' => (object)
|
||||
[
|
||||
'total' => 0
|
||||
],
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
// Tools
|
||||
public function beginTransaction() {
|
||||
|
||||
$this->_db->beginTransaction();
|
||||
}
|
||||
|
||||
public function commit() {
|
||||
|
||||
$this->_db->commit();
|
||||
}
|
||||
|
||||
public function rollBack() {
|
||||
|
||||
$this->_db->rollBack();
|
||||
}
|
||||
|
||||
public function getDebug() {
|
||||
|
||||
return $this->_debug;
|
||||
}
|
||||
|
||||
// Peer
|
||||
public function getPeersByMinLastUptime(int $time) {
|
||||
|
||||
$this->_debug->query->select->total++;
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `peer`
|
||||
|
||||
HAVING (
|
||||
SELECT `peerRemote`.`uptime`
|
||||
FROM `peerRemote`
|
||||
WHERE `peerRemote`.`peerId` = `peer`.`peerId`
|
||||
ORDER BY `timeAdded` DESC
|
||||
LIMIT 1
|
||||
) >= ?');
|
||||
|
||||
$query->execute([$time]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user