mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
implement custom hostPageDom elements index
This commit is contained in:
parent
5df598a1d4
commit
5346b13602
@ -306,6 +306,22 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generates hostPageDom index based on hostPage.data field
|
||||||
|
*
|
||||||
|
* Could be useful for building semantical index query (config/sphinx.conf.txt)
|
||||||
|
*
|
||||||
|
* At this moment feature available in the CLI only (cli/yggo.php)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6');
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Look for third-party manifests to collect distributed index
|
* Look for third-party manifests to collect distributed index
|
||||||
*
|
*
|
||||||
@ -386,6 +402,12 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
|||||||
*/
|
*/
|
||||||
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove page DOM history after following time
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CLEAN_PAGE_DOM_OFFSET', 60*60*24*30*12*10);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Database tables optimization
|
* Database tables optimization
|
||||||
*
|
*
|
||||||
|
@ -32,6 +32,7 @@ $manifestsTotal = $db->getTotalManifests();
|
|||||||
$hostsUpdated = 0;
|
$hostsUpdated = 0;
|
||||||
$hostPagesDeleted = 0;
|
$hostPagesDeleted = 0;
|
||||||
$hostPagesDescriptionsDeleted = 0;
|
$hostPagesDescriptionsDeleted = 0;
|
||||||
|
$hostPagesDomsDeleted = 0;
|
||||||
$hostPagesSnapDeleted = 0;
|
$hostPagesSnapDeleted = 0;
|
||||||
$hostPagesToHostPageDeleted = 0;
|
$hostPagesToHostPageDeleted = 0;
|
||||||
$manifestsDeleted = 0;
|
$manifestsDeleted = 0;
|
||||||
@ -81,6 +82,9 @@ try {
|
|||||||
// Delete host page descriptions
|
// Delete host page descriptions
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||||
|
|
||||||
|
// Delete host page DOMs
|
||||||
|
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page refs data
|
// Delete host page refs data
|
||||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||||
|
|
||||||
@ -124,6 +128,9 @@ try {
|
|||||||
// Delete host page descriptions
|
// Delete host page descriptions
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||||
|
|
||||||
|
// Delete host page DOMs
|
||||||
|
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page refs data
|
// Delete host page refs data
|
||||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||||
|
|
||||||
@ -223,6 +230,9 @@ try {
|
|||||||
// Delete host page descriptions
|
// Delete host page descriptions
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId);
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId);
|
||||||
|
|
||||||
|
// Delete host page DOMs
|
||||||
|
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
|
||||||
|
|
||||||
// Delete host page refs data
|
// Delete host page refs data
|
||||||
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId);
|
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId);
|
||||||
|
|
||||||
@ -254,6 +264,9 @@ try {
|
|||||||
// Delete page description history
|
// Delete page description history
|
||||||
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
||||||
|
|
||||||
|
// Delete page dom history
|
||||||
|
$hostPagesDomsDeleted += $db->deleteHostPageDomsByTimeAdded(time() - CLEAN_PAGE_DOM_OFFSET);
|
||||||
|
|
||||||
// Delete deprecated logs
|
// Delete deprecated logs
|
||||||
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
||||||
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
||||||
@ -292,6 +305,7 @@ if (CLEAN_LOG_ENABLED) {
|
|||||||
$hostsUpdated,
|
$hostsUpdated,
|
||||||
$hostPagesDeleted,
|
$hostPagesDeleted,
|
||||||
$hostPagesDescriptionsDeleted,
|
$hostPagesDescriptionsDeleted,
|
||||||
|
$hostPagesDomsDeleted,
|
||||||
$hostPagesSnapDeleted,
|
$hostPagesSnapDeleted,
|
||||||
$hostPagesToHostPageDeleted,
|
$hostPagesToHostPageDeleted,
|
||||||
$hostPagesBansRemoved,
|
$hostPagesBansRemoved,
|
||||||
@ -316,6 +330,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
|||||||
|
|
||||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||||
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
|
||||||
|
echo 'Host page doms deleted: ' . $hostPagesDomsDeleted . PHP_EOL;
|
||||||
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
|
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
|
||||||
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ require_once('../library/robots.php');
|
|||||||
require_once('../library/filter.php');
|
require_once('../library/filter.php');
|
||||||
require_once('../library/parser.php');
|
require_once('../library/parser.php');
|
||||||
require_once('../library/mysql.php');
|
require_once('../library/mysql.php');
|
||||||
|
require_once('../library/vendor/simple_html_dom.php');
|
||||||
|
|
||||||
// Check disk quota
|
// Check disk quota
|
||||||
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
|
||||||
@ -491,7 +492,33 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
$metaKeywords = null;
|
$metaKeywords = null;
|
||||||
$metaYggoManifest = null;
|
$metaYggoManifest = null;
|
||||||
|
|
||||||
// Parse content
|
// Collect page DOM elements data
|
||||||
|
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
|
||||||
|
|
||||||
|
// Begin selectors extraction
|
||||||
|
$html = str_get_html($content);
|
||||||
|
|
||||||
|
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
|
||||||
|
|
||||||
|
foreach($html->find($selector) as $element) {
|
||||||
|
|
||||||
|
if (!empty($element->innertext)) {
|
||||||
|
|
||||||
|
$db->addHostPageDom($queueHostPage->hostPageId,
|
||||||
|
time(),
|
||||||
|
$selector,
|
||||||
|
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
|
||||||
|
preg_replace('/[\s]+/',
|
||||||
|
' ',
|
||||||
|
str_replace(['<br />', '<br/>', '<br>', '</'],
|
||||||
|
[' ', ' ', ' ', ' </'],
|
||||||
|
$element->innertext))) : $element->innertext));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse page content
|
||||||
$dom = new DomDocument();
|
$dom = new DomDocument();
|
||||||
|
|
||||||
if ($encoding = mb_detect_encoding($content)) {
|
if ($encoding = mb_detect_encoding($content)) {
|
||||||
|
Binary file not shown.
@ -198,6 +198,13 @@ class MySQL {
|
|||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostPagesByIndexed() {
|
||||||
|
|
||||||
|
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL LIMIT 100,1'); // @TODO
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
public function getHostPagesByLimit(int $hostId, int $limit) {
|
public function getHostPagesByLimit(int $hostId, int $limit) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
|
||||||
@ -486,6 +493,34 @@ class MySQL {
|
|||||||
return $query->fetch()->size;
|
return $query->fetch()->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostPageId, $timeAdded, $selector, $value]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostPageDoms(int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostPageDomsByTimeAdded(int $timeOffset) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function truncateHostPageDom() {
|
||||||
|
|
||||||
|
$query = $this->_db->query('TRUNCATE `hostPageDom`');
|
||||||
|
}
|
||||||
|
|
||||||
// Cleaner tools
|
// Cleaner tools
|
||||||
public function getCleanerQueue(int $limit, int $timeFrom) {
|
public function getCleanerQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
@ -532,6 +567,7 @@ class MySQL {
|
|||||||
int $hostsUpdated,
|
int $hostsUpdated,
|
||||||
int $hostPagesDeleted,
|
int $hostPagesDeleted,
|
||||||
int $hostPagesDescriptionsDeleted,
|
int $hostPagesDescriptionsDeleted,
|
||||||
|
int $hostPagesDomsDeleted,
|
||||||
int $hostPagesSnapDeleted,
|
int $hostPagesSnapDeleted,
|
||||||
int $hostPagesToHostPageDeleted,
|
int $hostPagesToHostPageDeleted,
|
||||||
int $hostPagesBansRemoved,
|
int $hostPagesBansRemoved,
|
||||||
@ -550,6 +586,7 @@ class MySQL {
|
|||||||
`hostsUpdated`,
|
`hostsUpdated`,
|
||||||
`hostPagesDeleted`,
|
`hostPagesDeleted`,
|
||||||
`hostPagesDescriptionsDeleted`,
|
`hostPagesDescriptionsDeleted`,
|
||||||
|
`hostPagesDomsDeleted`,
|
||||||
`hostPagesSnapDeleted`,
|
`hostPagesSnapDeleted`,
|
||||||
`hostPagesToHostPageDeleted`,
|
`hostPagesToHostPageDeleted`,
|
||||||
`hostPagesBansRemoved`,
|
`hostPagesBansRemoved`,
|
||||||
@ -561,7 +598,7 @@ class MySQL {
|
|||||||
`httpRequestsSizeTotal`,
|
`httpRequestsSizeTotal`,
|
||||||
`httpDownloadSizeTotal`,
|
`httpDownloadSizeTotal`,
|
||||||
`httpRequestsTimeTotal`,
|
`httpRequestsTimeTotal`,
|
||||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([
|
$query->execute([
|
||||||
$timeAdded,
|
$timeAdded,
|
||||||
@ -569,6 +606,7 @@ class MySQL {
|
|||||||
$hostsUpdated,
|
$hostsUpdated,
|
||||||
$hostPagesDeleted,
|
$hostPagesDeleted,
|
||||||
$hostPagesDescriptionsDeleted,
|
$hostPagesDescriptionsDeleted,
|
||||||
|
$hostPagesDomsDeleted,
|
||||||
$hostPagesSnapDeleted,
|
$hostPagesSnapDeleted,
|
||||||
$hostPagesToHostPageDeleted,
|
$hostPagesToHostPageDeleted,
|
||||||
$hostPagesBansRemoved,
|
$hostPagesBansRemoved,
|
||||||
@ -718,6 +756,7 @@ class MySQL {
|
|||||||
$this->_db->query('OPTIMIZE TABLE `host`');
|
$this->_db->query('OPTIMIZE TABLE `host`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPage`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
|
||||||
|
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnap`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageSnap`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
|
||||||
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');
|
||||||
|
2353
library/vendor/simple_html_dom.php
vendored
Normal file
2353
library/vendor/simple_html_dom.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Before Width: | Height: | Size: 181 KiB After Width: | Height: | Size: 211 KiB |
Loading…
x
Reference in New Issue
Block a user