implement custom hostPageDom elements index

This commit is contained in:
ghost 2023-06-25 22:10:47 +03:00
parent 5df598a1d4
commit 5346b13602
7 changed files with 2458 additions and 2 deletions

View File

@ -306,6 +306,22 @@ define('CRAWL_ROBOTS_DEFAULT_RULES', null); // string|null
*/ */
define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
/*
* Generates hostPageDom index based on hostPage.data field
*
* Could be useful for building semantical index query (config/sphinx.conf.txt)
*
* At this moment feature available in the CLI only (cli/yggo.php)
*
*/
define('CRAWL_HOST_PAGE_DOM_SELECTORS', 'h1,h2,h3,h4,h5,h6');
/*
* Strip HTML in the CRAWL_HOST_PAGE_DOM_SELECTORS content
*
*/
define('CRAWL_HOST_PAGE_DOM_STRIP_TAGS', true);
/* /*
* Look for third-party manifests to collect distributed index * Look for third-party manifests to collect distributed index
* *
@ -386,6 +402,12 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/ */
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10); define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
/*
* Remove page DOM history after following time
*
*/
define('CLEAN_PAGE_DOM_OFFSET', 60*60*24*30*12*10);
/* /*
* Database tables optimization * Database tables optimization
* *

View File

@ -32,6 +32,7 @@ $manifestsTotal = $db->getTotalManifests();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostPagesDeleted = 0; $hostPagesDeleted = 0;
$hostPagesDescriptionsDeleted = 0; $hostPagesDescriptionsDeleted = 0;
$hostPagesDomsDeleted = 0;
$hostPagesSnapDeleted = 0; $hostPagesSnapDeleted = 0;
$hostPagesToHostPageDeleted = 0; $hostPagesToHostPageDeleted = 0;
$manifestsDeleted = 0; $manifestsDeleted = 0;
@ -81,6 +82,9 @@ try {
// Delete host page descriptions // Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
// Delete host page refs data // Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
@ -124,6 +128,9 @@ try {
// Delete host page descriptions // Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPage->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
// Delete host page refs data // Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPage->hostPageId);
@ -223,6 +230,9 @@ try {
// Delete host page descriptions // Delete host page descriptions
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptions($hostPageBanned->hostPageId);
// Delete host page DOMs
$hostPagesDomsDeleted += $db->deleteHostPageDoms($hostPage->hostPageId);
// Delete host page refs data // Delete host page refs data
$hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId); $hostPagesToHostPageDeleted += $db->deleteHostPageToHostPage($hostPageBanned->hostPageId);
@ -254,6 +264,9 @@ try {
// Delete page description history // Delete page description history
$hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET); $hostPagesDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
// Delete page dom history
$hostPagesDomsDeleted += $db->deleteHostPageDomsByTimeAdded(time() - CLEAN_PAGE_DOM_OFFSET);
// Delete deprecated logs // Delete deprecated logs
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET); $logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET); $logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
@ -292,6 +305,7 @@ if (CLEAN_LOG_ENABLED) {
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPagesDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesDomsDeleted,
$hostPagesSnapDeleted, $hostPagesSnapDeleted,
$hostPagesToHostPageDeleted, $hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
@ -316,6 +330,7 @@ echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL; echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL; echo 'Host page descriptions deleted: ' . $hostPagesDescriptionsDeleted . PHP_EOL;
echo 'Host page doms deleted: ' . $hostPagesDomsDeleted . PHP_EOL;
echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL; echo 'Host page snaps deleted: ' . $hostPagesSnapDeleted . PHP_EOL;
echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL; echo 'Host page to host page deleted: ' . $hostPagesToHostPageDeleted . PHP_EOL;

View File

@ -17,6 +17,7 @@ require_once('../library/robots.php');
require_once('../library/filter.php'); require_once('../library/filter.php');
require_once('../library/parser.php'); require_once('../library/parser.php');
require_once('../library/mysql.php'); require_once('../library/mysql.php');
require_once('../library/vendor/simple_html_dom.php');
// Check disk quota // Check disk quota
if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) { if (CRAWL_STOP_DISK_QUOTA_MB_LEFT > disk_free_space('/') / 1000000) {
@ -491,7 +492,33 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$metaKeywords = null; $metaKeywords = null;
$metaYggoManifest = null; $metaYggoManifest = null;
// Parse content // Collect page DOM elements data
if (CRAWL_HOST_PAGE_DOM_SELECTORS) {
// Begin selectors extraction
$html = str_get_html($content);
foreach ((array) explode(',', CRAWL_HOST_PAGE_DOM_SELECTORS) as $selector) {
foreach($html->find($selector) as $element) {
if (!empty($element->innertext)) {
$db->addHostPageDom($queueHostPage->hostPageId,
time(),
$selector,
trim(CRAWL_HOST_PAGE_DOM_STRIP_TAGS ? strip_tags(
preg_replace('/[\s]+/',
' ',
str_replace(['<br />', '<br/>', '<br>', '</'],
[' ', ' ', ' ', ' </'],
$element->innertext))) : $element->innertext));
}
}
}
}
// Parse page content
$dom = new DomDocument(); $dom = new DomDocument();
if ($encoding = mb_detect_encoding($content)) { if ($encoding = mb_detect_encoding($content)) {

Binary file not shown.

View File

@ -198,6 +198,13 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPagesByIndexed() {
$query = $this->_db->query('SELECT * FROM `hostPage` WHERE `timeUpdated` IS NOT NULL AND `timeBanned` IS NULL LIMIT 100,1'); // @TODO
return $query->fetchAll();
}
public function getHostPagesByLimit(int $hostId, int $limit) { public function getHostPagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit); $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY `hostPageId` DESC LIMIT ' . (int) $limit);
@ -486,6 +493,34 @@ class MySQL {
return $query->fetch()->size; return $query->fetch()->size;
} }
public function addHostPageDom(int $hostPageId, int $timeAdded, string $selector, string $value) {
$query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?');
$query->execute([$hostPageId, $timeAdded, $selector, $value]);
}
public function deleteHostPageDoms(int $hostPageId) {
$query = $this->_db->query('DELETE FROM `hostPageDom` WHERE `hostPageId` = ?');
return $query->rowCount();
}
public function deleteHostPageDomsByTimeAdded(int $timeOffset) {
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `timeAdded` < ' . (int) $timeOffset);
$query->execute();
return $query->rowCount();
}
public function truncateHostPageDom() {
$query = $this->_db->query('TRUNCATE `hostPageDom`');
}
// Cleaner tools // Cleaner tools
public function getCleanerQueue(int $limit, int $timeFrom) { public function getCleanerQueue(int $limit, int $timeFrom) {
@ -532,6 +567,7 @@ class MySQL {
int $hostsUpdated, int $hostsUpdated,
int $hostPagesDeleted, int $hostPagesDeleted,
int $hostPagesDescriptionsDeleted, int $hostPagesDescriptionsDeleted,
int $hostPagesDomsDeleted,
int $hostPagesSnapDeleted, int $hostPagesSnapDeleted,
int $hostPagesToHostPageDeleted, int $hostPagesToHostPageDeleted,
int $hostPagesBansRemoved, int $hostPagesBansRemoved,
@ -550,6 +586,7 @@ class MySQL {
`hostsUpdated`, `hostsUpdated`,
`hostPagesDeleted`, `hostPagesDeleted`,
`hostPagesDescriptionsDeleted`, `hostPagesDescriptionsDeleted`,
`hostPagesDomsDeleted`,
`hostPagesSnapDeleted`, `hostPagesSnapDeleted`,
`hostPagesToHostPageDeleted`, `hostPagesToHostPageDeleted`,
`hostPagesBansRemoved`, `hostPagesBansRemoved`,
@ -561,7 +598,7 @@ class MySQL {
`httpRequestsSizeTotal`, `httpRequestsSizeTotal`,
`httpDownloadSizeTotal`, `httpDownloadSizeTotal`,
`httpRequestsTimeTotal`, `httpRequestsTimeTotal`,
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([ $query->execute([
$timeAdded, $timeAdded,
@ -569,6 +606,7 @@ class MySQL {
$hostsUpdated, $hostsUpdated,
$hostPagesDeleted, $hostPagesDeleted,
$hostPagesDescriptionsDeleted, $hostPagesDescriptionsDeleted,
$hostPagesDomsDeleted,
$hostPagesSnapDeleted, $hostPagesSnapDeleted,
$hostPagesToHostPageDeleted, $hostPagesToHostPageDeleted,
$hostPagesBansRemoved, $hostPagesBansRemoved,
@ -718,6 +756,7 @@ class MySQL {
$this->_db->query('OPTIMIZE TABLE `host`'); $this->_db->query('OPTIMIZE TABLE `host`');
$this->_db->query('OPTIMIZE TABLE `hostPage`'); $this->_db->query('OPTIMIZE TABLE `hostPage`');
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`'); $this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnap`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnap`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');
$this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`'); $this->_db->query('OPTIMIZE TABLE `hostPageToHostPage`');

2353
library/vendor/simple_html_dom.php vendored Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 181 KiB

After

Width:  |  Height:  |  Size: 211 KiB