Browse Source

refactor hostPageDom tables, add multiple selectors and children values support

main
ghost 1 year ago
parent
commit
eccb7ea241
  1. BIN
      database/yggo.mwb
  2. BIN
      media/db-prototype.png
  3. 62
      src/crontab/crawler.php
  4. 35
      src/library/mysql.php

BIN
database/yggo.mwb

Binary file not shown.

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 138 KiB

After

Width:  |  Height:  |  Size: 168 KiB

62
src/crontab/crawler.php

@ -997,55 +997,63 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
// Process selectors configuration // Process selectors configuration
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', json_decode(DEFAULT_HOST_PAGES_DOM_SELECTORS))) { if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', json_decode(DEFAULT_HOST_PAGES_DOM_SELECTORS))) {
$hostPageDomId = $db->addHostPageDom(
$queueHostPage->hostPageId,
time()
);
foreach ($hostPageDomSelectors as $selector => $settings) { foreach ($hostPageDomSelectors as $selector => $settings) {
// Extract target selector data $hostPageDomSelectorId = $db->addHostPageDomSelector(
foreach ($crawler->filter($selector) as $data) { $hostPageDomId,
$selector
);
foreach ($data->childNodes as $node) { // Extract selectors data
foreach ($crawler->filter($selector)->each(function($node) {
$value = trim($node->ownerDocument->saveHtml()); return $node->html();
// Apply selector settings }) as $value) {
foreach ($settings as $key => $setting) {
switch ($key) { foreach ($settings as $name => $setting) {
case 'strip_tags': // Apply value settings
switch ($name) {
if (!isset($setting->enabled)) { case 'strip_tags':
continue 2; if (!isset($setting->enabled)) {
}
if (false === $setting->enabled) { break;
}
continue 2; if (false === $setting->enabled) {
}
if (!isset($setting->allowed_tags)) { break;
}
continue 2; if (!isset($setting->allowed_tags)) {
}
$value = strip_tags($value, $setting->allowed_tags); break;
}
break; $value = strip_tags($value, $setting->allowed_tags);
}
break;
} }
// Skip empty selector values save $value = trim($value);
if (empty($value)) { if (empty($value)) {
continue; continue;
} }
// Save selector value // Save selector data
$db->addHostPageDom( $db->addHostPageDomSelectorData(
$queueHostPage->hostPageId, $hostPageDomSelectorId,
$selector, $value
$value,
time()
); );
} }
} }

35
src/library/mysql.php

@ -620,40 +620,37 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function addHostPageDom(int $hostPageId, string $selector, string $value, int $timeAdded) { public function addHostPageDom(int $hostPageId, int $timeAdded) {
$this->_debug->query->insert->total++; $this->_debug->query->insert->total++;
$query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?'); $query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?');
$query->execute([$hostPageId, $timeAdded, $selector, $value]); $query->execute([$hostPageId, $timeAdded]);
return $this->_db->lastInsertId();
} }
public function deleteHostPageDomBySelector(int $hostPageId, string $selector) { public function addHostPageDomSelector(int $hostPageDomId, string $name) {
$this->_debug->query->delete->total++; $this->_debug->query->insert->total++;
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ? AND `selector` = ?'); $query = $this->_db->prepare('INSERT INTO `hostPageDomSelector` SET `hostPageDomId` = ?, `name` = ?');
$query->execute([$hostPageId, $selector]); $query->execute([$hostPageDomId, $name]);
return $query->rowCount(); return $this->_db->lastInsertId();
} }
public function findLastHostPageDomBySelector(int $hostPageId, string $selector) { public function addHostPageDomSelectorData(int $hostPageDomSelectorId, string $value) {
$this->_debug->query->select->total++; $this->_debug->query->insert->total++;
$query = $this->_db->prepare('SELECT * FROM `hostPageDom` WHERE `hostPageId` = ? AND `selector` = ? ORDER BY `timeAdded` DESC LIMIT 1'); $query = $this->_db->prepare('INSERT INTO `hostPageDomSelectorData` SET `hostPageDomSelectorId` = ?, `value` = ?');
$query->execute([$hostPageId, $selector]); $query->execute([$hostPageDomSelectorId, $value]);
return $query->fetch(); return $this->_db->lastInsertId();
}
public function truncateHostPageDom() {
$query = $this->_db->query('TRUNCATE `hostPageDom`');
} }
// Cleaner tools // Cleaner tools
@ -806,6 +803,8 @@ class MySQL {
$this->_db->query('OPTIMIZE TABLE `hostPage`'); $this->_db->query('OPTIMIZE TABLE `hostPage`');
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`'); $this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
$this->_db->query('OPTIMIZE TABLE `hostPageDom`'); $this->_db->query('OPTIMIZE TABLE `hostPageDom`');
$this->_db->query('OPTIMIZE TABLE `hostPageDomSelector`');
$this->_db->query('OPTIMIZE TABLE `hostPageDomSelectorData`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnap`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnap`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`'); $this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');

Loading…
Cancel
Save