Browse Source

refactor hostPageDom tables, add multiple selectors and children values support

main
ghost 9 months ago
parent
commit
eccb7ea241
  1. BIN
      database/yggo.mwb
  2. BIN
      media/db-prototype.png
  3. 62
      src/crontab/crawler.php
  4. 35
      src/library/mysql.php

BIN
database/yggo.mwb

Binary file not shown.

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 138 KiB

After

Width:  |  Height:  |  Size: 168 KiB

62
src/crontab/crawler.php

@ -997,55 +997,63 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_ @@ -997,55 +997,63 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_HOST_PAGE_QUEUE_LIMIT, time() - CRAWL_
// Process selectors configuration
if ($hostPageDomSelectors = Helper::getHostSettingValue($db, $memory, $queueHostPage->hostId, 'PAGES_DOM_SELECTORS', json_decode(DEFAULT_HOST_PAGES_DOM_SELECTORS))) {
$hostPageDomId = $db->addHostPageDom(
$queueHostPage->hostPageId,
time()
);
foreach ($hostPageDomSelectors as $selector => $settings) {
// Extract target selector data
foreach ($crawler->filter($selector) as $data) {
$hostPageDomSelectorId = $db->addHostPageDomSelector(
$hostPageDomId,
$selector
);
foreach ($data->childNodes as $node) {
// Extract selectors data
foreach ($crawler->filter($selector)->each(function($node) {
$value = trim($node->ownerDocument->saveHtml());
return $node->html();
// Apply selector settings
foreach ($settings as $key => $setting) {
}) as $value) {
switch ($key) {
foreach ($settings as $name => $setting) {
case 'strip_tags':
// Apply value settings
switch ($name) {
if (!isset($setting->enabled)) {
case 'strip_tags':
continue 2;
}
if (!isset($setting->enabled)) {
if (false === $setting->enabled) {
break;
}
continue 2;
}
if (false === $setting->enabled) {
if (!isset($setting->allowed_tags)) {
break;
}
continue 2;
}
if (!isset($setting->allowed_tags)) {
$value = strip_tags($value, $setting->allowed_tags);
break;
}
break;
}
$value = strip_tags($value, $setting->allowed_tags);
break;
}
// Skip empty selector values save
$value = trim($value);
if (empty($value)) {
continue;
}
// Save selector value
$db->addHostPageDom(
$queueHostPage->hostPageId,
$selector,
$value,
time()
// Save selector data
$db->addHostPageDomSelectorData(
$hostPageDomSelectorId,
$value
);
}
}

35
src/library/mysql.php

@ -620,40 +620,37 @@ class MySQL { @@ -620,40 +620,37 @@ class MySQL {
return $query->rowCount();
}
public function addHostPageDom(int $hostPageId, string $selector, string $value, int $timeAdded) {
public function addHostPageDom(int $hostPageId, int $timeAdded) {
$this->_debug->query->insert->total++;
$query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?, `selector` = ?, `value` = ?');
$query = $this->_db->prepare('INSERT INTO `hostPageDom` SET `hostPageId` = ?, `timeAdded` = ?');
$query->execute([$hostPageId, $timeAdded, $selector, $value]);
$query->execute([$hostPageId, $timeAdded]);
return $this->_db->lastInsertId();
}
public function deleteHostPageDomBySelector(int $hostPageId, string $selector) {
public function addHostPageDomSelector(int $hostPageDomId, string $name) {
$this->_debug->query->delete->total++;
$this->_debug->query->insert->total++;
$query = $this->_db->prepare('DELETE FROM `hostPageDom` WHERE `hostPageId` = ? AND `selector` = ?');
$query = $this->_db->prepare('INSERT INTO `hostPageDomSelector` SET `hostPageDomId` = ?, `name` = ?');
$query->execute([$hostPageId, $selector]);
$query->execute([$hostPageDomId, $name]);
return $query->rowCount();
return $this->_db->lastInsertId();
}
public function findLastHostPageDomBySelector(int $hostPageId, string $selector) {
public function addHostPageDomSelectorData(int $hostPageDomSelectorId, string $value) {
$this->_debug->query->select->total++;
$this->_debug->query->insert->total++;
$query = $this->_db->prepare('SELECT * FROM `hostPageDom` WHERE `hostPageId` = ? AND `selector` = ? ORDER BY `timeAdded` DESC LIMIT 1');
$query = $this->_db->prepare('INSERT INTO `hostPageDomSelectorData` SET `hostPageDomSelectorId` = ?, `value` = ?');
$query->execute([$hostPageId, $selector]);
$query->execute([$hostPageDomSelectorId, $value]);
return $query->fetch();
}
public function truncateHostPageDom() {
$query = $this->_db->query('TRUNCATE `hostPageDom`');
return $this->_db->lastInsertId();
}
// Cleaner tools
@ -806,6 +803,8 @@ class MySQL { @@ -806,6 +803,8 @@ class MySQL {
$this->_db->query('OPTIMIZE TABLE `hostPage`');
$this->_db->query('OPTIMIZE TABLE `hostPageDescription`');
$this->_db->query('OPTIMIZE TABLE `hostPageDom`');
$this->_db->query('OPTIMIZE TABLE `hostPageDomSelector`');
$this->_db->query('OPTIMIZE TABLE `hostPageDomSelectorData`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnap`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapStorage`');
$this->_db->query('OPTIMIZE TABLE `hostPageSnapDownload`');

Loading…
Cancel
Save