Browse Source

implement image crawler

main
ghost 2 years ago
parent
commit
0741a3e9ef
  1. 28
      config/app.php.txt
  2. 61
      crontab/cleaner.php
  3. 156
      crontab/crawler.php
  4. BIN
      database/yggo.mwb
  5. 191
      library/mysql.php
  6. BIN
      media/db-prototype.png
  7. 21
      public/api.php

28
config/app.php.txt

@ -145,6 +145,16 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
*/ */
define('CRAWL_HOST_DEFAULT_META_ONLY', false); define('CRAWL_HOST_DEFAULT_META_ONLY', false);
/*
* Images limit per new host by default
*
* Crawler stops indexing on this limit reach to prevent disk overuse
*
* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field
*
*/
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000);
/* /*
* Default robots.txt rules on remote file not exists * Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules * The crawler able to overwrite these rules
@ -250,14 +260,16 @@ define('API_HOSTS_ENABLED', true);
*/ */
define('API_HOSTS_FIELDS', define('API_HOSTS_FIELDS',
'`host`.`scheme`, '`host`.`scheme`,
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`, `host`.`crawlPageLimit`,
`host`.`robots`, `host`.`crawlImageLimit`,
`host`.`robotsPostfix`, `host`.`robots`,
`host`.`timeAdded`, `host`.`robotsPostfix`,
`host`.`timeUpdated`, `host`.`timeAdded`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); // string: *|field names comma separated `host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
/* /*
* Manifest API * Manifest API

61
crontab/cleaner.php

@ -21,9 +21,10 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
// Debug // Debug
$timeStart = microtime(true); $timeStart = microtime(true);
$hostsTotal = $db->getTotalHosts(); $hostsTotal = $db->getTotalHosts();
$hostsUpdated = 0; $hostsUpdated = 0;
$hostsPagesDeleted = 0; $hostsPagesDeleted = 0;
$hostsImagesDeleted = 0;
// Get host queue // Get host queue
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) { foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
@ -48,25 +49,76 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
// Update host data // Update host data
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time()); $hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
// Apply host images limits
$totalHostImages = $db->getTotalHostImages($host->hostId);
if ($totalHostImages > $host->crawlImageLimit) {
foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
// Apply host pages limits // Apply host pages limits
$totalHostPages = $db->getTotalHostPages($host->hostId); $totalHostPages = $db->getTotalHostPages($host->hostId);
if ($totalHostPages > $host->crawlPageLimit) { if ($totalHostPages > $host->crawlPageLimit) {
$hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit); foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
}
} }
// Apply new robots.txt rules // Apply new robots.txt rules
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
foreach ($db->getHostImages($host->hostId) as $hostImage) {
if (!$robots->uriAllowed($hostImage->uri)) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
}
foreach ($db->getHostPages($host->hostId) as $hostPage) { foreach ($db->getHostPages($host->hostId) as $hostPage) {
if (!$robots->uriAllowed($hostPage->uri)) { if (!$robots->uriAllowed($hostPage->uri)) {
// Delete foreign key relations
$db->deleteHostPageToHostImage($hostPage->hostPageId);
// Delete host page
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId); $hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
} }
} }
// Clean up host images unrelated to host pages
foreach ($db->getUnrelatedHostImages() as $hostImage) {
// Delete foreign key relations
$db->deleteHostImageDescription($hostImage->hostImageId);
$db->deleteHostImageToHostPage($hostImage->hostImageId);
// Delete host image
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
}
$db->commit(); $db->commit();
} catch(Exception $e){ } catch(Exception $e){
@ -81,4 +133,5 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
echo 'Hosts total: ' . $hostsTotal . PHP_EOL; echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL; echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL; echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL; echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;

156
crontab/crawler.php

@ -30,6 +30,7 @@ $timeStart = microtime(true);
$hostPagesProcessed = 0; $hostPagesProcessed = 0;
$hostPagesIndexed = 0; $hostPagesIndexed = 0;
$hostPagesAdded = 0; $hostPagesAdded = 0;
$hostImagesAdded = 0;
$hostsAdded = 0; $hostsAdded = 0;
// Connect database // Connect database
@ -127,6 +128,157 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
continue; continue;
} }
// Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
foreach (@$dom->getElementsByTagName('img') as $img) {
// Skip images without src attribute
if (!$imageSrc = @$img->getAttribute('src')) {
continue;
}
// Skip images without alt attribute
if (!$imageAlt = @$img->getAttribute('alt')) {
continue;
}
if (!$imageTitle = @$img->getAttribute('title')) {
$imageTitle = null;
}
// Add domain to the relative src links
if (!parse_url($imageSrc, PHP_URL_HOST)) {
$imageSrc = $queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
}
// Validate formatted src link
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
$db->beginTransaction();
try {
// Parse formatted src link
$hostImageURL = Parser::hostURL($imageSrc);
$hostImageURI = Parser::uri($imageSrc);
// Host exists
if ($host = $db->getHost(crc32($hostImageURL->string))) {
$hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId;
$hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix;
// Register new host
} else {
// Get robots.txt if exists
$curl = new Curl($hostImageURL->string . '/robots.txt');
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
$hostRobots = $curl->getContent();
} else {
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
}
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostImageURL->scheme,
$hostImageURL->name,
$hostImageURL->port,
crc32($hostURL->string),
time(),
null,
$hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus,
$hostRobots,
$hostRobotsPostfix);
if ($hostId) {
$hostsAdded++;
} else {
continue;
}
}
// Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save image info
$hostImageId = $db->getHostImage($hostId, crc32($hostImageURI->string));
if ($hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId) && // images quantity not reached host limit
!$hostImageId) { // image not exists
// Add host image
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time())) {
$hostImagesAdded++;
} else {
continue;
}
}
// Add host image description
$hostImageDescriptionCRC32id = crc32(md5((string) $imageAlt . (string) $imageTitle));
if (!$db->getHostImageDescription($hostImageId, $hostImageDescriptionCRC32id)) {
$db->addHostImageDescription($hostImageId, $hostImageDescriptionCRC32id, (string) $imageAlt, (string) $imageTitle, time());
}
// Relate host image with host page was found
if (!$db->getHostImageToHostPage($hostImageId, $queueHostPage->hostPageId)) {
$db->addHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), null, 1);
} else {
$db->updateHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
}
// Increase page rank when link does not match the current host
if ($hostImageURL->scheme . '://' .
$hostImageURL->name .
($hostImageURL->port ? ':' . $hostImageURL->port : '')
!=
$queueHostPage->scheme . '://' .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
}
$db->commit();
} catch(Exception $e) {
var_dump($e);
$db->rollBack();
}
}
}
}
// Collect internal links from page content // Collect internal links from page content
foreach(@$dom->getElementsByTagName('a') as $a) { foreach(@$dom->getElementsByTagName('a') as $a) {
@ -187,6 +339,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$hostStatus = $host->status; $hostStatus = $host->status;
$hostPageLimit = $host->crawlPageLimit; $hostPageLimit = $host->crawlPageLimit;
$hostImageLimit = $host->crawlImageLimit;
$hostId = $host->hostId; $hostId = $host->hostId;
$hostRobots = $host->robots; $hostRobots = $host->robots;
$hostRobotsPostfix = $host->robotsPostfix; $hostRobotsPostfix = $host->robotsPostfix;
@ -207,6 +360,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
$hostStatus = CRAWL_HOST_DEFAULT_STATUS; $hostStatus = CRAWL_HOST_DEFAULT_STATUS;
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
$hostId = $db->addHost($hostURL->scheme, $hostId = $db->addHost($hostURL->scheme,
$hostURL->name, $hostURL->name,
$hostURL->port, $hostURL->port,
@ -214,6 +368,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
time(), time(),
null, null,
$hostPageLimit, $hostPageLimit,
$hostImageLimit,
(string) CRAWL_HOST_DEFAULT_META_ONLY, (string) CRAWL_HOST_DEFAULT_META_ONLY,
(string) $hostStatus, (string) $hostStatus,
$hostRobots, $hostRobots,
@ -272,5 +427,6 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
echo 'Hosts added: ' . $hostsAdded . PHP_EOL; echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;

BIN
database/yggo.mwb

Binary file not shown.

191
library/mysql.php

@ -40,7 +40,7 @@ class MySQL {
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) { public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?, ?)'); $query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]); $query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
@ -75,11 +75,11 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) { public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]); $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -93,6 +93,173 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
// Images
public function getTotalHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetch()->total;
}
public function getHostImage(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->fetch();
}
public function getHostImages(int $hostId) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getUnrelatedHostImages() {
$query = $this->_db->prepare('SELECT * FROM `hostImage`
WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
FROM `hostImageToHostPage`
WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
$query->execute();
return $query->fetchAll();
}
public function getHostImagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->fetchAll();
}
public function addHostImage(int $hostId,
int $crc32uri,
string $uri,
int $timeAdded,
mixed $timeUpdated = null,
mixed $httpCode = null,
mixed $rank = null) {
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
`crc32uri`,
`uri`,
`timeAdded`,
`timeUpdated`,
`httpCode`,
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank]);
return $this->_db->lastInsertId();
}
public function updateHostImageRank(int $hostId,
int $crc32uri,
int $increment) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]);
return $query->rowCount();
}
public function deleteHostImage(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function getHostImageDescription(int $hostImageId, int $crc32id) {
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? AND `crc32id` = ? LIMIT 1');
$query->execute([$hostImageId, $crc32id]);
return $query->fetch();
}
public function addHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`,
`alt`,
`title`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded]);
return $this->_db->lastInsertId();
}
public function deleteHostImageDescription(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->rowCount();
}
public function getHostImageToHostPage(int $hostImageId, int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ? AND `hostPageId` = ? LIMIT 1');
$query->execute([$hostImageId, $hostPageId]);
return $query->fetch();
}
public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
`hostPageId`,
`timeAdded`,
`timeUpdated`,
`quantity`) VALUES (?, ?, ?, ?, ?)');
$query->execute([$hostImageId, $hostPageId, $timeAdded, $timeUpdated, $quantity]);
return $query->rowCount(); // no primary key
}
public function updateHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, int $quantity) {
$query = $this->_db->prepare('UPDATE `hostImageToHostPage` SET `quantity` = `quantity` + ' . (int) $quantity . ', `timeUpdated` = ?
WHERE `hostImageId` = ?
AND `hostPageId` = ?
LIMIT 1');
$query->execute([$timeAdded, $hostImageId, $hostPageId]);
return $query->rowCount();
}
public function deleteHostImageToHostPage(int $hostImageId) {
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
$query->execute([$hostImageId]);
return $query->rowCount();
}
// Pages // Pages
public function getTotalHostPages(int $hostId) { public function getTotalHostPages(int $hostId) {
@ -147,6 +314,15 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function getHostPagesByLimit(int $hostId, int $limit) {
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
$query->execute([$hostId]);
return $query->fetchAll();
}
public function getFoundHostPage(int $hostPageId) { public function getFoundHostPage(int $hostPageId) {
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, $query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
@ -240,11 +416,11 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function deleteHostPages(int $hostId, int $limit) { public function deleteHostPageToHostImage(int $hostPageId) {
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit); $query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
$query->execute([$hostId]); $query->execute([$hostPageId]);
return $query->rowCount(); return $query->rowCount();
} }
@ -275,6 +451,7 @@ class MySQL {
`host`.`name`, `host`.`name`,
`host`.`port`, `host`.`port`,
`host`.`crawlPageLimit`, `host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`crawlPageMetaOnly`, `host`.`crawlPageMetaOnly`,
`host`.`robots`, `host`.`robots`,
`host`.`robotsPostfix` `host`.`robotsPostfix`

BIN
media/db-prototype.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

After

Width:  |  Height:  |  Size: 98 KiB

21
public/api.php

@ -1,7 +1,7 @@
<?php <?php
// Current version // Current version
define('API_VERSION', 0.2); define('API_VERSION', 0.3);
// Load system dependencies // Load system dependencies
require_once('../config/app.php'); require_once('../config/app.php');
@ -101,15 +101,16 @@ if (API_ENABLED) {
'status' => true, 'status' => true,
'result' => [ 'result' => [
'config' => [ 'config' => [
'websiteDomain' => WEBSITE_DOMAIN, 'websiteDomain' => WEBSITE_DOMAIN,
'crawlUrlRegexp' => CRAWL_URL_REGEXP, 'crawlUrlRegexp' => CRAWL_URL_REGEXP,
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT, 'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS, 'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY, 'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET, 'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET, 'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES, 'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES, 'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
], ],
'api' => [ 'api' => [
'version' => API_VERSION, 'version' => API_VERSION,

Loading…
Cancel
Save