mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-02-03 10:25:52 +00:00
implement image crawler
This commit is contained in:
parent
78931ebc74
commit
0741a3e9ef
@ -145,6 +145,16 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Images limit per new host by default
|
||||||
|
*
|
||||||
|
* Crawler stops indexing on this limit reach to prevent disk overuse
|
||||||
|
*
|
||||||
|
* Custom rule for specified host could be provided in the DB `host`.`crawlImageLimit` field
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_HOST_DEFAULT_IMAGES_LIMIT', 1000);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default robots.txt rules on remote file not exists
|
* Default robots.txt rules on remote file not exists
|
||||||
* The crawler able to overwrite these rules
|
* The crawler able to overwrite these rules
|
||||||
@ -250,14 +260,16 @@ define('API_HOSTS_ENABLED', true);
|
|||||||
*/
|
*/
|
||||||
define('API_HOSTS_FIELDS',
|
define('API_HOSTS_FIELDS',
|
||||||
'`host`.`scheme`,
|
'`host`.`scheme`,
|
||||||
`host`.`name`,
|
`host`.`name`,
|
||||||
`host`.`port`,
|
`host`.`port`,
|
||||||
`host`.`crawlPageLimit`,
|
`host`.`crawlPageLimit`,
|
||||||
`host`.`robots`,
|
`host`.`crawlImageLimit`,
|
||||||
`host`.`robotsPostfix`,
|
`host`.`robots`,
|
||||||
`host`.`timeAdded`,
|
`host`.`robotsPostfix`,
|
||||||
`host`.`timeUpdated`,
|
`host`.`timeAdded`,
|
||||||
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`'); // string: *|field names comma separated
|
`host`.`timeUpdated`,
|
||||||
|
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
|
||||||
|
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Manifest API
|
* Manifest API
|
||||||
|
@ -21,9 +21,10 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
|||||||
// Debug
|
// Debug
|
||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$hostsTotal = $db->getTotalHosts();
|
$hostsTotal = $db->getTotalHosts();
|
||||||
$hostsUpdated = 0;
|
$hostsUpdated = 0;
|
||||||
$hostsPagesDeleted = 0;
|
$hostsPagesDeleted = 0;
|
||||||
|
$hostsImagesDeleted = 0;
|
||||||
|
|
||||||
// Get host queue
|
// Get host queue
|
||||||
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
||||||
@ -48,25 +49,76 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
// Update host data
|
// Update host data
|
||||||
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
|
|
||||||
|
// Apply host images limits
|
||||||
|
$totalHostImages = $db->getTotalHostImages($host->hostId);
|
||||||
|
|
||||||
|
if ($totalHostImages > $host->crawlImageLimit) {
|
||||||
|
|
||||||
|
foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
|
||||||
|
|
||||||
|
// Delete foreign key relations
|
||||||
|
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||||
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
|
// Delete host image
|
||||||
|
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Apply host pages limits
|
// Apply host pages limits
|
||||||
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
||||||
|
|
||||||
if ($totalHostPages > $host->crawlPageLimit) {
|
if ($totalHostPages > $host->crawlPageLimit) {
|
||||||
|
|
||||||
$hostsPagesDeleted += $db->deleteHostPages($host->hostId, $totalHostPages - $host->crawlPageLimit);
|
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
||||||
|
|
||||||
|
// Delete foreign key relations
|
||||||
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
|
// Delete host page
|
||||||
|
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply new robots.txt rules
|
// Apply new robots.txt rules
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
foreach ($db->getHostImages($host->hostId) as $hostImage) {
|
||||||
|
|
||||||
|
if (!$robots->uriAllowed($hostImage->uri)) {
|
||||||
|
|
||||||
|
// Delete foreign key relations
|
||||||
|
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||||
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
|
// Delete host image
|
||||||
|
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||||
|
|
||||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||||
|
|
||||||
|
// Delete foreign key relations
|
||||||
|
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||||
|
|
||||||
|
// Delete host page
|
||||||
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
$hostsPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up host images unrelated to host pages
|
||||||
|
foreach ($db->getUnrelatedHostImages() as $hostImage) {
|
||||||
|
|
||||||
|
// Delete foreign key relations
|
||||||
|
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||||
|
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||||
|
|
||||||
|
// Delete host image
|
||||||
|
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
|
}
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
} catch(Exception $e){
|
} catch(Exception $e){
|
||||||
@ -81,4 +133,5 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||||
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
||||||
|
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
|
||||||
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;
|
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;
|
@ -30,6 +30,7 @@ $timeStart = microtime(true);
|
|||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
$hostPagesIndexed = 0;
|
$hostPagesIndexed = 0;
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
|
$hostImagesAdded = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
|
|
||||||
// Connect database
|
// Connect database
|
||||||
@ -127,6 +128,157 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Collect page images
|
||||||
|
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
|
||||||
|
|
||||||
|
foreach (@$dom->getElementsByTagName('img') as $img) {
|
||||||
|
|
||||||
|
// Skip images without src attribute
|
||||||
|
if (!$imageSrc = @$img->getAttribute('src')) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip images without alt attribute
|
||||||
|
if (!$imageAlt = @$img->getAttribute('alt')) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$imageTitle = @$img->getAttribute('title')) {
|
||||||
|
$imageTitle = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add domain to the relative src links
|
||||||
|
if (!parse_url($imageSrc, PHP_URL_HOST)) {
|
||||||
|
|
||||||
|
$imageSrc = $queueHostPage->scheme . '://' .
|
||||||
|
$queueHostPage->name .
|
||||||
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||||
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate formatted src link
|
||||||
|
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
|
||||||
|
|
||||||
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Parse formatted src link
|
||||||
|
$hostImageURL = Parser::hostURL($imageSrc);
|
||||||
|
$hostImageURI = Parser::uri($imageSrc);
|
||||||
|
|
||||||
|
// Host exists
|
||||||
|
if ($host = $db->getHost(crc32($hostImageURL->string))) {
|
||||||
|
|
||||||
|
$hostStatus = $host->status;
|
||||||
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostImageLimit = $host->crawlImageLimit;
|
||||||
|
$hostId = $host->hostId;
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
$hostRobotsPostfix = $host->robotsPostfix;
|
||||||
|
|
||||||
|
// Register new host
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostImageURL->string . '/robots.txt');
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||||
|
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||||
|
$hostId = $db->addHost($hostImageURL->scheme,
|
||||||
|
$hostImageURL->name,
|
||||||
|
$hostImageURL->port,
|
||||||
|
crc32($hostURL->string),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
$hostImageLimit,
|
||||||
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
|
(string) $hostStatus,
|
||||||
|
$hostRobots,
|
||||||
|
$hostRobotsPostfix);
|
||||||
|
|
||||||
|
if ($hostId) {
|
||||||
|
|
||||||
|
$hostsAdded++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init robots parser
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
// Save image info
|
||||||
|
$hostImageId = $db->getHostImage($hostId, crc32($hostImageURI->string));
|
||||||
|
|
||||||
|
if ($hostStatus && // host enabled
|
||||||
|
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
|
||||||
|
$hostImageLimit > $db->getTotalHostImages($hostId) && // images quantity not reached host limit
|
||||||
|
!$hostImageId) { // image not exists
|
||||||
|
|
||||||
|
// Add host image
|
||||||
|
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time())) {
|
||||||
|
|
||||||
|
$hostImagesAdded++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add host image description
|
||||||
|
$hostImageDescriptionCRC32id = crc32(md5((string) $imageAlt . (string) $imageTitle));
|
||||||
|
|
||||||
|
if (!$db->getHostImageDescription($hostImageId, $hostImageDescriptionCRC32id)) {
|
||||||
|
$db->addHostImageDescription($hostImageId, $hostImageDescriptionCRC32id, (string) $imageAlt, (string) $imageTitle, time());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relate host image with host page was found
|
||||||
|
if (!$db->getHostImageToHostPage($hostImageId, $queueHostPage->hostPageId)) {
|
||||||
|
$db->addHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), null, 1);
|
||||||
|
} else {
|
||||||
|
$db->updateHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increase page rank when link does not match the current host
|
||||||
|
if ($hostImageURL->scheme . '://' .
|
||||||
|
$hostImageURL->name .
|
||||||
|
($hostImageURL->port ? ':' . $hostImageURL->port : '')
|
||||||
|
!=
|
||||||
|
$queueHostPage->scheme . '://' .
|
||||||
|
$queueHostPage->name .
|
||||||
|
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
|
||||||
|
|
||||||
|
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
$db->commit();
|
||||||
|
|
||||||
|
} catch(Exception $e) {
|
||||||
|
|
||||||
|
var_dump($e);
|
||||||
|
|
||||||
|
$db->rollBack();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Collect internal links from page content
|
// Collect internal links from page content
|
||||||
foreach(@$dom->getElementsByTagName('a') as $a) {
|
foreach(@$dom->getElementsByTagName('a') as $a) {
|
||||||
|
|
||||||
@ -187,6 +339,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
|
|
||||||
$hostStatus = $host->status;
|
$hostStatus = $host->status;
|
||||||
$hostPageLimit = $host->crawlPageLimit;
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostImageLimit = $host->crawlImageLimit;
|
||||||
$hostId = $host->hostId;
|
$hostId = $host->hostId;
|
||||||
$hostRobots = $host->robots;
|
$hostRobots = $host->robots;
|
||||||
$hostRobotsPostfix = $host->robotsPostfix;
|
$hostRobotsPostfix = $host->robotsPostfix;
|
||||||
@ -207,6 +360,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
|
|
||||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||||
$hostId = $db->addHost($hostURL->scheme,
|
$hostId = $db->addHost($hostURL->scheme,
|
||||||
$hostURL->name,
|
$hostURL->name,
|
||||||
$hostURL->port,
|
$hostURL->port,
|
||||||
@ -214,6 +368,7 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
time(),
|
time(),
|
||||||
null,
|
null,
|
||||||
$hostPageLimit,
|
$hostPageLimit,
|
||||||
|
$hostImageLimit,
|
||||||
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
(string) $hostStatus,
|
(string) $hostStatus,
|
||||||
$hostRobots,
|
$hostRobots,
|
||||||
@ -272,5 +427,6 @@ foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET
|
|||||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||||
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
||||||
|
Binary file not shown.
@ -40,7 +40,7 @@ class MySQL {
|
|||||||
|
|
||||||
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
|
public function addManifest(int $crc32url, string $url, string $status, int $timeAdded, mixed $timeUpdated = null) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?, ?)');
|
$query = $this->_db->prepare('INSERT INTO `manifest` (`crc32url`, `url`, `status`, `timeAdded`, `timeUpdated`) VALUES (?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
|
$query->execute([$crc32url, $url, $status, $timeAdded, $timeUpdated]);
|
||||||
|
|
||||||
@ -75,11 +75,11 @@ class MySQL {
|
|||||||
return $query->fetch()->total;
|
return $query->fetch()->total;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlPageMetaOnly, string $status, mixed $robots, mixed $robotsPostfix) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlPageMetaOnly`, `status`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
|
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlPageMetaOnly, $status, $robots, $robotsPostfix]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
@ -93,6 +93,173 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Images
|
||||||
|
public function getTotalHostImages(int $hostId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostImage(int $hostId, int $crc32uri) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $crc32uri]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostImages(int $hostId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getUnrelatedHostImages() {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImage`
|
||||||
|
WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
|
||||||
|
FROM `hostImageToHostPage`
|
||||||
|
|
||||||
|
WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostImagesByLimit(int $hostId, int $limit) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addHostImage(int $hostId,
|
||||||
|
int $crc32uri,
|
||||||
|
string $uri,
|
||||||
|
int $timeAdded,
|
||||||
|
mixed $timeUpdated = null,
|
||||||
|
mixed $httpCode = null,
|
||||||
|
mixed $rank = null) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
|
||||||
|
`crc32uri`,
|
||||||
|
`uri`,
|
||||||
|
`timeAdded`,
|
||||||
|
`timeUpdated`,
|
||||||
|
`httpCode`,
|
||||||
|
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateHostImageRank(int $hostId,
|
||||||
|
int $crc32uri,
|
||||||
|
int $increment) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostId, $crc32uri]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostImage(int $hostImageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostImageDescription(int $hostImageId, int $crc32id) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? AND `crc32id` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId, $crc32id]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
|
||||||
|
`crc32id`,
|
||||||
|
`alt`,
|
||||||
|
`title`,
|
||||||
|
`timeAdded`) VALUES (?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded]);
|
||||||
|
|
||||||
|
return $this->_db->lastInsertId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostImageDescription(int $hostImageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getHostImageToHostPage(int $hostImageId, int $hostPageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ? AND `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId, $hostPageId]);
|
||||||
|
|
||||||
|
return $query->fetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
|
||||||
|
`hostPageId`,
|
||||||
|
`timeAdded`,
|
||||||
|
`timeUpdated`,
|
||||||
|
`quantity`) VALUES (?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId, $hostPageId, $timeAdded, $timeUpdated, $quantity]);
|
||||||
|
|
||||||
|
return $query->rowCount(); // no primary key
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, int $quantity) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `hostImageToHostPage` SET `quantity` = `quantity` + ' . (int) $quantity . ', `timeUpdated` = ?
|
||||||
|
|
||||||
|
WHERE `hostImageId` = ?
|
||||||
|
AND `hostPageId` = ?
|
||||||
|
|
||||||
|
LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$timeAdded, $hostImageId, $hostPageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function deleteHostImageToHostPage(int $hostImageId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
|
||||||
|
|
||||||
|
$query->execute([$hostImageId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
// Pages
|
// Pages
|
||||||
public function getTotalHostPages(int $hostId) {
|
public function getTotalHostPages(int $hostId) {
|
||||||
|
|
||||||
@ -147,6 +314,15 @@ class MySQL {
|
|||||||
return $query->fetchAll();
|
return $query->fetchAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getHostPagesByLimit(int $hostId, int $limit) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$hostId]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
public function getFoundHostPage(int $hostPageId) {
|
public function getFoundHostPage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
$query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`,
|
||||||
@ -240,11 +416,11 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function deleteHostPages(int $hostId, int $limit) {
|
public function deleteHostPageToHostImage(int $hostPageId) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('DELETE FROM `hostPage` WHERE `hostId` = ? ORDER BY hostPageId DESC LIMIT ' . (int) $limit);
|
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
|
||||||
|
|
||||||
$query->execute([$hostId]);
|
$query->execute([$hostPageId]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
@ -275,6 +451,7 @@ class MySQL {
|
|||||||
`host`.`name`,
|
`host`.`name`,
|
||||||
`host`.`port`,
|
`host`.`port`,
|
||||||
`host`.`crawlPageLimit`,
|
`host`.`crawlPageLimit`,
|
||||||
|
`host`.`crawlImageLimit`,
|
||||||
`host`.`crawlPageMetaOnly`,
|
`host`.`crawlPageMetaOnly`,
|
||||||
`host`.`robots`,
|
`host`.`robots`,
|
||||||
`host`.`robotsPostfix`
|
`host`.`robotsPostfix`
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 91 KiB After Width: | Height: | Size: 98 KiB |
@ -1,7 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
// Current version
|
// Current version
|
||||||
define('API_VERSION', 0.2);
|
define('API_VERSION', 0.3);
|
||||||
|
|
||||||
// Load system dependencies
|
// Load system dependencies
|
||||||
require_once('../config/app.php');
|
require_once('../config/app.php');
|
||||||
@ -101,15 +101,16 @@ if (API_ENABLED) {
|
|||||||
'status' => true,
|
'status' => true,
|
||||||
'result' => [
|
'result' => [
|
||||||
'config' => [
|
'config' => [
|
||||||
'websiteDomain' => WEBSITE_DOMAIN,
|
'websiteDomain' => WEBSITE_DOMAIN,
|
||||||
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
||||||
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
||||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
|
||||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||||
|
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||||
],
|
],
|
||||||
'api' => [
|
'api' => [
|
||||||
'version' => API_VERSION,
|
'version' => API_VERSION,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user