Browse Source

fix image crawling errors

main
ghost 2 years ago
parent
commit
d4f66c83e7
  1. 24
      crontab/crawler.php
  2. BIN
      database/yggo.mwb
  3. 57
      library/mysql.php

24
crontab/crawler.php

@ -266,12 +266,10 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save image info // Save image info
$hostImageId = $db->getHostImage($hostId, crc32($hostImageURI->string));
if ($hostStatus && // host enabled if ($hostStatus && // host enabled
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules $robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
$hostImageLimit > $db->getTotalHostImages($hostId) && // images quantity not reached host limit $hostImageLimit > $db->getTotalHostImages($hostId) && // images quantity not reached host limit
!$hostImageId) { // image not exists !$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string))) { // image not exists
// Add host image // Add host image
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) { if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) {
@ -284,19 +282,17 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
} }
} }
// Add host image description // Add/update host image description
$hostImageDescriptionCRC32id = crc32(md5((string) $imageAlt . (string) $imageTitle)); $db->setHostImageDescription($hostImageId,
crc32(md5((string) $imageAlt . (string) $imageTitle)),
Filter::imageAlt($imageAlt),
Filter::imageTitle($imageTitle),
time(),
time());
if (!$db->getHostImageDescription($hostImageId, $hostImageDescriptionCRC32id)) {
$db->addHostImageDescription($hostImageId, $hostImageDescriptionCRC32id, Filter::imageAlt($imageAlt), Filter::imageTitle($imageTitle), time());
}
// Relate host image with host page was found // Relate host image with host page was found
if (!$db->getHostImageToHostPage($hostImageId, $queueHostPage->hostPageId)) { $db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), time(), 1);
$db->addHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), null, 1);
} else {
$db->updateHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
}
// Increase page rank when link does not match the current host // Increase page rank when link does not match the current host
if ($hostImageURL->scheme . '://' . if ($hostImageURL->scheme . '://' .
@ -434,7 +430,7 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
if ($hostStatus && // host enabled if ($hostStatus && // host enabled
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {

BIN
database/yggo.mwb

Binary file not shown.

57
library/mysql.php

@ -103,13 +103,13 @@ class MySQL {
return $query->fetch()->total; return $query->fetch()->total;
} }
public function getHostImage(int $hostId, int $crc32uri) { public function getHostImageId(int $hostId, int $crc32uri) {
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); $query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
$query->execute([$hostId, $crc32uri]); $query->execute([$hostId, $crc32uri]);
return $query->fetch(); return $query->rowCount() ? $query->fetch()->hostImageId : 0;
} }
public function getHostImages(int $hostId) { public function getHostImages(int $hostId) {
@ -208,24 +208,19 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getHostImageDescription(int $hostImageId, int $crc32id) { public function setHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded, int $timeUpdated) {
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? AND `crc32id` = ? LIMIT 1');
$query->execute([$hostImageId, $crc32id]);
return $query->fetch();
}
public function addHostImageDescription(int $hostImageId, int $crc32id, string $alt, string $title, int $timeAdded) {
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`, $query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
`crc32id`, `crc32id`,
`alt`, `alt`,
`title`, `title`,
`timeAdded`) VALUES (?, ?, ?, ?, ?)'); `timeAdded`) VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE `alt` = ?,
`title` = ?,
`timeUpdated` = ?');
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded]); $query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -239,15 +234,6 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function getHostImageToHostPage(int $hostImageId, int $hostPageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ? AND `hostPageId` = ? LIMIT 1');
$query->execute([$hostImageId, $hostPageId]);
return $query->fetch();
}
public function getHostImageHostPages(int $hostImageId) { public function getHostImageHostPages(int $hostImageId) {
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ?'); $query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
@ -257,31 +243,20 @@ class MySQL {
return $query->fetchAll(); return $query->fetchAll();
} }
public function addHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) { public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, mixed $timeUpdated, int $quantity) {
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`, $query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
`hostPageId`, `hostPageId`,
`timeAdded`, `timeAdded`,
`timeUpdated`, `timeUpdated`,
`quantity`) VALUES (?, ?, ?, ?, ?)'); `quantity`) VALUES (?, ?, ?, ?, ?)
$query->execute([$hostImageId, $hostPageId, $timeAdded, $timeUpdated, $quantity]);
return $query->rowCount(); // no primary key
}
public function updateHostImageToHostPage(int $hostImageId, int $hostPageId, int $timeAdded, int $quantity) { ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
`quantity` = `quantity` + ' . (int) $quantity);
$query = $this->_db->prepare('UPDATE `hostImageToHostPage` SET `quantity` = `quantity` + ' . (int) $quantity . ', `timeUpdated` = ? $query->execute([$hostImageId, $hostPageId, $timeAdded, $timeUpdated, $quantity, $timeUpdated]);
WHERE `hostImageId` = ? return $query->rowCount(); // no primary key
AND `hostPageId` = ?
LIMIT 1');
$query->execute([$timeAdded, $hostImageId, $hostPageId]);
return $query->rowCount();
} }
public function deleteHostImageToHostPage(int $hostImageId) { public function deleteHostImageToHostPage(int $hostImageId) {

Loading…
Cancel
Save