mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 21:44:59 +00:00
add mime content type crawling #1
This commit is contained in:
parent
0bd95d7f4d
commit
702a14b634
@ -175,7 +175,7 @@ GET m=SphinxQL
|
|||||||
* [x] Auto stop crawling on disk quota reached
|
* [x] Auto stop crawling on disk quota reached
|
||||||
* [x] Transactions support to prevent data loss on queue failures
|
* [x] Transactions support to prevent data loss on queue failures
|
||||||
* [x] Distributed index crawling between YGGo nodes trough manifest API
|
* [x] Distributed index crawling between YGGo nodes trough manifest API
|
||||||
* [ ] MIME Content-type crawler settings
|
* [x] MIME Content-type crawler settings
|
||||||
* [ ] Indexing new sites homepage in higher priority
|
* [ ] Indexing new sites homepage in higher priority
|
||||||
* [ ] Redirect codes extended processing
|
* [ ] Redirect codes extended processing
|
||||||
* [ ] Palette image index / filter
|
* [ ] Palette image index / filter
|
||||||
|
@ -236,40 +236,39 @@ try {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save image content on data settings enabled
|
// Skip image processing on MIME type not provided
|
||||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
if (!$hostImageContentType = $curl->getContentType()) {
|
||||||
|
|
||||||
// Skip image processing on MIME type not provided
|
continue;
|
||||||
if (!$contentType = $curl->getContentType()) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip image processing on MIME type not allowed in settings
|
|
||||||
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip image processing without returned content
|
|
||||||
if (!$content = $curl->getContent()) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert remote image data to base64 string to prevent direct URL call
|
|
||||||
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Skip image processing on MIME type not allowed in settings
|
||||||
|
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip image processing without returned content
|
||||||
|
if (!$content = $curl->getContent()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert remote image data to base64 string to prevent direct URL call
|
||||||
|
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$hostImageBase64 = @base64_encode($curl->getContent())) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
|
||||||
|
Filter::mime($hostImageContentType),
|
||||||
|
(!CRAWL_HOST_DEFAULT_META_ONLY ? 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64 : null),
|
||||||
|
time());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process pages crawl queue
|
// Process pages crawl queue
|
||||||
@ -344,26 +343,6 @@ try {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update queued page data
|
|
||||||
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
|
||||||
Filter::pageTitle($title->item(0)->nodeValue),
|
|
||||||
Filter::pageDescription($metaDescription),
|
|
||||||
Filter::pageKeywords($metaKeywords),
|
|
||||||
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
|
||||||
|
|
||||||
// Update manifest registry
|
|
||||||
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
|
||||||
|
|
||||||
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
|
||||||
|
|
||||||
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
|
||||||
$db->addManifest($metaYggoManifestCRC32,
|
|
||||||
$metaYggoManifest,
|
|
||||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
|
||||||
time());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
// Append page with meta robots:noindex value to the robotsPostfix disallow list
|
||||||
if (false !== stripos($metaRobots, 'noindex')) {
|
if (false !== stripos($metaRobots, 'noindex')) {
|
||||||
|
|
||||||
@ -376,6 +355,27 @@ try {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update queued page data
|
||||||
|
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
|
||||||
|
Filter::pageTitle($title->item(0)->nodeValue),
|
||||||
|
Filter::pageDescription($metaDescription),
|
||||||
|
Filter::pageKeywords($metaKeywords),
|
||||||
|
Filter::mime($contentType),
|
||||||
|
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
|
||||||
|
|
||||||
|
// Update manifest registry
|
||||||
|
if (CRAWL_MANIFEST && !empty($metaYggoManifest) && filter_var($metaYggoManifest, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $metaYggoManifest)) {
|
||||||
|
|
||||||
|
$metaYggoManifestCRC32 = crc32($metaYggoManifest);
|
||||||
|
|
||||||
|
if (!$db->getManifest($metaYggoManifestCRC32)) {
|
||||||
|
$db->addManifest($metaYggoManifestCRC32,
|
||||||
|
$metaYggoManifest,
|
||||||
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||||
|
time());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Collect page images
|
// Collect page images
|
||||||
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
|
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
|
||||||
|
|
||||||
@ -402,7 +402,7 @@ try {
|
|||||||
|
|
||||||
$imageSrc = $queueHostPage->scheme . '://' .
|
$imageSrc = $queueHostPage->scheme . '://' .
|
||||||
$queueHostPage->name .
|
$queueHostPage->name .
|
||||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -466,16 +466,19 @@ try {
|
|||||||
// Init robots parser
|
// Init robots parser
|
||||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
// Save image info
|
// Save new image info
|
||||||
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
|
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
|
||||||
|
|
||||||
if (!$hostImageId && // image not exists
|
if (!$hostImageId && // image not exists
|
||||||
$hostStatus && // host enabled
|
$hostStatus && // host enabled
|
||||||
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
|
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
|
||||||
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
|
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
|
||||||
|
|
||||||
// Add host image
|
// Add host image
|
||||||
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) {
|
if ($hostImageId = $db->addHostImage($hostId,
|
||||||
|
crc32($hostImageURI->string),
|
||||||
|
$hostImageURI->string,
|
||||||
|
time())) {
|
||||||
|
|
||||||
$hostImagesAdded++;
|
$hostImagesAdded++;
|
||||||
|
|
||||||
|
Binary file not shown.
@ -9,6 +9,13 @@ class Filter {
|
|||||||
return trim(urldecode($url));
|
return trim(urldecode($url));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static public function mime(mixed $mime) {
|
||||||
|
|
||||||
|
$mime = (string) $mime;
|
||||||
|
|
||||||
|
return trim($mime);
|
||||||
|
}
|
||||||
|
|
||||||
static public function pageTitle(mixed $title) {
|
static public function pageTitle(mixed $title) {
|
||||||
|
|
||||||
$title = (string) $title;
|
$title = (string) $title;
|
||||||
|
@ -180,13 +180,14 @@ class MySQL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public function addHostImage(int $hostId,
|
public function addHostImage(int $hostId,
|
||||||
int $crc32uri,
|
int $crc32uri,
|
||||||
string $uri,
|
string $uri,
|
||||||
int $timeAdded,
|
int $timeAdded,
|
||||||
mixed $timeUpdated = null,
|
mixed $timeUpdated = null,
|
||||||
mixed $httpCode = null,
|
mixed $httpCode = null,
|
||||||
mixed $rank = null,
|
mixed $mime = null,
|
||||||
mixed $data = null) {
|
mixed $rank = null,
|
||||||
|
mixed $data = null) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
|
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
|
||||||
`crc32uri`,
|
`crc32uri`,
|
||||||
@ -194,10 +195,11 @@ class MySQL {
|
|||||||
`timeAdded`,
|
`timeAdded`,
|
||||||
`timeUpdated`,
|
`timeUpdated`,
|
||||||
`httpCode`,
|
`httpCode`,
|
||||||
|
`mime`,
|
||||||
`rank`,
|
`rank`,
|
||||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
|
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $data]);
|
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $data]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
@ -224,13 +226,14 @@ class MySQL {
|
|||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function updateHostImageData(int $hostImageId,
|
public function updateHostImage(int $hostImageId,
|
||||||
string $data,
|
string $mime,
|
||||||
int $timeUpdated) {
|
mixed $data,
|
||||||
|
int $timeUpdated) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
|
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||||
|
|
||||||
$query->execute([$data, $timeUpdated, $hostImageId]);
|
$query->execute([$mime, $data, $timeUpdated, $hostImageId]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
@ -439,6 +442,7 @@ class MySQL {
|
|||||||
int $timeAdded,
|
int $timeAdded,
|
||||||
mixed $timeUpdated = null,
|
mixed $timeUpdated = null,
|
||||||
mixed $httpCode = null,
|
mixed $httpCode = null,
|
||||||
|
mixed $mime = null,
|
||||||
mixed $rank = null,
|
mixed $rank = null,
|
||||||
mixed $metaTitle = null,
|
mixed $metaTitle = null,
|
||||||
mixed $metaDescription = null,
|
mixed $metaDescription = null,
|
||||||
@ -451,13 +455,14 @@ class MySQL {
|
|||||||
`timeAdded`,
|
`timeAdded`,
|
||||||
`timeUpdated`,
|
`timeUpdated`,
|
||||||
`httpCode`,
|
`httpCode`,
|
||||||
|
`mime`,
|
||||||
`rank`,
|
`rank`,
|
||||||
`metaTitle`,
|
`metaTitle`,
|
||||||
`metaDescription`,
|
`metaDescription`,
|
||||||
`metaKeywords`,
|
`metaKeywords`,
|
||||||
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||||
|
|
||||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
|
||||||
|
|
||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
@ -466,14 +471,16 @@ class MySQL {
|
|||||||
mixed $metaTitle,
|
mixed $metaTitle,
|
||||||
mixed $metaDescription,
|
mixed $metaDescription,
|
||||||
mixed $metaKeywords,
|
mixed $metaKeywords,
|
||||||
|
string $mime,
|
||||||
mixed $data) {
|
mixed $data) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
|
||||||
`metaDescription` = ?,
|
`metaDescription` = ?,
|
||||||
`metaKeywords` = ?,
|
`metaKeywords` = ?,
|
||||||
|
`mime` = ?,
|
||||||
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
|
`data` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||||
|
|
||||||
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]);
|
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $hostPageId]);
|
||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
@ -353,17 +353,17 @@ if (!empty($q)) {
|
|||||||
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());
|
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());
|
||||||
|
|
||||||
if (200 != $hostImageHttpCode) continue;
|
if (200 != $hostImageHttpCode) continue;
|
||||||
|
if (!$hostImageContentType = $hostImageCurl->getContentType()) continue;
|
||||||
|
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) continue;
|
||||||
|
|
||||||
// Convert remote image data to base64 string to prevent direct URL call
|
// Convert remote image data to base64 string to prevent direct URL call
|
||||||
if (!$hostImageType = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
|
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
|
||||||
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;
|
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;
|
||||||
|
|
||||||
$hostImageURLencoded = 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64;
|
$hostImageURLencoded = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
|
||||||
|
|
||||||
// Save image content on data settings enabled
|
// Save image content on data settings enabled
|
||||||
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
|
$db->updateHostImage($hostImage->hostImageId, Filter::mime($hostImageContentType), (!CRAWL_HOST_DEFAULT_META_ONLY ? $hostImageURLencoded : null), time());
|
||||||
$db->updateHostImageData($hostImage->hostImageId, (string) $hostImageURLencoded, time());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Local image data exists
|
// Local image data exists
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user