Browse Source

add mime content type crawling #1

main
ghost 2 years ago
parent
commit
702a14b634
  1. 2
      README.md
  2. 47
      crontab/crawler.php
  3. BIN
      database/yggo.mwb
  4. 7
      library/filter.php
  5. 25
      library/mysql.php
  6. 10
      public/search.php

2
README.md

@ -175,7 +175,7 @@ GET m=SphinxQL
* [x] Auto stop crawling on disk quota reached * [x] Auto stop crawling on disk quota reached
* [x] Transactions support to prevent data loss on queue failures * [x] Transactions support to prevent data loss on queue failures
* [x] Distributed index crawling between YGGo nodes trough manifest API * [x] Distributed index crawling between YGGo nodes trough manifest API
* [ ] MIME Content-type crawler settings * [x] MIME Content-type crawler settings
* [ ] Indexing new sites homepage in higher priority * [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing * [ ] Redirect codes extended processing
* [ ] Palette image index / filter * [ ] Palette image index / filter

47
crontab/crawler.php

@ -236,17 +236,14 @@ try {
continue; continue;
} }
// Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) {
// Skip image processing on MIME type not provided // Skip image processing on MIME type not provided
if (!$contentType = $curl->getContentType()) { if (!$hostImageContentType = $curl->getContentType()) {
continue; continue;
} }
// Skip image processing on MIME type not allowed in settings // Skip image processing on MIME type not allowed in settings
if (false === strpos($contentType, CRAWL_IMAGE_MIME_TYPE)) { if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) {
continue; continue;
} }
@ -258,7 +255,7 @@ try {
} }
// Convert remote image data to base64 string to prevent direct URL call // Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) { if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
continue; continue;
} }
@ -268,8 +265,10 @@ try {
continue; continue;
} }
$hostImagesIndexed += $db->updateHostImageData($hostImage->hostImageId, (string) 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64, time()); $hostImagesIndexed += $db->updateHostImage($hostImage->hostImageId,
} Filter::mime($hostImageContentType),
(!CRAWL_HOST_DEFAULT_META_ONLY ? 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64 : null),
time());
} }
// Process pages crawl queue // Process pages crawl queue
@ -344,11 +343,24 @@ try {
} }
} }
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
continue;
}
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
// Update queued page data // Update queued page data
$hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId,
Filter::pageTitle($title->item(0)->nodeValue), Filter::pageTitle($title->item(0)->nodeValue),
Filter::pageDescription($metaDescription), Filter::pageDescription($metaDescription),
Filter::pageKeywords($metaKeywords), Filter::pageKeywords($metaKeywords),
Filter::mime($contentType),
CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content));
// Update manifest registry // Update manifest registry
@ -364,18 +376,6 @@ try {
} }
} }
// Append page with meta robots:noindex value to the robotsPostfix disallow list
if (false !== stripos($metaRobots, 'noindex')) {
continue;
}
// Skip page links following by robots:nofollow attribute detected
if (false !== stripos($metaRobots, 'nofollow')) {
continue;
}
// Collect page images // Collect page images
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) { if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
@ -466,7 +466,7 @@ try {
// Init robots parser // Init robots parser
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES)); $robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
// Save image info // Save new image info
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string)); $hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
if (!$hostImageId && // image not exists if (!$hostImageId && // image not exists
@ -475,7 +475,10 @@ try {
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit $hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
// Add host image // Add host image
if ($hostImageId = $db->addHostImage($hostId, crc32($hostImageURI->string), $hostImageURI->string, time(), null, 200)) { if ($hostImageId = $db->addHostImage($hostId,
crc32($hostImageURI->string),
$hostImageURI->string,
time())) {
$hostImagesAdded++; $hostImagesAdded++;

BIN
database/yggo.mwb

Binary file not shown.

7
library/filter.php

@ -9,6 +9,13 @@ class Filter {
return trim(urldecode($url)); return trim(urldecode($url));
} }
static public function mime(mixed $mime) {
$mime = (string) $mime;
return trim($mime);
}
static public function pageTitle(mixed $title) { static public function pageTitle(mixed $title) {
$title = (string) $title; $title = (string) $title;

25
library/mysql.php

@ -185,6 +185,7 @@ class MySQL {
int $timeAdded, int $timeAdded,
mixed $timeUpdated = null, mixed $timeUpdated = null,
mixed $httpCode = null, mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null, mixed $rank = null,
mixed $data = null) { mixed $data = null) {
@ -194,10 +195,11 @@ class MySQL {
`timeAdded`, `timeAdded`,
`timeUpdated`, `timeUpdated`,
`httpCode`, `httpCode`,
`mime`,
`rank`, `rank`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)'); `data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $data]); $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $data]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -224,13 +226,14 @@ class MySQL {
return $query->rowCount(); return $query->rowCount();
} }
public function updateHostImageData(int $hostImageId, public function updateHostImage(int $hostImageId,
string $data, string $mime,
mixed $data,
int $timeUpdated) { int $timeUpdated) {
$query = $this->_db->prepare('UPDATE `hostImage` SET `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1'); $query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `data` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
$query->execute([$data, $timeUpdated, $hostImageId]); $query->execute([$mime, $data, $timeUpdated, $hostImageId]);
return $query->rowCount(); return $query->rowCount();
} }
@ -439,6 +442,7 @@ class MySQL {
int $timeAdded, int $timeAdded,
mixed $timeUpdated = null, mixed $timeUpdated = null,
mixed $httpCode = null, mixed $httpCode = null,
mixed $mime = null,
mixed $rank = null, mixed $rank = null,
mixed $metaTitle = null, mixed $metaTitle = null,
mixed $metaDescription = null, mixed $metaDescription = null,
@ -451,13 +455,14 @@ class MySQL {
`timeAdded`, `timeAdded`,
`timeUpdated`, `timeUpdated`,
`httpCode`, `httpCode`,
`mime`,
`rank`, `rank`,
`metaTitle`, `metaTitle`,
`metaDescription`, `metaDescription`,
`metaKeywords`, `metaKeywords`,
`data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); `data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]); $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $mime, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]);
return $this->_db->lastInsertId(); return $this->_db->lastInsertId();
} }
@ -466,14 +471,16 @@ class MySQL {
mixed $metaTitle, mixed $metaTitle,
mixed $metaDescription, mixed $metaDescription,
mixed $metaKeywords, mixed $metaKeywords,
string $mime,
mixed $data) { mixed $data) {
$query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?, $query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?,
`metaDescription` = ?, `metaDescription` = ?,
`metaKeywords` = ?, `metaKeywords` = ?,
`mime` = ?,
`data` = ? WHERE `hostPageId` = ? LIMIT 1'); `data` = ? WHERE `hostPageId` = ? LIMIT 1');
$query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]); $query->execute([$metaTitle, $metaDescription, $metaKeywords, $mime, $data, $hostPageId]);
return $query->rowCount(); return $query->rowCount();
} }

10
public/search.php

@ -353,17 +353,17 @@ if (!empty($q)) {
$db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time()); $db->updateHostImageHttpCode($hostImage->hostImageId, (int) $hostImageHttpCode, time());
if (200 != $hostImageHttpCode) continue; if (200 != $hostImageHttpCode) continue;
if (!$hostImageContentType = $hostImageCurl->getContentType()) continue;
if (false === strpos($hostImageContentType, CRAWL_IMAGE_MIME_TYPE)) continue;
// Convert remote image data to base64 string to prevent direct URL call // Convert remote image data to base64 string to prevent direct URL call
if (!$hostImageType = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue; if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) continue;
if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue; if (!$hostImageBase64 = @base64_encode($hostImageCurl->getContent())) continue;
$hostImageURLencoded = 'data:image/' . $hostImageType . ';base64,' . $hostImageBase64; $hostImageURLencoded = 'data:image/' . $hostImageExtension . ';base64,' . $hostImageBase64;
// Save image content on data settings enabled // Save image content on data settings enabled
if (!CRAWL_HOST_DEFAULT_META_ONLY) { $db->updateHostImage($hostImage->hostImageId, Filter::mime($hostImageContentType), (!CRAWL_HOST_DEFAULT_META_ONLY ? $hostImageURLencoded : null), time());
$db->updateHostImageData($hostImage->hostImageId, (string) $hostImageURLencoded, time());
}
// Local image data exists // Local image data exists
} else { } else {

Loading…
Cancel
Save