mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-03-13 05:41:02 +00:00
refactor to mime-based content index #1
This commit is contained in:
parent
272a885039
commit
db0e66c846
17
README.md
17
README.md
@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
|
||||
```
|
||||
GET action=search - required
|
||||
GET query={string} - optional, search request, empty if not provided
|
||||
GET type={string} - optional, search type, image|default or empty
|
||||
GET type={string} - optional, filter mime type of available or empty
|
||||
GET page={int} - optional, search results page, 1 if not provided
|
||||
GET mode=SphinxQL - optional, enable extended SphinxQL syntax
|
||||
```
|
||||
@ -141,7 +141,7 @@ GET m=SphinxQL
|
||||
##### Basic features
|
||||
|
||||
* [x] Web pages full text ranking search
|
||||
* [x] Images search with safe proxy preview support
|
||||
* [x] MIME filtering search with safe proxy images preview
|
||||
* [x] Extended syntax support
|
||||
* [x] Flexible settings compatible with IPv4/IPv6 networks
|
||||
|
||||
@ -159,18 +159,14 @@ GET m=SphinxQL
|
||||
* [ ] Index API
|
||||
+ [x] Manifest
|
||||
+ [x] Search
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
+ [x] Hosts
|
||||
+ [ ] Pages
|
||||
+ [ ] Images
|
||||
+ [ ] MIME list
|
||||
* [ ] Context advertising API
|
||||
|
||||
##### Crawler
|
||||
|
||||
* [x] Auto crawl links by regular expression rules
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
+ [x] Manifests
|
||||
* [x] Robots.txt / robots meta tags support (#2)
|
||||
* [x] Specific rules configuration for every host
|
||||
@ -181,8 +177,6 @@ GET m=SphinxQL
|
||||
* [x] Ban non-condition links to prevent extra requests
|
||||
* [x] Debug log
|
||||
* [x] History snaps
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
* [ ] Indexing new sites homepage in higher priority
|
||||
* [ ] Redirect codes extended processing
|
||||
* [ ] Palette image index / filter
|
||||
@ -191,17 +185,12 @@ GET m=SphinxQL
|
||||
##### Cleaner
|
||||
* [x] Deprecated DB items auto deletion / host settings update
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
+ [x] Manifests
|
||||
+ [x] Logs
|
||||
+ [x] Crawler
|
||||
+ [x] Cleaner
|
||||
* [x] Deprecated history snaps removing
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
* [x] Banned resources reset by timeout
|
||||
+ [x] Pages
|
||||
+ [x] Images
|
||||
* [x] Debug log
|
||||
|
||||
##### Other
|
||||
|
@ -47,7 +47,7 @@ error_reporting(E_ALL);
|
||||
* Project domain, without slash on postfix
|
||||
*
|
||||
*/
|
||||
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
|
||||
define('WEBSITE_DOMAIN', '');
|
||||
|
||||
/*
|
||||
* Page search results before show the read more link
|
||||
@ -55,18 +55,6 @@ define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on'
|
||||
*/
|
||||
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);
|
||||
|
||||
/*
|
||||
* Image search results before show the read more link
|
||||
*
|
||||
*/
|
||||
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);
|
||||
|
||||
/*
|
||||
* Quantity of related pages for each image in the search results
|
||||
*
|
||||
*/
|
||||
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
|
||||
|
||||
/*
|
||||
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
|
||||
*
|
||||
@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
|
||||
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);
|
||||
|
||||
// Database
|
||||
define('DB_HOST', 'localhost');
|
||||
define('DB_HOST', '127.0.0.1');
|
||||
define('DB_PORT', 3306);
|
||||
define('DB_NAME', '');
|
||||
define('DB_USERNAME', '');
|
||||
@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
||||
*/
|
||||
define('CRAWL_PAGE_LIMIT', 20);
|
||||
|
||||
/*
|
||||
* Images (URI) processing limit in the crawler.php queue
|
||||
*
|
||||
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
|
||||
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||
*
|
||||
* Usually up to 20 pages per minute,
|
||||
* to prevent websites overload by sending GET crawling requests
|
||||
*
|
||||
* Set 0 to disable
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_LIMIT', 10);
|
||||
|
||||
/*
|
||||
* Manifest (URI) processing limit in the crawler.php queue
|
||||
*
|
||||
@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_PAGE_MIME', 'text/html');
|
||||
|
||||
/*
|
||||
* Index images match MIME types
|
||||
*
|
||||
* comma separated
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
|
||||
|
||||
/*
|
||||
* Renew image index by timing offset provided
|
||||
*
|
||||
* This option works with CRAWL_IMAGE_LIMIT step queue
|
||||
*
|
||||
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
|
||||
* must have enough value to crawl all images collected in the DB index
|
||||
*
|
||||
* or the crawler can stuck in queue
|
||||
*
|
||||
*/
|
||||
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
|
||||
|
||||
/*
|
||||
* Renew manifests index by timing offset provided
|
||||
@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
|
||||
* Only URL addresses match this rule will be auto-crawled
|
||||
*
|
||||
*/
|
||||
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
|
||||
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');
|
||||
|
||||
/*
|
||||
* Pages limit per new host by default
|
||||
@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u
|
||||
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
|
||||
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new host added
|
||||
@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
|
||||
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
|
||||
*
|
||||
* This option able to change search results relevance
|
||||
* This option enables image data caching in base64
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
||||
@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false);
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||
|
||||
/*
|
||||
* Not suitable/safe for work status for new host by default
|
||||
*
|
||||
* Could be filtered in crawl conditions or search results
|
||||
*
|
||||
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
|
||||
*
|
||||
*/
|
||||
define('CRAWL_HOST_DEFAULT_NSFW', false);
|
||||
|
||||
/*
|
||||
* Default robots.txt rules on remote file not exists
|
||||
* The crawler able to overwrite these rules
|
||||
@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true);
|
||||
* Manifest API version compatibility
|
||||
*
|
||||
*/
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.7);
|
||||
define('CRAWL_MANIFEST_API_VERSION', 0.8);
|
||||
|
||||
/*
|
||||
* Set default auto-crawl status for new manifest added
|
||||
@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
*/
|
||||
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
||||
|
||||
/*
|
||||
* Remove image ban after following time
|
||||
*
|
||||
* This option used in crawler and search page
|
||||
* to prevent extra http requests to unavailable or not condition resources
|
||||
*
|
||||
*/
|
||||
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
|
||||
|
||||
/*
|
||||
* Remove image description history after following time
|
||||
*
|
||||
*/
|
||||
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);
|
||||
|
||||
// API settings
|
||||
|
||||
@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS',
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`crawlImageLimit`,
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`,
|
||||
`host`.`nsfw`,
|
||||
`host`.`timeAdded`,
|
||||
`host`.`timeUpdated`,
|
||||
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
|
||||
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
|
||||
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');
|
||||
|
||||
/*
|
||||
* Manifest API
|
||||
|
@ -12,36 +12,24 @@ source common
|
||||
source hostPage : common
|
||||
{
|
||||
sql_query = \
|
||||
SELECT hostPage.hostPageId, \
|
||||
hostPage.rank, \
|
||||
hostPage.uri, \
|
||||
host.name, \
|
||||
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
|
||||
hostPageDescription.metaDescription, \
|
||||
hostPageDescription.metaKeywords) \
|
||||
FROM hostPageDescription \
|
||||
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
|
||||
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
|
||||
LIMIT 1) AS pageDescription \
|
||||
FROM hostPage \
|
||||
JOIN host ON (host.hostId = hostPage.hostId) \
|
||||
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
|
||||
SELECT `hostPage`.`hostPageId`, \
|
||||
`hostPage`.`uri`, \
|
||||
`host`.`name`, \
|
||||
REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \
|
||||
(SELECT COUNT(*) FROM `hostPageToHostPage` \
|
||||
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
|
||||
AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \
|
||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
|
||||
`hostPageDescription`.`description`, \
|
||||
`hostPageDescription`.`keywords`)) \
|
||||
FROM `hostPageDescription` \
|
||||
WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \
|
||||
FROM `hostPage` \
|
||||
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \
|
||||
WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL
|
||||
|
||||
sql_attr_uint = rank
|
||||
}
|
||||
|
||||
source hostImage : common
|
||||
{
|
||||
sql_query = \
|
||||
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
|
||||
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
|
||||
FROM hostImageDescription \
|
||||
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \
|
||||
FROM hostImage \
|
||||
JOIN host ON (host.hostId = hostImage.hostId) \
|
||||
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \
|
||||
|
||||
sql_attr_uint = rank
|
||||
sql_attr_uint = rank
|
||||
sql_attr_string = mime
|
||||
}
|
||||
|
||||
index hostPage
|
||||
@ -49,11 +37,4 @@ index hostPage
|
||||
source = hostPage
|
||||
morphology = stem_enru, stem_cz, stem_ar
|
||||
path = /var/lib/sphinxsearch/data/hostPage
|
||||
}
|
||||
|
||||
index hostImage
|
||||
{
|
||||
source = hostImage
|
||||
morphology = stem_enru, stem_cz, stem_ar
|
||||
path = /var/lib/sphinxsearch/data/hostImage
|
||||
}
|
@ -31,11 +31,8 @@ $manifestsTotal = $db->getTotalManifests();
|
||||
$hostsUpdated = 0;
|
||||
$hostPagesDeleted = 0;
|
||||
$hostPageDescriptionsDeleted = 0;
|
||||
$hostImagesDeleted = 0;
|
||||
$hostImageDescriptionsDeleted = 0;
|
||||
$manifestsDeleted = 0;
|
||||
$hostPagesBansRemoved = 0;
|
||||
$hostImagesBansRemoved = 0;
|
||||
|
||||
$logsCleanerDeleted = 0;
|
||||
$logsCrawlerDeleted = 0;
|
||||
@ -56,7 +53,7 @@ try {
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
@ -69,22 +66,6 @@ try {
|
||||
// Update host data
|
||||
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||
|
||||
// Apply host images limits
|
||||
$totalHostImages = $db->getTotalHostImages($host->hostId);
|
||||
|
||||
if ($totalHostImages > $host->crawlImageLimit) {
|
||||
|
||||
foreach ((array) $db->getHostImagesByLimit($host->hostId, $totalHostImages - $host->crawlImageLimit) as $hostImage) {
|
||||
|
||||
// Delete foreign key relations
|
||||
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||
|
||||
// Delete host image
|
||||
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply host pages limits
|
||||
$totalHostPages = $db->getTotalHostPages($host->hostId);
|
||||
|
||||
@ -92,56 +73,32 @@ try {
|
||||
|
||||
foreach ((array) $db->getHostPagesByLimit($host->hostId, $totalHostPages - $host->crawlPageLimit) as $hostPage) {
|
||||
|
||||
// Delete foreign key relations
|
||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply new robots.txt rules
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($host->robotsPostfix ? (string) $host->robotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
foreach ($db->getHostImages($host->hostId) as $hostImage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostImage->uri)) {
|
||||
|
||||
// Delete foreign key relations
|
||||
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||
|
||||
// Delete host image
|
||||
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($db->getHostPages($host->hostId) as $hostPage) {
|
||||
|
||||
if (!$robots->uriAllowed($hostPage->uri)) {
|
||||
|
||||
// Delete foreign key relations
|
||||
$db->deleteHostPageToHostImage($hostPage->hostPageId);
|
||||
|
||||
// Delete host page
|
||||
$db->deleteHostPageDescriptions($hostPage->hostPageId);
|
||||
$db->deleteHostPageToHostPage($hostPage->hostPageId);
|
||||
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
if ($hostPage->uri != '/') {
|
||||
$hostPagesDeleted += $db->deleteHostPage($hostPage->hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up host images unrelated to host pages
|
||||
foreach ($db->getUnrelatedHostImages() as $hostImage) {
|
||||
|
||||
// Delete foreign key relations
|
||||
$db->deleteHostImageDescription($hostImage->hostImageId);
|
||||
$db->deleteHostImageToHostPage($hostImage->hostImageId);
|
||||
|
||||
// Delete host image
|
||||
$hostImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up deprecated manifests
|
||||
@ -207,12 +164,6 @@ try {
|
||||
// Delete page description history
|
||||
$hostPageDescriptionsDeleted += $db->deleteHostPageDescriptionsByTimeAdded(time() - CLEAN_PAGE_DESCRIPTION_OFFSET);
|
||||
|
||||
// Reset banned images
|
||||
$hostImagesBansRemoved += $db->resetBannedHostImages(time() - CLEAN_IMAGE_BAN_SECONDS_OFFSET);
|
||||
|
||||
// Delete image description history
|
||||
$hostImageDescriptionsDeleted += $db->deleteHostImageDescriptionsByTimeAdded(time() - CLEAN_IMAGE_DESCRIPTION_OFFSET);
|
||||
|
||||
// Delete deprecated logs
|
||||
$logsCleanerDeleted += $db->deleteLogCleaner(time() - CLEAN_LOG_SECONDS_OFFSET);
|
||||
$logsCrawlerDeleted += $db->deleteLogCrawler(time() - CRAWL_LOG_SECONDS_OFFSET);
|
||||
@ -238,9 +189,6 @@ if (CLEAN_LOG_ENABLED) {
|
||||
$hostPagesDeleted,
|
||||
$hostPageDescriptionsDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$hostImagesDeleted,
|
||||
$hostImageDescriptionsDeleted,
|
||||
$hostImagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
$manifestsDeleted,
|
||||
$logsCleanerDeleted,
|
||||
@ -256,15 +204,12 @@ if (CLEAN_LOG_ENABLED) {
|
||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||
echo 'Hosts pages deleted: ' . $hostPagesDeleted . PHP_EOL;
|
||||
echo 'Hosts images deleted: ' . $hostImagesDeleted . PHP_EOL;
|
||||
|
||||
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
||||
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||
|
||||
echo 'Host page bans removed: ' . $hostPagesBansRemoved . PHP_EOL;
|
||||
echo 'Host page descriptions deleted: ' . $hostPageDescriptionsDeleted . PHP_EOL;
|
||||
echo 'Host images bans removed: ' . $hostImagesBansRemoved . PHP_EOL;
|
||||
echo 'Host image descriptions deleted: ' . $hostImageDescriptionsDeleted . PHP_EOL;
|
||||
|
||||
echo 'Cleaner logs deleted: ' . $logsCleanerDeleted . PHP_EOL;
|
||||
echo 'Crawler logs deleted: ' . $logsCrawlerDeleted . PHP_EOL;
|
||||
|
@ -33,16 +33,12 @@ $httpDownloadSizeTotal = 0;
|
||||
$httpRequestsTimeTotal = 0;
|
||||
|
||||
$hostPagesProcessed = 0;
|
||||
$hostImagesProcessed = 0;
|
||||
$manifestsProcessed = 0;
|
||||
$hostPagesIndexed = 0;
|
||||
$hostImagesIndexed = 0;
|
||||
$manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostImagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesBanned = 0;
|
||||
$hostImagesBanned = 0;
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
@ -121,7 +117,7 @@ try {
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
@ -167,26 +163,15 @@ try {
|
||||
// Validate formatted link
|
||||
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHost(crc32($hostURL))) {
|
||||
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostImageLimit = $host->crawlImageLimit;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
|
||||
// Register new host
|
||||
} else {
|
||||
// Host not exists
|
||||
if (!$db->getHost(crc32($hostURL))) {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
@ -198,160 +183,35 @@ try {
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||
|
||||
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
|
||||
$remoteManifestHosts->result->name,
|
||||
$remoteManifestHosts->result->port,
|
||||
crc32($hostURL),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
$hostImageLimit,
|
||||
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
$hostId = $db->addHost( $remoteManifestHosts->result->scheme,
|
||||
$remoteManifestHosts->result->name,
|
||||
$remoteManifestHosts->result->port,
|
||||
crc32($hostURL),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
if ($hostId) {
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
|
||||
$hostsAdded++;
|
||||
|
||||
} else {
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Init robots parser
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
// Save home page info
|
||||
// Until page API not implemented, save at least home page to have ability to crawl
|
||||
// @TODO
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed('/') && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
|
||||
|
||||
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
|
||||
|
||||
$hostPagesAdded++;
|
||||
}
|
||||
// Increase counters
|
||||
$hostPagesAdded++;
|
||||
$hostsAdded++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process images crawl queue
|
||||
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostImageURL = $queueHostImage->scheme . '://' . $queueHostImage->name . ($queueHostImage->port ? ':' . $queueHostImage->port : false) . $queueHostImage->uri;
|
||||
|
||||
// Init image request
|
||||
$curl = new Curl($queueHostImageURL, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
// Update image index anyway, with the current time and http code
|
||||
$hostImagesProcessed += $db->updateHostImageCrawlQueue($queueHostImage->hostImageId, time(), $curl->getCode());
|
||||
|
||||
// Skip image processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->updateHostImageHttpCode($queueHostImage->hostImageId, $curl->getCode(), time());
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not provided
|
||||
if (!$hostImageContentType = $curl->getContentType()) {
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not allowed in settings
|
||||
$hostImageBanned = true;
|
||||
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
|
||||
|
||||
if (false !== strpos($hostImageContentType, trim($mime))) {
|
||||
|
||||
$hostImageBanned = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($hostImageBanned) {
|
||||
|
||||
$db->updateHostImageMime($queueHostImage->hostImageId, Filter::mime($hostImageContentType), time());
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert remote image data to base64 string
|
||||
if (!$queueHostImage->crawlMetaOnly) {
|
||||
|
||||
// Skip image processing without returned content
|
||||
if (!$hostImageContent = $curl->getContent()) {
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageExtension = @pathinfo($queueHostImageURL, PATHINFO_EXTENSION)) {
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($queueHostImage->hostImageId, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostImageData = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
|
||||
|
||||
// Set host image description
|
||||
// On link collection we knew meta but data,
|
||||
// this step use latest description slice and insert the data received by curl request
|
||||
if ($lastHostImageDescription = $db->getLastHostImageDescription($queueHostImage->hostImageId)) {
|
||||
|
||||
$db->setHostImageDescription($queueHostImage->hostImageId,
|
||||
crc32($lastHostImageDescription->alt .
|
||||
$lastHostImageDescription->title .
|
||||
$hostImageData),
|
||||
$lastHostImageDescription->alt,
|
||||
$lastHostImageDescription->title,
|
||||
$hostImageData,
|
||||
time(),
|
||||
time());
|
||||
}
|
||||
}
|
||||
|
||||
$hostImagesIndexed += $db->updateHostImage($queueHostImage->hostImageId,
|
||||
Filter::mime($hostImageContentType),
|
||||
time());
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
@ -476,12 +336,11 @@ try {
|
||||
time());
|
||||
|
||||
// Add queued page description if not exists
|
||||
$db->setHostPageDescription($queueHostPage->hostPageId,
|
||||
crc32($content),
|
||||
$db->addHostPageDescription($queueHostPage->hostPageId,
|
||||
Filter::pageTitle($title->item(0)->nodeValue),
|
||||
Filter::pageDescription($metaDescription),
|
||||
Filter::pageKeywords($metaKeywords),
|
||||
$queueHostPage->crawlMetaOnly ? null : Filter::string($content),
|
||||
$queueHostPage->crawlMetaOnly ? null : base64_encode($content),
|
||||
time());
|
||||
|
||||
// Update manifest registry
|
||||
@ -499,155 +358,42 @@ try {
|
||||
}
|
||||
}
|
||||
|
||||
// Collect page images
|
||||
if (CRAWL_HOST_DEFAULT_IMAGES_LIMIT > 0) {
|
||||
// Init links registry
|
||||
$links = [];
|
||||
|
||||
foreach (@$dom->getElementsByTagName('img') as $img) {
|
||||
// Collect image links
|
||||
foreach (@$dom->getElementsByTagName('img') as $img) {
|
||||
|
||||
// Skip images without src attribute
|
||||
if (!$imageSrc = @$img->getAttribute('src')) {
|
||||
// Skip images without src attribute
|
||||
if (!$src = @$img->getAttribute('src')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip images without alt attribute
|
||||
if (!$imageAlt = @$img->getAttribute('alt')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$imageTitle = @$img->getAttribute('title')) {
|
||||
$imageTitle = null;
|
||||
}
|
||||
|
||||
// Add domain to the relative src links
|
||||
if (!parse_url($imageSrc, PHP_URL_HOST)) {
|
||||
|
||||
$imageSrc = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $imageSrc), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted src link
|
||||
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
|
||||
|
||||
// Parse formatted src link
|
||||
$hostImageURL = Parser::hostURL($imageSrc);
|
||||
$hostImageURI = Parser::uri($imageSrc);
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHost(crc32($hostImageURL->string))) {
|
||||
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostImageLimit = $host->crawlImageLimit;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
|
||||
// Register new host
|
||||
} else {
|
||||
|
||||
// Get robots.txt if exists
|
||||
$curl = new Curl($hostImageURL->string . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
$httpRequestsTotal++;
|
||||
$httpRequestsSizeTotal += $curl->getSizeRequest();
|
||||
$httpDownloadSizeTotal += $curl->getSizeDownload();
|
||||
$httpRequestsTimeTotal += $curl->getTotalTime();
|
||||
|
||||
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||
$hostRobots = $curl->getContent();
|
||||
} else {
|
||||
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||
$hostId = $db->addHost($hostImageURL->scheme,
|
||||
$hostImageURL->name,
|
||||
$hostImageURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
$hostImageLimit,
|
||||
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
if ($hostId) {
|
||||
|
||||
$hostsAdded++;
|
||||
|
||||
} else {
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Init robots parser
|
||||
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||
|
||||
// Save new image info
|
||||
$hostImageId = $db->getHostImageId($hostId, crc32($hostImageURI->string));
|
||||
|
||||
if (!$hostImageId && // image not exists
|
||||
$hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostImageURI->string) && // src allowed by robots.txt rules
|
||||
$hostImageLimit > $db->getTotalHostImages($hostId)) { // images quantity not reached host limit
|
||||
|
||||
// Add host image
|
||||
if ($hostImageId = $db->addHostImage($hostId,
|
||||
crc32($hostImageURI->string),
|
||||
$hostImageURI->string,
|
||||
time())) {
|
||||
|
||||
$hostImagesAdded++;
|
||||
|
||||
} else {
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Add/update host image description
|
||||
$imageAlt = Filter::imageAlt($imageAlt);
|
||||
$imageTitle = Filter::imageTitle($imageTitle);
|
||||
|
||||
$db->setHostImageDescription($hostImageId,
|
||||
crc32($imageAlt . $imageTitle),
|
||||
$imageAlt,
|
||||
$imageTitle,
|
||||
null,
|
||||
time(),
|
||||
null);
|
||||
|
||||
// Relate host image with host page was found
|
||||
$db->setHostImageToHostPage($hostImageId, $queueHostPage->hostPageId, time(), 1);
|
||||
|
||||
// Increase image rank when link does not match the current host
|
||||
if ($hostImageURL->scheme . '://' .
|
||||
$hostImageURL->name .
|
||||
($hostImageURL->port ? ':' . $hostImageURL->port : '')
|
||||
!=
|
||||
$queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
|
||||
|
||||
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip images without alt attribute
|
||||
if (!$alt = @$img->getAttribute('alt')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$title = @$img->getAttribute('title')) {
|
||||
$title = null;
|
||||
}
|
||||
|
||||
// Skip encoded content
|
||||
if (false !== strpos($src, 'data:')) {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add link to queue
|
||||
$links[] = [
|
||||
'title' => null,
|
||||
'description' => null,
|
||||
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
|
||||
'data' => null,
|
||||
'ref' => $src,
|
||||
];
|
||||
}
|
||||
|
||||
// Collect internal links from page content
|
||||
@ -659,6 +405,11 @@ try {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get title attribute if available
|
||||
if (!$title = @$a->getAttribute('title')) {
|
||||
$title = null;
|
||||
}
|
||||
|
||||
// Skip anchor links
|
||||
if (false !== strpos($href, '#')) {
|
||||
|
||||
@ -683,23 +434,34 @@ try {
|
||||
continue;
|
||||
}
|
||||
|
||||
// @TODO skip other apps
|
||||
// Add link to queue
|
||||
$links[] = [
|
||||
'title' => null,
|
||||
'description' => null,
|
||||
'keywords' => Filter::pageKeywords($title),
|
||||
'data' => null,
|
||||
'ref' => $href,
|
||||
];
|
||||
}
|
||||
|
||||
// Add absolute URL prefixes to the relative links found
|
||||
if (!parse_url($href, PHP_URL_HOST)) {
|
||||
// Process links collected
|
||||
foreach ($links as $link) {
|
||||
|
||||
$href = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $href), '/'), '.');
|
||||
//Make relative links absolute
|
||||
if (!parse_url($link['ref'], PHP_URL_HOST)) {
|
||||
|
||||
$link['ref'] = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
||||
}
|
||||
|
||||
// Validate formatted link
|
||||
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
||||
if (filter_var($link['ref'], FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $link['ref'])) {
|
||||
|
||||
// Parse formatted link
|
||||
$hostURL = Parser::hostURL($href);
|
||||
$hostPageURI = Parser::uri($href);
|
||||
$hostURL = Parser::hostURL($link['ref']);
|
||||
$hostPageURI = Parser::uri($link['ref']);
|
||||
|
||||
// Host exists
|
||||
if ($host = $db->getHost(crc32($hostURL->string))) {
|
||||
@ -707,7 +469,7 @@ try {
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostImageLimit = $host->crawlImageLimit;
|
||||
$hostMetaOnly = $host->crawlMetaOnly;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
@ -731,30 +493,33 @@ try {
|
||||
}
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||
$hostId = $db->addHost($hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
$hostImageLimit,
|
||||
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
$hostId = $db->addHost( $hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
if ($hostId) {
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
|
||||
$hostsAdded++;
|
||||
// Increase counters
|
||||
$hostPagesAdded++;
|
||||
$hostsAdded++;
|
||||
|
||||
} else {
|
||||
// When page is root, skip next operations
|
||||
if ($hostPageURI->string == '/') {
|
||||
|
||||
continue;
|
||||
}
|
||||
@ -766,25 +531,27 @@ try {
|
||||
// Save page info
|
||||
if ($hostStatus && // host enabled
|
||||
$robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||
!$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists
|
||||
$hostPageLimit > $db->getTotalHostPages($hostId)) { // pages quantity not reached host limit
|
||||
|
||||
if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) {
|
||||
if ($hostPage = $db->getHostPage($hostId, crc32($hostPageURI->string))) {
|
||||
|
||||
$hostPageId = $hostPage->hostPageId;
|
||||
|
||||
} else {
|
||||
|
||||
$hostPageId = $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time());
|
||||
|
||||
$db->addHostPageDescription($hostPageId,
|
||||
$link['title'],
|
||||
$link['description'],
|
||||
$link['keywords'],
|
||||
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
|
||||
time());
|
||||
|
||||
$hostPagesAdded++;
|
||||
}
|
||||
}
|
||||
|
||||
// Increase page rank when link does not match the current host
|
||||
if ($hostURL->scheme . '://' .
|
||||
$hostURL->name .
|
||||
($hostURL->port ? ':' . $hostURL->port : '')
|
||||
!=
|
||||
$queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '')) {
|
||||
|
||||
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
|
||||
$db->addHostPageToHostPage($queueHostPage->hostPageId, $hostPageId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -811,10 +578,6 @@ if (CRAWL_LOG_ENABLED) {
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesBanned,
|
||||
$hostImagesIndexed,
|
||||
$hostImagesProcessed,
|
||||
$hostImagesAdded,
|
||||
$hostImagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
$httpRequestsTotal,
|
||||
@ -832,11 +595,6 @@ echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL;
|
||||
echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
||||
echo 'Pages banned: ' . $hostPagesBanned . PHP_EOL;
|
||||
|
||||
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
||||
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
||||
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||
echo 'Images banned: ' . $hostImagesBanned . PHP_EOL;
|
||||
|
||||
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||
echo 'Manifests added: ' . $manifestsAdded . PHP_EOL;
|
||||
|
||||
|
Binary file not shown.
@ -54,24 +54,6 @@ class Filter {
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
static public function imageAlt(mixed $alt) {
|
||||
|
||||
$alt = (string) $alt;
|
||||
|
||||
$alt = trim($alt);
|
||||
|
||||
return $alt;
|
||||
}
|
||||
|
||||
static public function imageTitle(mixed $title) {
|
||||
|
||||
$title = (string) $title;
|
||||
|
||||
$title = trim($title);
|
||||
|
||||
return $title;
|
||||
}
|
||||
|
||||
static public function pageData(mixed $data) {
|
||||
|
||||
$data = (string) $data;
|
||||
|
@ -102,11 +102,44 @@ class MySQL {
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, int $crawlImageLimit, string $crawlMetaOnly, string $status, string $nsfw, mixed $robots, mixed $robotsPostfix) {
|
||||
public function addHost(string $scheme,
|
||||
string $name,
|
||||
mixed $port,
|
||||
int $crc32url,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated,
|
||||
int $crawlPageLimit,
|
||||
string $crawlMetaOnly,
|
||||
string $status,
|
||||
string $nsfw,
|
||||
mixed $robots,
|
||||
mixed $robotsPostfix) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlImageLimit`, `crawlMetaOnly`, `status`, `nsfw`, `robots`, `robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
$query = $this->_db->prepare('INSERT INTO `host` (`scheme`,
|
||||
`name`,
|
||||
`port`,
|
||||
`crc32url`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`crawlPageLimit`,
|
||||
`crawlMetaOnly`,
|
||||
`status`,
|
||||
`nsfw`,
|
||||
`robots`,
|
||||
`robotsPostfix`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlImageLimit, $crawlMetaOnly, $status, $nsfw, $robots, $robotsPostfix]);
|
||||
$query->execute([ $scheme,
|
||||
$name,
|
||||
$port,
|
||||
$crc32url,
|
||||
$timeAdded,
|
||||
$timeUpdated,
|
||||
$crawlPageLimit,
|
||||
$crawlMetaOnly,
|
||||
$status,
|
||||
$nsfw,
|
||||
$robots,
|
||||
$robotsPostfix]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
@ -120,253 +153,6 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Images
|
||||
public function getTotalHostImages(int $hostId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE `hostId` = ?');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostImageId(int $hostId, int $crc32uri) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostImageId` FROM `hostImage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostId, $crc32uri]);
|
||||
|
||||
return $query->rowCount() ? $query->fetch()->hostImageId : 0;
|
||||
}
|
||||
|
||||
public function getHostImages(int $hostId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ?');
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getUnrelatedHostImages() {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostImage`
|
||||
WHERE `hostImage`.`hostImageId` NOT IN (SELECT `hostImageToHostPage`.`hostImageId`
|
||||
FROM `hostImageToHostPage`
|
||||
|
||||
WHERE `hostImageToHostPage`.`hostImageId` = `hostImage`.`hostImageId`)');
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostImagesByLimit(int $hostId, int $limit) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostImage` WHERE `hostId` = ? ORDER BY hostImageId DESC LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$hostId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function addHostImage(int $hostId,
|
||||
int $crc32uri,
|
||||
string $uri,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated = null,
|
||||
mixed $timeBanned = null,
|
||||
mixed $httpCode = null,
|
||||
mixed $mime = null,
|
||||
mixed $rank = null) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostImage` ( `hostId`,
|
||||
`crc32uri`,
|
||||
`uri`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`timeBanned`,
|
||||
`httpCode`,
|
||||
`mime`,
|
||||
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function updateHostImageRank(int $hostId,
|
||||
int $crc32uri,
|
||||
int $increment) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `rank` = `rank` + ' . (int) $increment . ' WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostId, $crc32uri]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostImageTimeBanned(int $hostImageId, int $timeBanned) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$timeBanned, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostImageHttpCode(int $hostImageId,
|
||||
int $httpCode,
|
||||
int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `httpCode` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$httpCode, $timeUpdated, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostImageMime(int $hostImageId,
|
||||
string $mime,
|
||||
int $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$mime, $timeUpdated, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostImage(int $hostImageId,
|
||||
string $mime,
|
||||
int $timeUpdated,
|
||||
mixed $timeBanned = null) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `mime` = ?, `timeUpdated` = ?, `timeBanned` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$mime, $timeUpdated, $timeBanned, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostImage(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImage` WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function setHostImageDescription(int $hostImageId,
|
||||
int $crc32id,
|
||||
string $alt,
|
||||
string $title,
|
||||
mixed $data,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
|
||||
`crc32id`,
|
||||
`alt`,
|
||||
`title`,
|
||||
`timeAdded`) VALUES (?, ?, ?, ?, ?)
|
||||
|
||||
ON DUPLICATE KEY UPDATE `alt` = ?,
|
||||
`title` = ?,
|
||||
`timeUpdated` = ?');
|
||||
|
||||
$query->execute([$hostImageId, $crc32id, $alt, $title, $timeAdded, $alt, $title, $timeUpdated]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function setHostImageDescriptionData(int $hostImageId,
|
||||
int $crc32id,
|
||||
mixed $data,
|
||||
int $timeAdded,
|
||||
mixed $timeUpdated) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostImageDescription` (`hostImageId`,
|
||||
`crc32id`,
|
||||
`data`,
|
||||
`timeAdded`) VALUES (?, ?, ?, ?)
|
||||
|
||||
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
|
||||
|
||||
$query->execute([$hostImageId, $crc32id, $data, $timeAdded, $timeUpdated]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
|
||||
public function deleteHostImageDescription(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `hostImageId` = ?');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getLastHostImageDescription(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostImageDescription` WHERE `hostImageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getHostImageHostPages(int $hostImageId, int $limit = 5) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostImageToHostPage`
|
||||
JOIN `hostPage` ON (`hostPage`.`hostPageId` = `hostImageToHostPage`.`hostPageId`)
|
||||
|
||||
WHERE `hostImageId` = ?
|
||||
|
||||
ORDER BY `hostPage`.`rank` DESC, RAND(`hostPage`.`hostId`)
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostImageHostPagesTotal(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function setHostImageToHostPage(int $hostImageId, int $hostPageId, int $time, int $quantity) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostImageToHostPage` (`hostImageId`,
|
||||
`hostPageId`,
|
||||
`timeAdded`,
|
||||
`timeUpdated`,
|
||||
`quantity`) VALUES (?, ?, ?, ?, ?)
|
||||
|
||||
ON DUPLICATE KEY UPDATE `timeUpdated` = ?,
|
||||
`quantity` = `quantity` + ' . (int) $quantity);
|
||||
|
||||
$query->execute([$hostImageId, $hostPageId, $time, null, $quantity, $time]);
|
||||
|
||||
return $query->rowCount(); // no primary key
|
||||
}
|
||||
|
||||
public function deleteHostImageToHostPage(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostImageId` = ?');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
// Pages
|
||||
public function getTotalHostPages(int $hostId) {
|
||||
|
||||
@ -421,18 +207,9 @@ class MySQL {
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function getHostPageDescription(int $hostPageId, int $crc32data) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? AND `crc32data` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId, $crc32data]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getLastPageDescription(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeUpdated` DESC, `timeAdded` DESC LIMIT 1');
|
||||
$query = $this->_db->prepare('SELECT * FROM `hostPageDescription` WHERE `hostPageId` = ? ORDER BY `timeAdded` DESC LIMIT 1');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
|
||||
@ -442,7 +219,6 @@ class MySQL {
|
||||
public function getFoundHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostPage`.`uri`,
|
||||
`hostPage`.`rank`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`
|
||||
@ -459,28 +235,6 @@ class MySQL {
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function getFoundHostImage(int $hostImageId) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostImage`.`hostImageId`,
|
||||
`hostImage`.`uri`,
|
||||
`hostImage`.`rank`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlMetaOnly`
|
||||
|
||||
FROM `hostImage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||
|
||||
WHERE `hostImage`.`hostImageId` = ?
|
||||
|
||||
LIMIT 1');
|
||||
|
||||
$query->execute([$hostImageId]);
|
||||
|
||||
return $query->fetch();
|
||||
}
|
||||
|
||||
public function addHostPage(int $hostId,
|
||||
int $crc32uri,
|
||||
string $uri,
|
||||
@ -488,8 +242,7 @@ class MySQL {
|
||||
mixed $timeUpdated = null,
|
||||
mixed $timeBanned = null,
|
||||
mixed $httpCode = null,
|
||||
mixed $mime = null,
|
||||
mixed $rank = null) {
|
||||
mixed $mime = null) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`,
|
||||
`crc32uri`,
|
||||
@ -498,10 +251,9 @@ class MySQL {
|
||||
`timeUpdated`,
|
||||
`timeBanned`,
|
||||
`httpCode`,
|
||||
`mime`,
|
||||
`rank`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`mime`) VALUES (?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime, $rank]);
|
||||
$query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $timeBanned, $httpCode, $mime]);
|
||||
|
||||
return $this->_db->lastInsertId();
|
||||
}
|
||||
@ -515,22 +267,6 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostPageRank(int $hostId,
|
||||
int $crc32uri,
|
||||
int $increment) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `rank` = `rank` + ' . (int) $increment . '
|
||||
|
||||
WHERE `hostId` = ?
|
||||
AND `crc32uri` = ?
|
||||
|
||||
LIMIT 1');
|
||||
|
||||
$query->execute([$hostId, $crc32uri]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function updateHostPageTimeBanned(int $hostPageId, int $timeBanned) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostPage` SET `timeBanned` = ? WHERE `hostPageId` = ? LIMIT 1');
|
||||
@ -576,44 +312,48 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostPageToHostImage(int $hostPageId) {
|
||||
public function addHostPageDescription(int $hostPageId,
|
||||
mixed $title,
|
||||
mixed $description,
|
||||
mixed $keywords,
|
||||
mixed $data,
|
||||
int $timeAdded) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImageToHostPage` WHERE `hostPageId` = ?');
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
||||
`title`,
|
||||
`description`,
|
||||
`keywords`,
|
||||
`data`,
|
||||
`timeAdded`
|
||||
) VALUES (?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([$hostPageId]);
|
||||
$query->execute([
|
||||
$hostPageId,
|
||||
$title,
|
||||
$description,
|
||||
$keywords,
|
||||
$data,
|
||||
$timeAdded,
|
||||
]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function setHostPageDescription(int $hostPageId,
|
||||
int $crc32data,
|
||||
mixed $metaTitle,
|
||||
mixed $metaDescription,
|
||||
mixed $metaKeywords,
|
||||
mixed $data,
|
||||
int $time) {
|
||||
public function addHostPageToHostPage(int $hostPageIdSource, int $hostPageIdTarget) {
|
||||
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPageDescription` ( `hostPageId`,
|
||||
`crc32data`,
|
||||
`metaTitle`,
|
||||
`metaDescription`,
|
||||
`metaKeywords`,
|
||||
`data`,
|
||||
`timeAdded`
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
$query = $this->_db->prepare('INSERT INTO `hostPageToHostPage` (`hostPageIdSource`, `hostPageIdTarget`, `quantity`) VALUES (?, ?, 0)
|
||||
|
||||
ON DUPLICATE KEY UPDATE `timeUpdated` = ?');
|
||||
ON DUPLICATE KEY UPDATE `quantity` = `quantity` + 1');
|
||||
|
||||
$query->execute([
|
||||
$hostPageId,
|
||||
$crc32data,
|
||||
$metaTitle,
|
||||
$metaDescription,
|
||||
$metaKeywords,
|
||||
$data,
|
||||
$time,
|
||||
$time
|
||||
]);
|
||||
$query->execute([$hostPageIdSource, $hostPageIdTarget]);
|
||||
|
||||
}
|
||||
|
||||
public function deleteHostPageToHostPage(int $hostPageId) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostPageToHostPage` WHERE `hostPageIdSource` = ? OR `hostPageIdTarget` = ?');
|
||||
|
||||
$query->execute([$hostPageId, $hostPageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
@ -652,33 +392,12 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function resetBannedHostImages(int $timeOffset) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeBanned` = NULL WHERE `timeBanned` IS NOT NULL AND `timeBanned` < ' . (int) $timeOffset);
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function deleteHostImageDescriptionsByTimeAdded(int $timeOffset) {
|
||||
|
||||
$query = $this->_db->prepare('DELETE FROM `hostImageDescription` WHERE `timeAdded` < ' . (int) $timeOffset);
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function addCleanerLog(int $timeAdded,
|
||||
int $hostsTotal,
|
||||
int $hostsUpdated,
|
||||
int $hostPagesDeleted,
|
||||
int $hostPageDescriptionsDeleted,
|
||||
int $hostPagesBansRemoved,
|
||||
int $hostImagesDeleted,
|
||||
int $hostImageDescriptionsDeleted,
|
||||
int $hostImagesBansRemoved,
|
||||
int $manifestsTotal,
|
||||
int $manifestsDeleted,
|
||||
int $logsCleanerDeleted,
|
||||
@ -695,9 +414,6 @@ class MySQL {
|
||||
`hostPagesDeleted`,
|
||||
`hostPageDescriptionsDeleted`,
|
||||
`hostPagesBansRemoved`,
|
||||
`hostImagesDeleted`,
|
||||
`hostImageDescriptionsDeleted`,
|
||||
`hostImagesBansRemoved`,
|
||||
`manifestsTotal`,
|
||||
`manifestsDeleted`,
|
||||
`logsCleanerDeleted`,
|
||||
@ -706,7 +422,7 @@ class MySQL {
|
||||
`httpRequestsSizeTotal`,
|
||||
`httpDownloadSizeTotal`,
|
||||
`httpRequestsTimeTotal`,
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([
|
||||
$timeAdded,
|
||||
@ -715,9 +431,6 @@ class MySQL {
|
||||
$hostPagesDeleted,
|
||||
$hostPageDescriptionsDeleted,
|
||||
$hostPagesBansRemoved,
|
||||
$hostImagesDeleted,
|
||||
$hostImageDescriptionsDeleted,
|
||||
$hostImagesBansRemoved,
|
||||
$manifestsTotal,
|
||||
$manifestsDeleted,
|
||||
$logsCleanerDeleted,
|
||||
@ -751,7 +464,6 @@ class MySQL {
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlPageLimit`,
|
||||
`host`.`crawlImageLimit`,
|
||||
`host`.`crawlMetaOnly`,
|
||||
`host`.`robots`,
|
||||
`host`.`robotsPostfix`
|
||||
@ -762,7 +474,7 @@ class MySQL {
|
||||
WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
AND `hostPage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY `hostPage`.`rank` DESC, RAND()
|
||||
ORDER BY RAND()
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
@ -780,40 +492,6 @@ class MySQL {
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getHostImageCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT `hostImage`.`hostId`,
|
||||
`hostImage`.`hostImageId`,
|
||||
`hostImage`.`uri`,
|
||||
`host`.`scheme`,
|
||||
`host`.`name`,
|
||||
`host`.`port`,
|
||||
`host`.`crawlMetaOnly`
|
||||
|
||||
FROM `hostImage`
|
||||
JOIN `host` ON (`host`.`hostId` = `hostImage`.`hostId`)
|
||||
|
||||
WHERE (`hostImage`.`timeUpdated` IS NULL OR `hostImage`.`timeUpdated` < ? ) AND `host`.`status` <> 0
|
||||
AND `hostImage`.`timeBanned` IS NULL
|
||||
|
||||
ORDER BY `hostImage`.`rank` DESC, RAND()
|
||||
|
||||
LIMIT ' . (int) $limit);
|
||||
|
||||
$query->execute([$timeFrom]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function updateHostImageCrawlQueue(int $hostImageId, int $timeUpdated, int $httpCode) {
|
||||
|
||||
$query = $this->_db->prepare('UPDATE `hostImage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostImageId` = ? LIMIT 1');
|
||||
|
||||
$query->execute([$timeUpdated, $httpCode, $hostImageId]);
|
||||
|
||||
return $query->rowCount();
|
||||
}
|
||||
|
||||
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
||||
|
||||
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
||||
@ -844,10 +522,6 @@ class MySQL {
|
||||
int $hostPagesIndexed,
|
||||
int $hostPagesAdded,
|
||||
int $hostPagesBanned,
|
||||
int $hostImagesIndexed,
|
||||
int $hostImagesProcessed,
|
||||
int $hostImagesAdded,
|
||||
int $hostImagesBanned,
|
||||
int $manifestsProcessed,
|
||||
int $manifestsAdded,
|
||||
int $httpRequestsTotal,
|
||||
@ -862,17 +536,13 @@ class MySQL {
|
||||
`hostPagesIndexed`,
|
||||
`hostPagesAdded`,
|
||||
`hostPagesBanned`,
|
||||
`hostImagesIndexed`,
|
||||
`hostImagesProcessed`,
|
||||
`hostImagesAdded`,
|
||||
`hostImagesBanned`,
|
||||
`manifestsProcessed`,
|
||||
`manifestsAdded`,
|
||||
`httpRequestsTotal`,
|
||||
`httpRequestsSizeTotal`,
|
||||
`httpDownloadSizeTotal`,
|
||||
`httpRequestsTimeTotal`,
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
`executionTimeTotal`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)');
|
||||
|
||||
$query->execute([
|
||||
$timeAdded,
|
||||
@ -881,10 +551,6 @@ class MySQL {
|
||||
$hostPagesIndexed,
|
||||
$hostPagesAdded,
|
||||
$hostPagesBanned,
|
||||
$hostImagesIndexed,
|
||||
$hostImagesProcessed,
|
||||
$hostImagesAdded,
|
||||
$hostImagesBanned,
|
||||
$manifestsProcessed,
|
||||
$manifestsAdded,
|
||||
$httpRequestsTotal,
|
||||
|
@ -11,13 +11,13 @@ class SphinxQL {
|
||||
$this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ);
|
||||
}
|
||||
|
||||
public function searchHostPages(string $keyword, int $start, int $limit, int $maxMatches) {
|
||||
public function searchHostPages(string $keyword, string $mime, int $start, int $limit, int $maxMatches) {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
|
||||
|
||||
FROM `hostPage`
|
||||
|
||||
WHERE MATCH(?)
|
||||
WHERE MATCH(?) AND `mime` = ?
|
||||
|
||||
ORDER BY `rank` DESC, WEIGHT() DESC
|
||||
|
||||
@ -25,26 +25,7 @@ class SphinxQL {
|
||||
|
||||
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
|
||||
|
||||
$query->execute([$keyword]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function searchHostImages(string $keyword, int $start, int $limit, int $maxMatches) {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT *, WEIGHT() AS `weight`
|
||||
|
||||
FROM `hostImage`
|
||||
|
||||
WHERE MATCH(?)
|
||||
|
||||
ORDER BY `rank` DESC, WEIGHT() DESC
|
||||
|
||||
LIMIT ' . (int) ($start >= $maxMatches ? ($maxMatches > 0 ? $maxMatches - 1 : 0) : $start) . ',' . (int) $limit . '
|
||||
|
||||
OPTION `max_matches`=' . (int) ($maxMatches >= 1 ? $maxMatches : 1));
|
||||
|
||||
$query->execute([$keyword]);
|
||||
$query->execute([$keyword, $mime]);
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
@ -58,30 +39,21 @@ class SphinxQL {
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function searchHostPagesTotal(string $keyword) {
|
||||
public function getHostPagesMime() {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)');
|
||||
|
||||
$query->execute([$keyword]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function searchHostImagesTotal(string $keyword) {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage` WHERE MATCH(?)');
|
||||
|
||||
$query->execute([$keyword]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
|
||||
public function getHostImagesTotal() {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostImage`');
|
||||
$query = $this->_sphinx->prepare('SELECT `mime` FROM `hostPage` GROUP BY `mime` ORDER BY `mime` ASC');
|
||||
|
||||
$query->execute();
|
||||
|
||||
return $query->fetchAll();
|
||||
}
|
||||
|
||||
public function searchHostPagesTotal(string $keyword, string $mime) {
|
||||
|
||||
$query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?) AND `mime` = ?');
|
||||
|
||||
$query->execute([$keyword, $mime]);
|
||||
|
||||
return $query->fetch()->total;
|
||||
}
|
||||
}
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 134 KiB After Width: | Height: | Size: 147 KiB |
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
// Current version
|
||||
define('API_VERSION', 0.7);
|
||||
define('API_VERSION', 0.8);
|
||||
|
||||
// Load system dependencies
|
||||
require_once('../config/app.php');
|
||||
@ -30,48 +30,25 @@ if (API_ENABLED) {
|
||||
|
||||
|
||||
// Filter request data
|
||||
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'page';
|
||||
$type = !empty($_GET['type']) ? Filter::url($_GET['type']) : 'html';
|
||||
$mode = !empty($_GET['mode']) ? Filter::url($_GET['mode']) : 'default';
|
||||
$query = !empty($_GET['query']) ? Filter::url($_GET['query']) : '';
|
||||
$page = !empty($_GET['page']) ? (int) $_GET['page'] : 1;
|
||||
|
||||
// Make image search request
|
||||
if (!empty($type) && $type == 'image') {
|
||||
|
||||
$sphinxResultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($query, $mode));
|
||||
$sphinxResults = $sphinx->searchHostImages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
|
||||
|
||||
// Make default search request
|
||||
} else {
|
||||
|
||||
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode));
|
||||
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
|
||||
}
|
||||
// Make search request
|
||||
$sphinxResultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($query, $mode), $type);
|
||||
$sphinxResults = $sphinx->searchHostPages(Filter::searchQuery($query, $mode), $type, $page * API_SEARCH_PAGINATION_RESULTS_LIMIT - API_SEARCH_PAGINATION_RESULTS_LIMIT, API_SEARCH_PAGINATION_RESULTS_LIMIT, $sphinxResultsTotal);
|
||||
|
||||
// Generate results
|
||||
$dbResults = [];
|
||||
|
||||
foreach ($sphinxResults as $i => $sphinxResult) {
|
||||
|
||||
// Image
|
||||
if (!empty($type) && $type == 'image') {
|
||||
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
|
||||
|
||||
if ($hostImage = $db->getFoundHostImage($sphinxResult->id)) {
|
||||
$dbResults[$i] = $hostPage;
|
||||
|
||||
$dbResults[$i] = $hostImage;
|
||||
|
||||
$dbResults[$i]->weight = $sphinxResult->weight;
|
||||
}
|
||||
|
||||
// Default
|
||||
} else {
|
||||
|
||||
if ($hostPage = $db->getFoundHostPage($sphinxResult->id)) {
|
||||
|
||||
$dbResults[$i] = $hostPage;
|
||||
|
||||
$dbResults[$i]->weight = $sphinxResult->weight;
|
||||
}
|
||||
$dbResults[$i]->weight = $sphinxResult->weight;
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,13 +106,10 @@ if (API_ENABLED) {
|
||||
'crawlUrlRegexp' => CRAWL_URL_REGEXP,
|
||||
'crawlHostDefaultNsfw' => CRAWL_HOST_DEFAULT_NSFW,
|
||||
'crawlHostDefaultPagesLimit' => CRAWL_HOST_DEFAULT_PAGES_LIMIT,
|
||||
'crawlHostDefaultImagesLimit' => CRAWL_HOST_DEFAULT_IMAGES_LIMIT,
|
||||
'crawlHostDefaultStatus' => CRAWL_HOST_DEFAULT_STATUS,
|
||||
'crawlHostDefaultMetaOnly' => CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
'crawlHostPageSecondsOffset' => CRAWL_PAGE_SECONDS_OFFSET,
|
||||
'crawlHostPageMime' => CRAWL_PAGE_MIME,
|
||||
'crawlHostImageSecondsOffset' => CRAWL_IMAGE_SECONDS_OFFSET,
|
||||
'crawlHostImageMime' => CRAWL_IMAGE_MIME,
|
||||
'cleanHostSecondsOffset' => CLEAN_HOST_SECONDS_OFFSET,
|
||||
'crawlRobotsDefaultRules' => CRAWL_ROBOTS_DEFAULT_RULES,
|
||||
'crawlRobotsPostfixRules' => CRAWL_ROBOTS_POSTFIX_RULES,
|
||||
|
@ -24,7 +24,6 @@ $placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the
|
||||
<?php if (API_MANIFEST_ENABLED) { ?>
|
||||
<meta name="yggo:manifest" content="<?php echo sprintf('%s/api.php?action=manifest', WEBSITE_DOMAIN) ?>" />
|
||||
<?php } ?>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
|
||||
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
|
||||
<style>
|
||||
|
@ -16,34 +16,34 @@ $sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT);
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
// Filter request data
|
||||
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'page';
|
||||
$t = !empty($_GET['t']) ? Filter::url($_GET['t']) : 'html';
|
||||
$m = !empty($_GET['m']) ? Filter::url($_GET['m']) : 'default';
|
||||
$q = !empty($_GET['q']) ? Filter::url($_GET['q']) : '';
|
||||
$p = !empty($_GET['p']) ? (int) $_GET['p'] : 1;
|
||||
|
||||
// Define page basics
|
||||
switch ($t) {
|
||||
// Search request
|
||||
if (!empty($q)) {
|
||||
|
||||
case 'image':
|
||||
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m), $t);
|
||||
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $t, $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
|
||||
|
||||
$totalPages = $sphinx->getHostImagesTotal();
|
||||
} else {
|
||||
|
||||
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s image or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s images or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s images or enter the new one...'), $totalPages),
|
||||
]);
|
||||
|
||||
break;
|
||||
default:
|
||||
|
||||
$totalPages = $sphinx->getHostPagesTotal();
|
||||
|
||||
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
]);
|
||||
$resultsTotal = 0;
|
||||
$results = [];
|
||||
}
|
||||
|
||||
// Mime list
|
||||
$hostPagesMime = $sphinx->getHostPagesMime();
|
||||
|
||||
// Define page basics
|
||||
$totalPages = $sphinx->getHostPagesTotal();
|
||||
|
||||
|
||||
$placeholder = Filter::plural($totalPages, [sprintf(_('Over %s page or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
sprintf(_('Over %s pages or enter the new one...'), $totalPages),
|
||||
]);
|
||||
|
||||
// Crawl request
|
||||
if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
@ -61,6 +61,7 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
$hostStatus = $host->status;
|
||||
$hostNsfw = $host->nsfw;
|
||||
$hostPageLimit = $host->crawlPageLimit;
|
||||
$hostMetaOnly = $host->crawlMetaOnly;
|
||||
$hostId = $host->hostId;
|
||||
$hostRobots = $host->robots;
|
||||
$hostRobotsPostfix = $host->robotsPostfix;
|
||||
@ -82,21 +83,26 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
|
||||
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW;
|
||||
$hostStatus = CRAWL_HOST_DEFAULT_STATUS ? 1 : 0;
|
||||
$hostNsfw = CRAWL_HOST_DEFAULT_NSFW ? 1 : 0;
|
||||
$hostMetaOnly = CRAWL_HOST_DEFAULT_META_ONLY ? 1 : 0;
|
||||
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||
$hostId = $db->addHost($hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
$hostId = $db->addHost( $hostURL->scheme,
|
||||
$hostURL->name,
|
||||
$hostURL->port,
|
||||
crc32($hostURL->string),
|
||||
time(),
|
||||
null,
|
||||
$hostPageLimit,
|
||||
(string) $hostMetaOnly,
|
||||
(string) $hostStatus,
|
||||
(string) $hostNsfw,
|
||||
$hostRobots,
|
||||
$hostRobotsPostfix);
|
||||
|
||||
// Add web root host page to make host visible in the crawl queue
|
||||
$db->addHostPage($hostId, crc32('/'), '/', time());
|
||||
}
|
||||
}
|
||||
|
||||
@ -120,30 +126,12 @@ if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) {
|
||||
|
||||
} catch(Exception $e){
|
||||
|
||||
var_dump($e);
|
||||
|
||||
$db->rollBack();
|
||||
}
|
||||
}
|
||||
|
||||
// Search request
|
||||
if (!empty($q)) {
|
||||
|
||||
if ($t == 'image') {
|
||||
|
||||
$resultsTotal = $sphinx->searchHostImagesTotal(Filter::searchQuery($q, $m));
|
||||
$results = $sphinx->searchHostImages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT, $resultsTotal);
|
||||
|
||||
} else {
|
||||
|
||||
$resultsTotal = $sphinx->searchHostPagesTotal(Filter::searchQuery($q, $m));
|
||||
$results = $sphinx->searchHostPages(Filter::searchQuery($q, $m), $p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT, $resultsTotal);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
$resultsTotal = 0;
|
||||
$results = [];
|
||||
}
|
||||
|
||||
?>
|
||||
|
||||
<!DOCTYPE html>
|
||||
@ -151,7 +139,6 @@ if (!empty($q)) {
|
||||
<head>
|
||||
<title><?php echo (empty($q) ? _('Empty request - YGGo!') : ($p > 1 ? sprintf(_('%s - #%s - YGGo!'), htmlentities($q), $p) : sprintf(_('%s - YGGo!'), htmlentities($q)))) ?></title>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="description" content="<?php echo _('Javascript-less Open Source Web Search Engine') ?>" />
|
||||
<meta name="keywords" content="<?php echo _('web, search, engine, crawler, php, pdo, mysql, sphinx, yggdrasil, js-less, open source') ?>" />
|
||||
<style>
|
||||
@ -322,8 +309,9 @@ if (!empty($q)) {
|
||||
<form name="search" method="GET" action="<?php echo WEBSITE_DOMAIN; ?>/search.php">
|
||||
<h1><a href="<?php echo WEBSITE_DOMAIN; ?>"><?php echo _('YGGo!') ?></a></h1>
|
||||
<input type="text" name="q" placeholder="<?php echo $placeholder ?>" value="<?php echo htmlentities($q) ?>" />
|
||||
<label><input type="radio" name="t" value="page" <?php echo ($t == 'page' ? 'checked="checked"' : false) ?>/> <?php echo _('Pages') ?></label>
|
||||
<label><input type="radio" name="t" value="image" <?php echo ($t == 'image' ? 'checked="checked"' : false) ?>/> <?php echo _('Images') ?></label>
|
||||
<?php foreach ($hostPagesMime as $hostPageMime) { ?>
|
||||
<label><input type="radio" name="t" value="<?php echo $hostPageMime->mime ?>" <?php echo ($t == $hostPageMime->mime ? 'checked="checked"' : false) ?>/> <?php echo $hostPageMime->mime ?></label>
|
||||
<?php } ?>
|
||||
<button type="submit"><?php echo _('Search'); ?></button>
|
||||
</form>
|
||||
</header>
|
||||
@ -336,156 +324,13 @@ if (!empty($q)) {
|
||||
<?php } ?>
|
||||
</div>
|
||||
<?php foreach ($results as $result) { ?>
|
||||
<?php if ($t == 'image' && $hostImage = $db->getFoundHostImage($result->id)) { ?>
|
||||
<?php
|
||||
|
||||
// Built image url
|
||||
$hostImageURL = $hostImage->scheme . '://' .
|
||||
$hostImage->name .
|
||||
($hostImage->port ? ':' . $hostImage->port : false) .
|
||||
$hostImage->uri;
|
||||
|
||||
// Get local image data
|
||||
$lastHostImageDescription = $db->getLastHostImageDescription($result->id);
|
||||
|
||||
if (!empty($lastHostImageDescription->data)) {
|
||||
|
||||
$hostImageURLencoded = $lastHostImageDescription->data;
|
||||
|
||||
// Get remote if local index not found or CRAWL_HOST_DEFAULT_META_ONLY enabled
|
||||
} else {
|
||||
|
||||
// Init image request
|
||||
$hostImageCurl = new Curl($hostImageURL, PROXY_CURLOPT_USERAGENT);
|
||||
|
||||
// Skip item render on timeout
|
||||
$hostImageHttpCode = $hostImageCurl->getCode();
|
||||
|
||||
$db->updateHostImageHttpCode($result->id, (int) $hostImageHttpCode, time());
|
||||
|
||||
if (200 != $hostImageHttpCode) {
|
||||
|
||||
$db->updateHostImageHttpCode($result->id, $hostImageHttpCode, time());
|
||||
|
||||
$db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not provided
|
||||
if (!$hostImageContentType = $hostImageCurl->getContentType()) {
|
||||
|
||||
$db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing on MIME type not allowed in settings
|
||||
$hostImageBanned = true;
|
||||
foreach ((array) explode(',', CRAWL_IMAGE_MIME) as $mime) {
|
||||
|
||||
if (false !== strpos($hostImageContentType, trim($mime))) {
|
||||
|
||||
$hostImageBanned = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($hostImageBanned) {
|
||||
|
||||
$db->updateHostImageMime($result->id, $hostImageContentType, time());
|
||||
|
||||
$hostImagesBanned += $db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip image processing without returned content
|
||||
if (!$hostImageContent = $hostImageCurl->getContent()) {
|
||||
|
||||
$db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert remote image data to base64 string to prevent direct URL call
|
||||
if (!$hostImageExtension = @pathinfo($hostImageURL, PATHINFO_EXTENSION)) {
|
||||
|
||||
$db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$hostImageBase64 = @base64_encode($hostImageContent)) {
|
||||
|
||||
$db->updateHostImageTimeBanned($result->id, time());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$hostImageURLencoded = 'data:image/' . str_replace(['svg'], ['svg+xml'], $hostImageExtension) . ';base64,' . $hostImageBase64;
|
||||
|
||||
// Save image content on data settings enabled
|
||||
$db->updateHostImage($result->id,
|
||||
Filter::mime($hostImageContentType),
|
||||
time());
|
||||
|
||||
$db->setHostImageDescriptionData($result->id,
|
||||
crc32($hostImageURLencoded),
|
||||
$hostImage->crawlMetaOnly ? null : $hostImageURLencoded,
|
||||
time(),
|
||||
null);
|
||||
}
|
||||
?>
|
||||
<div>
|
||||
<a href="<?php echo $hostImageURL ?>">
|
||||
<img src="<?php echo $hostImageURLencoded ?>" alt="<?php echo htmlentities($hostImageURL) ?>" title="<?php echo htmlentities($hostImageURL) ?>" class="image" />
|
||||
</a>
|
||||
<br />
|
||||
<?php $hostImageHostPagesTotal = $db->getHostImageHostPagesTotal($result->id) ?>
|
||||
<?php foreach ((array) $db->getHostImageHostPages($result->id, WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT) as $hostPage) { ?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($hostPage->hostPageId)) { ?>
|
||||
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
||||
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||
<h3><?php echo $hostPageDescription->metaTitle ?></h3>
|
||||
<?php } ?>
|
||||
<?php if ($lastHostImageDescription) { ?>
|
||||
<span><?php echo $lastHostImageDescription->title ?> <?php echo $lastHostImageDescription->alt ?></span>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $hostPageURL ?>">
|
||||
<img src="<?php echo WEBSITE_DOMAIN ?>/image.php?q=<?php echo urlencode($hostPage->name) ?>" alt="favicon" width="16" height="16" class="icon" />
|
||||
<?php echo htmlentities(urldecode($hostPageURL)) ?>
|
||||
</a>
|
||||
<br />
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<?php if ($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT > 0) { ?>
|
||||
<p>
|
||||
<small>
|
||||
<?php echo Filter::plural($hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT,
|
||||
[
|
||||
sprintf(_('+%s other page'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
|
||||
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
|
||||
sprintf(_('+%s other pages'), $hostImageHostPagesTotal - WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT),
|
||||
]); ?>
|
||||
</small>
|
||||
</p>
|
||||
<?php } ?>
|
||||
</div>
|
||||
<?php } else if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
|
||||
<?php
|
||||
|
||||
$hostPageURL = $hostPage->scheme . '://' .
|
||||
$hostPage->name .
|
||||
($hostPage->port ? ':' . $hostPage->port : false) .
|
||||
$hostPage->uri;
|
||||
|
||||
?>
|
||||
<?php if ($hostPage = $db->getFoundHostPage($result->id)) { ?>
|
||||
<?php $hostPageURL = $hostPage->scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?>
|
||||
<div>
|
||||
<?php if ($hostPageDescription = $db->getLastPageDescription($result->id)) { ?>
|
||||
<h2><?php echo $hostPageDescription->metaTitle ?></h2>
|
||||
<?php if (!empty($hostPageDescription->metaDescription)) { ?>
|
||||
<span><?php echo $hostPageDescription->metaDescription ?></span>
|
||||
<h2><?php echo $hostPageDescription->title ?></h2>
|
||||
<?php if (!empty($hostPageDescription->description)) { ?>
|
||||
<span><?php echo $hostPageDescription->description ?></span>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<a href="<?php echo $hostPageURL ?>">
|
||||
@ -495,7 +340,7 @@ if (!empty($q)) {
|
||||
</div>
|
||||
<?php } ?>
|
||||
<?php } ?>
|
||||
<?php if ($p * ($t == 'image' ? WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT : WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT) <= $resultsTotal) { ?>
|
||||
<?php if ($p * WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT <= $resultsTotal) { ?>
|
||||
<div>
|
||||
<a href="<?php echo WEBSITE_DOMAIN; ?>/search.php?q=<?php echo urlencode(htmlentities($q)) ?>&t=<?php echo $t ?>&p=<?php echo $p + 1 ?>"><?php echo _('Next page') ?></a>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user