mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-08-26 13:51:55 +00:00
add distributed hosts crawling using yggo nodes manifest
This commit is contained in:
parent
f0b2eb1613
commit
5999fb3a73
@ -150,7 +150,7 @@ GET m=SphinxQL
|
|||||||
* [ ] Implement database auto backup on crawl process completing
|
* [ ] Implement database auto backup on crawl process completing
|
||||||
* [x] Add transactions to prevent data loss on DB crashes
|
* [x] Add transactions to prevent data loss on DB crashes
|
||||||
* [x] JSON API
|
* [x] JSON API
|
||||||
* [ ] Distributed index data sharing between the nodes trough service API
|
* [x] Distributed index data sharing between the nodes trough service API
|
||||||
* [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6
|
* [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6
|
||||||
* [ ] An idea to make some visitors counters, like in good old times?
|
* [ ] An idea to make some visitors counters, like in good old times?
|
||||||
|
|
||||||
|
@ -121,8 +121,10 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
|
|||||||
* Usually up to 20 pages per minute,
|
* Usually up to 20 pages per minute,
|
||||||
* to prevent websites overload by sending GET crawling requests
|
* to prevent websites overload by sending GET crawling requests
|
||||||
*
|
*
|
||||||
|
* Set 0 to disable
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_PAGE_LIMIT', 10);
|
define('CRAWL_PAGE_LIMIT', 20);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Images (URI) processing limit in the crawler.php queue
|
* Images (URI) processing limit in the crawler.php queue
|
||||||
@ -133,8 +135,27 @@ define('CRAWL_PAGE_LIMIT', 10);
|
|||||||
* Usually up to 20 pages per minute,
|
* Usually up to 20 pages per minute,
|
||||||
* to prevent websites overload by sending GET crawling requests
|
* to prevent websites overload by sending GET crawling requests
|
||||||
*
|
*
|
||||||
|
* Set 0 to disable
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
define('CRAWL_IMAGE_LIMIT', 20);
|
define('CRAWL_IMAGE_LIMIT', 10);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Manifest (URI) processing limit in the crawler.php queue
|
||||||
|
*
|
||||||
|
* Used to collect distributed data index
|
||||||
|
* that match CRAWL_URL_REGEXP & CRAWL_MANIFEST_API_VERSION
|
||||||
|
*
|
||||||
|
* This option related to CRAWL_MANIFEST_SECONDS_OFFSET value
|
||||||
|
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
|
||||||
|
*
|
||||||
|
* Usually up to 20 pages per minute,
|
||||||
|
* to prevent websites overload by sending GET crawling requests
|
||||||
|
*
|
||||||
|
* Set 0 to disable
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_MANIFEST_LIMIT', 10);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Renew page index by timing offset provided
|
* Renew page index by timing offset provided
|
||||||
@ -162,6 +183,19 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Renew manifests index by timing offset provided
|
||||||
|
*
|
||||||
|
* This option works with CRAWL_MANIFEST_LIMIT step queue
|
||||||
|
*
|
||||||
|
* Pay attention, that CRAWL_MANIFEST_LIMIT + CRAWL_MANIFEST_SECONDS_OFFSET pair
|
||||||
|
* must have enough value to crawl all manifests collected in the DB index
|
||||||
|
*
|
||||||
|
* or the crawler can stuck in queue
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only URL addresses match this rule will be auto-crawled
|
* Only URL addresses match this rule will be auto-crawled
|
||||||
*
|
*
|
||||||
@ -242,6 +276,12 @@ define('CRAWL_ROBOTS_POSTFIX_RULES', null); // string|null
|
|||||||
*/
|
*/
|
||||||
define('CRAWL_MANIFEST', true);
|
define('CRAWL_MANIFEST', true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Manifest API version compatibility
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
define('CRAWL_MANIFEST_API_VERSION', 0.4);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set default auto-crawl status for new manifest added
|
* Set default auto-crawl status for new manifest added
|
||||||
*
|
*
|
||||||
|
@ -22,11 +22,18 @@ $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
|||||||
$timeStart = microtime(true);
|
$timeStart = microtime(true);
|
||||||
|
|
||||||
$hostsTotal = $db->getTotalHosts();
|
$hostsTotal = $db->getTotalHosts();
|
||||||
|
$manifestsTotal = $db->getTotalManifests();
|
||||||
$hostsUpdated = 0;
|
$hostsUpdated = 0;
|
||||||
$hostsPagesDeleted = 0;
|
$hostsPagesDeleted = 0;
|
||||||
$hostsImagesDeleted = 0;
|
$hostsImagesDeleted = 0;
|
||||||
|
$manifestsDeleted = 0;
|
||||||
|
|
||||||
// Get host queue
|
// Begin update
|
||||||
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Get cleaner queue
|
||||||
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFSET) as $host) {
|
||||||
|
|
||||||
// Parse host info
|
// Parse host info
|
||||||
@ -41,11 +48,6 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
$hostRobots = null;
|
$hostRobots = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Begin update
|
|
||||||
$db->beginTransaction();
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
// Update host data
|
// Update host data
|
||||||
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
$hostsUpdated += $db->updateHostRobots($host->hostId, $hostRobots, time());
|
||||||
|
|
||||||
@ -118,6 +120,58 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
// Delete host image
|
// Delete host image
|
||||||
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
$hostsImagesDeleted += $db->deleteHostImage($hostImage->hostImageId);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up deprecated manifests
|
||||||
|
foreach ($db->getManifests() as $manifest) {
|
||||||
|
|
||||||
|
$delete = false;
|
||||||
|
|
||||||
|
$curl = new Curl($manifest->url);
|
||||||
|
|
||||||
|
// Skip processing non 200 code
|
||||||
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
|
continue; // Wait for reconnect
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing without returned data
|
||||||
|
if (!$remoteManifest = $curl->getContent()) {
|
||||||
|
|
||||||
|
$delete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on json encoding error
|
||||||
|
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||||
|
|
||||||
|
$delete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on required fields missed
|
||||||
|
if (empty($remoteManifest->status) ||
|
||||||
|
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
||||||
|
empty($remoteManifest->result->api->version)) {
|
||||||
|
|
||||||
|
$delete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on API version not compatible
|
||||||
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||||
|
|
||||||
|
$delete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||||
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||||
|
|
||||||
|
$delete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($delete) {
|
||||||
|
|
||||||
|
$manifestsDeleted += $db->deleteManifest($manifest->manifestId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
@ -127,11 +181,12 @@ foreach ($db->getCleanerQueue(CLEAN_HOST_LIMIT, time() - CLEAN_HOST_SECONDS_OFFS
|
|||||||
|
|
||||||
$db->rollBack();
|
$db->rollBack();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Debug
|
// Debug
|
||||||
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
echo 'Hosts total: ' . $hostsTotal . PHP_EOL;
|
||||||
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
echo 'Hosts updated: ' . $hostsUpdated . PHP_EOL;
|
||||||
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
echo 'Hosts pages deleted: ' . $hostsPagesDeleted . PHP_EOL;
|
||||||
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
|
echo 'Hosts images deleted: ' . $hostsImagesDeleted . PHP_EOL;
|
||||||
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL;
|
echo 'Manifests total: ' . $manifestsTotal . PHP_EOL;
|
||||||
|
echo 'Manifests deleted: ' . $manifestsDeleted . PHP_EOL;
|
||||||
|
echo 'Execution time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
@ -29,8 +29,10 @@ $timeStart = microtime(true);
|
|||||||
|
|
||||||
$hostPagesProcessed = 0;
|
$hostPagesProcessed = 0;
|
||||||
$hostImagesProcessed = 0;
|
$hostImagesProcessed = 0;
|
||||||
|
$manifestsProcessed = 0;
|
||||||
$hostPagesIndexed = 0;
|
$hostPagesIndexed = 0;
|
||||||
$hostImagesIndexed = 0;
|
$hostImagesIndexed = 0;
|
||||||
|
$manifestsIndexed = 0;
|
||||||
$hostPagesAdded = 0;
|
$hostPagesAdded = 0;
|
||||||
$hostImagesAdded = 0;
|
$hostImagesAdded = 0;
|
||||||
$hostsAdded = 0;
|
$hostsAdded = 0;
|
||||||
@ -38,6 +40,185 @@ $hostsAdded = 0;
|
|||||||
// Connect database
|
// Connect database
|
||||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||||
|
|
||||||
|
$db->beginTransaction();
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Process manifests crawl queue
|
||||||
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||||
|
|
||||||
|
$curl = new Curl($queueManifest->url);
|
||||||
|
|
||||||
|
// Update manifest index anyway, with the current time and http code
|
||||||
|
$manifestsProcessed += $db->updateManifestCrawlQueue($queueManifest->manifestId, time(), $curl->getCode());
|
||||||
|
|
||||||
|
// Skip processing non 200 code
|
||||||
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing without returned data
|
||||||
|
if (!$remoteManifest = $curl->getContent()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on json encoding error
|
||||||
|
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on required fields missed
|
||||||
|
if (empty($remoteManifest->status) ||
|
||||||
|
empty($remoteManifest->result->config->crawlUrlRegexp) ||
|
||||||
|
empty($remoteManifest->result->api->version) ||
|
||||||
|
empty($remoteManifest->result->api->hosts)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on API version not compatible
|
||||||
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on host API not available
|
||||||
|
if (!$remoteManifest->result->api->hosts) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||||
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on host link does not match condition
|
||||||
|
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Begin hosts collection
|
||||||
|
$curl = new Curl($remoteManifest->result->api->hosts);
|
||||||
|
|
||||||
|
// Skip processing non 200 code
|
||||||
|
if (200 != $curl->getCode()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing without returned data
|
||||||
|
if (!$remoteManifestHosts = $curl->getContent()) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on json encoding error
|
||||||
|
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip processing on required fields missed
|
||||||
|
if (empty($remoteManifestHosts->status) ||
|
||||||
|
empty($remoteManifestHosts->result)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Begin hosts processing
|
||||||
|
foreach ($remoteManifestHosts->result as $remoteManifestHost) {
|
||||||
|
|
||||||
|
// Skip processing on required fields missed
|
||||||
|
if (empty($remoteManifestHost->scheme) ||
|
||||||
|
empty($remoteManifestHost->name)) {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||||
|
$remoteManifestHost->name .
|
||||||
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||||
|
|
||||||
|
// Validate formatted link
|
||||||
|
if (filter_var($hostURL, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $hostURL)) {
|
||||||
|
|
||||||
|
// Host exists
|
||||||
|
if ($host = $db->getHost(crc32($hostURL))) {
|
||||||
|
|
||||||
|
$hostStatus = $host->status;
|
||||||
|
$hostPageLimit = $host->crawlPageLimit;
|
||||||
|
$hostImageLimit = $host->crawlImageLimit;
|
||||||
|
$hostId = $host->hostId;
|
||||||
|
$hostRobots = $host->robots;
|
||||||
|
$hostRobotsPostfix = $host->robotsPostfix;
|
||||||
|
|
||||||
|
// Register new host
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Get robots.txt if exists
|
||||||
|
$curl = new Curl($hostURL . '/robots.txt', CRAWL_CURLOPT_USERAGENT);
|
||||||
|
|
||||||
|
if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) {
|
||||||
|
$hostRobots = $curl->getContent();
|
||||||
|
} else {
|
||||||
|
$hostRobots = CRAWL_ROBOTS_DEFAULT_RULES;
|
||||||
|
}
|
||||||
|
|
||||||
|
$hostRobotsPostfix = CRAWL_ROBOTS_POSTFIX_RULES;
|
||||||
|
|
||||||
|
$hostStatus = CRAWL_HOST_DEFAULT_STATUS;
|
||||||
|
$hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT;
|
||||||
|
$hostImageLimit= CRAWL_HOST_DEFAULT_IMAGES_LIMIT;
|
||||||
|
|
||||||
|
$hostId = $db->addHost($remoteManifestHosts->result->scheme,
|
||||||
|
$remoteManifestHosts->result->name,
|
||||||
|
$remoteManifestHosts->result->port,
|
||||||
|
crc32($hostURL),
|
||||||
|
time(),
|
||||||
|
null,
|
||||||
|
$hostPageLimit,
|
||||||
|
$hostImageLimit,
|
||||||
|
(string) CRAWL_HOST_DEFAULT_META_ONLY,
|
||||||
|
(string) $hostStatus,
|
||||||
|
$hostRobots,
|
||||||
|
$hostRobotsPostfix);
|
||||||
|
|
||||||
|
if ($hostId) {
|
||||||
|
|
||||||
|
$hostsAdded++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init robots parser
|
||||||
|
$robots = new Robots(($hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES) . PHP_EOL . ($hostRobotsPostfix ? (string) $hostRobotsPostfix : (string) CRAWL_ROBOTS_POSTFIX_RULES));
|
||||||
|
|
||||||
|
// Save home page info
|
||||||
|
// Until page API not implemented, save at least home page to have ability to crawl
|
||||||
|
// @TODO
|
||||||
|
if ($hostStatus && // host enabled
|
||||||
|
$robots->uriAllowed('/') && // page allowed by robots.txt rules
|
||||||
|
$hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit
|
||||||
|
!$db->getHostPage($hostId, crc32('/'))) { // page not exists
|
||||||
|
|
||||||
|
if ($db->addHostPage($hostId, crc32('/'), '/', time())) {
|
||||||
|
|
||||||
|
$hostPagesAdded++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Process images crawl queue
|
// Process images crawl queue
|
||||||
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
foreach ($db->getHostImageCrawlQueue(CRAWL_IMAGE_LIMIT, time() - CRAWL_IMAGE_SECONDS_OFFSET) as $queueHostImage) {
|
||||||
|
|
||||||
@ -204,10 +385,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
// Validate formatted src link
|
// Validate formatted src link
|
||||||
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
|
if (filter_var($imageSrc, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $imageSrc)) {
|
||||||
|
|
||||||
$db->beginTransaction();
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
// Parse formatted src link
|
// Parse formatted src link
|
||||||
$hostImageURL = Parser::hostURL($imageSrc);
|
$hostImageURL = Parser::hostURL($imageSrc);
|
||||||
$hostImageURI = Parser::uri($imageSrc);
|
$hostImageURI = Parser::uri($imageSrc);
|
||||||
@ -310,15 +487,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
|
$db->updateHostImageRank($hostId, crc32($hostImageURI->string), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
$db->commit();
|
|
||||||
|
|
||||||
} catch(Exception $e) {
|
|
||||||
|
|
||||||
var_dump($e);
|
|
||||||
|
|
||||||
$db->rollBack();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -370,10 +538,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
// Validate formatted link
|
// Validate formatted link
|
||||||
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) {
|
||||||
|
|
||||||
$db->beginTransaction();
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
// Parse formatted link
|
// Parse formatted link
|
||||||
$hostURL = Parser::hostURL($href);
|
$hostURL = Parser::hostURL($href);
|
||||||
$hostPageURI = Parser::uri($href);
|
$hostPageURI = Parser::uri($href);
|
||||||
@ -454,6 +618,9 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
|
$db->updateHostPageRank($hostId, crc32($hostPageURI->string), 1);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$db->commit();
|
$db->commit();
|
||||||
|
|
||||||
@ -463,9 +630,6 @@ foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECOND
|
|||||||
|
|
||||||
$db->rollBack();
|
$db->rollBack();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Debug
|
// Debug
|
||||||
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL;
|
||||||
@ -474,5 +638,7 @@ echo 'Pages added: ' . $hostPagesAdded . PHP_EOL;
|
|||||||
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
echo 'Images processed: ' . $hostImagesProcessed . PHP_EOL;
|
||||||
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
echo 'Images indexed: ' . $hostImagesIndexed . PHP_EOL;
|
||||||
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
echo 'Images added: ' . $hostImagesAdded . PHP_EOL;
|
||||||
|
echo 'Manifests processed: ' . $manifestsProcessed . PHP_EOL;
|
||||||
|
echo 'Manifests indexed: ' . $manifestsIndexed . PHP_EOL;
|
||||||
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
echo 'Hosts added: ' . $hostsAdded . PHP_EOL;
|
||||||
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL;
|
echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL . PHP_EOL;
|
||||||
|
Binary file not shown.
@ -29,6 +29,24 @@ class MySQL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Manifest
|
// Manifest
|
||||||
|
public function getTotalManifests() {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `manifest`');
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->fetch()->total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getManifests() {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `manifest`');
|
||||||
|
|
||||||
|
$query->execute();
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
public function getManifest(int $crc32url) {
|
public function getManifest(int $crc32url) {
|
||||||
|
|
||||||
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
|
$query = $this->_db->prepare('SELECT * FROM `manifest` WHERE `crc32url` = ? LIMIT 1');
|
||||||
@ -47,6 +65,15 @@ class MySQL {
|
|||||||
return $this->_db->lastInsertId();
|
return $this->_db->lastInsertId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function deleteManifest(int $manifestId) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('DELETE FROM `manifest` WHERE `manifestId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$manifestId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
|
|
||||||
// Host
|
// Host
|
||||||
public function getAPIHosts(string $apiHostFields) {
|
public function getAPIHosts(string $apiHostFields) {
|
||||||
|
|
||||||
@ -570,4 +597,28 @@ class MySQL {
|
|||||||
|
|
||||||
return $query->rowCount();
|
return $query->rowCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getManifestCrawlQueue(int $limit, int $timeFrom) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('SELECT * FROM `manifest`
|
||||||
|
|
||||||
|
WHERE (`timeUpdated` IS NULL OR `timeUpdated` < ? ) AND `status` <> 0
|
||||||
|
|
||||||
|
ORDER BY RAND()
|
||||||
|
|
||||||
|
LIMIT ' . (int) $limit);
|
||||||
|
|
||||||
|
$query->execute([$timeFrom]);
|
||||||
|
|
||||||
|
return $query->fetchAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function updateManifestCrawlQueue(int $manifestId, int $timeUpdated, int $httpCode) {
|
||||||
|
|
||||||
|
$query = $this->_db->prepare('UPDATE `manifest` SET `timeUpdated` = ?, `httpCode` = ? WHERE `manifestId` = ? LIMIT 1');
|
||||||
|
|
||||||
|
$query->execute([$timeUpdated, $httpCode, $manifestId]);
|
||||||
|
|
||||||
|
return $query->rowCount();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user