make transaction for each item in crawl queue

This commit is contained in:
ghost 2023-06-05 22:01:22 +03:00
parent b585b16d31
commit 4b16b41440

View File

@ -43,15 +43,25 @@ $hostPagesBanned = 0;
$hostPagesSnapAdded = 0;
// Connect database
try {
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
} catch(Exception $e) {
// Debug std
var_dump($e);
exit;
}
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$db->beginTransaction();
try {
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
// Update curl stats
@ -66,18 +76,24 @@ try {
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue;
}
// Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit();
continue;
}
@ -87,30 +103,40 @@ try {
empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) {
$db->commit();
continue;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$db->commit();
continue;
}
// Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) {
$db->commit();
continue;
}
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$db->commit();
continue;
}
// Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
$db->commit();
continue;
}
@ -126,18 +152,24 @@ try {
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) {
$db->commit();
continue;
}
// Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$db->commit();
continue;
}
@ -145,6 +177,8 @@ try {
if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) {
$db->commit();
continue;
}
@ -212,11 +246,30 @@ try {
}
}
}
// Apply changes
$db->commit();
// Process update errors
} catch (Exception $e) {
// Debug std
var_dump($e);
// Skip item
$db->rollBack();
continue;
}
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction();
try {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -253,6 +306,8 @@ try {
if (empty($match[1])) {
$db->commit();
continue;
}
@ -332,6 +387,8 @@ try {
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
$db->commit();
continue;
}
}
@ -373,6 +430,8 @@ try {
}
// Skip other this page actions
$db->commit();
continue;
}
@ -386,6 +445,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -418,6 +479,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -428,6 +491,8 @@ try {
// This case possible for multimedia/streaming resources index
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -453,6 +518,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
} else {
@ -928,7 +995,6 @@ try {
}
}
}
}
// Apply changes
$db->commit();
@ -936,9 +1002,6 @@ try {
// Process update errors
} catch (Exception $e) {
// Decline DB changes
$db->rollBack();
// Debug std
var_dump($e);
@ -947,16 +1010,15 @@ try {
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Reset counters
$hostPagesProcessed = $hostPagesBanned;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesSnapAdded = 0;
$hostPagesProcessed++;
}
// Skip item
$db->rollBack();
continue;
}
}