mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-08-26 05:42:05 +00:00
make transaction for each item in crawl queue
This commit is contained in:
parent
b585b16d31
commit
4b16b41440
@ -43,15 +43,25 @@ $hostPagesBanned = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
|
||||
// Connect database
|
||||
try {
|
||||
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Process manifests crawl queue
|
||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Process manifests crawl queue
|
||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||
|
||||
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
// Update curl stats
|
||||
@ -66,18 +76,24 @@ try {
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifest = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -87,30 +103,40 @@ try {
|
||||
empty($remoteManifest->result->api->version) ||
|
||||
empty($remoteManifest->result->api->hosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on API version not compatible
|
||||
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on host API not available
|
||||
if (!$remoteManifest->result->api->hosts) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on host link does not match condition
|
||||
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -126,18 +152,24 @@ try {
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifestHosts = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -145,6 +177,8 @@ try {
|
||||
if (empty($remoteManifestHosts->status) ||
|
||||
empty($remoteManifestHosts->result)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -212,11 +246,30 @@ try {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
|
||||
@ -253,6 +306,8 @@ try {
|
||||
|
||||
if (empty($match[1])) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -332,6 +387,8 @@ try {
|
||||
// When page is root, skip next operations
|
||||
if ($hostPageURI->string == '/') {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -373,6 +430,8 @@ try {
|
||||
}
|
||||
|
||||
// Skip other this page actions
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -386,6 +445,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -418,6 +479,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -428,6 +491,8 @@ try {
|
||||
// This case possible for multimedia/streaming resources index
|
||||
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -453,6 +518,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
|
||||
} else {
|
||||
@ -928,7 +995,6 @@ try {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
@ -936,9 +1002,6 @@ try {
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Decline DB changes
|
||||
$db->rollBack();
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
@ -947,16 +1010,15 @@ try {
|
||||
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
|
||||
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
|
||||
|
||||
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
// Reset counters
|
||||
$hostPagesProcessed = $hostPagesBanned;
|
||||
$manifestsProcessed = 0;
|
||||
$hostPagesIndexed = 0;
|
||||
$manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
$hostPagesProcessed++;
|
||||
}
|
||||
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user