Browse Source

make transaction for each item in crawl queue

main
ghost 2 years ago
parent
commit
4b16b41440
  1. 154
      crontab/crawler.php

154
crontab/crawler.php

@ -43,14 +43,24 @@ $hostPagesBanned = 0;
$hostPagesSnapAdded = 0; $hostPagesSnapAdded = 0;
// Connect database // Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); try {
$db->beginTransaction(); $db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
try { } catch(Exception $e) {
// Process manifests crawl queue // Debug std
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { var_dump($e);
exit;
}
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$db->beginTransaction();
try {
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT); $curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
@ -66,18 +76,24 @@ try {
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) { if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) { if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit();
continue; continue;
} }
@ -87,30 +103,40 @@ try {
empty($remoteManifest->result->api->version) || empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) { empty($remoteManifest->result->api->hosts)) {
continue; $db->commit();
continue;
} }
// Skip processing on API version not compatible // Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$db->commit();
continue; continue;
} }
// Skip processing on host API not available // Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) { if (!$remoteManifest->result->api->hosts) {
$db->commit();
continue; continue;
} }
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition // Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$db->commit();
continue; continue;
} }
// Skip processing on host link does not match condition // Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
$db->commit();
continue; continue;
} }
@ -126,18 +152,24 @@ try {
// Skip processing non 200 code // Skip processing non 200 code
if (200 != $curl->getCode()) { if (200 != $curl->getCode()) {
$db->commit();
continue; continue;
} }
// Skip processing without returned data // Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) { if (!$remoteManifestHosts = $curl->getContent()) {
$db->commit();
continue; continue;
} }
// Skip processing on json encoding error // Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$db->commit();
continue; continue;
} }
@ -145,6 +177,8 @@ try {
if (empty($remoteManifestHosts->status) || if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) { empty($remoteManifestHosts->result)) {
$db->commit();
continue; continue;
} }
@ -159,7 +193,7 @@ try {
} }
$hostURL = $remoteManifestHost->scheme . '://' . $hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name . $remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); (!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link // Validate formatted link
@ -212,10 +246,29 @@ try {
} }
} }
} }
// Apply changes
$db->commit();
// Process update errors
} catch (Exception $e) {
// Debug std
var_dump($e);
// Skip item
$db->rollBack();
continue;
} }
}
// Process pages crawl queue // Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction();
try {
// Build URL from the DB // Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -253,6 +306,8 @@ try {
if (empty($match[1])) { if (empty($match[1])) {
$db->commit();
continue; continue;
} }
@ -262,7 +317,7 @@ try {
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
$url = $queueHostPage->scheme . '://' . $url = $queueHostPage->scheme . '://' .
$queueHostPage->name . $queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') . ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); '/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
} }
@ -332,6 +387,8 @@ try {
// When page is root, skip next operations // When page is root, skip next operations
if ($hostPageURI->string == '/') { if ($hostPageURI->string == '/') {
$db->commit();
continue; continue;
} }
} }
@ -373,6 +430,8 @@ try {
} }
// Skip other this page actions // Skip other this page actions
$db->commit();
continue; continue;
} }
@ -386,6 +445,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue; continue;
} }
@ -418,6 +479,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue; continue;
} }
@ -428,6 +491,8 @@ try {
// This case possible for multimedia/streaming resources index // This case possible for multimedia/streaming resources index
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); // $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue; continue;
} }
@ -453,6 +518,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue; continue;
} else { } else {
@ -514,12 +581,12 @@ try {
$yggoManifestCRC32 = crc32($yggoManifest); $yggoManifestCRC32 = crc32($yggoManifest);
if (!$db->getManifest($yggoManifestCRC32)) { if (!$db->getManifest($yggoManifestCRC32)) {
$db->addManifest($yggoManifestCRC32, $db->addManifest($yggoManifestCRC32,
$yggoManifest, $yggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS, (string) CRAWL_MANIFEST_DEFAULT_STATUS,
time()); time());
$manifestsAdded++; $manifestsAdded++;
} }
} }
@ -571,7 +638,7 @@ try {
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); $snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; $snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); @mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container // Create new ZIP container
$zip = new ZipArchive(); $zip = new ZipArchive();
@ -581,10 +648,10 @@ try {
// Insert compressed snap data into the tmp storage // Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) && if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL . sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done // Done
$zip->close(); $zip->close();
@ -656,7 +723,7 @@ try {
} }
if (!$title = @$img->getAttribute('title')) { if (!$title = @$img->getAttribute('title')) {
$title = null; $title = null;
} }
// Skip encoded content // Skip encoded content
@ -718,7 +785,7 @@ try {
// Skip media without type attribute // Skip media without type attribute
if (!$type = @$video->getAttribute('type')) { if (!$type = @$video->getAttribute('type')) {
$type = 'video/*'; $type = 'video/*';
} }
// Skip encoded content // Skip encoded content
@ -748,7 +815,7 @@ try {
// Skip media without type attribute // Skip media without type attribute
if (!$type = @$audio->getAttribute('type')) { if (!$type = @$audio->getAttribute('type')) {
$type = 'audio/*'; $type = 'audio/*';
} }
// Skip encoded content // Skip encoded content
@ -779,7 +846,7 @@ try {
// Get title attribute if available // Get title attribute if available
if (!$title = @$a->getAttribute('title')) { if (!$title = @$a->getAttribute('title')) {
$title = null; $title = null;
} }
// Skip anchor links // Skip anchor links
@ -824,7 +891,7 @@ try {
if (!parse_url($link['ref'], PHP_URL_HOST)) { if (!parse_url($link['ref'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->scheme . '://' . $link['ref'] = $queueHostPage->scheme . '://' .
$queueHostPage->name . $queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') . ($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); '/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
} }
@ -928,35 +995,30 @@ try {
} }
} }
} }
}
// Apply changes // Apply changes
$db->commit(); $db->commit();
// Process update errors // Process update errors
} catch(Exception $e) { } catch (Exception $e) {
// Decline DB changes // Debug std
$db->rollBack(); var_dump($e);
// Debug std // Ban page that throws the data type error and stuck the crawl queue
var_dump($e); if (!empty($queueHostPage->hostPageId) &&
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPagesProcessed++;
}
// Skip item
$db->rollBack();
// Ban page that throws the data type error and stuck the crawl queue continue;
if (!empty($queueHostPage->hostPageId) &&
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Reset counters
$hostPagesProcessed = $hostPagesBanned;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesSnapAdded = 0;
} }
} }

Loading…
Cancel
Save