Browse Source

make transaction for each item in crawl queue

main
ghost 2 years ago
parent
commit
4b16b41440
  1. 154
      crontab/crawler.php

154
crontab/crawler.php

@ -43,14 +43,24 @@ $hostPagesBanned = 0; @@ -43,14 +43,24 @@ $hostPagesBanned = 0;
$hostPagesSnapAdded = 0;
// Connect database
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
try {
$db->beginTransaction();
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
try {
} catch(Exception $e) {
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
// Debug std
var_dump($e);
exit;
}
// Process manifests crawl queue
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
$db->beginTransaction();
try {
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
@ -66,18 +76,24 @@ try { @@ -66,18 +76,24 @@ try {
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifest = $curl->getContent()) {
$db->commit();
continue;
}
// Skip processing on json encoding error
if (!$remoteManifest = @json_decode($remoteManifest)) {
$db->commit();
continue;
}
@ -87,30 +103,40 @@ try { @@ -87,30 +103,40 @@ try {
empty($remoteManifest->result->api->version) ||
empty($remoteManifest->result->api->hosts)) {
continue;
$db->commit();
continue;
}
// Skip processing on API version not compatible
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
$db->commit();
continue;
}
// Skip processing on host API not available
if (!$remoteManifest->result->api->hosts) {
$db->commit();
continue;
}
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
$db->commit();
continue;
}
// Skip processing on host link does not match condition
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
$db->commit();
continue;
}
@ -126,18 +152,24 @@ try { @@ -126,18 +152,24 @@ try {
// Skip processing non 200 code
if (200 != $curl->getCode()) {
$db->commit();
continue;
}
// Skip processing without returned data
if (!$remoteManifestHosts = $curl->getContent()) {
$db->commit();
continue;
}
// Skip processing on json encoding error
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
$db->commit();
continue;
}
@ -145,6 +177,8 @@ try { @@ -145,6 +177,8 @@ try {
if (empty($remoteManifestHosts->status) ||
empty($remoteManifestHosts->result)) {
$db->commit();
continue;
}
@ -159,7 +193,7 @@ try { @@ -159,7 +193,7 @@ try {
}
$hostURL = $remoteManifestHost->scheme . '://' .
$remoteManifestHost->name .
$remoteManifestHost->name .
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
// Validate formatted link
@ -212,10 +246,29 @@ try { @@ -212,10 +246,29 @@ try {
}
}
}
// Apply changes
$db->commit();
// Process update errors
} catch (Exception $e) {
// Debug std
var_dump($e);
// Skip item
$db->rollBack();
continue;
}
}
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
// Process pages crawl queue
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
$db->beginTransaction();
try {
// Build URL from the DB
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
@ -253,6 +306,8 @@ try { @@ -253,6 +306,8 @@ try {
if (empty($match[1])) {
$db->commit();
continue;
}
@ -262,7 +317,7 @@ try { @@ -262,7 +317,7 @@ try {
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
$url = $queueHostPage->scheme . '://' .
$queueHostPage->name .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
}
@ -332,6 +387,8 @@ try { @@ -332,6 +387,8 @@ try {
// When page is root, skip next operations
if ($hostPageURI->string == '/') {
$db->commit();
continue;
}
}
@ -373,6 +430,8 @@ try { @@ -373,6 +430,8 @@ try {
}
// Skip other this page actions
$db->commit();
continue;
}
@ -386,6 +445,8 @@ try { @@ -386,6 +445,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -418,6 +479,8 @@ try { @@ -418,6 +479,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -428,6 +491,8 @@ try { @@ -428,6 +491,8 @@ try {
// This case possible for multimedia/streaming resources index
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
}
@ -453,6 +518,8 @@ try { @@ -453,6 +518,8 @@ try {
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$db->commit();
continue;
} else {
@ -514,12 +581,12 @@ try { @@ -514,12 +581,12 @@ try {
$yggoManifestCRC32 = crc32($yggoManifest);
if (!$db->getManifest($yggoManifestCRC32)) {
$db->addManifest($yggoManifestCRC32,
$db->addManifest($yggoManifestCRC32,
$yggoManifest,
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
time());
$manifestsAdded++;
$manifestsAdded++;
}
}
@ -571,7 +638,7 @@ try { @@ -571,7 +638,7 @@ try {
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
// Create new ZIP container
$zip = new ZipArchive();
@ -581,10 +648,10 @@ try { @@ -581,10 +648,10 @@ try {
// Insert compressed snap data into the tmp storage
if (true === $zip->addFromString('DATA', $content) &&
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
sprintf('CRC32: %s', $crc32data . PHP_EOL .
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
// Done
$zip->close();
@ -656,7 +723,7 @@ try { @@ -656,7 +723,7 @@ try {
}
if (!$title = @$img->getAttribute('title')) {
$title = null;
$title = null;
}
// Skip encoded content
@ -718,7 +785,7 @@ try { @@ -718,7 +785,7 @@ try {
// Skip media without type attribute
if (!$type = @$video->getAttribute('type')) {
$type = 'video/*';
$type = 'video/*';
}
// Skip encoded content
@ -748,7 +815,7 @@ try { @@ -748,7 +815,7 @@ try {
// Skip media without type attribute
if (!$type = @$audio->getAttribute('type')) {
$type = 'audio/*';
$type = 'audio/*';
}
// Skip encoded content
@ -779,7 +846,7 @@ try { @@ -779,7 +846,7 @@ try {
// Get title attribute if available
if (!$title = @$a->getAttribute('title')) {
$title = null;
$title = null;
}
// Skip anchor links
@ -824,7 +891,7 @@ try { @@ -824,7 +891,7 @@ try {
if (!parse_url($link['ref'], PHP_URL_HOST)) {
$link['ref'] = $queueHostPage->scheme . '://' .
$queueHostPage->name .
$queueHostPage->name .
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
}
@ -928,35 +995,30 @@ try { @@ -928,35 +995,30 @@ try {
}
}
}
}
// Apply changes
$db->commit();
// Apply changes
$db->commit();
// Process update errors
} catch(Exception $e) {
// Process update errors
} catch (Exception $e) {
// Decline DB changes
$db->rollBack();
// Debug std
var_dump($e);
// Debug std
var_dump($e);
// Ban page that throws the data type error and stuck the crawl queue
if (!empty($queueHostPage->hostPageId) &&
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
$hostPagesProcessed++;
}
// Skip item
$db->rollBack();
// Ban page that throws the data type error and stuck the crawl queue
if (!empty($queueHostPage->hostPageId) &&
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
// Reset counters
$hostPagesProcessed = $hostPagesBanned;
$manifestsProcessed = 0;
$hostPagesIndexed = 0;
$manifestsAdded = 0;
$hostPagesAdded = 0;
$hostsAdded = 0;
$hostPagesSnapAdded = 0;
continue;
}
}

Loading…
Cancel
Save