|
|
|
@ -43,14 +43,24 @@ $hostPagesBanned = 0;
@@ -43,14 +43,24 @@ $hostPagesBanned = 0;
|
|
|
|
|
$hostPagesSnapAdded = 0; |
|
|
|
|
|
|
|
|
|
// Connect database |
|
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); |
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
} catch(Exception $e) { |
|
|
|
|
|
|
|
|
|
// Process manifests crawl queue |
|
|
|
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { |
|
|
|
|
// Debug std |
|
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
exit; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Process manifests crawl queue |
|
|
|
|
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) { |
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT); |
|
|
|
|
|
|
|
|
@ -66,18 +76,24 @@ try {
@@ -66,18 +76,24 @@ try {
|
|
|
|
|
// Skip processing non 200 code |
|
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing without returned data |
|
|
|
|
if (!$remoteManifest = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on json encoding error |
|
|
|
|
if (!$remoteManifest = @json_decode($remoteManifest)) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -87,30 +103,40 @@ try {
@@ -87,30 +103,40 @@ try {
|
|
|
|
|
empty($remoteManifest->result->api->version) || |
|
|
|
|
empty($remoteManifest->result->api->hosts)) { |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on API version not compatible |
|
|
|
|
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on host API not available |
|
|
|
|
if (!$remoteManifest->result->api->hosts) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition |
|
|
|
|
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on host link does not match condition |
|
|
|
|
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -126,18 +152,24 @@ try {
@@ -126,18 +152,24 @@ try {
|
|
|
|
|
// Skip processing non 200 code |
|
|
|
|
if (200 != $curl->getCode()) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing without returned data |
|
|
|
|
if (!$remoteManifestHosts = $curl->getContent()) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip processing on json encoding error |
|
|
|
|
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -145,6 +177,8 @@ try {
@@ -145,6 +177,8 @@ try {
|
|
|
|
|
if (empty($remoteManifestHosts->status) || |
|
|
|
|
empty($remoteManifestHosts->result)) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -159,7 +193,7 @@ try {
@@ -159,7 +193,7 @@ try {
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
$hostURL = $remoteManifestHost->scheme . '://' . |
|
|
|
|
$remoteManifestHost->name . |
|
|
|
|
$remoteManifestHost->name . |
|
|
|
|
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false); |
|
|
|
|
|
|
|
|
|
// Validate formatted link |
|
|
|
@ -212,10 +246,29 @@ try {
@@ -212,10 +246,29 @@ try {
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Apply changes |
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
// Process update errors |
|
|
|
|
} catch (Exception $e) { |
|
|
|
|
|
|
|
|
|
// Debug std |
|
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
// Skip item |
|
|
|
|
$db->rollBack(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Process pages crawl queue |
|
|
|
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { |
|
|
|
|
// Process pages crawl queue |
|
|
|
|
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { |
|
|
|
|
|
|
|
|
|
$db->beginTransaction(); |
|
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
|
|
// Build URL from the DB |
|
|
|
|
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; |
|
|
|
@ -253,6 +306,8 @@ try {
@@ -253,6 +306,8 @@ try {
|
|
|
|
|
|
|
|
|
|
if (empty($match[1])) { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -262,7 +317,7 @@ try {
@@ -262,7 +317,7 @@ try {
|
|
|
|
|
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use |
|
|
|
|
|
|
|
|
|
$url = $queueHostPage->scheme . '://' . |
|
|
|
|
$queueHostPage->name . |
|
|
|
|
$queueHostPage->name . |
|
|
|
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') . |
|
|
|
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.'); |
|
|
|
|
} |
|
|
|
@ -332,6 +387,8 @@ try {
@@ -332,6 +387,8 @@ try {
|
|
|
|
|
// When page is root, skip next operations |
|
|
|
|
if ($hostPageURI->string == '/') { |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -373,6 +430,8 @@ try {
@@ -373,6 +430,8 @@ try {
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip other this page actions |
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -386,6 +445,8 @@ try {
@@ -386,6 +445,8 @@ try {
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -418,6 +479,8 @@ try {
@@ -418,6 +479,8 @@ try {
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -428,6 +491,8 @@ try {
@@ -428,6 +491,8 @@ try {
|
|
|
|
|
// This case possible for multimedia/streaming resources index |
|
|
|
|
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -453,6 +518,8 @@ try {
@@ -453,6 +518,8 @@ try {
|
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
|
|
} else { |
|
|
|
@ -514,12 +581,12 @@ try {
@@ -514,12 +581,12 @@ try {
|
|
|
|
|
$yggoManifestCRC32 = crc32($yggoManifest); |
|
|
|
|
|
|
|
|
|
if (!$db->getManifest($yggoManifestCRC32)) { |
|
|
|
|
$db->addManifest($yggoManifestCRC32, |
|
|
|
|
$db->addManifest($yggoManifestCRC32, |
|
|
|
|
$yggoManifest, |
|
|
|
|
(string) CRAWL_MANIFEST_DEFAULT_STATUS, |
|
|
|
|
time()); |
|
|
|
|
|
|
|
|
|
$manifestsAdded++; |
|
|
|
|
$manifestsAdded++; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -571,7 +638,7 @@ try {
@@ -571,7 +638,7 @@ try {
|
|
|
|
|
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/'); |
|
|
|
|
|
|
|
|
|
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip'; |
|
|
|
|
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true); |
|
|
|
|
|
|
|
|
|
// Create new ZIP container |
|
|
|
|
$zip = new ZipArchive(); |
|
|
|
@ -581,10 +648,10 @@ try {
@@ -581,10 +648,10 @@ try {
|
|
|
|
|
// Insert compressed snap data into the tmp storage |
|
|
|
|
if (true === $zip->addFromString('DATA', $content) && |
|
|
|
|
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL . |
|
|
|
|
sprintf('CRC32: %s', $crc32data . PHP_EOL . |
|
|
|
|
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . |
|
|
|
|
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . |
|
|
|
|
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { |
|
|
|
|
sprintf('CRC32: %s', $crc32data . PHP_EOL . |
|
|
|
|
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL . |
|
|
|
|
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL . |
|
|
|
|
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) { |
|
|
|
|
|
|
|
|
|
// Done |
|
|
|
|
$zip->close(); |
|
|
|
@ -656,7 +723,7 @@ try {
@@ -656,7 +723,7 @@ try {
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (!$title = @$img->getAttribute('title')) { |
|
|
|
|
$title = null; |
|
|
|
|
$title = null; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip encoded content |
|
|
|
@ -718,7 +785,7 @@ try {
@@ -718,7 +785,7 @@ try {
|
|
|
|
|
|
|
|
|
|
// Skip media without type attribute |
|
|
|
|
if (!$type = @$video->getAttribute('type')) { |
|
|
|
|
$type = 'video/*'; |
|
|
|
|
$type = 'video/*'; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip encoded content |
|
|
|
@ -748,7 +815,7 @@ try {
@@ -748,7 +815,7 @@ try {
|
|
|
|
|
|
|
|
|
|
// Skip media without type attribute |
|
|
|
|
if (!$type = @$audio->getAttribute('type')) { |
|
|
|
|
$type = 'audio/*'; |
|
|
|
|
$type = 'audio/*'; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip encoded content |
|
|
|
@ -779,7 +846,7 @@ try {
@@ -779,7 +846,7 @@ try {
|
|
|
|
|
|
|
|
|
|
// Get title attribute if available |
|
|
|
|
if (!$title = @$a->getAttribute('title')) { |
|
|
|
|
$title = null; |
|
|
|
|
$title = null; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip anchor links |
|
|
|
@ -824,7 +891,7 @@ try {
@@ -824,7 +891,7 @@ try {
|
|
|
|
|
if (!parse_url($link['ref'], PHP_URL_HOST)) { |
|
|
|
|
|
|
|
|
|
$link['ref'] = $queueHostPage->scheme . '://' . |
|
|
|
|
$queueHostPage->name . |
|
|
|
|
$queueHostPage->name . |
|
|
|
|
($queueHostPage->port ? ':' . $queueHostPage->port : '') . |
|
|
|
|
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.'); |
|
|
|
|
} |
|
|
|
@ -928,35 +995,30 @@ try {
@@ -928,35 +995,30 @@ try {
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Apply changes |
|
|
|
|
$db->commit(); |
|
|
|
|
// Apply changes |
|
|
|
|
$db->commit(); |
|
|
|
|
|
|
|
|
|
// Process update errors |
|
|
|
|
} catch(Exception $e) { |
|
|
|
|
// Process update errors |
|
|
|
|
} catch (Exception $e) { |
|
|
|
|
|
|
|
|
|
// Decline DB changes |
|
|
|
|
$db->rollBack(); |
|
|
|
|
// Debug std |
|
|
|
|
var_dump($e); |
|
|
|
|
|
|
|
|
|
// Debug std |
|
|
|
|
var_dump($e); |
|
|
|
|
// Ban page that throws the data type error and stuck the crawl queue |
|
|
|
|
if (!empty($queueHostPage->hostPageId) && |
|
|
|
|
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) && |
|
|
|
|
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO |
|
|
|
|
|
|
|
|
|
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
$hostPagesProcessed++; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Skip item |
|
|
|
|
$db->rollBack(); |
|
|
|
|
|
|
|
|
|
// Ban page that throws the data type error and stuck the crawl queue |
|
|
|
|
if (!empty($queueHostPage->hostPageId) && |
|
|
|
|
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) && |
|
|
|
|
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO |
|
|
|
|
|
|
|
|
|
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time()); |
|
|
|
|
|
|
|
|
|
// Reset counters |
|
|
|
|
$hostPagesProcessed = $hostPagesBanned; |
|
|
|
|
$manifestsProcessed = 0; |
|
|
|
|
$hostPagesIndexed = 0; |
|
|
|
|
$manifestsAdded = 0; |
|
|
|
|
$hostPagesAdded = 0; |
|
|
|
|
$hostsAdded = 0; |
|
|
|
|
$hostPagesSnapAdded = 0; |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|