mirror of
https://github.com/YGGverse/YGGo.git
synced 2025-01-24 13:34:25 +00:00
make transaction for each item in crawl queue
This commit is contained in:
parent
b585b16d31
commit
4b16b41440
@ -43,14 +43,24 @@ $hostPagesBanned = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
|
||||
// Connect database
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Process manifests crawl queue
|
||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||
$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD);
|
||||
|
||||
} catch(Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Process manifests crawl queue
|
||||
foreach ($db->getManifestCrawlQueue(CRAWL_MANIFEST_LIMIT, time() - CRAWL_MANIFEST_SECONDS_OFFSET) as $queueManifest) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
$curl = new Curl($queueManifest->url, CRAWL_CURLOPT_USERAGENT);
|
||||
|
||||
@ -66,18 +76,24 @@ try {
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifest = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifest = @json_decode($remoteManifest)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -87,30 +103,40 @@ try {
|
||||
empty($remoteManifest->result->api->version) ||
|
||||
empty($remoteManifest->result->api->hosts)) {
|
||||
|
||||
continue;
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on API version not compatible
|
||||
if ($remoteManifest->result->api->version !== CRAWL_MANIFEST_API_VERSION) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on host API not available
|
||||
if (!$remoteManifest->result->api->hosts) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on crawlUrlRegexp does not match CRAWL_URL_REGEXP condition
|
||||
if ($remoteManifest->result->config->crawlUrlRegexp !== CRAWL_URL_REGEXP) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on host link does not match condition
|
||||
if (false === preg_match(CRAWL_URL_REGEXP, $remoteManifest->result->api->hosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -126,18 +152,24 @@ try {
|
||||
// Skip processing non 200 code
|
||||
if (200 != $curl->getCode()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing without returned data
|
||||
if (!$remoteManifestHosts = $curl->getContent()) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip processing on json encoding error
|
||||
if (!$remoteManifestHosts = @json_decode($remoteManifestHosts)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -145,6 +177,8 @@ try {
|
||||
if (empty($remoteManifestHosts->status) ||
|
||||
empty($remoteManifestHosts->result)) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -159,7 +193,7 @@ try {
|
||||
}
|
||||
|
||||
$hostURL = $remoteManifestHost->scheme . '://' .
|
||||
$remoteManifestHost->name .
|
||||
$remoteManifestHost->name .
|
||||
(!empty($remoteManifestHost->port) ? ':' . $remoteManifestHost->port : false);
|
||||
|
||||
// Validate formatted link
|
||||
@ -212,10 +246,29 @@ try {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Process pages crawl queue
|
||||
foreach ($db->getHostPageCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) {
|
||||
|
||||
$db->beginTransaction();
|
||||
|
||||
try {
|
||||
|
||||
// Build URL from the DB
|
||||
$queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri;
|
||||
@ -253,6 +306,8 @@ try {
|
||||
|
||||
if (empty($match[1])) {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -262,7 +317,7 @@ try {
|
||||
if (!parse_url($url, PHP_URL_HOST)) { // @TODO probably, case not in use
|
||||
|
||||
$url = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $url), '/'), '.');
|
||||
}
|
||||
@ -332,6 +387,8 @@ try {
|
||||
// When page is root, skip next operations
|
||||
if ($hostPageURI->string == '/') {
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -373,6 +430,8 @@ try {
|
||||
}
|
||||
|
||||
// Skip other this page actions
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -386,6 +445,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -418,6 +479,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -428,6 +491,8 @@ try {
|
||||
// This case possible for multimedia/streaming resources index
|
||||
// $hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -453,6 +518,8 @@ try {
|
||||
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$db->commit();
|
||||
|
||||
continue;
|
||||
|
||||
} else {
|
||||
@ -514,12 +581,12 @@ try {
|
||||
$yggoManifestCRC32 = crc32($yggoManifest);
|
||||
|
||||
if (!$db->getManifest($yggoManifestCRC32)) {
|
||||
$db->addManifest($yggoManifestCRC32,
|
||||
$db->addManifest($yggoManifestCRC32,
|
||||
$yggoManifest,
|
||||
(string) CRAWL_MANIFEST_DEFAULT_STATUS,
|
||||
time());
|
||||
|
||||
$manifestsAdded++;
|
||||
$manifestsAdded++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -571,7 +638,7 @@ try {
|
||||
$snapPath = chunk_split($queueHostPage->hostPageId, 1, '/');
|
||||
|
||||
$snapTmp = '../storage/tmp/snap/hp/' . $snapPath . $snapTime . '.zip';
|
||||
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
|
||||
@mkdir('../storage/tmp/snap/hp/' . $snapPath, 0755, true);
|
||||
|
||||
// Create new ZIP container
|
||||
$zip = new ZipArchive();
|
||||
@ -581,10 +648,10 @@ try {
|
||||
// Insert compressed snap data into the tmp storage
|
||||
if (true === $zip->addFromString('DATA', $content) &&
|
||||
true === $zip->addFromString('META', sprintf('TIMESTAMP: %s', $snapTime) . PHP_EOL .
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
sprintf('CRC32: %s', $crc32data . PHP_EOL .
|
||||
sprintf('MIME: %s', Filter::mime($contentType)) . PHP_EOL .
|
||||
sprintf('SOURCE: %s', Filter::url(WEBSITE_DOMAIN . '/explore.php?hp=' . $queueHostPage->hostPageId)) . PHP_EOL .
|
||||
sprintf('TARGET: %s', Filter::url($queueHostPageURL))))) {
|
||||
|
||||
// Done
|
||||
$zip->close();
|
||||
@ -656,7 +723,7 @@ try {
|
||||
}
|
||||
|
||||
if (!$title = @$img->getAttribute('title')) {
|
||||
$title = null;
|
||||
$title = null;
|
||||
}
|
||||
|
||||
// Skip encoded content
|
||||
@ -718,7 +785,7 @@ try {
|
||||
|
||||
// Skip media without type attribute
|
||||
if (!$type = @$video->getAttribute('type')) {
|
||||
$type = 'video/*';
|
||||
$type = 'video/*';
|
||||
}
|
||||
|
||||
// Skip encoded content
|
||||
@ -748,7 +815,7 @@ try {
|
||||
|
||||
// Skip media without type attribute
|
||||
if (!$type = @$audio->getAttribute('type')) {
|
||||
$type = 'audio/*';
|
||||
$type = 'audio/*';
|
||||
}
|
||||
|
||||
// Skip encoded content
|
||||
@ -779,7 +846,7 @@ try {
|
||||
|
||||
// Get title attribute if available
|
||||
if (!$title = @$a->getAttribute('title')) {
|
||||
$title = null;
|
||||
$title = null;
|
||||
}
|
||||
|
||||
// Skip anchor links
|
||||
@ -824,7 +891,7 @@ try {
|
||||
if (!parse_url($link['ref'], PHP_URL_HOST)) {
|
||||
|
||||
$link['ref'] = $queueHostPage->scheme . '://' .
|
||||
$queueHostPage->name .
|
||||
$queueHostPage->name .
|
||||
($queueHostPage->port ? ':' . $queueHostPage->port : '') .
|
||||
'/' . trim(ltrim(str_replace(['./', '../'], '', $link['ref']), '/'), '.');
|
||||
}
|
||||
@ -928,35 +995,30 @@ try {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
// Apply changes
|
||||
$db->commit();
|
||||
|
||||
// Process update errors
|
||||
} catch(Exception $e) {
|
||||
// Process update errors
|
||||
} catch (Exception $e) {
|
||||
|
||||
// Decline DB changes
|
||||
$db->rollBack();
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
|
||||
// Debug std
|
||||
var_dump($e);
|
||||
// Ban page that throws the data type error and stuck the crawl queue
|
||||
if (!empty($queueHostPage->hostPageId) &&
|
||||
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
|
||||
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
|
||||
|
||||
// Ban page that throws the data type error and stuck the crawl queue
|
||||
if (!empty($queueHostPage->hostPageId) &&
|
||||
!empty($e->errorInfo[0]) && in_array($e->errorInfo[0], ['HY000']) &&
|
||||
!empty($e->errorInfo[1]) && in_array($e->errorInfo[1], [1366])) { // @TODO
|
||||
$hostPagesBanned += $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
|
||||
$hostPagesBanned = $db->updateHostPageTimeBanned($queueHostPage->hostPageId, time());
|
||||
$hostPagesProcessed++;
|
||||
}
|
||||
|
||||
// Reset counters
|
||||
$hostPagesProcessed = $hostPagesBanned;
|
||||
$manifestsProcessed = 0;
|
||||
$hostPagesIndexed = 0;
|
||||
$manifestsAdded = 0;
|
||||
$hostPagesAdded = 0;
|
||||
$hostsAdded = 0;
|
||||
$hostPagesSnapAdded = 0;
|
||||
// Skip item
|
||||
$db->rollBack();
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user