add audio/video media crawl support

This commit is contained in:
ghost 2023-05-13 01:23:09 +03:00
parent 89d1b2230b
commit 28e8bcf8d7
2 changed files with 100 additions and 2 deletions

View File

@ -168,7 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated
*
*/
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml,video/mp4,video/ogg,/video/webm,audio/mpeg,audio/ogg,audio/wav,audio/mp4,audio/aac,audio/aacp,audio/webm,audio/x-caf,audio/flac');
/*
* Renew manifests index by timing offset provided

View File

@ -417,6 +417,99 @@ try {
'description' => null,
'keywords' => Filter::pageKeywords($alt . ($title ? ',' . $title : '')),
'data' => null,
'mime' => null,
'ref' => $src,
];
}
// Collect media links
foreach (@$dom->getElementsByTagName('source') as $source) {
// Skip images without src attribute
if (!$src = @$source->getAttribute('src')) {
continue;
}
// Skip media without type attribute
if (!$type = @$source->getAttribute('type')) {
continue;
}
// Skip encoded content
if (false !== strpos($src, 'data:')) {
continue;
}
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
];
}
foreach (@$dom->getElementsByTagName('video') as $video) {
// Skip images without src attribute
if (!$src = @$video->getAttribute('src')) {
continue;
}
// Skip media without type attribute
if (!$type = @$video->getAttribute('type')) {
$type = 'video/*';
}
// Skip encoded content
if (false !== strpos($src, 'data:')) {
continue;
}
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
];
}
foreach (@$dom->getElementsByTagName('audio') as $audio) {
// Skip images without src attribute
if (!$src = @$audio->getAttribute('src')) {
continue;
}
// Skip media without type attribute
if (!$type = @$audio->getAttribute('type')) {
$type = 'audio/*';
}
// Skip encoded content
if (false !== strpos($src, 'data:')) {
continue;
}
// Add link to queue
$links[] = [
'title' => null,
'description' => null,
'keywords' => null,
'data' => null,
'mime' => Filter::mime($type),
'ref' => $src,
];
}
@ -465,6 +558,7 @@ try {
'description' => null,
'keywords' => Filter::pageKeywords($title),
'data' => null,
'mime' => null,
'ref' => $href,
];
}
@ -571,7 +665,11 @@ try {
$link['description'],
$link['keywords'],
$hostMetaOnly ? null : ($link['data'] ? base64_encode($link['data']) : null),
time());
time(),
null,
null,
null,
$link['mime']);
$hostPagesAdded++;
}