initial commit

This commit is contained in:
ghost 2023-11-19 23:00:51 +02:00
parent a5fabf2381
commit 7dfc800a67
9 changed files with 583 additions and 2 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/vendor/
composer.lock

View File

@ -1,2 +1,39 @@
# Yo # Yo!
Tiny web crawler in PHP & Manticore
Micro Web Crawler in PHP & Manticore
## CLI
### Index
#### Init
Create initial index
```
php src/cli/index/init.php [reset]
```
* `reset` - optional, reset existing index
### Document
#### Add
```
php src/cli/document/add.php URL
```
* `URL` - add new URL to the crawl queue
#### Crawl
```
php src/cli/document/crawl.php
```
#### Search
```
php src/cli/document/search.php '@title "*"' [limit]
```
* `query` - required
* `limit` - optional search results limit

21
composer.json Normal file
View File

@ -0,0 +1,21 @@
{
"name": "yggverse/yo",
"description": "Yo! Micro Web Crawler in PHP & Manticore",
"type": "project",
"license": "MIT",
"autoload": {
"psr-4": {
"Yggverse\\Yo\\": "src/"
}
},
"authors": [
{
"name": "YGGverse"
}
],
"require": {
"manticoresoftware/manticoresearch-php": "^3.1",
"symfony/css-selector": "^6.3",
"symfony/dom-crawler": "^6.3"
}
}

57
src/cli/document/add.php Normal file
View File

@ -0,0 +1,57 @@
<?php
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . '/../../config.json'
)
);
// Init
$client = new \Manticoresearch\Client(
[
'host' => $config->manticore->server->host,
'port' => $config->manticore->server->port,
]
);
// Init index
$index = $client->index(
$config->manticore->index->document
);
// Check URL for exist
$result = $index->search('@url "' . $argv[1] . '"')
->limit(1)
->get();
if ($result->getTotal())
{
echo sprintf(
'URL "%s" already exists in "%s" index!' . PHP_EOL,
$argv[1],
$config->manticore->index->document
);
exit;
}
// Add
$result = $index->addDocument(
[
'url' => $argv[1]
]
);
echo sprintf(
'URL "%s" added to "%s" index: %s' . PHP_EOL,
$argv[1],
$config->manticore->index->document,
print_r(
$result,
true
)
);

269
src/cli/document/crawl.php Normal file
View File

@ -0,0 +1,269 @@
<?php
// Prevent multi-thread execution
$semaphore = sem_get(crc32('yo.cli.document.crawl'), 1);
if (false === sem_acquire($semaphore, true))
{
exit ('process execution locked by another thread!' . PHP_EOL);
}
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . '/../../config.json'
)
);
// Init client
$client = new \Manticoresearch\Client(
[
'host' => $config->manticore->server->host,
'port' => $config->manticore->server->port,
]
);
// Init search
$search = new \Manticoresearch\Search(
$client
);
$search->setIndex(
$config->manticore->index->document
);
$search->match(
'*',
'url'
);
$search->sort(
'time',
'asc'
);
$search->limit(
$config->cli->document->crawl->queue->limit
);
// Init index
$index = $client->index(
$config->manticore->index->document
);
// Begin queue
foreach($search->get() as $document)
{
// Update index time
$index->updateDocument(
[
'time' => time()
],
$document->getId()
);
// Request remote URL
$request = curl_init(
$document->get('url')
);
curl_setopt(
$request,
CURLOPT_RETURNTRANSFER,
true
);
if ($response = curl_exec($request))
{
// Update HTTP code
if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE))
{
$index->updateDocument(
[
'code' => $code
],
$document->getId()
);
} else continue;
// Update size
if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD))
{
$index->updateDocument(
[
'size' => $size
],
$document->getId()
);
} else continue;
// Update MIME type
if ($mime = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
{
$index->updateDocument(
[
'mime' => $mime
],
$document->getId()
);
} else continue;
// DOM crawler
if (false !== stripos($mime, 'text/html'))
{
$crawler = new Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent(
$response
);
// Get title
$title = '';
foreach ($crawler->filter('head > title')->each(function($node) {
return $node->text();
}) as $value) {
$title = html_entity_decode(
$value
);
}
// Get description
$description = '';
foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) {
return $node->attr('content');
}) as $value) {
$description = html_entity_decode(
$value
);
}
// Get keywords
$keywords = '';
foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) {
return $node->attr('content');
}) as $value) {
$keywords = html_entity_decode(
$value
);
}
// Replace document
// https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916
$index->replaceDocument(
[
'url' => $document->get('url'),
'title' => $title,
'description' => $description,
'keywords' => $keywords,
'code' => $code,
'size' => $size,
'mime' => $mime,
'time' => time(),
],
$document->getId()
);
// Crawl documents
$documents = [];
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
$host = parse_url($document->get('url'), PHP_URL_HOST);
$port = parse_url($document->get('url'), PHP_URL_PORT);
foreach ($config->cli->document->crawl->selector as $selector => $settings)
{
foreach ($crawler->filter($selector)->each(function($node) {
return $node;
}) as $value) {
if ($url = $value->attr($settings->attribute))
{
//Make relative links absolute
if (!parse_url($url, PHP_URL_HOST))
{
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) .
'/' .
trim(
ltrim(
str_replace(
[
'./',
'../'
],
'',
$url
),
'/'
),
'.'
);
}
// Regex rules
if (!preg_match($settings->regex, $url))
{
continue;
}
// External host rules
if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host)
{
continue;
}
$documents[] = $url;
}
}
}
if ($documents)
{
foreach (array_unique($documents) as $url)
{
if (!$index->search('@url "' . $url . '"')
->limit(1)
->get()
->getTotal())
{
$index->addDocument(
[
'url' => $url
]
);
echo sprintf(
'add "%s" to "%s"' . PHP_EOL,
$url,
$config->manticore->index->document
);
}
}
}
}
}
echo sprintf(
'index "%s" updated: %s' . PHP_EOL,
$config->manticore->index->document,
print_r(
$document,
true
)
);
}

View File

@ -0,0 +1,34 @@
<?php
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . '/../../config.json'
)
);
// Init client
$client = new \Manticoresearch\Client(
[
'host' => $config->manticore->server->host,
'port' => $config->manticore->server->port,
]
);
// Init index
$index = $client->index(
$config->manticore->index->document
);
// Search
foreach($index->search($argv[1])
->limit($argv[2] ? $argv[2] : 10)
->get() as $result)
{
var_dump(
$result
);
}

93
src/cli/index/init.php Normal file
View File

@ -0,0 +1,93 @@
<?php
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . '/../../config.json'
)
);
// Init client
$client = new \Manticoresearch\Client(
[
'host' => $config->manticore->server->host,
'port' => $config->manticore->server->port,
]
);
// Init index
$index = $client->index(
$config->manticore->index->document
);
// Request options
if (isset($argv[1]))
{
switch ($argv[1])
{
case 'reset':
$result = $index->drop(true);
echo sprintf(
'index "%s" deleted: %s' . PHP_EOL,
$config->manticore->index->document,
print_r(
$result,
true
)
);
break;
}
}
// Init index
$result = $index->create(
[
'url' =>
[
'type' => 'text'
],
'title' =>
[
'type' => 'text'
],
'description' =>
[
'type' => 'text'
],
'keywords' =>
[
'type' => 'text'
],
'mime' =>
[
'type' => 'text'
],
'code' =>
[
'type' => 'integer'
],
'size' =>
[
'type' => 'integer'
],
'time' =>
[
'type' => 'integer'
]
]
);
echo sprintf(
'index "%s" created: %s' . PHP_EOL,
$config->manticore->index->document,
print_r(
$result,
true
)
);

68
src/config.json Normal file
View File

@ -0,0 +1,68 @@
{
"manticore":
{
"server":
{
"host":"127.0.0.1",
"port":9308
},
"index":
{
"document":"yo_document"
}
},
"cli":
{
"document":
{
"crawl":
{
"queue":
{
"limit":1
},
"selector":
{
"a:not([rel=nofollow])":
{
"attribute":"href",
"external":false,
"regex":"/.*/ui"
},
"image":
{
"attribute":"src",
"external":true,
"regex":"/.*/ui"
},
"audio":
{
"attribute":"src",
"external":false,
"regex":"/.*/ui"
},
"video":
{
"attribute":"src",
"external":false,
"regex":"/.*/ui"
},
"script":
{
"attribute":"href",
"external":false,
"regex":"/.*/ui"
}
}
},
"snap":
{
"mime":
[
"text/html",
"image/webp"
]
}
}
}
}

0
src/nodes.json Normal file
View File