From 7dfc800a679f8e48a005a3769401c17c649d3807 Mon Sep 17 00:00:00 2001 From: ghost Date: Sun, 19 Nov 2023 23:00:51 +0200 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 41 +++++- composer.json | 21 +++ src/cli/document/add.php | 57 ++++++++ src/cli/document/crawl.php | 269 ++++++++++++++++++++++++++++++++++++ src/cli/document/search.php | 34 +++++ src/cli/index/init.php | 93 +++++++++++++ src/config.json | 68 +++++++++ src/nodes.json | 0 9 files changed, 583 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 composer.json create mode 100644 src/cli/document/add.php create mode 100644 src/cli/document/crawl.php create mode 100644 src/cli/document/search.php create mode 100644 src/cli/index/init.php create mode 100644 src/config.json create mode 100644 src/nodes.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..55940e5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/vendor/ +composer.lock \ No newline at end of file diff --git a/README.md b/README.md index 2bd1d2e..1b1e60e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,39 @@ -# Yo -Tiny web crawler in PHP & Manticore +# Yo! + +Micro Web Crawler in PHP & Manticore + +## CLI + +### Index + +#### Init + +Create initial index + +``` +php src/cli/index/init.php [reset] +``` +* `reset` - optional, reset existing index + +### Document + +#### Add + +``` +php src/cli/document/add.php URL +``` +* `URL` - add new URL to the crawl queue + +#### Crawl + +``` +php src/cli/document/crawl.php +``` + +#### Search + +``` +php src/cli/document/search.php '@title "*"' [limit] +``` +* `query` - required +* `limit` - optional search results limit \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..8f72c1c --- /dev/null +++ b/composer.json @@ -0,0 +1,21 @@ +{ + "name": "yggverse/yo", + "description": "Yo! Micro Web Crawler in PHP & Manticore", + "type": "project", + "license": "MIT", + "autoload": { + "psr-4": { + "Yggverse\\Yo\\": "src/" + } + }, + "authors": [ + { + "name": "YGGverse" + } + ], + "require": { + "manticoresoftware/manticoresearch-php": "^3.1", + "symfony/css-selector": "^6.3", + "symfony/dom-crawler": "^6.3" + } +} diff --git a/src/cli/document/add.php b/src/cli/document/add.php new file mode 100644 index 0000000..58170ec --- /dev/null +++ b/src/cli/document/add.php @@ -0,0 +1,57 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init index +$index = $client->index( + $config->manticore->index->document +); + +// Check URL for exist +$result = $index->search('@url "' . $argv[1] . '"') + ->limit(1) + ->get(); + +if ($result->getTotal()) +{ + echo sprintf( + 'URL "%s" already exists in "%s" index!' . PHP_EOL, + $argv[1], + $config->manticore->index->document + ); + + exit; +} + +// Add +$result = $index->addDocument( + [ + 'url' => $argv[1] + ] +); + +echo sprintf( + 'URL "%s" added to "%s" index: %s' . PHP_EOL, + $argv[1], + $config->manticore->index->document, + print_r( + $result, + true + ) +); diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php new file mode 100644 index 0000000..7de5856 --- /dev/null +++ b/src/cli/document/crawl.php @@ -0,0 +1,269 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init search +$search = new \Manticoresearch\Search( + $client +); + +$search->setIndex( + $config->manticore->index->document +); + +$search->match( + '*', + 'url' +); + +$search->sort( + 'time', + 'asc' +); + +$search->limit( + $config->cli->document->crawl->queue->limit +); + +// Init index +$index = $client->index( + $config->manticore->index->document +); + +// Begin queue +foreach($search->get() as $document) +{ + // Update index time + $index->updateDocument( + [ + 'time' => time() + ], + $document->getId() + ); + + // Request remote URL + $request = curl_init( + $document->get('url') + ); + + curl_setopt( + $request, + CURLOPT_RETURNTRANSFER, + true + ); + + if ($response = curl_exec($request)) + { + // Update HTTP code + if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE)) + { + $index->updateDocument( + [ + 'code' => $code + ], + $document->getId() + ); + + } else continue; + + // Update size + if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD)) + { + $index->updateDocument( + [ + 'size' => $size + ], + $document->getId() + ); + + } else continue; + + // Update MIME type + if ($mime = curl_getinfo($request, CURLINFO_CONTENT_TYPE)) + { + $index->updateDocument( + [ + 'mime' => $mime + ], + $document->getId() + ); + + } else continue; + + // DOM crawler + if (false !== stripos($mime, 'text/html')) + { + $crawler = new Symfony\Component\DomCrawler\Crawler(); + $crawler->addHtmlContent( + $response + ); + + // Get title + $title = ''; + foreach ($crawler->filter('head > title')->each(function($node) { + + return $node->text(); + + }) as $value) { + + $title = html_entity_decode( + $value + ); + } + + // Get description + $description = ''; + foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) { + + return $node->attr('content'); + + }) as $value) { + + $description = html_entity_decode( + $value + ); + } + + // Get keywords + $keywords = ''; + foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) { + + return $node->attr('content'); + + }) as $value) { + + $keywords = html_entity_decode( + $value + ); + } + + // Replace document + // https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916 + $index->replaceDocument( + [ + 'url' => $document->get('url'), + 'title' => $title, + 'description' => $description, + 'keywords' => $keywords, + 'code' => $code, + 'size' => $size, + 'mime' => $mime, + 'time' => time(), + ], + $document->getId() + ); + + // Crawl documents + $documents = []; + + $scheme = parse_url($document->get('url'), PHP_URL_SCHEME); + $host = parse_url($document->get('url'), PHP_URL_HOST); + $port = parse_url($document->get('url'), PHP_URL_PORT); + + foreach ($config->cli->document->crawl->selector as $selector => $settings) + { + foreach ($crawler->filter($selector)->each(function($node) { + + return $node; + + }) as $value) { + + if ($url = $value->attr($settings->attribute)) + { + //Make relative links absolute + if (!parse_url($url, PHP_URL_HOST)) + { + $url = $scheme . '://' . $host . ($port ? ':' . $port : null) . + '/' . + trim( + ltrim( + str_replace( + [ + './', + '../' + ], + '', + $url + ), + '/' + ), + '.' + ); + } + + // Regex rules + if (!preg_match($settings->regex, $url)) + { + continue; + } + + // External host rules + if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host) + { + continue; + } + + $documents[] = $url; + } + } + } + + if ($documents) + { + foreach (array_unique($documents) as $url) + { + if (!$index->search('@url "' . $url . '"') + ->limit(1) + ->get() + ->getTotal()) + { + $index->addDocument( + [ + 'url' => $url + ] + ); + + echo sprintf( + 'add "%s" to "%s"' . PHP_EOL, + $url, + $config->manticore->index->document + ); + } + } + } + } + } + + echo sprintf( + 'index "%s" updated: %s' . PHP_EOL, + $config->manticore->index->document, + print_r( + $document, + true + ) + ); +} \ No newline at end of file diff --git a/src/cli/document/search.php b/src/cli/document/search.php new file mode 100644 index 0000000..f990cdb --- /dev/null +++ b/src/cli/document/search.php @@ -0,0 +1,34 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init index +$index = $client->index( + $config->manticore->index->document +); + +// Search +foreach($index->search($argv[1]) + ->limit($argv[2] ? $argv[2] : 10) + ->get() as $result) +{ + var_dump( + $result + ); +} diff --git a/src/cli/index/init.php b/src/cli/index/init.php new file mode 100644 index 0000000..fe992c3 --- /dev/null +++ b/src/cli/index/init.php @@ -0,0 +1,93 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init index +$index = $client->index( + $config->manticore->index->document +); + +// Request options +if (isset($argv[1])) +{ + switch ($argv[1]) + { + case 'reset': + + $result = $index->drop(true); + + echo sprintf( + 'index "%s" deleted: %s' . PHP_EOL, + $config->manticore->index->document, + print_r( + $result, + true + ) + ); + + break; + } +} + +// Init index +$result = $index->create( + [ + 'url' => + [ + 'type' => 'text' + ], + 'title' => + [ + 'type' => 'text' + ], + 'description' => + [ + 'type' => 'text' + ], + 'keywords' => + [ + 'type' => 'text' + ], + 'mime' => + [ + 'type' => 'text' + ], + 'code' => + [ + 'type' => 'integer' + ], + 'size' => + [ + 'type' => 'integer' + ], + 'time' => + [ + 'type' => 'integer' + ] + ] +); + +echo sprintf( + 'index "%s" created: %s' . PHP_EOL, + $config->manticore->index->document, + print_r( + $result, + true + ) +); \ No newline at end of file diff --git a/src/config.json b/src/config.json new file mode 100644 index 0000000..4dbc2a7 --- /dev/null +++ b/src/config.json @@ -0,0 +1,68 @@ +{ + "manticore": + { + "server": + { + "host":"127.0.0.1", + "port":9308 + }, + "index": + { + "document":"yo_document" + } + }, + "cli": + { + "document": + { + "crawl": + { + "queue": + { + "limit":1 + }, + "selector": + { + "a:not([rel=nofollow])": + { + "attribute":"href", + "external":false, + "regex":"/.*/ui" + }, + "image": + { + "attribute":"src", + "external":true, + "regex":"/.*/ui" + }, + "audio": + { + "attribute":"src", + "external":false, + "regex":"/.*/ui" + }, + "video": + { + "attribute":"src", + "external":false, + "regex":"/.*/ui" + }, + "script": + { + "attribute":"href", + "external":false, + "regex":"/.*/ui" + } + } + }, + "snap": + { + "mime": + [ + "text/html", + "image/webp" + ] + } + } + } +} \ No newline at end of file diff --git a/src/nodes.json b/src/nodes.json new file mode 100644 index 0000000..e69de29