Browse Source

init sqlite implementation

main
yggverse 6 months ago
parent
commit
13ebc7bcbc
  1. 5
      .gitignore
  2. 30
      README.md
  3. 4
      composer.json
  4. 88
      config/example.json
  5. 23
      example/config.json
  6. 171
      src/Model/Database.php
  7. 69
      src/Model/Filter.php
  8. 422
      src/crawler.php

5
.gitignore vendored

@ -1,4 +1,7 @@ @@ -1,4 +1,7 @@
/composer.lock
/config.json
/config/*
!/config/example.json
/server/
/vendor/

30
README.md

@ -2,32 +2,18 @@ @@ -2,32 +2,18 @@
RSS Aggregator for [Gemini Protocol](https://geminiprotocol.net)
Simple RSS feed converter to static Gemtext format, useful for news portals or localhost reading
## Components
* [x] `src/crawler.php` - scan configured RSS feeds and dump results to SQLite (see also [FS branch](https://github.com/YGGverse/Pulsar/tree/fs))
* [ ] `src/nex.php` - Build-in server for [NEX Protocol](https://nightfall.city/nps/info/specification.txt)
* [ ] `src/gemini.php` - Build-in server for [Gemini Protocol](https://geminiprotocol.net)
## Example
* `nex://[301:23b4:991a:634d::feed]/index.gmi` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse
* `nex://[301:23b4:991a:634d::feed]` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse
## Usage
1. `git clone https://github.com/YGGverse/Pulsar.git`
2. `cp example/config.json config.json` - setup your feed locations
3. `php src/crawler.php` - grab feeds manually or using crontab
## Config
Configuration file supports multiple feed channels with custom settings:
* `source` - string, filepath or URL to the valid RSS feed
* `target` - string, relative or absolute path to Gemtext dumps
* `item`
* `limit` - integer, how many items to display on page generated
* `template` - string, custom pattern for feed item, that supports following macros
* `{nl}` - new line separator
* `{link}` - item link
* `{guid}` - item guid
* `{pubDate}` - item pubDate, soon with custom time format e.g. `{pubDate:Y-m-d H:s}`
* `{title}` - item title
* `{description}` - item description
Resulting files could be placed to any local folder (for personal reading) or shared with others (using [gmid](https://github.com/omar-polo/gmid), [twins](https://code.rocket9labs.com/tslocum/twins) or any other [server](https://github.com/kr1sp1n/awesome-gemini#servers) for `gemtext` statics)
2. `cp config/example.json name.json` - setup your feed
3. `php src/crawler.php name.json` - grab feeds manually or using crontabdes

4
composer.json

@ -15,7 +15,5 @@ @@ -15,7 +15,5 @@
"name": "YGGverse"
}
],
"require": {
"yggverse/titan-ii": "^1.0"
}
"require": {}
}

88
config/example.json

@ -0,0 +1,88 @@ @@ -0,0 +1,88 @@
{
"database":
{
"location":"example.sqlite",
"username":null,
"password":null
},
"crawler":
{
"channel":
[
{
"source":"https://www.omglinux.com/feed",
"enabled":true,
"item":
{
"link":
{
"enabled":true,
"required":false
},
"pubDate":
{
"enabled":true,
"required":false
},
"title":
{
"enabled":true,
"required":false
},
"description":
{
"enabled":true,
"required":false
},
"content":{
"enabled":false,
"required":false
}
},
"debug":
{
"info":true,
"warning":true,
"error":true
}
},
{
"source":"https://omgubuntu.co.uk/feed",
"enabled":false,
"item":
{
"link":
{
"enabled":true,
"required":false
},
"pubDate":
{
"enabled":true,
"required":false
},
"title":
{
"enabled":true,
"required":false
},
"description":
{
"enabled":true,
"required":false
},
"content":{
"enabled":false,
"required":false
}
},
"debug":
{
"info":true,
"warning":true,
"error":true
}
}
]
}
}

23
example/config.json

@ -1,23 +0,0 @@ @@ -1,23 +0,0 @@
{
"feed":
[
{
"source":"https://www.omglinux.com/feed",
"target":"server/127.0.0.1/public/omglinux/feed.gmi",
"item":
{
"template":"=> {link} {title}{nl}{nl}{description}",
"limit":20
}
},
{
"source":"https://omgubuntu.co.uk/feed",
"target":"server/127.0.0.1/public/omgubuntu/feed.gmi",
"item":
{
"template":"=> {link} {title}{nl}{nl}{description}",
"limit":20
}
}
]
}

171
src/Model/Database.php

@ -0,0 +1,171 @@ @@ -0,0 +1,171 @@
<?php
declare(strict_types=1);
namespace Yggverse\Pulsar\Model;
class Database
{
public \PDO $_database;
public function __construct(
string $database,
?string $username = null,
?string $password = null
) {
$this->_database = new \PDO(
sprintf(
'sqlite:%s',
$database
),
$username,
$password
);
$this->_database->setAttribute(
\PDO::ATTR_ERRMODE,
\PDO::ERRMODE_EXCEPTION
);
$this->_database->setAttribute(
\PDO::ATTR_DEFAULT_FETCH_MODE,
\PDO::FETCH_OBJ
);
$this->_database->query('
CREATE TABLE IF NOT EXISTS "channel"
(
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"time" INTEGER NOT NULL,
"source" TEXT NOT NULL,
"link" TEXT,
"title" TEXT,
"description" TEXT
)
');
$this->_database->query('
CREATE TABLE IF NOT EXISTS "channelItem"
(
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"channelId" INTEGER NOT NULL,
"time" INTEGER NOT NULL,
"pubTime" INTEGER,
"guid" TEXT NOT NULL,
"link" TEXT,
"title" TEXT,
"description" TEXT,
"content" TEXT
)
');
}
public function getChannelIdBySource(
string $source
): ?int
{
$query = $this->_database->prepare(
'SELECT `id` FROM `channel` WHERE `source` LIKE :source LIMIT 1'
);
$query->execute(
[
':source' => $source
]
);
if ($result = $query->fetch())
{
return $result->id;
}
return null;
}
public function addChannel(
string $source,
?string $link,
?string $title,
?string $description,
?int $time = null
): ?int
{
$query = $this->_database->prepare(
'INSERT INTO `channel` (`source`, `link`, `title`, `description`, `time`)
VALUES (:source, :link, :title, :description, :time)'
);
$query->execute(
[
':source' => $source,
':link' => $link,
':title' => $title,
':description' => $description,
':time' => $time ? $time : time()
]
);
if ($id = $this->_database->lastInsertId())
{
return (int) $id;
}
return null;
}
public function isChannelItemExist(
int $channelId,
string $guid
): bool
{
$query = $this->_database->prepare(
'SELECT NULL FROM `channelItem` WHERE `channelId` = :channelId AND `guid` LIKE :guid LIMIT 1'
);
$query->execute(
[
':channelId' => $channelId,
':guid' => $guid
]
);
return (bool) $query->fetch();
}
public function addChannelItem(
int $channelId,
string $guid,
?string $link,
?string $title,
?string $description,
?string $content,
?int $pubTime,
?int $time = null
): ?int
{
$query = $this->_database->prepare(
'INSERT INTO `channelItem` (`channelId`, `guid`, `link`, `title`, `description`, `content`, `pubTime`, `time`)
VALUES (:channelId, :guid, :link, :title, :description, :content, :pubTime, :time)'
);
$query->execute(
[
':channelId' => $channelId,
':guid' => $guid,
':link' => $link,
':title' => $title,
':description' => $description,
':content' => $content,
':pubTime' => $pubTime,
':time' => $time ? $time : time()
]
);
if ($id = $this->_database->lastInsertId())
{
return (int) $id;
}
return null;
}
}

69
src/Model/Filter.php

@ -0,0 +1,69 @@ @@ -0,0 +1,69 @@
<?php
declare(strict_types=1);
namespace Yggverse\Pulsar\Model;
class Filter
{
public static function url(
string $value
): string
{
return trim(
urldecode(
$value
)
);
}
public static function title(
string $value
): string
{
return trim(
preg_replace(
[
'/[\n\r]*/',
'/[\s]{2,}/',
],
' ',
$this->text(
$value
)
)
);
}
public static function description(
string $value
): string
{
return $this->text(
$value
);
}
public static function text(
string $value
): string
{
return trim(
preg_replace(
[
'/[\n\r]{2,}/',
'/[\s]{2,}/',
],
[
PHP_EOL,
' '
],
strip_tags(
html_entity_decode(
$value
)
)
)
);
}
}

422
src/crawler.php

@ -7,126 +7,354 @@ $semaphore = sem_get( @@ -7,126 +7,354 @@ $semaphore = sem_get(
), 1
);
if (false === sem_acquire($semaphore, true))
{
exit;
}
if (false === sem_acquire($semaphore, true)) exit;
// Load dependencies
require_once __DIR__ .
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'vendor' .
DIRECTORY_SEPARATOR . 'autoload.php';
// Init profile argument
if (empty($argv[1])) throw new \Exception();
// Init config
$config = json_decode(
file_get_contents(
__DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'config.json'
str_starts_with(
$argv[1],
DIRECTORY_SEPARATOR
) ? $argv[1] // absolute
: __DIR__ . // relative
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'config'.
DIRECTORY_SEPARATOR . $argv[1]
)
); if (!$config) throw new \Exception();
// Init database
$database = new \Yggverse\Pulsar\Model\Database(
str_starts_with(
$config->database->location,
DIRECTORY_SEPARATOR
) ? $config->database->location
: __DIR__ .
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'config'.
DIRECTORY_SEPARATOR . $config->database->location,
$config->database->username,
$config->database->password
);
// Update feeds
foreach ($config->feed as $feed)
// Begin channels crawl
foreach ($config->crawler->channel as $channel)
{
// Init feed location
$filename = str_starts_with(
$feed->target,
DIRECTORY_SEPARATOR
) ? $feed->target : __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $feed->target;
// Init destination storage
@mkdir(
dirname(
$filename
),
0755,
true
);
// Get feed data
if (!$channel = simplexml_load_file($feed->source)->channel)
// Check channel enabled
if (!$channel->enabled)
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] skip disabled channel "%s"') . PHP_EOL,
date('c'),
$channel->source
) . PHP_EOL;
}
continue;
}
// Update title
if (!empty($channel->title))
// Get channel data
if (!$remoteChannel = simplexml_load_file($channel->source)->channel)
{
$title = trim(
strip_tags(
html_entity_decode(
$channel->title
)
)
);
}
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] channel "%s" not accessible') . PHP_EOL,
date('c'),
$channel->source
) . PHP_EOL;
}
else
{
$title = parse_url(
$feed->source,
PHP_URL_HOST
);
continue;
}
file_put_contents(
$filename,
sprintf(
'# %s',
$title
) . PHP_EOL
);
// Append description
if (!empty($channel->description))
// Init channel
if (!$channelId = $database->getChannelIdBySource($channel->source))
{
file_put_contents(
$filename,
PHP_EOL . trim(
strip_tags(
html_entity_decode(
$channel->description
)
)
) . PHP_EOL,
FILE_APPEND | LOCK_EX
// Create new one if not exists
$channelId = $database->addChannel(
$channel->source,
isset($remoteChannel->link) ? (string) $remoteChannel->link : null,
isset($remoteChannel->title) ? (string) $remoteChannel->title : null,
isset($remoteChannel->description) ? (string) $remoteChannel->description : null
);
if ($channel->debug->info)
{
printf(
_('[%s] [info] channel "%s" registered as #%d') . PHP_EOL,
date('c'),
$channel->source,
$channelId
) . PHP_EOL;
}
}
// Append items
$i = 1; foreach ($channel->item as $item)
// Process items
if (!empty($remoteChannel->item))
{
// Apply items limit
if ($i > $feed->item->limit)
foreach ($remoteChannel->item as $remoteChannelItem)
{
break;
}
// Prepare link
$link = null;
// Format item
file_put_contents(
$filename,
PHP_EOL . trim(
preg_replace(
'/[\s]{3,}/ui',
PHP_EOL . PHP_EOL,
str_replace(
[
'{nl}',
'{link}',
'{guid}',
'{pubDate}',
'{title}',
'{description}'
],
[
PHP_EOL,
!empty($item->link) ? trim($item->link) : '',
!empty($item->guid) ? trim($item->guid) : '',
!empty($item->pubDate) ? trim($item->pubDate) : '',
!empty($item->title) ? trim(strip_tags(html_entity_decode($item->title))) : '',
!empty($item->description) ? trim(strip_tags(html_entity_decode($item->description))) : ''
],
$feed->item->template
) . PHP_EOL
)
) . PHP_EOL,
FILE_APPEND | LOCK_EX
);
if ($channel->item->link->enabled)
{
if (isset($remoteChannelItem->link))
{
$link = (string) $remoteChannelItem->link;
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item link enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->link->required && !$link)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item link for channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare guid or define it from link
$guid = null;
$i++;
if (isset($remoteChannelItem->guid))
{
$guid = (string) $remoteChannelItem->guid;
}
else
{
$guid = $link;
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] item guid defined as link in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
// Prepare title
$title = null;
if ($channel->item->title->enabled)
{
if (isset($remoteChannelItem->title))
{
$title = (string) $remoteChannelItem->title;
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item title enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->title->required && !$title)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item title in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare description
$description = null;
if ($channel->item->description->enabled)
{
if (isset($remoteChannelItem->description))
{
$description = (string) $remoteChannelItem->description;
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item description enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->description->required && !$description)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item description in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare content
$content = null;
if ($channel->item->content->enabled)
{
if ($_content = $remoteChannelItem->children('content', true))
{
if (isset($_content->encoded))
{
$content = (string) $_content->encoded;
}
}
if (!$content && $channel->debug->info)
{
printf(
_('[%s] [info] item content enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
if ($channel->item->content->required && !$content)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item content in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare pubDate
$pubTime = null;
if ($channel->item->pubDate->enabled)
{
if (isset($remoteChannelItem->pubDate))
{
if ($_pubTime = strtotime((string) $remoteChannelItem->pubDate))
{
$pubTime = $_pubTime;
}
else
{
if ($channel->debug->warning)
{
printf(
_('[%s] [info] could not convert item pubDate to pubTime in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item pubDate enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->pubDate->required && !$pubTime)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item pubDate in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Check item not registered yet
if (!$database->isChannelItemExist($channelId, $guid))
{
// Create new one if not exists
$channelItemId = $database->addChannelItem(
$channelId,
$guid,
$link,
$title,
$description,
$content,
$pubTime
);
if ($channelItemId)
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] registered new item #%d for channel #%d') . PHP_EOL,
date('c'),
$channelItemId,
$channelId
) . PHP_EOL;
}
}
}
}
}
}
Loading…
Cancel
Save