Browse Source

init sqlite implementation

main
yggverse 6 months ago
parent
commit
13ebc7bcbc
  1. 5
      .gitignore
  2. 30
      README.md
  3. 4
      composer.json
  4. 88
      config/example.json
  5. 23
      example/config.json
  6. 171
      src/Model/Database.php
  7. 69
      src/Model/Filter.php
  8. 422
      src/crawler.php

5
.gitignore vendored

@ -1,4 +1,7 @@
/composer.lock /composer.lock
/config.json
/config/*
!/config/example.json
/server/ /server/
/vendor/ /vendor/

30
README.md

@ -2,32 +2,18 @@
RSS Aggregator for [Gemini Protocol](https://geminiprotocol.net) RSS Aggregator for [Gemini Protocol](https://geminiprotocol.net)
Simple RSS feed converter to static Gemtext format, useful for news portals or localhost reading ## Components
* [x] `src/crawler.php` - scan configured RSS feeds and dump results to SQLite (see also [FS branch](https://github.com/YGGverse/Pulsar/tree/fs))
* [ ] `src/nex.php` - Build-in server for [NEX Protocol](https://nightfall.city/nps/info/specification.txt)
* [ ] `src/gemini.php` - Build-in server for [Gemini Protocol](https://geminiprotocol.net)
## Example ## Example
* `nex://[301:23b4:991a:634d::feed]/index.gmi` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse * `nex://[301:23b4:991a:634d::feed]` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse
## Usage ## Usage
1. `git clone https://github.com/YGGverse/Pulsar.git` 1. `git clone https://github.com/YGGverse/Pulsar.git`
2. `cp example/config.json config.json` - setup your feed locations 2. `cp config/example.json name.json` - setup your feed
3. `php src/crawler.php` - grab feeds manually or using crontab 3. `php src/crawler.php name.json` - grab feeds manually or using crontabdes
## Config
Configuration file supports multiple feed channels with custom settings:
* `source` - string, filepath or URL to the valid RSS feed
* `target` - string, relative or absolute path to Gemtext dumps
* `item`
* `limit` - integer, how many items to display on page generated
* `template` - string, custom pattern for feed item, that supports following macros
* `{nl}` - new line separator
* `{link}` - item link
* `{guid}` - item guid
* `{pubDate}` - item pubDate, soon with custom time format e.g. `{pubDate:Y-m-d H:s}`
* `{title}` - item title
* `{description}` - item description
Resulting files could be placed to any local folder (for personal reading) or shared with others (using [gmid](https://github.com/omar-polo/gmid), [twins](https://code.rocket9labs.com/tslocum/twins) or any other [server](https://github.com/kr1sp1n/awesome-gemini#servers) for `gemtext` statics)

4
composer.json

@ -15,7 +15,5 @@
"name": "YGGverse" "name": "YGGverse"
} }
], ],
"require": { "require": {}
"yggverse/titan-ii": "^1.0"
}
} }

88
config/example.json

@ -0,0 +1,88 @@
{
"database":
{
"location":"example.sqlite",
"username":null,
"password":null
},
"crawler":
{
"channel":
[
{
"source":"https://www.omglinux.com/feed",
"enabled":true,
"item":
{
"link":
{
"enabled":true,
"required":false
},
"pubDate":
{
"enabled":true,
"required":false
},
"title":
{
"enabled":true,
"required":false
},
"description":
{
"enabled":true,
"required":false
},
"content":{
"enabled":false,
"required":false
}
},
"debug":
{
"info":true,
"warning":true,
"error":true
}
},
{
"source":"https://omgubuntu.co.uk/feed",
"enabled":false,
"item":
{
"link":
{
"enabled":true,
"required":false
},
"pubDate":
{
"enabled":true,
"required":false
},
"title":
{
"enabled":true,
"required":false
},
"description":
{
"enabled":true,
"required":false
},
"content":{
"enabled":false,
"required":false
}
},
"debug":
{
"info":true,
"warning":true,
"error":true
}
}
]
}
}

23
example/config.json

@ -1,23 +0,0 @@
{
"feed":
[
{
"source":"https://www.omglinux.com/feed",
"target":"server/127.0.0.1/public/omglinux/feed.gmi",
"item":
{
"template":"=> {link} {title}{nl}{nl}{description}",
"limit":20
}
},
{
"source":"https://omgubuntu.co.uk/feed",
"target":"server/127.0.0.1/public/omgubuntu/feed.gmi",
"item":
{
"template":"=> {link} {title}{nl}{nl}{description}",
"limit":20
}
}
]
}

171
src/Model/Database.php

@ -0,0 +1,171 @@
<?php
declare(strict_types=1);
namespace Yggverse\Pulsar\Model;
class Database
{
public \PDO $_database;
public function __construct(
string $database,
?string $username = null,
?string $password = null
) {
$this->_database = new \PDO(
sprintf(
'sqlite:%s',
$database
),
$username,
$password
);
$this->_database->setAttribute(
\PDO::ATTR_ERRMODE,
\PDO::ERRMODE_EXCEPTION
);
$this->_database->setAttribute(
\PDO::ATTR_DEFAULT_FETCH_MODE,
\PDO::FETCH_OBJ
);
$this->_database->query('
CREATE TABLE IF NOT EXISTS "channel"
(
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"time" INTEGER NOT NULL,
"source" TEXT NOT NULL,
"link" TEXT,
"title" TEXT,
"description" TEXT
)
');
$this->_database->query('
CREATE TABLE IF NOT EXISTS "channelItem"
(
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"channelId" INTEGER NOT NULL,
"time" INTEGER NOT NULL,
"pubTime" INTEGER,
"guid" TEXT NOT NULL,
"link" TEXT,
"title" TEXT,
"description" TEXT,
"content" TEXT
)
');
}
public function getChannelIdBySource(
string $source
): ?int
{
$query = $this->_database->prepare(
'SELECT `id` FROM `channel` WHERE `source` LIKE :source LIMIT 1'
);
$query->execute(
[
':source' => $source
]
);
if ($result = $query->fetch())
{
return $result->id;
}
return null;
}
public function addChannel(
string $source,
?string $link,
?string $title,
?string $description,
?int $time = null
): ?int
{
$query = $this->_database->prepare(
'INSERT INTO `channel` (`source`, `link`, `title`, `description`, `time`)
VALUES (:source, :link, :title, :description, :time)'
);
$query->execute(
[
':source' => $source,
':link' => $link,
':title' => $title,
':description' => $description,
':time' => $time ? $time : time()
]
);
if ($id = $this->_database->lastInsertId())
{
return (int) $id;
}
return null;
}
public function isChannelItemExist(
int $channelId,
string $guid
): bool
{
$query = $this->_database->prepare(
'SELECT NULL FROM `channelItem` WHERE `channelId` = :channelId AND `guid` LIKE :guid LIMIT 1'
);
$query->execute(
[
':channelId' => $channelId,
':guid' => $guid
]
);
return (bool) $query->fetch();
}
public function addChannelItem(
int $channelId,
string $guid,
?string $link,
?string $title,
?string $description,
?string $content,
?int $pubTime,
?int $time = null
): ?int
{
$query = $this->_database->prepare(
'INSERT INTO `channelItem` (`channelId`, `guid`, `link`, `title`, `description`, `content`, `pubTime`, `time`)
VALUES (:channelId, :guid, :link, :title, :description, :content, :pubTime, :time)'
);
$query->execute(
[
':channelId' => $channelId,
':guid' => $guid,
':link' => $link,
':title' => $title,
':description' => $description,
':content' => $content,
':pubTime' => $pubTime,
':time' => $time ? $time : time()
]
);
if ($id = $this->_database->lastInsertId())
{
return (int) $id;
}
return null;
}
}

69
src/Model/Filter.php

@ -0,0 +1,69 @@
<?php
declare(strict_types=1);
namespace Yggverse\Pulsar\Model;
class Filter
{
public static function url(
string $value
): string
{
return trim(
urldecode(
$value
)
);
}
public static function title(
string $value
): string
{
return trim(
preg_replace(
[
'/[\n\r]*/',
'/[\s]{2,}/',
],
' ',
$this->text(
$value
)
)
);
}
public static function description(
string $value
): string
{
return $this->text(
$value
);
}
public static function text(
string $value
): string
{
return trim(
preg_replace(
[
'/[\n\r]{2,}/',
'/[\s]{2,}/',
],
[
PHP_EOL,
' '
],
strip_tags(
html_entity_decode(
$value
)
)
)
);
}
}

422
src/crawler.php

@ -7,126 +7,354 @@ $semaphore = sem_get(
), 1 ), 1
); );
if (false === sem_acquire($semaphore, true)) if (false === sem_acquire($semaphore, true)) exit;
{
exit; // Load dependencies
} require_once __DIR__ .
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'vendor' .
DIRECTORY_SEPARATOR . 'autoload.php';
// Init profile argument
if (empty($argv[1])) throw new \Exception();
// Init config // Init config
$config = json_decode( $config = json_decode(
file_get_contents( file_get_contents(
__DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'config.json' str_starts_with(
$argv[1],
DIRECTORY_SEPARATOR
) ? $argv[1] // absolute
: __DIR__ . // relative
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'config'.
DIRECTORY_SEPARATOR . $argv[1]
) )
); if (!$config) throw new \Exception();
// Init database
$database = new \Yggverse\Pulsar\Model\Database(
str_starts_with(
$config->database->location,
DIRECTORY_SEPARATOR
) ? $config->database->location
: __DIR__ .
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'config'.
DIRECTORY_SEPARATOR . $config->database->location,
$config->database->username,
$config->database->password
); );
// Update feeds // Begin channels crawl
foreach ($config->feed as $feed) foreach ($config->crawler->channel as $channel)
{ {
// Init feed location // Check channel enabled
$filename = str_starts_with( if (!$channel->enabled)
$feed->target,
DIRECTORY_SEPARATOR
) ? $feed->target : __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $feed->target;
// Init destination storage
@mkdir(
dirname(
$filename
),
0755,
true
);
// Get feed data
if (!$channel = simplexml_load_file($feed->source)->channel)
{ {
if ($channel->debug->info)
{
printf(
_('[%s] [info] skip disabled channel "%s"') . PHP_EOL,
date('c'),
$channel->source
) . PHP_EOL;
}
continue; continue;
} }
// Update title // Get channel data
if (!empty($channel->title)) if (!$remoteChannel = simplexml_load_file($channel->source)->channel)
{ {
$title = trim( if ($channel->debug->warning)
strip_tags( {
html_entity_decode( printf(
$channel->title _('[%s] [warning] channel "%s" not accessible') . PHP_EOL,
) date('c'),
) $channel->source
); ) . PHP_EOL;
} }
else continue;
{
$title = parse_url(
$feed->source,
PHP_URL_HOST
);
} }
file_put_contents( // Init channel
$filename, if (!$channelId = $database->getChannelIdBySource($channel->source))
sprintf(
'# %s',
$title
) . PHP_EOL
);
// Append description
if (!empty($channel->description))
{ {
file_put_contents( // Create new one if not exists
$filename, $channelId = $database->addChannel(
PHP_EOL . trim( $channel->source,
strip_tags( isset($remoteChannel->link) ? (string) $remoteChannel->link : null,
html_entity_decode( isset($remoteChannel->title) ? (string) $remoteChannel->title : null,
$channel->description isset($remoteChannel->description) ? (string) $remoteChannel->description : null
)
)
) . PHP_EOL,
FILE_APPEND | LOCK_EX
); );
if ($channel->debug->info)
{
printf(
_('[%s] [info] channel "%s" registered as #%d') . PHP_EOL,
date('c'),
$channel->source,
$channelId
) . PHP_EOL;
}
} }
// Append items // Process items
$i = 1; foreach ($channel->item as $item) if (!empty($remoteChannel->item))
{ {
// Apply items limit foreach ($remoteChannel->item as $remoteChannelItem)
if ($i > $feed->item->limit)
{ {
break; // Prepare link
} $link = null;
// Format item if ($channel->item->link->enabled)
file_put_contents( {
$filename, if (isset($remoteChannelItem->link))
PHP_EOL . trim( {
preg_replace( $link = (string) $remoteChannelItem->link;
'/[\s]{3,}/ui', }
PHP_EOL . PHP_EOL,
str_replace( else
[ {
'{nl}', if ($channel->debug->info)
'{link}', {
'{guid}', printf(
'{pubDate}', _('[%s] [info] item link enabled but not defined in channel #%d') . PHP_EOL,
'{title}', date('c'),
'{description}' $channelId
], ) . PHP_EOL;
[ }
PHP_EOL, }
!empty($item->link) ? trim($item->link) : '',
!empty($item->guid) ? trim($item->guid) : '', if ($channel->item->link->required && !$link)
!empty($item->pubDate) ? trim($item->pubDate) : '', {
!empty($item->title) ? trim(strip_tags(html_entity_decode($item->title))) : '', if ($channel->debug->warning)
!empty($item->description) ? trim(strip_tags(html_entity_decode($item->description))) : '' {
], printf(
$feed->item->template _('[%s] [warning] could not get item link for channel #%d') . PHP_EOL,
) . PHP_EOL date('c'),
) $channelId
) . PHP_EOL, ) . PHP_EOL;
FILE_APPEND | LOCK_EX }
);
continue;
}
}
// Prepare guid or define it from link
$guid = null;
$i++; if (isset($remoteChannelItem->guid))
{
$guid = (string) $remoteChannelItem->guid;
}
else
{
$guid = $link;
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] item guid defined as link in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
// Prepare title
$title = null;
if ($channel->item->title->enabled)
{
if (isset($remoteChannelItem->title))
{
$title = (string) $remoteChannelItem->title;
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item title enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->title->required && !$title)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item title in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare description
$description = null;
if ($channel->item->description->enabled)
{
if (isset($remoteChannelItem->description))
{
$description = (string) $remoteChannelItem->description;
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item description enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->description->required && !$description)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item description in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare content
$content = null;
if ($channel->item->content->enabled)
{
if ($_content = $remoteChannelItem->children('content', true))
{
if (isset($_content->encoded))
{
$content = (string) $_content->encoded;
}
}
if (!$content && $channel->debug->info)
{
printf(
_('[%s] [info] item content enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
if ($channel->item->content->required && !$content)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item content in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Prepare pubDate
$pubTime = null;
if ($channel->item->pubDate->enabled)
{
if (isset($remoteChannelItem->pubDate))
{
if ($_pubTime = strtotime((string) $remoteChannelItem->pubDate))
{
$pubTime = $_pubTime;
}
else
{
if ($channel->debug->warning)
{
printf(
_('[%s] [info] could not convert item pubDate to pubTime in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
}
else
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] item pubDate enabled but not defined in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
}
if ($channel->item->pubDate->required && !$pubTime)
{
if ($channel->debug->warning)
{
printf(
_('[%s] [warning] could not get item pubDate in channel #%d') . PHP_EOL,
date('c'),
$channelId
) . PHP_EOL;
}
continue;
}
}
// Check item not registered yet
if (!$database->isChannelItemExist($channelId, $guid))
{
// Create new one if not exists
$channelItemId = $database->addChannelItem(
$channelId,
$guid,
$link,
$title,
$description,
$content,
$pubTime
);
if ($channelItemId)
{
if ($channel->debug->info)
{
printf(
_('[%s] [info] registered new item #%d for channel #%d') . PHP_EOL,
date('c'),
$channelItemId,
$channelId
) . PHP_EOL;
}
}
}
}
} }
} }
Loading…
Cancel
Save