mirror of https://github.com/YGGverse/YGGo.git
phpyggdrasilmysqlcrawlerjs-lessalt-websphinxspiderdistributedwebsearch-engineopen-sourcesphinxsearchfederativeweb-archivepdocurlparserfts5privacy-oriented
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
2.2 KiB
65 lines
2.2 KiB
source common |
|
{ |
|
type = mysql |
|
|
|
sql_host = localhost |
|
sql_user = |
|
sql_pass = |
|
sql_db = |
|
sql_port = 3306 # optional, default is 3306 |
|
} |
|
|
|
source hostPage : common |
|
{ |
|
sql_query = \ |
|
SELECT `hostPage`.`hostPageId`, \ |
|
`hostPage`.`uri`, \ |
|
REPLACE(`hostPage`.`uri`, '.', ' '), \ |
|
REPLACE(`hostPage`.`uri`, '-', ' '), \ |
|
REPLACE(`hostPage`.`uri`, '_', ' '), \ |
|
REPLACE(`hostPage`.`uri`, '/', ' '), \ |
|
`hostPage`.`rank`, \ |
|
`host`.`name`, \ |
|
IF (`host`.`port` IS NOT NULL, \ |
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`), \ |
|
CONCAT(`host`.`scheme`, '://', `host`.`name`)) AS `hostURL`, \ |
|
IF (`host`.`port` IS NOT NULL, \ |
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, ':', `host`.`port`, `hostPage`.`uri`), \ |
|
CONCAT(`host`.`scheme`, '://', `host`.`name`, `hostPage`.`uri`)) AS `hostPageURL`, \ |
|
REGEXP_REPLACE(`hostPage`.`mime`, '^([A-z-]+)/[A-z-]+.*', '$1') AS `mime`, \ |
|
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \ |
|
`hostPageDescription`.`description`, \ |
|
`hostPageDescription`.`keywords`)) \ |
|
FROM `hostPageDescription` \ |
|
WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \ |
|
FROM `hostPage` \ |
|
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \ |
|
WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL AND `hostPage`.`mime` IS NOT NULL \ |
|
|
|
sql_attr_uint = rank |
|
sql_attr_string = mime |
|
} |
|
|
|
index hostPage |
|
{ |
|
source = hostPage |
|
morphology = stem_cz, stem_ar, lemmatize_de_all, lemmatize_ru_all, lemmatize_en_all # stem_enru |
|
path = /var/lib/sphinxsearch/data/hostPage |
|
|
|
min_word_len = 2 |
|
min_prefix_len = 2 |
|
|
|
html_strip = 1 |
|
|
|
index_exact_words = 1 |
|
} |
|
|
|
indexer |
|
{ |
|
mem_limit = 256M |
|
lemmatizer_cache = 256M |
|
} |
|
|
|
common { |
|
lemmatizer_base = /var/lib/sphinxsearch/dicts # http://sphinxsearch.com/downloads/dicts |
|
} |