From 2495a2bbc79ff4cd5eb62acafe389bd825f0e566 Mon Sep 17 00:00:00 2001 From: ghost Date: Fri, 7 Apr 2023 04:04:24 +0300 Subject: [PATCH] implement MySQL/Sphinx data model #3, add basical robots.txt support #2 --- .gitignore | 3 + README.md | 15 +++- config/app.php.txt | 16 +++- config/sphinx.conf.txt | 22 +++++ crontab/crawler.php | 182 +++++++++++++++++++++++++------------- database/yggo.mwb | Bin 0 -> 9355 bytes library/filter.php | 11 +-- library/mysql.php | 196 +++++++++++++++++++++++++++++++++++++++++ library/parser.php | 73 +++++++++++++++ library/robots.php | 14 +-- library/sphinxql.php | 31 +++++++ library/sqlite.php | 170 ----------------------------------- public/index.php | 4 +- public/search.php | 102 +++++++++++++++++---- 14 files changed, 561 insertions(+), 278 deletions(-) create mode 100644 config/sphinx.conf.txt create mode 100644 database/yggo.mwb create mode 100644 library/mysql.php create mode 100644 library/parser.php create mode 100644 library/sphinxql.php delete mode 100644 library/sqlite.php diff --git a/.gitignore b/.gitignore index b3e2ff2..3d95099 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,8 @@ .ftpignore config/app.php +config/sphinx.conf + +database/yggo.mwb.bak storage diff --git a/README.md b/README.md index f8dc048..791185a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The project goal - simple interface, clear architecture and lightweight server r #### Online examples -* [http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo](http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo) +* [http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo](http://[201:23b4:991a:634d:8359:4521:5576:15b7]/yggo) * [http://94.140.114.241/yggo/](http://94.140.114.241/yggo) #### Screenshots @@ -28,7 +28,8 @@ php-dom php-pdo php-curl php-gd -sqlite / fts5 +php-mysql +sphinx search server ``` #### Installation @@ -39,12 +40,16 @@ sqlite / fts5 * Set up the `/crontab/crawler.php` script for execution every the minute, but it mostly related of the configs and targetal network volume, there is no debug implemented yet, so let's silentize it by `/dev/null` * Script has no MVC model, because of super simple. It's is just 2 files, and everything else stored incapsulated in `/library` classes. +#### Configuration + +todo + #### Roadmap / ideas * [x] Web pages full text ranking search * [x] Make search results pagination * [ ] Blacklist domains (useful for some mirrors) -* [ ] Add robots.txt support (Issue #2) +* [x] Add robots.txt support (Issue #2) * [ ] Improve yggdrasil links detection, add .ygg domain zone support * [ ] Make page description visible - based on the cached content dump, when website description tag not available, add condition highlights * [ ] Images search (basically implemented but requires testing and some performance optimization) @@ -57,7 +62,7 @@ sqlite / fts5 * [x] An idea to make unique gravatars for sites without favicons, because simpler to ident, comparing to ipv6 * [ ] An idea to make some visitors counters, like in good old times? -#### Contributions +#### Contributions Please make a new master branch for each patch in your fork before create PR @@ -66,6 +71,8 @@ git checkout master git checkout -b my-pr-branch-name ``` +See also: [SQLite tree](https://github.com/YGGverse/YGGo/tree/sqliteway) + #### Donate to contributors * @d47081: [BTC](https://www.blockchain.com/explorer/addresses/btc/bc1qngdf2kwty6djjqpk0ynkpq9wmlrmtm7e0c534y) | [DOGE](https://dogechain.info/address/D5Sez493ibLqTpyB3xwQUspZvJ1cxEdRNQ) diff --git a/config/app.php.txt b/config/app.php.txt index 9880af4..169c388 100644 --- a/config/app.php.txt +++ b/config/app.php.txt @@ -11,16 +11,24 @@ define('WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT', 100); define('WEBSITE_IDENTICON_IMAGE_CACHE', true); // Database -define('DB_NAME', 'database.sqlite'); +define('DB_HOST', 'localhost'); +define('DB_PORT', 3306); +define('DB_NAME', ''); define('DB_USERNAME', ''); define('DB_PASSWORD', ''); -// Crawl settings -define('CRAWL_IMAGE', false); // @TODO +// Sphinx +define('SPHINX_HOST', '127.0.0.1'); +define('SPHINX_PORT', 9306); +// Crawl settings define('CRAWL_PAGE_LIMIT', 10); define('CRAWL_PAGE_SECONDS_OFFSET', 3600); define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui' -define('CRAWL_META_ONLY', false); +define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000); +define('CRAWL_HOST_DEFAULT_STATUS', true); +define('CRAWL_HOST_DEFAULT_META_ONLY', false); + +define('CRAWL_ROBOTS_DEFAULT_RULES', ""); diff --git a/config/sphinx.conf.txt b/config/sphinx.conf.txt new file mode 100644 index 0000000..f0f12a9 --- /dev/null +++ b/config/sphinx.conf.txt @@ -0,0 +1,22 @@ +source hostPage +{ + type = mysql + + sql_host = localhost + sql_user = + sql_pass = + sql_db = + sql_port = 3306 # optional, default is 3306 + + sql_query = \ + SELECT hostPageId, metaTitle, metaDescription, metaKeywords, data, uri \ + FROM hostPage + + sql_attr_uint = hostPageId +} + +index hostPage +{ + source = hostPage + path = +} \ No newline at end of file diff --git a/crontab/crawler.php b/crontab/crawler.php index 59879c7..4c1aece 100644 --- a/crontab/crawler.php +++ b/crontab/crawler.php @@ -11,31 +11,46 @@ if (false === sem_acquire($semaphore, true)) { // Load system dependencies require_once('../config/app.php'); require_once('../library/curl.php'); +require_once('../library/robots.php'); require_once('../library/filter.php'); -require_once('../library/sqlite.php'); +require_once('../library/parser.php'); +require_once('../library/mysql.php'); + +// Debug +$timeStart = microtime(true); + +$hostPagesProcessed = 0; +$hostPagesIndexed = 0; +$hostPagesAdded = 0; +$hostsAdded = 0; // Connect database -$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); +$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); // Process crawl queue -foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queue) { +foreach ($db->getCrawlQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) as $queueHostPage) { + + // Build URL from the DB + $queueHostPageURL = $queueHostPage->scheme . '://' . $queueHostPage->name . ($queueHostPage->port ? ':' . $queueHostPage->port : false) . $queueHostPage->uri; - $url = new Curl($queue->url); + $curl = new Curl($queueHostPageURL); - $db->updatePageQueue($queue->pageId, time(), $url->getCode()); + // Update page index anyway, with the current time and http code + $hostPagesProcessed += $db->updateCrawlQueue($queueHostPage->hostPageId, time(), $curl->getCode()); - // Skip processing non 200 code - if (200 != $url->getCode()) { + // Skip next page processing non 200 code + if (200 != $curl->getCode()) { continue; } - // Skip processing pages without returned data - if (!$content = $url->getContent()) { + // Skip next page processing pages without returned data + if (!$content = $curl->getContent()) { continue; } + // Grab page content $dom = new DomDocument(); @$dom->loadHTML($content); @@ -62,48 +77,12 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) } } - // Index page data - $db->updatePage($queue->pageId, - Filter::pageTitle($title->item(0)->nodeValue), - Filter::pageDescription($description), - Filter::pageKeywords($keywords), - CRAWL_META_ONLY ? '' : Filter::pageData($content), - time()); - - // Update images - $db->deleteImages($queue->pageId); - - if (CRAWL_IMAGE) { - - foreach (@$dom->getElementsByTagName('img') as $image) { - - // Skip images without required attributes - if (!$src = @$image->getAttribute('src')) { - - continue; - } - - if (!$alt = @$image->getAttribute('alt')) { - - continue; - } - - // Add domain to the relative links - if (!parse_url($src, PHP_URL_HOST)) { - - $src = parse_url($queue->url, PHP_URL_SCHEME) . '://' . - parse_url($queue->url, PHP_URL_HOST) . - parse_url($queue->url, PHP_URL_PORT) . - $src; // @TODO sometimes wrong URL prefix available - } - - // Add page images - $db->addImage($queue->pageId, - Filter::url($src), - crc32($src), - Filter::imageAlt($alt)); - } - } + // Update queued page data + $hostPagesIndexed += $db->updateHostPage($queueHostPage->hostPageId, + Filter::pageTitle($title->item(0)->nodeValue), + Filter::pageDescription($description), + Filter::pageKeywords($keywords), + CRAWL_HOST_DEFAULT_META_ONLY ? null : Filter::pageData($content)); // Collect internal links from page content foreach(@$dom->getElementsByTagName('a') as $a) { @@ -120,22 +99,101 @@ foreach ($db->getPageQueue(CRAWL_PAGE_LIMIT, time() - CRAWL_PAGE_SECONDS_OFFSET) continue; } - // Add absolute prefixes to the relative links + // Add absolute URL prefixes to the relative links found if (!parse_url($href, PHP_URL_HOST)) { - $href = parse_url($queue->url, PHP_URL_SCHEME) . '://' . - parse_url($queue->url, PHP_URL_HOST) . - parse_url($queue->url, PHP_URL_PORT) . - $href; + $href = $queueHostPage->scheme . '://' . + $queueHostPage->name . + ($queueHostPage->port ? ':' . $queueHostPage->port : '') . + '/' . ltrim($href, '/'); } - // Filter href URL - $href = Filter::url($href); - - // Save valid internal links to the index queue + // Validate formatted link if (filter_var($href, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $href)) { - $db->initPage($href, crc32($href), time()); + $db->beginTransaction(); + + try { + + // Parse formatted link + $hostURL = Parser::hostURL($href); + $hostPageURI = Parser::uri($href); + + // Host exists + if ($host = $db->getHost(crc32($hostURL->string))) { + + $hostStatus = $host->status; + $hostPageLimit = $host->crawlPageLimit; + $hostId = $host->hostId; + $hostRobots = $host->robots; + + // Register new host + } else { + + // Get robots.txt if exists + $curl = new Curl($hostURL->string . '/robots.txt'); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = null; + } + + $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostId = $db->addHost($hostURL->scheme, + $hostURL->name, + $hostURL->port, + crc32($hostURL->string), + time(), + null, + $hostPageLimit, + (string) CRAWL_HOST_DEFAULT_META_ONLY, + (string) $hostStatus, + $hostRobots); + + if ($hostId) { + + echo 'hostmane ' . $hostURL->string . PHP_EOL; + + $hostsAdded++; + + } else { + + continue; + } + } + + // Init robots parser + $robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES); + + // Save page info + if ($hostStatus && // host enabled + $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules + $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit + !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists + + if ($db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time())) { + + $hostPagesAdded++; + } + } + + $db->commit(); + + } catch(Exception $e){ + + var_dump($e); + + $db->rollBack(); + } } } -} \ No newline at end of file +} + +// Debug +echo 'Pages processed: ' . $hostPagesProcessed . PHP_EOL; +echo 'Pages indexed: ' . $hostPagesIndexed . PHP_EOL; +echo 'Pages added: ' . $hostPagesAdded . PHP_EOL; +echo 'Hosts added: ' . $hostsAdded . PHP_EOL; +echo 'Total time: ' . microtime(true) - $timeStart . PHP_EOL; diff --git a/database/yggo.mwb b/database/yggo.mwb new file mode 100644 index 0000000000000000000000000000000000000000..14fad6b51497020d325a5517885a78497b498f92 GIT binary patch literal 9355 zcmZ{K18^ovvvzFTPBt6cPTpW++qP|Iqm6Czjcwc3#@4J5y8LgRZHX zs(GgS=~0jdhrj><1Azg#CJI+uWYuzhCjtQp#Q+0A0RaIqu{UP4zCXv#qt>hk zpn>ehOBN~}N-o|QH-J8@#7IBOe|e=1dT%?VNLb$8cl+gCex2VC3z8}XbUCVh;FhdG zUzvX&+^ts-)bi>|(_G0H{q;Ua+tl~GhemA+SVZ$`hi)Q zRO)La!9Sh)!ZY#53G}A#&RQn^^WHWatiiL+zL~!{vAJWN$e3gjgUXJ!-mK|>&7IRy z5W~1>DhYY7Ek0i(YRpv*-=QK;z@cSMzDPa2lAcn5z~En5>UF!W}hmn`IG}^h&^&qc9Au` z?&SrQZYoxQ0&e%bZz!Jm(RO#C(Hr)t-92EimygbM>NWIBs)aTKNWQp1GWNvxOSkGj z>RUDRCZ0K)wI^!*VmnxXwpWg};W^hNTiQSR$>8CTsD8cr?*$vPSdG@|B=+s5^s$1l z(2r;$jKV!g6}&lug6Vdar_?y+#0rWt|4AxtI86-H*3-h(pQ;Z*Gr>ijk`bZ===qVR zSU$e$&^HEnTahdr*pZ0B`@rK3vofElqH2UesOpq9ko{QBpww=AsM5?hygeLiV1WCD z(M(ksyz?~~L8uWCudd&A5SJgL=;@kbDAtzvyDiS*bpWw#lsuIjAoxovgO6b|*ym4- zG@N=61}8}bHr)4<8XpqZQYxPx0&Y-33vW#*q;=62YcE1>A>}G}W#8hvPjN_o!pkUyzb2>KQNXIG0 zn;AsEhG+%{wUB_<_O8KTr)GZW9Zf@bxON+_Pmc_WGP2zG2z53}A~2>5Gv_e~&S=~A z3MUT<_@)h`Mn}>rgeee$IKr4Y-r|4iHey#BWgVtAaG71nG2$a}#~`*)`QQXQ>hXSX z^gQjZOw6s7PAnvHfQG7|9;R9{uLLlJp{O6dW$%0rQuJEbp^K3^p%ogWY-|K6u*8Ya znc(do$BKK#^WeURGi9rYbBp#-)Ce%Xy$Ykm{4}T}{k~nBIT=%R^O7d9-D0ViMQRww zCZ0;b6;#I`oB zHp^zFxtXn}84o2^JIcf#((iPohI*j@gSE>EsQvm3ej50?`po;f5gf?Zy0}%k{1u7R zA|1|^F54^{#vN~(;$AAkkp+rs7!Xfn)vF}NMM5@6nChZZ)JFL>00ffB0;;90wgQJeJW)9Mkim25V-UUE1fQZ_ndi*cUN+o)Yv$nNa+eFnW-}^?T${n;e{QQc!Iq znXAxWO7KC3>cIC_XCE;~p5l!~XD((ifQB29FtDh8;JAcs8&sliExhpcc?duM$k0Sz ze}BVA`t;1m@2P8=+$cz!Ka_d<^5K%sD53;!Dk@K5drf6KmEwT484Y8l5I|rXK(2*@ zh8_x62o3%$5`5qv5N4<6iL;!u%)s1g`vqw#85J|Sk_UTgWSo~~4>AeuO5kCamsVr{ zS0lzZ`>UBxs_%qXE9<8~S&|wu(&A4CoXmh4TBKoZO~BCu{K4}C!MQme7V~3?(DMz$ z!yp$C$}&JPxprZ$_Gyj685}^nDQ*j+3uCk5zzZZc9O&g2NMvd9P8?D03rHYRGO#aq zJE<*OQw}^LF_t00{I6F)lPi)Mk zAzD<&NA~SAzT9qWsu=VUgp_K3Af1gCkXb)N^tAWj^IUMNGR5hElMQQ@=N!f-=F!qz=5OmN^@sp9KcMr9$`i#EFeEFFrbEap$~P4p+R z{4Nfg(Gd6AfS2SWX?(gvC8X&QYt)b6);X97=L&T%Jh9)T)GENyANTnzNbn&Cl23tc zlCGTd*Tqn{j9OOygtjK@-q2bU>K`VFEZnP$5}8QJelJ&FF~^oz@sG$M>liP99VP0Z zb`u!p8Do|%_?oP}8CPAR!SPLZ&Cm(JrDEoXh2w$P1x{&M1{;3_WVNS^rAap*YtckG z%>HD6!erJEJxeNBak$N7Q`{MQkl%xH!Repy2rnKP&+yT{MbyP0gKsPL`uOt_26~Zi zkCTyMY4B^>`Ay?B?DGy$6Yc)8(SDR=KH0v^XF&ZPafU%&&j0!X*J_)ebc2L&pjRo@ zP>?`04Cx)tFaLTDk}q3u?p^J(zt{Dh%baVaX{Orx5c{e(i&PX4vX#SXf^#~~sN%O< z`5LOvIOF%})$Tf?Kw#A|@4%EqNpA$hI9s9l->71Kg}8sYNt~t0yavam$)scN1K`@~#Dg z^?Oumn)+s=j6L@}ryNiK+pG=AVJ;vX&DSH}V3ry& zROj3FYQl4@)XrS8ujJJAf*Ub=dr5a{B$$HZ2_1kW8&~{HykG++c8QF5>TCrHWQJhA z2yzN-$U^Al_yx2>CJ)5{ZJ&v=YntGGdxRUML#h3qsb>LMwvnd`NMN>8zvX$ztpCI0 znlD#S1wUxXQgT>>Jd;4d|4I_89Qzqt9}R=ed}<8-bCep_KM}Cc?6^)7f3&SJYQ!Fa zgV`WdcAcnAJa&=^^V>%>tB=hZQ9q+=49x&yp?nr7^E4Vvf5yZ%;8Uu3&1Notzpb4j zGy=nf^;2lI@K+hjCC^kpE4o<(c}$qI-E!uP zRXI!;rEl|Yb<@0vHl|Bn<^yOt5yp_)2r>G9&2#jl%>!mK~C{fItmyB*1^8V-sp>VM%+04|Ts5 z21oBb=F?m*B0W^WtG@LKL#o-dm7ez(y;pnXV7ip3)Njv)UN9?0>vKOM!|E9-3YI}{ z&sV+|V96m3ummAS7zdud2ioz_gb|{Q@BkwXy~pZeODos3hk`dVgCmW+vdCZ$_x?US zg8@1A5J2@38x;gD3@8^@Ow*?XP~8Fb(0H%mfDQ!?_3^9hmL^B&9VQ2#yc9-gfR93` z-zSAYk%%C`=VOqw53==Vq=v}N2m(6^8bUOR2n0JI2%JoKMsZG`QEJwC=}Hw4WJKW+ zO&`{h(yqsq z&BEIdTUwbqD5&6+t=8tMz^K}+a?(UA7sL|giP9*z{`eq9y3lZ-vx#6byuFyT0-$( zi6H`^I#ION{jinmQ*K^0(_3CuJuw6I*syDbIOmV6lwXvH!|9}mwDshoIRmSuzq3>J z@ZsWyxAv~Eu0N&fKgGt{Pj~D5FsNR#qBVtWEPe_`TCW*XL`D;V|4z?9GP(khKQM(B z<%&90jU+EMxwy#e>lu9KIVJgfm%Crk)0&xy$B4iqo1ieY@fU$+v3>4^GDICjv;KEl9m-4SO{w=54Yjm1@UrLHDu-h0 z!D?$&HIH}GP|XVM3l~kplz8AcTxETKjhfoa+1th5$=k1K*U-mUBJpqBSf-&(IjNXb z1b8p@T~)P>9*4;ja19R{no(M`-K5eao!;GsJB37VVaY(!7GSb{tM7L0yU$jN@4%I^ z-Z-a9w;}ZQ&Bfip!}D}bK9%d$DFT0|D>3tp?IHQofW*OhN6ij>Mv8qrlCdMU{8~i| zjBZc(Z=+)COzo9%?al6%Q6*E0(k?(s{O>vK3l2UX@*7_TDsVss9uZ9%-ujI~UlLr& zS#Wtxs(Fte|CSdUrPB9PQU=k)Lk z9jRIENze_uWy297h~oS%B-)1p!PLwE44DjQXXbU$;Wil`q$a8&6|bDRt%Rc$C%mD$ z2#!}%+yO+mqLQB%!~Oyzy97~&i;7`Z{cDKr8QGhFSL8N(pIc?`(d4&?_~VCGt^3Q# zjTCiv8`)!>A1$ixoBoQ7eo$mCKD^THtG_74e}m?UqMMw4j<98dD846R=Z%&_whp5V z*20ftPCK!HZ1At2fjOIte2uoSiy6x*N_^9NxDd#`%;WewE zPz#N)U><#)&dG6o(fcy(BUsbSs&}OuI5okd4R|nlmNSIw;} zoylEtmK{)VS+F4r*tw_6f$(H<|m|HaMTdM8JV~zGdP3YC{^HB}15ECO! z+|x|rBQ4*h>NSQM|Mp@`08f>!tKw{QLF~1MILjX&6_qe*?&-S|N6tPF!#Ry`on@K(vqCx zAS@Jj;Iq*AN7Q%tXs@K0kB^BTC|B}J=m0-nfL~}<{p=8JN{Al-OvP1RFaHYmY?OSX z%v;HJxSv=1`C#b>-ANdsUB?ydgj(xLnRfwxY~fi>()u&I?h^lQwfB5-dIk^G@9g$( zTgp3??cWgEzD>~2=58R@Gq;|${H~;qRU~71{VX?L7 zYZ(6nV#mym?HcEW0Bnhm=pnw_4WfHB+W2}=Jk3uL$tA^UBf%^{pWFxBpF&9fEf)G3 z^(I9ZcyIm(Vp`qz3XUJhpAsp~>%5nO_0bR2%i@8n?uHg%s`!9V^I#l=HLF4r1XbH4!xiz+>eX@gWpd5[k+R_wvTG`*o6HO%0d9x8U)0T~lM>9feXZ)99Mun= zWQ|Ich7IPR1$z(fBBsyH5(K@DCCiecr|OvIvpV`Ek7Wq%Z^Nekf_!#Tj-8dE|W=F%D0N z=Eq{$@cN#fyK4r(Wbm=LLO$@c)ajE*cP%}GWN;k9L>oTbLqu4i97do$&vXknGbmup zQP{bDJ=2Rp5UqbUF&LsWr+O@~J^M$(DML)B3=+oSuvlt@>1f~hbsa4<>;Y5cfzZ^S zNGGY9Z-}Q@(74^H!Kz|y#*$|j9#GdTo?E_D&{hN|Ic8S%y$H&wi`yf(!|wHTDR!&# zdbe@~wzZ})C$NMI3DKujFs7t^n{nbivE-IFJ0n6FI_MyzDNxD9=y9XQmZC=Tu9Qds zU9uE%ROxwMC`GV(38^EfQqRC5PrGhQN%DrCl0==HzI2EbHm0q3s3s|WK2Yo%&m;E3 zNieB2P^0lSDEnK8DsuG|h;Jm;O*k$aV7}P`#ZF|z`XQ!t_}s@hF4t!48$0Y>8^c|h zR3SiH>tTV7D&@S5*@Va6Hyn|3uCI53^@2pMejln>zEMVe!}8}-_t&r82A^lYI40KIM)sAFmI45&nb#cMldu&SpUN zY`Dbnas4xPlbr|V*2y@F%K5S|&t(5qZxTCh7wEI++jQoup8Z*?jRL#-B#Zkbk3HQF zpPk2_Gq<-KKW9MgY9bhM5N@{!e7OWMclgy;^fI5tOOH!OIDHERZm`N{ZnHrZ_1h|3 zg&G5uf(p4Du(?pW1)$r3EwV^1{9Wn<5g5=$wXR52ZPyo0E?)Ew+f$EgrShdKzT~QTn+~{2QxOhhA>8wBv}|e1~52=nug~o9#x|>faH$PP(=l;HeiKqGD>&25^HBEojeO=^PP&x5?Y4aWxhLz(gyH{rcs{jO z3^VpF;Afj4`1$$pVtp-H>)DBv4B!%N!j@495^N_B1fe0@KtOgd0xCQ{aI& zqJu{muppCdY##@LDf|?x3d?99OBf4HtPA&ilr7#%1yPid2=lC-Lsege@Aw#L5kxugiTeX0X?dH9CBLX^k37Zfd%2D$ zD3$V>$Go0f^a3nHk><7~%}hF}`%c?0dfC<&M>^Xg1~%vgt{iOKRMSf{N7OMD6Ig29 z%;Yo{RXnlz&NH?c<~T}hIWu%kM?{8pLOhYJXW<978*)30Q+ zC~bZqZ{fkIT+Hz_LLzGX{!mx@j5F&L0@p-(POBkVxmD%VvzxZ{eirv059$gV{^Ha4 zQaUG)H}fctr(*0#aUp7ivz%J$=)gT4%OD4Z(CpNgsKin+{JuZyrFLU{#DwoVQfz8L zXPoU+_aYo9kH+#lTL-)Ym_2ey%P&s|VIK#yKu0L5DAVxQmPgNKH_??4X~f*pbRu!m zuRquWYXF9%MOVn`SviD$r`d9ow z96)@7X*aMTR51f@u4Vvd?RTQSdoS!vJW?%4kMF5WHZq&*PG7`2&(0+`mEIh3vyKwC zxgVC#kq3IBdAMbwj9%!@)@;lWd0FO z_5%o!$`Y_&43G659u}5{CohXqH&rX1FFg2a)3ET~%&r2!cA6mTJv!;2f+FQ5Hue?? z9@Fl{L(U{`WbhU0I#R=@pA}~>>b$#`^gC0{kQ} zepBl`{196TRVV$-6;vN;2%&rN=<{deCT2*VfOD{VsFFN!35hV9$g*enb0W8n@Gb|M zVh~wG$EdIh6R73`r@+rUO9Ue8QZ`LQ|24YGG51tChB&T0EKH(S*XK$12xTMGKNHUa zK78DJs)R>NZ*ZcU3Bx^i?+e~ei^^CMR4no61*+l2=?}fgq`Zq_ah@8?dB?!-02=1x%P2ilH zraZ?)xOOYeZ>FuTQ&`QL&0kRT?#f*(eQmjjpL(49mwCgM<+<;my2pNc(aG4QMjMMR zUS3`5O*FYT4#u`Xm-f0aCgbP!JpT^+2Go+BAKA3;f-eoWCz35BwLzw5=Z$FtcLXW4 zQ>{YfZOxXW>rZShCjO~EN2k1bDH-R-bKI_(5~sHiYOPs)#^=`>twNbUH`UI6Eo3z* zRt&$7uO?YS-D^5@&!IJ=R1Uw^PjJ(%pz!y+(`3<%mx)Y#%SxK_Tv=ZipCnKp0mtLM zmV#{f&To95zvho_d)59<OHV+$+Nau#n|$*{#r15QY%P8V*g1jBG4O3+>L)cmKO6j{22JR|zyU)}W5xZ=Z^tq3*4Aby2gH`+Z7X#&+k| zP+JN5^)xhUf0-TCl(4ybD}0$($sdR@IV4AZ3OghV4ZiI$h(bhy)DBy-59L|KHrOgP zU#0k0f#QD_wlhnPK632|Pq1D~w4uX_dpM(?uPqU$Kh3=N!orM&Wk-ICV<(xpzfh98 zJsf~1Eir=a{oyWvmKL3OS9~iaqu^*I~|6BD#i&_gqIg$okXlpU$4){YR zrvc&l40s8uy~4zD=jYvlz!`>E;Mb1Cw@ze{DN8j!R7__7UyV9b;o<@MQlD54E)$KN z>On1tD|d~aEsc0PeZDf8&wLqTAN-5|;czeW<9120TS4^kSaMbb!i!&2oSDzOEdJsD z4zHkku)O>dBJs0K&&BTdZ224iAu-0c4DJD5u{CQEM>%%bTD=65ef)*EI*SA^VY}9` zBOE0!vR&7p6DtoZLC_`FiF#LToTs*mGQXsH`;mVQ^xg_br81EkKZuX&L5ahV25IsB z8C%dn`%;!!cm9z(S79HVrB;pZ0XYJrxPy0Da|7>3p$I$wEjj!pttt=dwaLTQ20E%&z zVhLR69z27OyaaZ^DkvC-Zj-C;Ry*Wc-T{Yl?g-E_43XwiVxIG7eiq?A+pKG0lYh#i zX5*Uuw+RH~=x}S=FYPQ0MVvWqo96Gt`4UsKzo%y&R>CmASl1_6OBhQJYUeCdoNdiI zK=zI2+4k9@TFvj&YSonB@HfxXB%R}+n;dfG%Zbgnr@|Ajb!V(T7OvAB^0lMfG=U_4 zj{6Wj&YFz7anEI?MIBO14}_zlM6AUgx$$hCKH8f|0+(C^kHJjbCyG0z{$3t&J{X)9 zatH0u*w18g-TU>zm7c|}=2)!^+n-&YvKM=f59hB^4)J6xI@|vJVopXLv5{aX+5zHA z34>9X->DiI6U$f@Rgg#{=5*P*a=uA3{IpZHBht(31Rs=blD+00EE%>sVK(65=aSlu zCMx2S?suVINf@)O2nK1*!Sa}pO)_k6+gdrsGaI7cGDCwr6-^Z2$aUx*eD*$cBRtYB zpWvElk6HFby6>{R&rk9kR9DR6pYpfYyr4$pH9fn|Fgx|(g>mJ3ns=ya9r9iRn8le= z(R~UTcU#(J0P9^M8-iXSX6!%y1}u4*%JSYJ@GLhl;6OL$KV;AhLBR8ls{(elc+Luk z#|sum{K5lKEsR5>Hdg~~*>QizW6!pxA#NsH;0Ew`!?N|dBcE4k{Yp#T5uh_|x%IkuPwCzln0VzbSDL5RbwzHMdfKcj&+RKMmq-r7O*mR(GN_!03su5*y zT8OQDm6s#%e0*KvU(!=Lvw(d(_dxOL`ZS$=@G=k>7apdVLoN%DMiW3jK} zGbR&_M57f=dplcRAD_nC)O_{uMZ5|}SPQ>@zw~`kcv$40>p*(urfHn~!Rj|Ft#?Fp zFnLcGQkrmFjM3LAmL`opji0SaAxT{Wmtbk8$TUMqRnH{+(Vv=2$=cxTJho77Rp`#S zk*SSeS@>%`{Swa?HZw*I{%Vu=WWWkWg`XTsT=W%k^CYvd1Nt*?)o+K`BpRjiWC^uk z>zzf+)%S(mV!w{lJedlzY`AlOPlK>2!qJRx!cYtGeQSm^5++nPE=%M1_b-O_1NqOI`NkWCbb$F2aaQ_1yiPl*WGKIfoj>zY1dG2f7MuQM6{)(PuL+EN zXcN~rzt5%6=LVGfud*`O>94|134mEbCn2$~&1`MN{o*EoJOHsISj>Gnp-eYI2kDSkt`)in?jxl@8WbkO8>8tfnT@wQzgJ ze~2;JPj3ZZAzQIrJs%G~PiX`M@sAkO+5`HVaIk|#(uUVw2v`nue5NOqm=tGcJx+oQ z<*KQ_C}%`GgV`x3*Apnnz47sD0&B36Lh70oplLLs6Jn4+Qlx9-X1vebxZYw zeSC?KE7Ts{5yzCqt08ajwv-|=cMkq0WvX;sUn9|2z#~P&r!Q;;X;3gh@c;W?&R-z; zPiYSPNBqxUI{#Y#-(dBxEeJ?+;CJGGq3gfz_rIam|F=;3Yw_POOhFnF>YqvQzqS0Y zOd$sW;Q*2KR92KBQMY%pHZrv{wjhzUH!-y#F?2GvuyiveVP;^0v;Aj;-r3l~)Yj0M Mhs4v|+#c?~02aODegFUf literal 0 HcmV?d00001 diff --git a/library/filter.php b/library/filter.php index eb925c8..cb3b545 100644 --- a/library/filter.php +++ b/library/filter.php @@ -4,16 +4,7 @@ class Filter { static public function url(string $url) { - return trim($url); - } - - static public function imageAlt(string $alt) { - - $alt = preg_replace('/[\s]+/', ' ', $alt); - - $alt = trim($alt); - - return $alt; + return trim(urldecode($url)); } static public function pageTitle(string $title) { diff --git a/library/mysql.php b/library/mysql.php new file mode 100644 index 0000000..7be3c48 --- /dev/null +++ b/library/mysql.php @@ -0,0 +1,196 @@ +_db = new PDO('mysql:dbname=' . $database . ';host=' . $host . ';port=' . $port . ';charset=utf8', $username, $password, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); + $this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + $this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); + $this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); + } + + // System + public function beginTransaction() { + + $this->_db->beginTransaction(); + } + + public function commit() { + + $this->_db->commit(); + } + + public function rollBack() { + + $this->_db->rollBack(); + } + + // Host + public function getHost(int $crc32url) { + + $query = $this->_db->prepare('SELECT * FROM `host` WHERE `crc32url` = ? LIMIT 1'); + + $query->execute([$crc32url]); + + return $query->fetch(); + } + + public function addHost(string $scheme, string $name, mixed $port, int $crc32url, int $timeAdded, mixed $timeUpdated, int $crawlPageLimit, string $crawlPageMetaOnly, string $status, mixed $robots) { + + $query = $this->_db->prepare('INSERT INTO `host` (`scheme`, `name`, `port`, `crc32url`, `timeAdded`, `timeUpdated`, `crawlPageLimit`, `crawlPageMetaOnly`, `status`, `robots`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + + $query->execute([$scheme, $name, $port, $crc32url, $timeAdded, $timeUpdated, $crawlPageLimit, $crawlPageMetaOnly, $status, $robots]); + + return $this->_db->lastInsertId(); + } + + // Pages + public function getTotalHostPages(int $hostId) { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `hostId` = ?'); + + $query->execute([$hostId]); + + return $query->fetch()->total; + } + + public function getTotalPagesByHttpCode(mixed $httpCode) { + + if (is_null($httpCode)) { + + $query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` IS NULL'); + + } else { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE `httpCode` = ?'); + + $query->execute([$httpCode]); + + } + + return $query->fetch()->total; + } + + public function getTotalPages() { + + $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `hostPage`'); + + $query->execute(); + + return $query->fetch()->total; + } + + public function getHostPage(int $hostId, int $crc32uri) { + + $query = $this->_db->prepare('SELECT * FROM `hostPage` WHERE `hostId` = ? AND `crc32uri` = ? LIMIT 1'); + + $query->execute([$hostId, $crc32uri]); + + return $query->fetch(); + } + + public function getFoundHostPage(int $hostPageId) { + + $query = $this->_db->prepare('SELECT `hostPage`.`metaTitle`, + `hostPage`.`metaDescription`, + `hostPage`.`data`, + `hostPage`.`uri`, + `host`.`scheme`, + `host`.`name`, + `host`.`port` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE `hostPage`.`hostPageId` = ? + + LIMIT 1'); + + $query->execute([$hostPageId]); + + return $query->fetch(); + } + + public function addHostPage(int $hostId, + int $crc32uri, + string $uri, + int $timeAdded, + mixed $timeUpdated = null, + mixed $httpCode = null, + mixed $rank = null, + mixed $metaTitle = null, + mixed $metaDescription = null, + mixed $metaKeywords = null, + mixed $data = null) { + + $query = $this->_db->prepare('INSERT INTO `hostPage` (`hostId`, + `crc32uri`, + `uri`, + `timeAdded`, + `timeUpdated`, + `httpCode`, + `rank`, + `metaTitle`, + `metaDescription`, + `metaKeywords`, + `data`) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'); + + $query->execute([$hostId, $crc32uri, $uri, $timeAdded, $timeUpdated, $httpCode, $rank, $metaTitle, $metaDescription, $metaKeywords, $data]); + + return $this->_db->lastInsertId(); + } + + public function updateHostPage( int $hostPageId, + mixed $metaTitle, + mixed $metaDescription, + mixed $metaKeywords, + mixed $data) { + + $query = $this->_db->prepare('UPDATE `hostPage` SET `metaTitle` = ?, + `metaDescription` = ?, + `metaKeywords` = ?, + `data` = ? WHERE `hostPageId` = ? LIMIT 1'); + + $query->execute([$metaTitle, $metaDescription, $metaKeywords, $data, $hostPageId]); + + return $query->rowCount(); + } + + // Crawl tools + public function getCrawlQueue(int $limit, int $timeFrom) { + + $query = $this->_db->prepare('SELECT `hostPage`.`hostPageId`, + `hostPage`.`uri`, + `host`.`scheme`, + `host`.`name`, + `host`.`port`, + `host`.`crawlPageLimit`, + `host`.`crawlPageMetaOnly`, + `host`.`robots` + + FROM `hostPage` + JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) + + WHERE (`hostPage`.`timeUpdated` IS NULL OR `hostPage`.`timeUpdated` < ? ) AND `host`.`status` <> 0 + + ORDER BY `hostPage`.`hostPageId` + + LIMIT ' . (int) $limit); + + $query->execute([$timeFrom]); + + return $query->fetchAll(); + } + + public function updateCrawlQueue(string $hostPageId, int $timeUpdated, int $httpCode) { + + $query = $this->_db->prepare('UPDATE `hostPage` SET `timeUpdated` = ?, `httpCode` = ? WHERE `hostPageId` = ? LIMIT 1'); + + $query->execute([$timeUpdated, $httpCode, $hostPageId]); + + return $query->rowCount(); + } +} diff --git a/library/parser.php b/library/parser.php new file mode 100644 index 0000000..5149427 --- /dev/null +++ b/library/parser.php @@ -0,0 +1,73 @@ + null, + 'scheme' => null, + 'name' => null, + 'port' => null, + ]; + + if ($hostScheme = parse_url($string, PHP_URL_SCHEME)) { + + $result['string'] = $hostScheme . '://'; + + $result['scheme'] = $hostScheme; + + } else { + + return false; + } + + if ($hostName = parse_url($string, PHP_URL_HOST)) { + + $result['string'] .= $hostName; + + $result['name'] = $hostName; + + } else { + + return false; + } + + if ($hostPort = parse_url($string, PHP_URL_PORT)) { + + $result['string'] .= ':' . $hostPort; + + $result['port'] = $hostPort; + + } + + return (object) $result; + } + + static public function uri(string $string) { + + $result = [ + 'string' => '/', + 'path' => '/', + 'query' => null, + ]; + + if ($path = parse_url($string, PHP_URL_PATH)) { + + $result['string'] = $path; + + $result['path'] = $path; + + } + + if ($query = parse_url($string, PHP_URL_QUERY)) { + + $result['string'] .= '?' . $query; + + $result['query'] = '?' . $query; + + } + + return (object) $result; + } +} \ No newline at end of file diff --git a/library/robots.php b/library/robots.php index 49beb10..40949cb 100644 --- a/library/robots.php +++ b/library/robots.php @@ -1,7 +1,5 @@ _rule as $rule => $value) { - if (preg_match('!^' . $rule . '!', $url)) { + if (preg_match('!^' . $rule . '!', $uri)) { $result = $value; } diff --git a/library/sphinxql.php b/library/sphinxql.php new file mode 100644 index 0000000..1780b8b --- /dev/null +++ b/library/sphinxql.php @@ -0,0 +1,31 @@ +_sphinx = new PDO('mysql:host=' . $host . ';port=' . $port . ';charset=utf8', false, false, [PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES utf8']); + $this->_sphinx->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + $this->_sphinx->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); + } + + public function searchHostPages(string $keyword, int $start, int $limit) { + + $query = $this->_sphinx->prepare('SELECT * FROM `hostPage` WHERE MATCH(?) LIMIT ' . (int) $start . ',' . (int) $limit); + + $query->execute([$keyword]); + + return $query->fetchAll(); + } + + public function searchHostPagesTotal(string $keyword) { + + $query = $this->_sphinx->prepare('SELECT COUNT(*) AS `total` FROM `hostPage` WHERE MATCH(?)'); + + $query->execute([$keyword]); + + return $query->fetch()->total; + } +} diff --git a/library/sqlite.php b/library/sqlite.php deleted file mode 100644 index 8878986..0000000 --- a/library/sqlite.php +++ /dev/null @@ -1,170 +0,0 @@ -_db = new PDO('sqlite:' . $database, $username, $password); - $this->_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); - $this->_db->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_OBJ); - $this->_db->setAttribute(PDO::ATTR_TIMEOUT, 600); - - $this->_db->query(' - CREATE TABLE IF NOT EXISTS "page" ( - "pageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, - "crc32url" INTEGER NOT NULL UNIQUE, - "httpCode" INTEGER, - "timeAdded" INTEGER NOT NULL, - "timeUpdated" INTEGER, - "title" TEXT, - "data" TEXT, - "description" TEXT, - "keywords" TEXT, - "url" TEXT NOT NULL - ) - '); - - $this->_db->query(' - CREATE TABLE IF NOT EXISTS "image" ( - "imageId" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, - "crc32src" INTEGER NOT NULL UNIQUE, - "pageId" INTEGER NOT NULL, - "alt" TEXT NOT NULL, - "src" TEXT NOT NULL - ) - '); - - // FTS5 - $this->_db->query(' - CREATE VIRTUAL TABLE IF NOT EXISTS `ftsPage` USING fts5(`url`, `title`, `description`, `keywords`, `data`, tokenize=`unicode61`, content=`page`, content_rowid=`pageId`) - '); - - $this->_db->query(' - CREATE TRIGGER IF NOT EXISTS `pageInsert` AFTER INSERT ON `page` BEGIN - INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`); - END - '); - - $this->_db->query(' - CREATE TRIGGER IF NOT EXISTS `pageDelete` AFTER DELETE ON `page` BEGIN - INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`); - END - '); - - $this->_db->query(' - CREATE TRIGGER IF NOT EXISTS `pageUpdate` UPDATE ON `page` BEGIN - INSERT INTO ftsPage(`ftsPage`, `rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES ("delete", `old`.`pageId`, `old`.`url`, `old`.`title`, `old`.`description`, `old`.`keywords`, `old`.`data`); - INSERT INTO ftsPage(`rowid`, `url`, `title`, `description`, `keywords`, `data`) VALUES (`new`.`pageId`, `new`.`url`, `new`.`title`, `new`.`description`, `new`.`keywords`, `new`.`data`); - END - '); - } - - public function getTotalPagesByHttpCode(mixed $httpCode) { - - if (is_null($httpCode)) { - - $query = $this->_db->query('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` IS NULL'); - - } else { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page` WHERE `httpCode` = ?'); - - $query->execute([$httpCode]); - - } - - return $query->fetch()->total; - } - - public function getTotalPages() { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `page`'); - - $query->execute(); - - return $query->fetch()->total; - } - - public function updatePage(int $pageId, string $title, string $description, string $keywords, string $data, int $timeUpdated) { - - $query = $this->_db->prepare('UPDATE `page` SET `title` = ?, `description` = ?, `data` = ?, `timeUpdated` = ? WHERE `pageId` = ?'); - - $query->execute([$title, $description, $data, $timeUpdated, $pageId]); - - return $query->rowCount(); - } - - public function addPage(string $title, string $description, string $keywords, string $data, int $timeAdded) { - - $query = $this->_db->prepare('INSERT INTO `page` (`title`, `description`, `data`, `timeAdded`) VALUES (?, ?, ?, ?)'); - - $query->execute([$title, $description, $data, $timeAdded]); - - return $this->_db->lastInsertId(); - } - - public function initPage(string $url, int $crc32url, int $timeAdded) { - - $query = $this->_db->prepare('INSERT OR IGNORE INTO `page` (`url`, `crc32url`, `timeAdded`) VALUES (?, ?, ?)'); - - $query->execute([$url, $crc32url, $timeAdded]); - - return $this->_db->lastInsertId(); - } - - public function addImage(int $pageId, string $src, int $crc32src, string $alt) { - - $query = $this->_db->prepare('INSERT OR IGNORE INTO `image` (`pageId`, `src`, `crc32src`, `alt`) VALUES (?, ?, ?, ?)'); - - $query->execute([$pageId, $src, $crc32src, $alt]); - - return $this->_db->lastInsertId(); - } - - public function deleteImages(int $pageId) { - - $query = $this->_db->prepare('DELETE FROM `image` WHERE `pageId` = ?'); - - $query->execute([$pageId]); - - return $query->rowCount(); - } - - public function getPageQueue(int $limit, int $timeFrom) { - - $query = $this->_db->prepare('SELECT * FROM `page` WHERE `timeUpdated` IS NULL OR `timeUpdated` < ? ORDER BY `pageId` LIMIT ' . (int) $limit); - - $query->execute([$timeFrom]); - - return $query->fetchAll(); - } - - public function updatePageQueue(string $pageId, int $timeUpdated, int $httpCode) { - - $query = $this->_db->prepare('UPDATE `page` SET `timeUpdated` = ?, `httpCode` = ? WHERE `pageId` = ? LIMIT 1'); - - $query->execute([$timeUpdated, $httpCode, $pageId]); - - return $query->rowCount(); - } - - public function searchPages(string $q, int $start = 0, int $limit = 100) { - - $query = $this->_db->prepare('SELECT `title`, `description`, `url` FROM `ftsPage` WHERE `data` MATCH ? ORDER BY `rank` LIMIT ' . (int) $start . ',' . (int) $limit); - - $query->execute([$q]); - - return $query->fetchAll(); - } - - public function searchPagesTotal(string $q) { - - $query = $this->_db->prepare('SELECT COUNT(*) AS `total` FROM `ftsPage` WHERE `data` MATCH ?'); - - $query->execute([$q]); - - return $query->fetch()->total; - } -} diff --git a/public/index.php b/public/index.php index cf485eb..4c789eb 100644 --- a/public/index.php +++ b/public/index.php @@ -3,10 +3,10 @@ // Load system dependencies require_once('../config/app.php'); require_once('../library/filter.php'); -require_once('../library/sqlite.php'); +require_once('../library/mysql.php'); // Connect database -$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); +$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); $totalPages = $db->getTotalPages(); diff --git a/public/search.php b/public/search.php index e8ad6e9..6fcee6a 100644 --- a/public/search.php +++ b/public/search.php @@ -2,11 +2,18 @@ // Load system dependencies require_once('../config/app.php'); +require_once('../library/curl.php'); +require_once('../library/robots.php'); require_once('../library/filter.php'); -require_once('../library/sqlite.php'); +require_once('../library/parser.php'); +require_once('../library/mysql.php'); +require_once('../library/sphinxql.php'); + +// Connect Sphinx search server +$sphinx = new SphinxQL(SPHINX_HOST, SPHINX_PORT); // Connect database -$db = new SQLite(DB_NAME, DB_USERNAME, DB_PASSWORD); +$db = new MySQL(DB_HOST, DB_PORT, DB_NAME, DB_USERNAME, DB_PASSWORD); // Define page basics $totalPages = $db->getTotalPages(); @@ -23,14 +30,76 @@ $p = !empty($_GET['p']) ? (int) $_GET['p'] : 1; // Crawl request if (filter_var($q, FILTER_VALIDATE_URL) && preg_match(CRAWL_URL_REGEXP, $q)) { - $db->initPage($q, crc32($q), time()); + $db->beginTransaction(); + + try { + + // Parse host info + if ($hostURL = Parser::hostURL($q)) { + + // Host exists + if ($host = $db->getHost(crc32($hostURL->string))) { + + $hostStatus = $host->status; + $hostPageLimit = $host->crawlPageLimit; + $hostId = $host->hostId; + $hostRobots = $host->robots; + + // Register new host + } else { + + // Get robots.txt if exists + $curl = new Curl($hostURL->string . '/robots.txt'); + + if (200 == $curl->getCode() && false !== stripos($curl->getContent(), 'user-agent:')) { + $hostRobots = $curl->getContent(); + } else { + $hostRobots = null; + } + + $hostStatus = CRAWL_HOST_DEFAULT_STATUS; + $hostPageLimit = CRAWL_HOST_DEFAULT_PAGES_LIMIT; + $hostId = $db->addHost($hostURL->scheme, + $hostURL->name, + $hostURL->port, + crc32($hostURL->string), + time(), + null, + $hostPageLimit, + (string) CRAWL_HOST_DEFAULT_META_ONLY, + (string) $hostStatus, + $hostRobots); + } + + // Parse page URI + $hostPageURI = Parser::uri($q); + + // Init robots parser + $robots = new Robots(!$hostRobots ? (string) $hostRobots : (string) CRAWL_ROBOTS_DEFAULT_RULES); + + // Save page info + if ($hostStatus && // host enabled + $robots->uriAllowed($hostPageURI->string) && // page allowed by robots.txt rules + $hostPageLimit > $db->getTotalHostPages($hostId) && // pages quantity not reached host limit + !$db->getHostPage($hostId, crc32($hostPageURI->string))) { // page not exists + + $db->addHostPage($hostId, crc32($hostPageURI->string), $hostPageURI->string, time()); + } + } + + $db->commit(); + + } catch(Exception $e){ + + $db->rollBack(); + } } // Search request if (!empty($q)) { - $results = $db->searchPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT); - $resultsTotal = $db->searchPagesTotal('"' . $q . '"'); + $results = $sphinx->searchHostPages('"' . $q . '"', $p * WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT - WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT, WEBSITE_PAGINATION_SEARCH_RESULTS_LIMIT); + $resultsTotal = $sphinx->searchHostPagesTotal('"' . $q . '"'); } else { @@ -196,16 +265,19 @@ if (!empty($q)) { -
-

title ?>

- description)) { ?> - description ?> - - - favicon - url ?> - -
+ getFoundHostPage($result->id)) { ?> + scheme . '://' . $hostPage->name . ($hostPage->port ? ':' . $hostPage->port : false) . $hostPage->uri ?> +
+

metaTitle ?>

+ metaDescription)) { ?> + metaDescription ?> + + + favicon + + +
+