From 080ac3046892acd2571b6a439b184f987de64ba5 Mon Sep 17 00:00:00 2001 From: Denis Ryabov Date: Tue, 2 Sep 2014 02:07:27 +0400 Subject: [PATCH] improve search --- src/twister.cpp | 320 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 233 insertions(+), 87 deletions(-) diff --git a/src/twister.cpp b/src/twister.cpp index 92e17f78..f260bcda 100644 --- a/src/twister.cpp +++ b/src/twister.cpp @@ -2347,33 +2347,201 @@ Value torrentstatus(const Array& params, bool fHelp) return result; } +class TextSearch +{ +public: + enum search_mode { + TEXTSEARCH_EXACT, + TEXTSEARCH_ALL, + TEXTSEARCH_ANY + }; + + TextSearch(std::string const &keyword, libtorrent::entry const ¶ms); + + bool matchText(std::string msg); + libtorrent::lazy_entry const* matchRawMessage(std::string const &rawMessage); + +private: + std::vector keywords; + search_mode mode; + bool caseInsensitive; + int64_t timeMin, timeMax; + std::string username; +}; + +TextSearch::TextSearch(string const &keyword, entry const ¶ms) : + mode(TEXTSEARCH_EXACT), + caseInsensitive(false), + timeMin(0), + timeMax(numeric_limits::max()) +{ + entry const *pMode = params.find_key("mode"); + if( pMode && pMode->type() == entry::string_t ) { + string strMode = pMode->string(); + if( strMode == "all" ) { + mode = TEXTSEARCH_ALL; + } else if( strMode == "any" ) { + mode = TEXTSEARCH_ANY; + } + } + + entry const *pCase = params.find_key("case"); + if( pCase && pCase->type() == entry::string_t && pCase->string() == "insensitive" ) { + caseInsensitive = true; + } + + int64_t now = GetAdjustedTime(); + + entry const *pAgeMin = params.find_key("agemin"); + if( pAgeMin && pAgeMin->type() == entry::int_t ) { + timeMax = now - pAgeMin->integer() * 24*60*60; + } + + entry const *pAgeMax = params.find_key("agemax"); + if( pAgeMax && pAgeMax->type() == entry::int_t ) { + timeMin = now - pAgeMax->integer() * 24*60*60; + } + + entry const *pUsername = params.find_key("username"); + if( pUsername && pUsername->type() == entry::string_t ) { + username = pUsername->string(); + } + + if( mode == TEXTSEARCH_EXACT ) { + keywords.push_back( keyword ); + } else { + stringstream stream( keyword ); + string word; + while( getline(stream, word, ' ') ) { + if( !word.empty() ) { + keywords.push_back( word ); + } + } + } + + if( caseInsensitive ) { + for( vector::iterator it=keywords.begin(); it != keywords.end(); ++it ) { +#ifdef HAVE_BOOST_LOCALE + *it = boost::locale::to_lower(*it); +#else + boost::algorithm::to_lower(*it); +#endif + } + } +} + +bool TextSearch::matchText(string msg) +{ + if( keywords.size() == 0 ) { + return false; + } + + if( caseInsensitive ) { +#ifdef HAVE_BOOST_LOCALE + msg = boost::locale::to_lower(msg); +#else + boost::algorithm::to_lower(msg); +#endif + } + + switch( mode ) { + case TEXTSEARCH_EXACT: + return msg.find(keywords[0]) != string::npos; + case TEXTSEARCH_ALL: + for( vector::const_iterator it=keywords.begin(); it != keywords.end(); ++it ) { + if( msg.find(*it) == string::npos ) { + return false; + } + } + return true; + case TEXTSEARCH_ANY: + for( vector::const_iterator it=keywords.begin(); it != keywords.end(); ++it ) { + if( msg.find(*it) != string::npos ) { + return true; + } + } + return false; + } + return false; +} + +lazy_entry const* TextSearch::matchRawMessage(string const &rawMessage) +{ + if( keywords.size() == 0 ) { + return 0; + } + // fast check + if( mode != TEXTSEARCH_ANY && rawMessage.find(keywords[0]) == string::npos ) { + return 0; + } + + lazy_entry v; + int pos; + libtorrent::error_code ec; + if (lazy_bdecode(rawMessage.data(), rawMessage.data()+rawMessage.size(), v, ec, &pos) == 0) { + lazy_entry const* vv = v.dict_find_dict("v"); + lazy_entry const* post = vv ? vv->dict_find_dict("userpost") : v.dict_find_dict("userpost"); + if( post ) { + lazy_entry const* rt = post->dict_find_dict("rt"); + lazy_entry const* p = rt ? rt : post; + + if( username.length() ) { + string user = p->dict_find_string_value("n"); + if( user != username ) { + return 0; + } + } + + int64_t time = p->dict_find_int_value("time"); + if( time < timeMin || time > timeMax ) { + return 0; + } + + string msg = p->dict_find_string_value("msg"); + return matchText( msg ) ? p : 0; + } + } + return 0; +} + Value search(const Array& params, bool fHelp) { if (fHelp || params.size() < 3 || params.size() > 4) throw runtime_error( - "search []\n" - "search text in known data\n" + "search ['{\"username\":username,\"mode\":\"exact\"|\"all\"|\"any\",\"case\":\"sensitive\"\"insensitive\",\"agemin\":agemin,\"agemax\":agemin}']\n" + "search text in available data\n" " is data area: messages, directmsgs, profiles, users, hashtags\n" " is a phrase to search\n" "up to entries are returned\n" " in messages scope is optional and allows to search in username's messages only\n" - " in directmsgs scope is required and sets whose conversation to search"); + " in directmsgs scope is required and sets whose conversation to search\n" + "\"mode\" and \"case\" are search mode options\n" + "\"agemin\" and \"agemax\" (days) are message filters\n" + "\"mode\", \"case\", \"agemin\", and \"agemax\" are optional"); string scope = params[0].get_str(); string keyword = params[1].get_str(); int count = params[2].get_int(); - string username = params.size()==4 ? params[3].get_str() : string(); + entry options = params.size()==4 ? jsonToEntry(params[3].get_obj()) : entry(); + string username; if( keyword.size() == 0 ) { throw runtime_error("Empty parameter"); } + entry const *pUsername = options.find_key("username"); + if( pUsername && pUsername->type() == entry::string_t ) { + username = pUsername->string(); + } + Array ret; if( scope == "messages" ) { // search public messages std::map< pair, pair > posts; + TextSearch searcher(keyword, options); + // search public messages in torrents { LOCK(cs_twister); @@ -2391,29 +2559,17 @@ Value search(const Array& params, bool fHelp) item.second.get_pieces(pieces, std::numeric_limits::max(), std::numeric_limits::max(), -1, ~USERPOST_FLAG_DM); BOOST_FOREACH(string const& piece, pieces) { - if( piece.find(keyword) != string::npos ) { - lazy_entry v; - int pos; - libtorrent::error_code ec; - if (lazy_bdecode(piece.data(), piece.data()+piece.size(), v, ec, &pos) == 0) { - lazy_entry const* post = v.dict_find_dict("userpost"); - if( post ) { - lazy_entry const* rt = post->dict_find_dict("rt"); - lazy_entry const* p = rt ? rt : post; - string msg = p->dict_find_string_value("msg"); - if( msg.find(keyword) != string::npos ) { - string n = p->dict_find_string_value("n"); - int k = p->dict_find_int_value("k"); - int64 time = p->dict_find_int_value("time",-1); + lazy_entry const* p = searcher.matchRawMessage(piece); + if( p ) { + string n = p->dict_find_string_value("n"); + int k = p->dict_find_int_value("k"); + int64 time = p->dict_find_int_value("time",-1); - entry vEntry; - vEntry = *p; - hexcapePost(vEntry); + entry vEntry; + vEntry = *p; + hexcapePost(vEntry); - posts[pair(n,k)] = pair(time,vEntry); - } - } - } + posts[pair(n,k)] = pair(time,vEntry); } } } @@ -2430,39 +2586,19 @@ Value search(const Array& params, bool fHelp) continue; for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) { string str_p = j->find_key("p")->string(); - if( str_p.find(keyword) != string::npos ) { - lazy_entry p; - int pos; - libtorrent::error_code err; - int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos); - - lazy_entry const* v = p.dict_find_dict("v"); - if( v ) { - lazy_entry const* post = v->dict_find_dict("userpost"); - if( post ) { - // post, mention, status - lazy_entry const* rt = post->dict_find_dict("rt"); - lazy_entry const* p = rt ? rt : post; - string msg = p->dict_find_string_value("msg"); - if( msg.find(keyword) != string::npos ) { - string n = p->dict_find_string_value("n"); - - if( username.size() == 0 || n == username ) { - int k = p->dict_find_int_value("k"); - - pair post_id(n,k); - if( posts.count(post_id) == 0 ) { - int64 time = p->dict_find_int_value("time",-1); - - entry vEntry; - vEntry = *p; - hexcapePost(vEntry); - - posts[post_id] = pair(time,vEntry); - } - } - } - } + lazy_entry const* p = searcher.matchRawMessage(str_p); + if( p ) { + string n = p->dict_find_string_value("n"); + int k = p->dict_find_int_value("k"); + pair post_id(n,k); + if( posts.count(post_id) == 0 ) { + int64 time = p->dict_find_int_value("time",-1); + + entry vEntry; + vEntry = *p; + hexcapePost(vEntry); + + posts[post_id] = pair(time,vEntry); } } } @@ -2481,16 +2617,18 @@ Value search(const Array& params, bool fHelp) } else if( scope == "directmsgs" ) { // search direct messages - if( m_users.count(username) ){ + if( m_users.count(username) ) { std::multimap postsByTime; + TextSearch searcher(keyword, options); + { LOCK(cs_twister); BOOST_FOREACH(const PAIRTYPE(std::string,std::vector)& list, m_users[username].m_directmsg) { string remoteUser = list.first; BOOST_FOREACH(const StoredDirectMsg& item, list.second) { - if( item.m_text.find(keyword) != string::npos ) { + if( searcher.matchText(item.m_text) ) { int64 time = item.m_utcTime; entry vEntry; vEntry["remoteUser"] = remoteUser; @@ -2518,38 +2656,38 @@ Value search(const Array& params, bool fHelp) entry data = ses->dht_getLocalData(); std::map users; + TextSearch searcher(keyword, options); + for (entry::dictionary_type::const_iterator i = data.dict().begin(); i != data.dict().end(); ++i) { if ( i->second.type() != entry::list_t ) continue; for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) { string str_p = j->find_key("p")->string(); - if( str_p.find(keyword) != string::npos ) { - lazy_entry p; - int pos; - libtorrent::error_code err; - int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos); - - lazy_entry const* target = p.dict_find_dict("target"); - if( target ) { - string resource = target->dict_find_string_value("r"); - if( resource == "profile" ) { - lazy_entry const* v = p.dict_find_dict("v"); - if( v ) { - string bio = v->dict_find_string_value("bio"); - string fullname = v->dict_find_string_value("fullname"); - string location = v->dict_find_string_value("location"); - string url = v->dict_find_string_value("url"); - - if( bio.find(keyword) != string::npos || - fullname.find(keyword) != string::npos || - location.find(keyword) != string::npos || - url.find(keyword) != string::npos ) { - - string n = target->dict_find_string_value("n"); - entry vEntry; - vEntry = *v; - users.insert(pair(n,vEntry)); - } + lazy_entry p; + int pos; + libtorrent::error_code err; + int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos); + + lazy_entry const* target = p.dict_find_dict("target"); + if( target ) { + string resource = target->dict_find_string_value("r"); + if( resource == "profile" ) { + lazy_entry const* v = p.dict_find_dict("v"); + if( v ) { + string bio = v->dict_find_string_value("bio"); + string fullname = v->dict_find_string_value("fullname"); + string location = v->dict_find_string_value("location"); + string url = v->dict_find_string_value("url"); + + if( searcher.matchText(bio) || + searcher.matchText(fullname) || + searcher.matchText(location) || + searcher.matchText(url) ) { + + string n = target->dict_find_string_value("n"); + entry vEntry; + vEntry = *v; + users.insert(pair(n,vEntry)); } } } @@ -2571,6 +2709,8 @@ Value search(const Array& params, bool fHelp) // @todo: there should be a faster way std::multimap usernamesByLength; + boost::algorithm::to_lower(keyword); + string allowed = "abcdefghijklmnopqrstuvwxyz0123456789_"; for( int i = 0; i < allowed.size(); ++i ) { set usernames; @@ -2593,6 +2733,12 @@ Value search(const Array& params, bool fHelp) // search hashtags std::multimap hashtagsByLength; +#ifdef HAVE_BOOST_LOCALE + keyword = boost::locale::to_lower(keyword); +#else + boost::algorithm::to_lower(keyword); +#endif + { LOCK(cs_seenHashtags);