improve search

This commit is contained in:
Denis Ryabov 2014-09-02 02:07:27 +04:00
parent c500409517
commit 080ac30468

View File

@ -2347,33 +2347,201 @@ Value torrentstatus(const Array& params, bool fHelp)
return result; return result;
} }
class TextSearch
{
public:
enum search_mode {
TEXTSEARCH_EXACT,
TEXTSEARCH_ALL,
TEXTSEARCH_ANY
};
TextSearch(std::string const &keyword, libtorrent::entry const &params);
bool matchText(std::string msg);
libtorrent::lazy_entry const* matchRawMessage(std::string const &rawMessage);
private:
std::vector<std::string> keywords;
search_mode mode;
bool caseInsensitive;
int64_t timeMin, timeMax;
std::string username;
};
TextSearch::TextSearch(string const &keyword, entry const &params) :
mode(TEXTSEARCH_EXACT),
caseInsensitive(false),
timeMin(0),
timeMax(numeric_limits<int64_t>::max())
{
entry const *pMode = params.find_key("mode");
if( pMode && pMode->type() == entry::string_t ) {
string strMode = pMode->string();
if( strMode == "all" ) {
mode = TEXTSEARCH_ALL;
} else if( strMode == "any" ) {
mode = TEXTSEARCH_ANY;
}
}
entry const *pCase = params.find_key("case");
if( pCase && pCase->type() == entry::string_t && pCase->string() == "insensitive" ) {
caseInsensitive = true;
}
int64_t now = GetAdjustedTime();
entry const *pAgeMin = params.find_key("agemin");
if( pAgeMin && pAgeMin->type() == entry::int_t ) {
timeMax = now - pAgeMin->integer() * 24*60*60;
}
entry const *pAgeMax = params.find_key("agemax");
if( pAgeMax && pAgeMax->type() == entry::int_t ) {
timeMin = now - pAgeMax->integer() * 24*60*60;
}
entry const *pUsername = params.find_key("username");
if( pUsername && pUsername->type() == entry::string_t ) {
username = pUsername->string();
}
if( mode == TEXTSEARCH_EXACT ) {
keywords.push_back( keyword );
} else {
stringstream stream( keyword );
string word;
while( getline(stream, word, ' ') ) {
if( !word.empty() ) {
keywords.push_back( word );
}
}
}
if( caseInsensitive ) {
for( vector<string>::iterator it=keywords.begin(); it != keywords.end(); ++it ) {
#ifdef HAVE_BOOST_LOCALE
*it = boost::locale::to_lower(*it);
#else
boost::algorithm::to_lower(*it);
#endif
}
}
}
bool TextSearch::matchText(string msg)
{
if( keywords.size() == 0 ) {
return false;
}
if( caseInsensitive ) {
#ifdef HAVE_BOOST_LOCALE
msg = boost::locale::to_lower(msg);
#else
boost::algorithm::to_lower(msg);
#endif
}
switch( mode ) {
case TEXTSEARCH_EXACT:
return msg.find(keywords[0]) != string::npos;
case TEXTSEARCH_ALL:
for( vector<string>::const_iterator it=keywords.begin(); it != keywords.end(); ++it ) {
if( msg.find(*it) == string::npos ) {
return false;
}
}
return true;
case TEXTSEARCH_ANY:
for( vector<string>::const_iterator it=keywords.begin(); it != keywords.end(); ++it ) {
if( msg.find(*it) != string::npos ) {
return true;
}
}
return false;
}
return false;
}
lazy_entry const* TextSearch::matchRawMessage(string const &rawMessage)
{
if( keywords.size() == 0 ) {
return 0;
}
// fast check
if( mode != TEXTSEARCH_ANY && rawMessage.find(keywords[0]) == string::npos ) {
return 0;
}
lazy_entry v;
int pos;
libtorrent::error_code ec;
if (lazy_bdecode(rawMessage.data(), rawMessage.data()+rawMessage.size(), v, ec, &pos) == 0) {
lazy_entry const* vv = v.dict_find_dict("v");
lazy_entry const* post = vv ? vv->dict_find_dict("userpost") : v.dict_find_dict("userpost");
if( post ) {
lazy_entry const* rt = post->dict_find_dict("rt");
lazy_entry const* p = rt ? rt : post;
if( username.length() ) {
string user = p->dict_find_string_value("n");
if( user != username ) {
return 0;
}
}
int64_t time = p->dict_find_int_value("time");
if( time < timeMin || time > timeMax ) {
return 0;
}
string msg = p->dict_find_string_value("msg");
return matchText( msg ) ? p : 0;
}
}
return 0;
}
Value search(const Array& params, bool fHelp) Value search(const Array& params, bool fHelp)
{ {
if (fHelp || params.size() < 3 || params.size() > 4) if (fHelp || params.size() < 3 || params.size() > 4)
throw runtime_error( throw runtime_error(
"search <scope> <text> <count> [<username>]\n" "search <scope> <text> <count> ['{\"username\":username,\"mode\":\"exact\"|\"all\"|\"any\",\"case\":\"sensitive\"\"insensitive\",\"agemin\":agemin,\"agemax\":agemin}']\n"
"search text in known data\n" "search text in available data\n"
"<scope> is data area: messages, directmsgs, profiles, users, hashtags\n" "<scope> is data area: messages, directmsgs, profiles, users, hashtags\n"
"<text> is a phrase to search\n" "<text> is a phrase to search\n"
"up to <count> entries are returned\n" "up to <count> entries are returned\n"
"<username> in messages scope is optional and allows to search in username's messages only\n" "<username> in messages scope is optional and allows to search in username's messages only\n"
"<username> in directmsgs scope is required and sets whose conversation to search"); "<username> in directmsgs scope is required and sets whose conversation to search\n"
"\"mode\" and \"case\" are search mode options\n"
"\"agemin\" and \"agemax\" (days) are message filters\n"
"\"mode\", \"case\", \"agemin\", and \"agemax\" are optional");
string scope = params[0].get_str(); string scope = params[0].get_str();
string keyword = params[1].get_str(); string keyword = params[1].get_str();
int count = params[2].get_int(); int count = params[2].get_int();
string username = params.size()==4 ? params[3].get_str() : string(); entry options = params.size()==4 ? jsonToEntry(params[3].get_obj()) : entry();
string username;
if( keyword.size() == 0 ) { if( keyword.size() == 0 ) {
throw runtime_error("Empty <text> parameter"); throw runtime_error("Empty <text> parameter");
} }
entry const *pUsername = options.find_key("username");
if( pUsername && pUsername->type() == entry::string_t ) {
username = pUsername->string();
}
Array ret; Array ret;
if( scope == "messages" ) { if( scope == "messages" ) {
// search public messages // search public messages
std::map< pair<std::string,int>, pair<int64,entry> > posts; std::map< pair<std::string,int>, pair<int64,entry> > posts;
TextSearch searcher(keyword, options);
// search public messages in torrents // search public messages in torrents
{ {
LOCK(cs_twister); LOCK(cs_twister);
@ -2391,29 +2559,17 @@ Value search(const Array& params, bool fHelp)
item.second.get_pieces(pieces, std::numeric_limits<int>::max(), std::numeric_limits<int>::max(), -1, ~USERPOST_FLAG_DM); item.second.get_pieces(pieces, std::numeric_limits<int>::max(), std::numeric_limits<int>::max(), -1, ~USERPOST_FLAG_DM);
BOOST_FOREACH(string const& piece, pieces) { BOOST_FOREACH(string const& piece, pieces) {
if( piece.find(keyword) != string::npos ) { lazy_entry const* p = searcher.matchRawMessage(piece);
lazy_entry v; if( p ) {
int pos; string n = p->dict_find_string_value("n");
libtorrent::error_code ec; int k = p->dict_find_int_value("k");
if (lazy_bdecode(piece.data(), piece.data()+piece.size(), v, ec, &pos) == 0) { int64 time = p->dict_find_int_value("time",-1);
lazy_entry const* post = v.dict_find_dict("userpost");
if( post ) {
lazy_entry const* rt = post->dict_find_dict("rt");
lazy_entry const* p = rt ? rt : post;
string msg = p->dict_find_string_value("msg");
if( msg.find(keyword) != string::npos ) {
string n = p->dict_find_string_value("n");
int k = p->dict_find_int_value("k");
int64 time = p->dict_find_int_value("time",-1);
entry vEntry; entry vEntry;
vEntry = *p; vEntry = *p;
hexcapePost(vEntry); hexcapePost(vEntry);
posts[pair<std::string,int>(n,k)] = pair<int64,entry>(time,vEntry); posts[pair<std::string,int>(n,k)] = pair<int64,entry>(time,vEntry);
}
}
}
} }
} }
} }
@ -2430,39 +2586,19 @@ Value search(const Array& params, bool fHelp)
continue; continue;
for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) { for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) {
string str_p = j->find_key("p")->string(); string str_p = j->find_key("p")->string();
if( str_p.find(keyword) != string::npos ) { lazy_entry const* p = searcher.matchRawMessage(str_p);
lazy_entry p; if( p ) {
int pos; string n = p->dict_find_string_value("n");
libtorrent::error_code err; int k = p->dict_find_int_value("k");
int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos); pair<std::string,int> post_id(n,k);
if( posts.count(post_id) == 0 ) {
int64 time = p->dict_find_int_value("time",-1);
lazy_entry const* v = p.dict_find_dict("v"); entry vEntry;
if( v ) { vEntry = *p;
lazy_entry const* post = v->dict_find_dict("userpost"); hexcapePost(vEntry);
if( post ) {
// post, mention, status
lazy_entry const* rt = post->dict_find_dict("rt");
lazy_entry const* p = rt ? rt : post;
string msg = p->dict_find_string_value("msg");
if( msg.find(keyword) != string::npos ) {
string n = p->dict_find_string_value("n");
if( username.size() == 0 || n == username ) { posts[post_id] = pair<int64,entry>(time,vEntry);
int k = p->dict_find_int_value("k");
pair<std::string,int> post_id(n,k);
if( posts.count(post_id) == 0 ) {
int64 time = p->dict_find_int_value("time",-1);
entry vEntry;
vEntry = *p;
hexcapePost(vEntry);
posts[post_id] = pair<int64,entry>(time,vEntry);
}
}
}
}
} }
} }
} }
@ -2481,16 +2617,18 @@ Value search(const Array& params, bool fHelp)
} else if( scope == "directmsgs" ) { } else if( scope == "directmsgs" ) {
// search direct messages // search direct messages
if( m_users.count(username) ){ if( m_users.count(username) ) {
std::multimap<int64,entry> postsByTime; std::multimap<int64,entry> postsByTime;
TextSearch searcher(keyword, options);
{ {
LOCK(cs_twister); LOCK(cs_twister);
BOOST_FOREACH(const PAIRTYPE(std::string,std::vector<StoredDirectMsg>)& list, m_users[username].m_directmsg) { BOOST_FOREACH(const PAIRTYPE(std::string,std::vector<StoredDirectMsg>)& list, m_users[username].m_directmsg) {
string remoteUser = list.first; string remoteUser = list.first;
BOOST_FOREACH(const StoredDirectMsg& item, list.second) { BOOST_FOREACH(const StoredDirectMsg& item, list.second) {
if( item.m_text.find(keyword) != string::npos ) { if( searcher.matchText(item.m_text) ) {
int64 time = item.m_utcTime; int64 time = item.m_utcTime;
entry vEntry; entry vEntry;
vEntry["remoteUser"] = remoteUser; vEntry["remoteUser"] = remoteUser;
@ -2518,38 +2656,38 @@ Value search(const Array& params, bool fHelp)
entry data = ses->dht_getLocalData(); entry data = ses->dht_getLocalData();
std::map<string,entry> users; std::map<string,entry> users;
TextSearch searcher(keyword, options);
for (entry::dictionary_type::const_iterator i = data.dict().begin(); i != data.dict().end(); ++i) { for (entry::dictionary_type::const_iterator i = data.dict().begin(); i != data.dict().end(); ++i) {
if ( i->second.type() != entry::list_t ) if ( i->second.type() != entry::list_t )
continue; continue;
for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) { for (entry::list_type::const_iterator j = i->second.list().begin(); j != i->second.list().end(); ++j) {
string str_p = j->find_key("p")->string(); string str_p = j->find_key("p")->string();
if( str_p.find(keyword) != string::npos ) { lazy_entry p;
lazy_entry p; int pos;
int pos; libtorrent::error_code err;
libtorrent::error_code err; int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos);
int ret = lazy_bdecode(str_p.data(), str_p.data() + str_p.size(), p, err, &pos);
lazy_entry const* target = p.dict_find_dict("target"); lazy_entry const* target = p.dict_find_dict("target");
if( target ) { if( target ) {
string resource = target->dict_find_string_value("r"); string resource = target->dict_find_string_value("r");
if( resource == "profile" ) { if( resource == "profile" ) {
lazy_entry const* v = p.dict_find_dict("v"); lazy_entry const* v = p.dict_find_dict("v");
if( v ) { if( v ) {
string bio = v->dict_find_string_value("bio"); string bio = v->dict_find_string_value("bio");
string fullname = v->dict_find_string_value("fullname"); string fullname = v->dict_find_string_value("fullname");
string location = v->dict_find_string_value("location"); string location = v->dict_find_string_value("location");
string url = v->dict_find_string_value("url"); string url = v->dict_find_string_value("url");
if( bio.find(keyword) != string::npos || if( searcher.matchText(bio) ||
fullname.find(keyword) != string::npos || searcher.matchText(fullname) ||
location.find(keyword) != string::npos || searcher.matchText(location) ||
url.find(keyword) != string::npos ) { searcher.matchText(url) ) {
string n = target->dict_find_string_value("n"); string n = target->dict_find_string_value("n");
entry vEntry; entry vEntry;
vEntry = *v; vEntry = *v;
users.insert(pair<string,entry>(n,vEntry)); users.insert(pair<string,entry>(n,vEntry));
}
} }
} }
} }
@ -2571,6 +2709,8 @@ Value search(const Array& params, bool fHelp)
// @todo: there should be a faster way // @todo: there should be a faster way
std::multimap<string::size_type,std::string> usernamesByLength; std::multimap<string::size_type,std::string> usernamesByLength;
boost::algorithm::to_lower(keyword);
string allowed = "abcdefghijklmnopqrstuvwxyz0123456789_"; string allowed = "abcdefghijklmnopqrstuvwxyz0123456789_";
for( int i = 0; i < allowed.size(); ++i ) { for( int i = 0; i < allowed.size(); ++i ) {
set<string> usernames; set<string> usernames;
@ -2593,6 +2733,12 @@ Value search(const Array& params, bool fHelp)
// search hashtags // search hashtags
std::multimap<string::size_type,std::string> hashtagsByLength; std::multimap<string::size_type,std::string> hashtagsByLength;
#ifdef HAVE_BOOST_LOCALE
keyword = boost::locale::to_lower(keyword);
#else
boost::algorithm::to_lower(keyword);
#endif
{ {
LOCK(cs_seenHashtags); LOCK(cs_seenHashtags);