From c0339d4f6a86191dde13fe8db95b00f88ac6005a Mon Sep 17 00:00:00 2001
From: Tim Delaney
Date: Sun, 5 Feb 2017 08:53:33 +1100
Subject: [PATCH] Fix regex RSS matching. Closes #6337.
--HG--
branch : magao-dev
---
src/base/rss/rssdownloadrule.cpp | 99 +++++++++++---------------
src/base/rss/rssdownloadrule.h | 2 +
src/gui/rss/automatedrssdownloader.cpp | 15 ++--
3 files changed, 56 insertions(+), 60 deletions(-)
diff --git a/src/base/rss/rssdownloadrule.cpp b/src/base/rss/rssdownloadrule.cpp
index beb861117..eef217f0d 100644
--- a/src/base/rss/rssdownloadrule.cpp
+++ b/src/base/rss/rssdownloadrule.cpp
@@ -50,48 +50,51 @@ DownloadRule::DownloadRule()
{
}
-bool DownloadRule::matches(const QString &articleTitle) const
+bool DownloadRule::matches(const QString &articleTitle, const QString &expression) const
{
- QRegExp whitespace("\\s+");
+ static QRegExp whitespace("\\s+");
+
+ if (expression.isEmpty()) {
+ // A regex of the form "expr|" will always match, so do the same for wildcards
+ return true;
+ }
+ else if (m_useRegex) {
+ QRegExp reg(expression, Qt::CaseInsensitive, QRegExp::RegExp);
+ return reg.indexIn(articleTitle) > -1;
+ }
+ else {
+ // Only match if every wildcard token (separated by spaces) is present in the article name.
+ // Order of wildcard tokens is unimportant (if order is important, they should have used *).
+ foreach (const QString &wildcard, expression.split(whitespace, QString::SplitBehavior::SkipEmptyParts)) {
+ QRegExp reg(wildcard, Qt::CaseInsensitive, QRegExp::Wildcard);
+
+ if (reg.indexIn(articleTitle) == -1)
+ return false;
+ }
+ }
+
+ return true;
+}
+bool DownloadRule::matches(const QString &articleTitle) const
+{
if (!m_mustContain.empty()) {
bool logged = false;
- bool foundMustContain = true;
+ bool foundMustContain = false;
// Each expression is either a regex, or a set of wildcards separated by whitespace.
// Accept if any complete expression matches.
foreach (const QString &expression, m_mustContain) {
- if (expression.isEmpty())
- continue;
-
if (!logged) {
- qDebug() << "Checking matching expressions:" << m_mustContain.join("|");
+ qDebug() << "Checking matching" << (m_useRegex ? "regex:" : "wildcard expressions:") << m_mustContain.join("|");
logged = true;
}
- if (m_useRegex) {
- QRegExp reg(expression, Qt::CaseInsensitive, QRegExp::RegExp);
-
- if (reg.indexIn(articleTitle) > -1)
- foundMustContain = true;
- }
- else {
- // Only accept if every wildcard token (separated by spaces) is present in the article name.
- // Order of wildcard tokens is unimportant (if order is important, they should have used *).
- foundMustContain = true;
-
- foreach (const QString &wildcard, expression.split(whitespace, QString::SplitBehavior::SkipEmptyParts)) {
- QRegExp reg(wildcard, Qt::CaseInsensitive, QRegExp::Wildcard);
-
- if (reg.indexIn(articleTitle) == -1) {
- foundMustContain = false;
- break;
- }
- }
- }
+ // A regex of the form "expr|" will always match, so do the same for wildcards
+ foundMustContain = matches(articleTitle, expression);
if (foundMustContain) {
- qDebug() << "Found matching expression:" << expression;
+ qDebug() << "Found matching" << (m_useRegex ? "regex:" : "wildcard expression:") << expression;
break;
}
}
@@ -106,38 +109,14 @@ bool DownloadRule::matches(const QString &articleTitle) const
// Each expression is either a regex, or a set of wildcards separated by whitespace.
// Reject if any complete expression matches.
foreach (const QString &expression, m_mustNotContain) {
- if (expression.isEmpty())
- continue;
-
if (!logged) {
- qDebug() << "Checking not matching expressions:" << m_mustNotContain.join("|");
+ qDebug() << "Checking not matching" << (m_useRegex ? "regex:" : "wildcard expressions:") << m_mustNotContain.join("|");
logged = true;
}
- if (m_useRegex) {
- QRegExp reg(expression, Qt::CaseInsensitive, QRegExp::RegExp);
-
- if (reg.indexIn(articleTitle) > -1) {
- qDebug() << "Found not matching expression:" << expression;
- return false;
- }
- }
-
- // Only reject if every wildcard token (separated by spaces) is present in the article name.
- // Order of wildcard tokens is unimportant (if order is important, they should have used *).
- bool foundMustNotContain = true;
-
- foreach (const QString &wildcard, expression.split(whitespace, QString::SplitBehavior::SkipEmptyParts)) {
- QRegExp reg(wildcard, Qt::CaseInsensitive, QRegExp::Wildcard);
-
- if (reg.indexIn(articleTitle) == -1) {
- foundMustNotContain = false;
- break;
- }
- }
-
- if (foundMustNotContain) {
- qDebug()<< "Found not matching expression:" << expression;
+ // A regex of the form "expr|" will always match, so do the same for wildcards
+ if (matches(articleTitle, expression)) {
+ qDebug() << "Found not matching" << (m_useRegex ? "regex:" : "wildcard expression:") << expression;
return false;
}
}
@@ -241,6 +220,10 @@ void DownloadRule::setMustContain(const QString &tokens)
m_mustContain = QStringList() << tokens;
else
m_mustContain = tokens.split("|");
+
+ // Check for single empty string - if so, no condition
+ if ((m_mustContain.size() == 1) && m_mustContain[0].isEmpty())
+ m_mustContain.clear();
}
void DownloadRule::setMustNotContain(const QString &tokens)
@@ -249,6 +232,10 @@ void DownloadRule::setMustNotContain(const QString &tokens)
m_mustNotContain = QStringList() << tokens;
else
m_mustNotContain = tokens.split("|");
+
+ // Check for single empty string - if so, no condition
+ if ((m_mustNotContain.size() == 1) && m_mustNotContain[0].isEmpty())
+ m_mustNotContain.clear();
}
QStringList DownloadRule::rssFeeds() const
diff --git a/src/base/rss/rssdownloadrule.h b/src/base/rss/rssdownloadrule.h
index c3dcfd29a..ac728f190 100644
--- a/src/base/rss/rssdownloadrule.h
+++ b/src/base/rss/rssdownloadrule.h
@@ -88,6 +88,8 @@ namespace Rss
bool operator==(const DownloadRule &other) const;
private:
+ bool matches(const QString &articleTitle, const QString &expression) const;
+
QString m_name;
QStringList m_mustContain;
QStringList m_mustNotContain;
diff --git a/src/gui/rss/automatedrssdownloader.cpp b/src/gui/rss/automatedrssdownloader.cpp
index 1337943ad..cf6bf0dd2 100644
--- a/src/gui/rss/automatedrssdownloader.cpp
+++ b/src/gui/rss/automatedrssdownloader.cpp
@@ -673,8 +673,6 @@ void AutomatedRssDownloader::updateFieldsToolTips(bool regex)
QString tip;
if (regex) {
tip = "" + tr("Regex mode: use Perl-like regular expressions") + "
";
- ui->lineContains->setToolTip(tip);
- ui->lineNotContains->setToolTip(tip);
}
else {
tip = "" + tr("Wildcard mode: you can use") + "
"
@@ -683,9 +681,18 @@ void AutomatedRssDownloader::updateFieldsToolTips(bool regex)
+ "- " + tr("Whitespaces count as AND operators (all words, any order)") + "
"
+ "- " + tr("| is used as OR operator") + "
"
+ "" + tr("If word order is important use * instead of whitespace.") + "
";
- ui->lineContains->setToolTip(tip);
- ui->lineNotContains->setToolTip(tip);
}
+
+ // Whether regex or wildcard, warn about a potential gotcha for users.
+ // Explanatory string broken over multiple lines for readability (and multiple
+ // statements to prevent uncrustify indenting excessively.
+ tip += "";
+ tip += tr("An expression with an empty %1 clause (e.g. %2)",
+ "We talk about regex/wildcards in the RSS filters section here."
+ " So a valid sentence would be: An expression with an empty | clause (e.g. expr|)"
+ ).arg("|").arg("expr|");
+ ui->lineContains->setToolTip(tip + tr(" will match all articles.") + "
");
+ ui->lineNotContains->setToolTip(tip + tr(" will exclude all articles.") + "");
}
void AutomatedRssDownloader::updateMustLineValidity()