Browse Source

Merge pull request #8607 from glassez/rss

Process loaded RSS articles in case of parsing error
adaptive-webui-19844
Vladimir Golovnev 7 years ago committed by GitHub
parent
commit
a8dbff06a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 321
      src/base/rss/private/rss_parser.cpp
  2. 2
      src/base/rss/private/rss_parser.h
  3. 27
      src/base/rss/rss_feed.cpp

321
src/base/rss/private/rss_parser.cpp

@ -25,24 +25,339 @@
* modify file(s), you may extend this exception to your version of the file(s), * modify file(s), you may extend this exception to your version of the file(s),
* but you are not obligated to do so. If you do not wish to do so, delete this * but you are not obligated to do so. If you do not wish to do so, delete this
* exception statement from your version. * exception statement from your version.
*
* Contact : chris@qbittorrent.org
*/ */
#include "rss_parser.h" #include "rss_parser.h"
#include <QDebug> #include <QDebug>
#include <QDateTime> #include <QDateTime>
#include <QGlobalStatic>
#include <QHash>
#include <QMetaObject> #include <QMetaObject>
#include <QRegExp> #include <QRegExp>
#include <QStringList> #include <QStringList>
#include <QVariant> #include <QVariant>
#include <QXmlStreamEntityResolver>
#include <QXmlStreamReader> #include <QXmlStreamReader>
#include "../rss_article.h" #include "../rss_article.h"
namespace namespace
{ {
// (X)HTML entities declared in:
// http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
// http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
// http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
using StringHash = QHash<QString, QString>;
Q_GLOBAL_STATIC_WITH_ARGS(StringHash, HTML_ENTITIES, ({
{"nbsp", "&#160;"}, // no-break space = non-breaking space, U+00A0 ISOnum
{"iexcl", "&#161;"}, // inverted exclamation mark, U+00A1 ISOnum
{"cent", "&#162;"}, // cent sign, U+00A2 ISOnum
{"pound", "&#163;"}, // pound sign, U+00A3 ISOnum
{"curren", "&#164;"}, // currency sign, U+00A4 ISOnum
{"yen", "&#165;"}, // yen sign = yuan sign, U+00A5 ISOnum
{"brvbar", "&#166;"}, // broken bar = broken vertical bar, U+00A6 ISOnum
{"sect", "&#167;"}, // section sign, U+00A7 ISOnum
{"uml", "&#168;"}, // diaeresis = spacing diaeresis, U+00A8 ISOdia
{"copy", "&#169;"}, // copyright sign, U+00A9 ISOnum
{"ordf", "&#170;"}, // feminine ordinal indicator, U+00AA ISOnum
{"laquo", "&#171;"}, // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
{"not", "&#172;"}, // not sign = angled dash, U+00AC ISOnum
{"shy", "&#173;"}, // soft hyphen = discretionary hyphen, U+00AD ISOnum
{"reg", "&#174;"}, // registered sign = registered trade mark sign, U+00AE ISOnum
{"macr", "&#175;"}, // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
{"deg", "&#176;"}, // degree sign, U+00B0 ISOnum
{"plusmn", "&#177;"}, // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
{"sup2", "&#178;"}, // superscript two = superscript digit two = squared, U+00B2 ISOnum
{"sup3", "&#179;"}, // superscript three = superscript digit three = cubed, U+00B3 ISOnum
{"acute", "&#180;"}, // acute accent = spacing acute, U+00B4 ISOdia
{"micro", "&#181;"}, // micro sign, U+00B5 ISOnum
{"para", "&#182;"}, // pilcrow sign = paragraph sign, U+00B6 ISOnum
{"middot", "&#183;"}, // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
{"cedil", "&#184;"}, // cedilla = spacing cedilla, U+00B8 ISOdia
{"sup1", "&#185;"}, // superscript one = superscript digit one, U+00B9 ISOnum
{"ordm", "&#186;"}, // masculine ordinal indicator, U+00BA ISOnum
{"raquo", "&#187;"}, // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
{"frac14", "&#188;"}, // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
{"frac12", "&#189;"}, // vulgar fraction one half = fraction one half, U+00BD ISOnum
{"frac34", "&#190;"}, // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
{"iquest", "&#191;"}, // inverted question mark = turned question mark, U+00BF ISOnum
{"Agrave", "&#192;"}, // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
{"Aacute", "&#193;"}, // latin capital letter A with acute, U+00C1 ISOlat1
{"Acirc", "&#194;"}, // latin capital letter A with circumflex, U+00C2 ISOlat1
{"Atilde", "&#195;"}, // latin capital letter A with tilde, U+00C3 ISOlat1
{"Auml", "&#196;"}, // latin capital letter A with diaeresis, U+00C4 ISOlat1
{"Aring", "&#197;"}, // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
{"AElig", "&#198;"}, // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
{"Ccedil", "&#199;"}, // latin capital letter C with cedilla, U+00C7 ISOlat1
{"Egrave", "&#200;"}, // latin capital letter E with grave, U+00C8 ISOlat1
{"Eacute", "&#201;"}, // latin capital letter E with acute, U+00C9 ISOlat1
{"Ecirc", "&#202;"}, // latin capital letter E with circumflex, U+00CA ISOlat1
{"Euml", "&#203;"}, // latin capital letter E with diaeresis, U+00CB ISOlat1
{"Igrave", "&#204;"}, // latin capital letter I with grave, U+00CC ISOlat1
{"Iacute", "&#205;"}, // latin capital letter I with acute, U+00CD ISOlat1
{"Icirc", "&#206;"}, // latin capital letter I with circumflex, U+00CE ISOlat1
{"Iuml", "&#207;"}, // latin capital letter I with diaeresis, U+00CF ISOlat1
{"ETH", "&#208;"}, // latin capital letter ETH, U+00D0 ISOlat1
{"Ntilde", "&#209;"}, // latin capital letter N with tilde, U+00D1 ISOlat1
{"Ograve", "&#210;"}, // latin capital letter O with grave, U+00D2 ISOlat1
{"Oacute", "&#211;"}, // latin capital letter O with acute, U+00D3 ISOlat1
{"Ocirc", "&#212;"}, // latin capital letter O with circumflex, U+00D4 ISOlat1
{"Otilde", "&#213;"}, // latin capital letter O with tilde, U+00D5 ISOlat1
{"Ouml", "&#214;"}, // latin capital letter O with diaeresis, U+00D6 ISOlat1
{"times", "&#215;"}, // multiplication sign, U+00D7 ISOnum
{"Oslash", "&#216;"}, // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
{"Ugrave", "&#217;"}, // latin capital letter U with grave, U+00D9 ISOlat1
{"Uacute", "&#218;"}, // latin capital letter U with acute, U+00DA ISOlat1
{"Ucirc", "&#219;"}, // latin capital letter U with circumflex, U+00DB ISOlat1
{"Uuml", "&#220;"}, // latin capital letter U with diaeresis, U+00DC ISOlat1
{"Yacute", "&#221;"}, // latin capital letter Y with acute, U+00DD ISOlat1
{"THORN", "&#222;"}, // latin capital letter THORN, U+00DE ISOlat1
{"szlig", "&#223;"}, // latin small letter sharp s = ess-zed, U+00DF ISOlat1
{"agrave", "&#224;"}, // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
{"aacute", "&#225;"}, // latin small letter a with acute, U+00E1 ISOlat1
{"acirc", "&#226;"}, // latin small letter a with circumflex, U+00E2 ISOlat1
{"atilde", "&#227;"}, // latin small letter a with tilde, U+00E3 ISOlat1
{"auml", "&#228;"}, // latin small letter a with diaeresis, U+00E4 ISOlat1
{"aring", "&#229;"}, // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
{"aelig", "&#230;"}, // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
{"ccedil", "&#231;"}, // latin small letter c with cedilla, U+00E7 ISOlat1
{"egrave", "&#232;"}, // latin small letter e with grave, U+00E8 ISOlat1
{"eacute", "&#233;"}, // latin small letter e with acute, U+00E9 ISOlat1
{"ecirc", "&#234;"}, // latin small letter e with circumflex, U+00EA ISOlat1
{"euml", "&#235;"}, // latin small letter e with diaeresis, U+00EB ISOlat1
{"igrave", "&#236;"}, // latin small letter i with grave, U+00EC ISOlat1
{"iacute", "&#237;"}, // latin small letter i with acute, U+00ED ISOlat1
{"icirc", "&#238;"}, // latin small letter i with circumflex, U+00EE ISOlat1
{"iuml", "&#239;"}, // latin small letter i with diaeresis, U+00EF ISOlat1
{"eth", "&#240;"}, // latin small letter eth, U+00F0 ISOlat1
{"ntilde", "&#241;"}, // latin small letter n with tilde, U+00F1 ISOlat1
{"ograve", "&#242;"}, // latin small letter o with grave, U+00F2 ISOlat1
{"oacute", "&#243;"}, // latin small letter o with acute, U+00F3 ISOlat1
{"ocirc", "&#244;"}, // latin small letter o with circumflex, U+00F4 ISOlat1
{"otilde", "&#245;"}, // latin small letter o with tilde, U+00F5 ISOlat1
{"ouml", "&#246;"}, // latin small letter o with diaeresis, U+00F6 ISOlat1
{"divide", "&#247;"}, // division sign, U+00F7 ISOnum
{"oslash", "&#248;"}, // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
{"ugrave", "&#249;"}, // latin small letter u with grave, U+00F9 ISOlat1
{"uacute", "&#250;"}, // latin small letter u with acute, U+00FA ISOlat1
{"ucirc", "&#251;"}, // latin small letter u with circumflex, U+00FB ISOlat1
{"uuml", "&#252;"}, // latin small letter u with diaeresis, U+00FC ISOlat1
{"yacute", "&#253;"}, // latin small letter y with acute, U+00FD ISOlat1
{"thorn", "&#254;"}, // latin small letter thorn, U+00FE ISOlat1
{"yuml", "&#255;"}, // latin small letter y with diaeresis, U+00FF ISOlat1
// Latin Extended-A
{"OElig", "&#338;"}, // latin capital ligature OE, U+0152 ISOlat2
{"oelig", "&#339;"}, // latin small ligature oe, U+0153 ISOlat2
// ligature is a misnomer, this is a separate character in some languages
{"Scaron", "&#352;"}, // latin capital letter S with caron, U+0160 ISOlat2
{"scaron", "&#353;"}, // latin small letter s with caron, U+0161 ISOlat2
{"Yuml", "&#376;"}, // latin capital letter Y with diaeresis, U+0178 ISOlat2
// Spacing Modifier Letters
{"circ", "&#710;"}, // modifier letter circumflex accent, U+02C6 ISOpub
{"tilde", "&#732;"}, // small tilde, U+02DC ISOdia
// General Punctuation
{"ensp", "&#8194;"}, // en space, U+2002 ISOpub
{"emsp", "&#8195;"}, // em space, U+2003 ISOpub
{"thinsp", "&#8201;"}, // thin space, U+2009 ISOpub
{"zwnj", "&#8204;"}, // zero width non-joiner, U+200C NEW RFC 2070
{"zwj", "&#8205;"}, // zero width joiner, U+200D NEW RFC 2070
{"lrm", "&#8206;"}, // left-to-right mark, U+200E NEW RFC 2070
{"rlm", "&#8207;"}, // right-to-left mark, U+200F NEW RFC 2070
{"ndash", "&#8211;"}, // en dash, U+2013 ISOpub
{"mdash", "&#8212;"}, // em dash, U+2014 ISOpub
{"lsquo", "&#8216;"}, // left single quotation mark, U+2018 ISOnum
{"rsquo", "&#8217;"}, // right single quotation mark, U+2019 ISOnum
{"sbquo", "&#8218;"}, // single low-9 quotation mark, U+201A NEW
{"ldquo", "&#8220;"}, // left double quotation mark, U+201C ISOnum
{"rdquo", "&#8221;"}, // right double quotation mark, U+201D ISOnum
{"bdquo", "&#8222;"}, // double low-9 quotation mark, U+201E NEW
{"dagger", "&#8224;"}, // dagger, U+2020 ISOpub
{"Dagger", "&#8225;"}, // double dagger, U+2021 ISOpub
{"permil", "&#8240;"}, // per mille sign, U+2030 ISOtech
{"lsaquo", "&#8249;"}, // single left-pointing angle quotation mark, U+2039 ISO proposed
// lsaquo is proposed but not yet ISO standardized
{"rsaquo", "&#8250;"}, // single right-pointing angle quotation mark, U+203A ISO proposed
// rsaquo is proposed but not yet ISO standardized
// Currency Symbols
{"euro", "&#8364;"}, // euro sign, U+20AC NEW
// Latin Extended-B
{"fnof", "&#402;"}, // latin small letter f with hook = function = florin, U+0192 ISOtech
// Greek
{"Alpha", "&#913;"}, // greek capital letter alpha, U+0391
{"Beta", "&#914;"}, // greek capital letter beta, U+0392
{"Gamma", "&#915;"}, // greek capital letter gamma, U+0393 ISOgrk3
{"Delta", "&#916;"}, // greek capital letter delta, U+0394 ISOgrk3
{"Epsilon", "&#917;"}, // greek capital letter epsilon, U+0395
{"Zeta", "&#918;"}, // greek capital letter zeta, U+0396
{"Eta", "&#919;"}, // greek capital letter eta, U+0397
{"Theta", "&#920;"}, // greek capital letter theta, U+0398 ISOgrk3
{"Iota", "&#921;"}, // greek capital letter iota, U+0399
{"Kappa", "&#922;"}, // greek capital letter kappa, U+039A
{"Lambda", "&#923;"}, // greek capital letter lamda, U+039B ISOgrk3
{"Mu", "&#924;"}, // greek capital letter mu, U+039C
{"Nu", "&#925;"}, // greek capital letter nu, U+039D
{"Xi", "&#926;"}, // greek capital letter xi, U+039E ISOgrk3
{"Omicron", "&#927;"}, // greek capital letter omicron, U+039F
{"Pi", "&#928;"}, // greek capital letter pi, U+03A0 ISOgrk3
{"Rho", "&#929;"}, // greek capital letter rho, U+03A1
{"Sigma", "&#931;"}, // greek capital letter sigma, U+03A3 ISOgrk3
{"Tau", "&#932;"}, // greek capital letter tau, U+03A4
{"Upsilon", "&#933;"}, // greek capital letter upsilon, U+03A5 ISOgrk3
{"Phi", "&#934;"}, // greek capital letter phi, U+03A6 ISOgrk3
{"Chi", "&#935;"}, // greek capital letter chi, U+03A7
{"Psi", "&#936;"}, // greek capital letter psi, U+03A8 ISOgrk3
{"Omega", "&#937;"}, // greek capital letter omega, U+03A9 ISOgrk3
{"alpha", "&#945;"}, // greek small letter alpha, U+03B1 ISOgrk3
{"beta", "&#946;"}, // greek small letter beta, U+03B2 ISOgrk3
{"gamma", "&#947;"}, // greek small letter gamma, U+03B3 ISOgrk3
{"delta", "&#948;"}, // greek small letter delta, U+03B4 ISOgrk3
{"epsilon", "&#949;"}, // greek small letter epsilon, U+03B5 ISOgrk3
{"zeta", "&#950;"}, // greek small letter zeta, U+03B6 ISOgrk3
{"eta", "&#951;"}, // greek small letter eta, U+03B7 ISOgrk3
{"theta", "&#952;"}, // greek small letter theta, U+03B8 ISOgrk3
{"iota", "&#953;"}, // greek small letter iota, U+03B9 ISOgrk3
{"kappa", "&#954;"}, // greek small letter kappa, U+03BA ISOgrk3
{"lambda", "&#955;"}, // greek small letter lamda, U+03BB ISOgrk3
{"mu", "&#956;"}, // greek small letter mu, U+03BC ISOgrk3
{"nu", "&#957;"}, // greek small letter nu, U+03BD ISOgrk3
{"xi", "&#958;"}, // greek small letter xi, U+03BE ISOgrk3
{"omicron", "&#959;"}, // greek small letter omicron, U+03BF NEW
{"pi", "&#960;"}, // greek small letter pi, U+03C0 ISOgrk3
{"rho", "&#961;"}, // greek small letter rho, U+03C1 ISOgrk3
{"sigmaf", "&#962;"}, // greek small letter final sigma, U+03C2 ISOgrk3
{"sigma", "&#963;"}, // greek small letter sigma, U+03C3 ISOgrk3
{"tau", "&#964;"}, // greek small letter tau, U+03C4 ISOgrk3
{"upsilon", "&#965;"}, // greek small letter upsilon, U+03C5 ISOgrk3
{"phi", "&#966;"}, // greek small letter phi, U+03C6 ISOgrk3
{"chi", "&#967;"}, // greek small letter chi, U+03C7 ISOgrk3
{"psi", "&#968;"}, // greek small letter psi, U+03C8 ISOgrk3
{"omega", "&#969;"}, // greek small letter omega, U+03C9 ISOgrk3
{"thetasym", "&#977;"}, // greek theta symbol, U+03D1 NEW
{"upsih", "&#978;"}, // greek upsilon with hook symbol, U+03D2 NEW
{"piv", "&#982;"}, // greek pi symbol, U+03D6 ISOgrk3
// General Punctuation
{"bull", "&#8226;"}, // bullet = black small circle, U+2022 ISOpub
// bullet is NOT the same as bullet operator, U+2219
{"hellip", "&#8230;"}, // horizontal ellipsis = three dot leader, U+2026 ISOpub
{"prime", "&#8242;"}, // prime = minutes = feet, U+2032 ISOtech
{"Prime", "&#8243;"}, // double prime = seconds = inches, U+2033 ISOtech
{"oline", "&#8254;"}, // overline = spacing overscore, U+203E NEW
{"frasl", "&#8260;"}, // fraction slash, U+2044 NEW
// Letterlike Symbols
{"weierp", "&#8472;"}, // script capital P = power set = Weierstrass p, U+2118 ISOamso
{"image", "&#8465;"}, // black-letter capital I = imaginary part, U+2111 ISOamso
{"real", "&#8476;"}, // black-letter capital R = real part symbol, U+211C ISOamso
{"trade", "&#8482;"}, // trade mark sign, U+2122 ISOnum
{"alefsym", "&#8501;"}, // alef symbol = first transfinite cardinal, U+2135 NEW
// alef symbol is NOT the same as hebrew letter alef,
// U+05D0 although the same glyph could be used to depict both characters
// Arrows
{"larr", "&#8592;"}, // leftwards arrow, U+2190 ISOnum
{"uarr", "&#8593;"}, // upwards arrow, U+2191 ISOnum
{"rarr", "&#8594;"}, // rightwards arrow, U+2192 ISOnum
{"darr", "&#8595;"}, // downwards arrow, U+2193 ISOnum
{"harr", "&#8596;"}, // left right arrow, U+2194 ISOamsa
{"crarr", "&#8629;"}, // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
{"lArr", "&#8656;"}, // leftwards double arrow, U+21D0 ISOtech
// Unicode does not say that lArr is the same as the 'is implied by' arrow
// but also does not have any other character for that function. So lArr can
// be used for 'is implied by' as ISOtech suggests
{"uArr", "&#8657;"}, // upwards double arrow, U+21D1 ISOamsa
{"rArr", "&#8658;"}, // rightwards double arrow, U+21D2 ISOtech
// Unicode does not say this is the 'implies' character but does not have
// another character with this function so rArr can be used for 'implies'
// as ISOtech suggests
{"dArr", "&#8659;"}, // downwards double arrow, U+21D3 ISOamsa
{"hArr", "&#8660;"}, // left right double arrow, U+21D4 ISOamsa
// Mathematical Operators
{"forall", "&#8704;"}, // for all, U+2200 ISOtech
{"part", "&#8706;"}, // partial differential, U+2202 ISOtech
{"exist", "&#8707;"}, // there exists, U+2203 ISOtech
{"empty", "&#8709;"}, // empty set = null set, U+2205 ISOamso
{"nabla", "&#8711;"}, // nabla = backward difference, U+2207 ISOtech
{"isin", "&#8712;"}, // element of, U+2208 ISOtech
{"notin", "&#8713;"}, // not an element of, U+2209 ISOtech
{"ni", "&#8715;"}, // contains as member, U+220B ISOtech
{"prod", "&#8719;"}, // n-ary product = product sign, U+220F ISOamsb
// prod is NOT the same character as U+03A0 'greek capital letter pi' though
// the same glyph might be used for both
{"sum", "&#8721;"}, // n-ary summation, U+2211 ISOamsb
// sum is NOT the same character as U+03A3 'greek capital letter sigma'
// though the same glyph might be used for both
{"minus", "&#8722;"}, // minus sign, U+2212 ISOtech
{"lowast", "&#8727;"}, // asterisk operator, U+2217 ISOtech
{"radic", "&#8730;"}, // square root = radical sign, U+221A ISOtech
{"prop", "&#8733;"}, // proportional to, U+221D ISOtech
{"infin", "&#8734;"}, // infinity, U+221E ISOtech
{"ang", "&#8736;"}, // angle, U+2220 ISOamso
{"and", "&#8743;"}, // logical and = wedge, U+2227 ISOtech
{"or", "&#8744;"}, // logical or = vee, U+2228 ISOtech
{"cap", "&#8745;"}, // intersection = cap, U+2229 ISOtech
{"cup", "&#8746;"}, // union = cup, U+222A ISOtech
{"int", "&#8747;"}, // integral, U+222B ISOtech
{"there4", "&#8756;"}, // therefore, U+2234 ISOtech
{"sim", "&#8764;"}, // tilde operator = varies with = similar to, U+223C ISOtech
// tilde operator is NOT the same character as the tilde, U+007E,
// although the same glyph might be used to represent both
{"cong", "&#8773;"}, // approximately equal to, U+2245 ISOtech
{"asymp", "&#8776;"}, // almost equal to = asymptotic to, U+2248 ISOamsr
{"ne", "&#8800;"}, // not equal to, U+2260 ISOtech
{"equiv", "&#8801;"}, // identical to, U+2261 ISOtech
{"le", "&#8804;"}, // less-than or equal to, U+2264 ISOtech
{"ge", "&#8805;"}, // greater-than or equal to, U+2265 ISOtech
{"sub", "&#8834;"}, // subset of, U+2282 ISOtech
{"sup", "&#8835;"}, // superset of, U+2283 ISOtech
{"nsub", "&#8836;"}, // not a subset of, U+2284 ISOamsn
{"sube", "&#8838;"}, // subset of or equal to, U+2286 ISOtech
{"supe", "&#8839;"}, // superset of or equal to, U+2287 ISOtech
{"oplus", "&#8853;"}, // circled plus = direct sum, U+2295 ISOamsb
{"otimes", "&#8855;"}, // circled times = vector product, U+2297 ISOamsb
{"perp", "&#8869;"}, // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
{"sdot", "&#8901;"}, // dot operator, U+22C5 ISOamsb
// dot operator is NOT the same character as U+00B7 middle dot
// Miscellaneous Technical
{"lceil", "&#8968;"}, // left ceiling = APL upstile, U+2308 ISOamsc
{"rceil", "&#8969;"}, // right ceiling, U+2309 ISOamsc
{"lfloor", "&#8970;"}, // left floor = APL downstile, U+230A ISOamsc
{"rfloor", "&#8971;"}, // right floor, U+230B ISOamsc
{"lang", "&#9001;"}, // left-pointing angle bracket = bra, U+2329 ISOtech
// lang is NOT the same character as U+003C 'less than sign'
// or U+2039 'single left-pointing angle quotation mark'
{"rang", "&#9002;"}, // right-pointing angle bracket = ket, U+232A ISOtech
// rang is NOT the same character as U+003E 'greater than sign'
// or U+203A 'single right-pointing angle quotation mark'
// Geometric Shapes
{"loz", "&#9674;"}, // lozenge, U+25CA ISOpub
// Miscellaneous Symbols
{"spades", "&#9824;"}, // black spade suit, U+2660 ISOpub
{"clubs", "&#9827;"}, // black club suit = shamrock, U+2663 ISOpub
{"hearts", "&#9829;"}, // black heart suit = valentine, U+2665 ISOpub
{"diams", "&#9830;"} // black diamond suit, U+2666 ISOpub
}))
class XmlStreamEntityResolver : public QXmlStreamEntityResolver
{
public:
QString resolveUndeclaredEntity(const QString &name) override
{
return HTML_ENTITIES->value(name);
}
};
const char shortDay[][4] = { const char shortDay[][4] = {
"Mon", "Tue", "Wed", "Mon", "Tue", "Wed",
"Thu", "Fri", "Sat", "Thu", "Fri", "Sat",
@ -228,6 +543,8 @@ void Parser::parse(const QByteArray &feedData)
void Parser::parse_impl(const QByteArray &feedData) void Parser::parse_impl(const QByteArray &feedData)
{ {
QXmlStreamReader xml(feedData); QXmlStreamReader xml(feedData);
XmlStreamEntityResolver resolver;
xml.setEntityResolver(&resolver);
bool foundChannel = false; bool foundChannel = false;
while (xml.readNextStartElement()) { while (xml.readNextStartElement()) {

2
src/base/rss/private/rss_parser.h

@ -25,8 +25,6 @@
* modify file(s), you may extend this exception to your version of the file(s), * modify file(s), you may extend this exception to your version of the file(s),
* but you are not obligated to do so. If you do not wish to do so, delete this * but you are not obligated to do so. If you do not wish to do so, delete this
* exception statement from your version. * exception statement from your version.
*
* Contact : chris@qbittorrent.org
*/ */
#pragma once #pragma once

27
src/base/rss/rss_feed.cpp

@ -197,12 +197,13 @@ void Feed::handleDownloadFailed(const QString &url, const QString &error)
void Feed::handleParsingFinished(const RSS::Private::ParsingResult &result) void Feed::handleParsingFinished(const RSS::Private::ParsingResult &result)
{ {
if (!result.error.isEmpty()) { m_hasError = !result.error.isEmpty();
m_hasError = true;
LogMsg(tr("Failed to parse RSS feed at '%1'. Reason: %2").arg(m_url, result.error) // For some reason, the RSS feed may contain malformed XML data and it may not be
, Log::WARNING); // successfully parsed by the XML parser. We are still trying to load as many articles
} // as possible until we encounter corrupted data. So we can have some articles here
else { // even in case of parsing error.
if (!m_hasError || !result.articles.isEmpty()) {
if (title() != result.title) { if (title() != result.title) {
m_title = result.title; m_title = result.title;
emit titleChanged(this); emit titleChanged(this);
@ -211,7 +212,7 @@ void Feed::handleParsingFinished(const RSS::Private::ParsingResult &result)
m_lastBuildDate = result.lastBuildDate; m_lastBuildDate = result.lastBuildDate;
int newArticlesCount = 0; int newArticlesCount = 0;
foreach (const QVariantHash &varHash, result.articles) { for (const QVariantHash &varHash : result.articles) {
try { try {
auto article = new Article(this, varHash); auto article = new Article(this, varHash);
if (addArticle(article)) if (addArticle(article))
@ -223,11 +224,15 @@ void Feed::handleParsingFinished(const RSS::Private::ParsingResult &result)
} }
m_dirty = (newArticlesCount > 0); m_dirty = (newArticlesCount > 0);
store(); store();
m_hasError = false;
LogMsg(tr("RSS feed at '%1' successfully updated. Added %2 new articles.") LogMsg(tr("RSS feed at '%1' updated. Added %2 new articles.")
.arg(m_url).arg(newArticlesCount)); .arg(m_url, QString::number(newArticlesCount)));
}
if (m_hasError) {
LogMsg(tr("Failed to parse RSS feed at '%1'. Reason: %2").arg(m_url, result.error)
, Log::WARNING);
} }
m_isLoading = false; m_isLoading = false;

Loading…
Cancel
Save