From 7c1e91f256c56a301790023e7bb04a4d264b056c Mon Sep 17 00:00:00 2001 From: Nick Tiskov Date: Sun, 27 Jan 2013 13:41:08 +0400 Subject: [PATCH] Support Atom feeds --- src/rss/rssparser.cpp | 131 +++++++++++++++++++++++++++++++++++++++++- src/rss/rssparser.h | 4 +- 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/src/rss/rssparser.cpp b/src/rss/rssparser.cpp index 64e16697e..2b1d73765 100644 --- a/src/rss/rssparser.cpp +++ b/src/rss/rssparser.cpp @@ -35,6 +35,7 @@ #include #include #include +#include struct ParsingJob { QString feedUrl; @@ -236,7 +237,7 @@ void RssParser::run() if (!m_queue.empty()) { ParsingJob job = m_queue.dequeue(); m_mutex.unlock(); - parseRSS(job); + parseFeed(job); } else { qDebug() << Q_FUNC_INFO << "Thread is waiting."; m_waitCondition.wait(&m_mutex); @@ -326,8 +327,129 @@ void RssParser::parseRSSChannel(QXmlStreamReader& xml, const QString& feedUrl) } } +void RssParser::parseAtomArticle(QXmlStreamReader& xml, const QString& feedUrl, const QString& baseUrl) +{ + QVariantHash article; + bool double_content = false; + + while(!xml.atEnd()) { + xml.readNext(); + + if(xml.isEndElement() && xml.name() == "entry") + break; + + if (xml.isStartElement()) { + if (xml.name() == "title") { + // Workaround for CDATA (QString cannot parse html escapes on it's own) + QTextDocument doc; + doc.setHtml(xml.readElementText()); + article["title"] = doc.toPlainText(); + } + else if (xml.name() == "link") { + QString theLink = ( xml.attributes().isEmpty() ? + xml.readElementText() : + xml.attributes().value("href").toString() ); + + // Atom feeds can have relative links, work around this and + // take the stress of figuring article full URI from UI + + // Assemble full URI + article["news_link"] = ( baseUrl.isEmpty() ? + theLink : + baseUrl + theLink ); + } + else if (xml.name() == "summary" || xml.name() == "content"){ + if(double_content) { // Duplicate content -> ignore + xml.readNext(); + + while(xml.name() != "summary" && xml.name() != "content") + xml.readNext(); + + continue; + } + + // Try to also parse broken articles, which don't use html '&' escapes + // Actually works great for non-broken content too + QString feedText = xml.readElementText(QXmlStreamReader::IncludeChildElements); + if (!feedText.isEmpty()) + article["description"] = feedText; + + double_content = true; + } + else if (xml.name() == "updated"){ + // ATOM uses standard compliant date, don't do fancy stuff + QDateTime articleDate = QDateTime::fromString(xml.readElementText(), Qt::ISODate); + article["date"] = ( articleDate.isValid() ? + articleDate : + QDateTime::currentDateTime() ); + } + else if (xml.name() == "author") { + xml.readNext(); + while(xml.name() != "author") { + if(xml.name() == "name") + article["author"] = xml.readElementText(); + xml.readNext(); + } + } + else if (xml.name() == "id") + article["id"] = xml.readElementText(); + } + } + + if (!article.contains("id")) { + // Item does not have a guid, fall back to some other identifier + const QString link = article.value("news_link").toString(); + if (!link.isEmpty()) + article["id"] = link; + else { + const QString title = article.value("title").toString(); + if (!title.isEmpty()) + article["id"] = title; + else { + qWarning() << "Item has no guid, link or title, ignoring it..."; + return; + } + } + } + + emit newArticle(feedUrl, article); +} + +void RssParser::parseAtomChannel(QXmlStreamReader& xml, const QString& feedUrl) +{ + qDebug() << Q_FUNC_INFO << feedUrl; + Q_ASSERT(xml.isStartElement() && xml.name() == "feed"); + + QString baseURL = xml.attributes().value("xml:base").toString(); + + while(!xml.atEnd()) { + xml.readNext(); + + if (xml.isStartElement()) { + if (xml.name() == "title") { + QString title = xml.readElementText(); + emit feedTitle(feedUrl, title); + } + else if (xml.name() == "updated") { + QString lastBuildDate = xml.readElementText(); + if (!lastBuildDate.isEmpty()) { + QMutexLocker locker(&m_mutex); + if (m_lastBuildDates.value(feedUrl) == lastBuildDate) { + qDebug() << "The RSS feed has not changed since last time, aborting parsing."; + return; + } + m_lastBuildDates[feedUrl] = lastBuildDate; + } + } + else if (xml.name() == "entry") { + parseAtomArticle(xml, feedUrl, baseURL); + } + } + } +} + // read and create items from a rss document -void RssParser::parseRSS(const ParsingJob& job) +void RssParser::parseFeed(const ParsingJob& job) { qDebug() << Q_FUNC_INFO << job.feedUrl << job.filePath; QFile fileRss(job.filePath); @@ -352,6 +474,11 @@ void RssParser::parseRSS(const ParsingJob& job) } } break; + } + else if (xml.name() == "feed") { // Atom feed + parseAtomChannel(xml, job.feedUrl); + found_channel = true; + break; } else { qDebug() << "Skip root item: " << xml.name(); xml.skipCurrentElement(); diff --git a/src/rss/rssparser.h b/src/rss/rssparser.h index cd771b587..bb3b2466f 100644 --- a/src/rss/rssparser.h +++ b/src/rss/rssparser.h @@ -61,7 +61,9 @@ protected: static QDateTime parseDate(const QString& string); void parseRssArticle(QXmlStreamReader& xml, const QString& feedUrl); void parseRSSChannel(QXmlStreamReader& xml, const QString& feedUrl); - void parseRSS(const ParsingJob& job); + void parseAtomArticle(QXmlStreamReader& xml, const QString& feedUrl, const QString& baseUrl); + void parseAtomChannel(QXmlStreamReader& xml, const QString& feedUrl); + void parseFeed(const ParsingJob& job); void reportFailure(const ParsingJob& job, const QString& error); private: