From 367c62bd39323a7ec944bbb35714c010e8bb8e94 Mon Sep 17 00:00:00 2001
From: Anthony Restaino <anthonyrestaino11@gmail.com>
Date: Tue, 25 Aug 2015 20:59:23 -0400
Subject: [PATCH] Improved reading mode thanks to changes from snacktory fork
 by skyshard

---
 .../lightning/activity/ReadingActivity.java   |  14 +-
 .../reading/ArticleTextExtractor.java         | 902 +++++++++++++++---
 .../lightning/reading/HtmlFetcher.java        | 153 +--
 .../browser/lightning/reading/JResult.java    |  98 +-
 .../lightning/reading/OutputFormatter.java    | 145 ++-
 .../browser/lightning/reading/SHelper.java    |  94 +-
 6 files changed, 1050 insertions(+), 356 deletions(-)
diff --git a/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java b/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java
index 4953dff..7cb77aa 100644
--- a/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java
+++ b/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java
@@ -139,7 +139,7 @@ public class ReadingActivity extends AppCompatActivity {
 
         private final Activity mActivity;
         private String mTitleText;
-        private List<String> mBodyText;
+        private String mBodyText;
 
         public PageLoader(Activity activity) {
             mActivity = activity;
@@ -163,15 +163,15 @@ public class ReadingActivity extends AppCompatActivity {
             try {
                 JResult result = fetcher.fetchAndExtract(params[0], 2500, true);
                 mTitleText = result.getTitle();
-                mBodyText = result.getTextList();
+                mBodyText = result.getText();
             } catch (Exception e) {
                 mTitleText = "";
-                mBodyText = new ArrayList<>();
+                mBodyText = "";
                 e.printStackTrace();
             } catch (OutOfMemoryError e) {
                 System.gc();
                 mTitleText = "";
-                mBodyText = new ArrayList<>();
+                mBodyText = "";
                 e.printStackTrace();
             }
             return null;
@@ -186,11 +186,7 @@ public class ReadingActivity extends AppCompatActivity {
             if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
                 setText(getString(R.string.untitled), getString(R.string.loading_failed));
             } else {
-                StringBuilder builder = new StringBuilder();
-                for (String text : mBodyText) {
-                    builder.append(text).append("\n\n");
-                }
-                setText(mTitleText, builder.toString());
+                setText(mTitleText, mBodyText);
             }
             super.onPostExecute(result);
         }
diff --git a/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java b/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java
index a9596e8..7ec41a9 100644
--- a/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java
+++ b/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java
@@ -5,20 +5,25 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
-import java.util.HashSet;
+import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
-import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import java.util.Date;
 
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
+import org.jsoup.select.Selector.SelectorParseException;
 
 /**
  * This class is thread safe.
+ * Class for content extraction from string form of webpage
+ * 'extractContent' is main call from external programs/classes
  *
  * @author Alex P (ifesdjeen from jreadability)
  * @author Peter Karich
@@ -36,17 +41,33 @@ public class ArticleTextExtractor {
     // Most likely negative candidates
     private String negativeStr;
     private Pattern NEGATIVE;
-    private static final Pattern NEGATIVE_STYLE = Pattern
-            .compile("hidden|display: ?none|font-size: ?small");
+    private static final Pattern NEGATIVE_STYLE =
+            Pattern.compile("hidden|display: ?none|font-size: ?small");
+    private static final Pattern IGNORE_AUTHOR_PARTS =
+            Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE);
     private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
         {
             add("hacker news");
             add("facebook");
+            add("home");
+            add("articles");
         }
     };
     private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
     private OutputFormatter formatter = DEFAULT_FORMATTER;
 
+    private static final int MAX_AUTHOR_NAME_LENGHT = 255;
+    private static final int MIN_AUTHOR_NAME_LENGTH = 4;
+    private static final List<Pattern> CLEAN_AUTHOR_PATTERNS = Collections.singletonList(
+            Pattern.compile("By\\S*(.*)[\\.,].*")
+    );
+    private static final int MAX_AUTHOR_DESC_LENGHT = 1000;
+    private static final int MAX_IMAGE_LENGHT = 255;
+
+    // For debugging
+    private static final boolean DEBUG_WEIGHTS = false;
+    private static final int MAX_LOG_LENGTH = 200;
+
     public ArticleTextExtractor() {
         setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
                 + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
@@ -59,7 +80,7 @@ public class ArticleTextExtractor {
                 + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
     }
 
-    public ArticleTextExtractor setUnlikely(String unlikelyStr) {
+    private ArticleTextExtractor setUnlikely(String unlikelyStr) {
         this.unlikelyStr = unlikelyStr;
         UNLIKELY = Pattern.compile(unlikelyStr);
         return this;
@@ -69,7 +90,7 @@ public class ArticleTextExtractor {
         return setUnlikely(unlikelyStr + '|' + unlikelyMatches);
     }
 
-    public ArticleTextExtractor setPositive(String positiveStr) {
+    private ArticleTextExtractor setPositive(String positiveStr) {
         this.positiveStr = positiveStr;
         POSITIVE = Pattern.compile(positiveStr);
         return this;
@@ -79,7 +100,7 @@ public class ArticleTextExtractor {
         return setPositive(positiveStr + '|' + pos);
     }
 
-    public ArticleTextExtractor setNegative(String negativeStr) {
+    private ArticleTextExtractor setNegative(String negativeStr) {
         this.negativeStr = negativeStr;
         NEGATIVE = Pattern.compile(negativeStr);
         return this;
@@ -95,109 +116,234 @@ public class ArticleTextExtractor {
     }
 
     /**
-     * @param doc the document to extract
-     *            extracts article text from given html string. wasn't tested
-     *            with improper HTML, although jSoup should be able to handle
-     *            minor stuff.
+     * @param html extracts article text from given html string. wasn't tested
+     *             with improper HTML, although jSoup should be able to handle minor stuff.
+     * @returns extracted article, all HTML tags stripped
      */
-    public JResult extractContent(Document doc) throws Exception {
-        return extractContent(new JResult(), doc, formatter);
+    public JResult extractContent(String html, int maxContentSize) throws Exception {
+        return extractContent(new JResult(), html, maxContentSize);
     }
 
-    public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {
-        return extractContent(new JResult(), doc, formatter);
+    public JResult extractContent(String html) throws Exception {
+        return extractContent(new JResult(), html, 0);
     }
 
-    public JResult extractContent(String html) throws Exception {
-        return extractContent(new JResult(), html);
+    public JResult extractContent(JResult res, String html, int maxContentSize) throws Exception {
+        return extractContent(res, html, formatter, true, maxContentSize);
     }
 
     public JResult extractContent(JResult res, String html) throws Exception {
-        return extractContent(res, html, formatter);
+        return extractContent(res, html, formatter, true, 0);
     }
 
-    public JResult extractContent(JResult res, String html, OutputFormatter formatter)
-            throws Exception {
+    private JResult extractContent(JResult res, String html, OutputFormatter formatter,
+                                   Boolean extractimages, int maxContentSize) throws Exception {
         if (html.isEmpty())
             throw new IllegalArgumentException("html string is empty!?");
 
         // http://jsoup.org/cookbook/extracting-data/selector-syntax
-        return extractContent(res, Jsoup.parse(html), formatter);
+        return extractContent(res, Jsoup.parse(html), formatter, extractimages, maxContentSize);
+    }
+
+    // Returns the best node match based on the weights (see getWeight for strategy)
+    private Element getBestMatchElement(Collection<Element> nodes) {
+        int maxWeight = -200;        // why -200 now instead of 0?
+        Element bestMatchElement = null;
+
+        boolean ignoreMaxWeightLimit = false;
+        for (Element entry : nodes) {
+
+            int currentWeight = getWeight(entry, false);
+            if (currentWeight > maxWeight) {
+                maxWeight = currentWeight;
+                bestMatchElement = entry;
+
+                /*
+                // NOTE: This optimization fails with large pages that
+                contains chunks of text that can be mistaken by articles, since we 
+                want the best accuracy possible, I am disabling it for now. AP.
+
+                // The original code had a limit of 200, the intention was that
+                // if a node had a weight greater than it, then it most likely
+                // it was the main content.
+                // However this assumption fails when the amount of text in the 
+                // children (or grandchildren) is too large. If we detect this
+                // case then the limit is ignored and we try all the nodes to select
+                // the one with the absolute maximum weight.
+                if (maxWeight > 500){
+                    ignoreMaxWeightLimit = true;
+                    continue;
+                } 
+                
+                // formerly 200, increased to 250 to account for the fact
+                // we are not adding the weights of the grand children to the
+                // tally.
+                
+                if (maxWeight > 250 && !ignoreMaxWeightLimit) 
+                    break;
+                */
+            }
+        }
+
+        return bestMatchElement;
     }
 
-    public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)
-            throws NullPointerException {
+    private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
+                                   Boolean extractimages, int maxContentSize) throws Exception {
+        Document origDoc = doc.clone();
+        JResult result = extractContent(res, doc, formatter, extractimages, maxContentSize, true);
+        //System.out.println("result.getText().length()="+result.getText().length());
+        if (result.getText().isEmpty()) {
+            result = extractContent(res, origDoc, formatter, extractimages, maxContentSize, false);
+        }
+        return result;
+    }
+
+
+    // main workhorse
+    private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
+                                   Boolean extractimages, int maxContentSize, boolean cleanScripts) {
         if (doc == null)
             throw new NullPointerException("missing document");
 
+        // get the easy stuff
         res.setTitle(extractTitle(doc));
         res.setDescription(extractDescription(doc));
         res.setCanonicalUrl(extractCanonicalUrl(doc));
+        res.setType(extractType(doc));
+        res.setSitename(extractSitename(doc));
+        res.setLanguage(extractLanguage(doc));
+
+        // get author information
+        res.setAuthorName(extractAuthorName(doc));
+        res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName()));
+
+        // add extra selection gravity to any element containing author name
+        // wasn't useful in the case I implemented it for, but might be later
+        /*
+        Elements authelems = doc.select(":containsOwn(" + res.getAuthorName() + ")");
+        for (Element elem : authelems) {
+            elem.attr("extragravityscore", Integer.toString(100));
+            System.out.println("modified element " + elem.toString());
+        }
+        */
+
+        // get date from document, if not present, extract from URL if possible
+        Date docdate = extractDate(doc);
+        if (docdate == null) {
+            String dateStr = SHelper.estimateDate(res.getUrl());
+            docdate = parseDate(dateStr);
+            res.setDate(docdate);
+        } else {
+            res.setDate(docdate);
+        }
 
-        // now remove the clutter
-        prepareDocument(doc);
+        // now remove the clutter 
+        if (cleanScripts) {
+            prepareDocument(doc);
+        }
 
-        // init elements
+        // init elements and get the one with highest weight (see getWeight for strategy)
         Collection<Element> nodes = getNodes(doc);
-        int maxWeight = 0;
-        Element bestMatchElement = null;
-        for (Element entry : nodes) {
-            int currentWeight = getWeight(entry);
-            if (currentWeight > maxWeight) {
-                maxWeight = currentWeight;
-                bestMatchElement = entry;
-                if (maxWeight > 200)
-                    break;
-            }
-        }
+        Element bestMatchElement = getBestMatchElement(nodes);
 
+        // do extraction from the best element
         if (bestMatchElement != null) {
-            List<ImageResult> images = new ArrayList<>();
-            Element imgEl = determineImageSource(bestMatchElement, images);
-            if (imgEl != null) {
-                res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
-                // TODO remove parent container of image if it is contained in
-                // bestMatchElement
-                // to avoid image subtitles flooding in
-
-                res.setImages(images);
+            if (extractimages) {
+                List<ImageResult> images = new ArrayList<>();
+                Element imgEl = determineImageSource(bestMatchElement, images);
+                if (imgEl != null) {
+                    res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
+                    // TODO remove parent container of image if it is contained in bestMatchElement
+                    // to avoid image subtitles flooding in
+
+                    res.setImages(images);
+                }
             }
 
             // clean before grabbing text
             String text = formatter.getFormattedText(bestMatchElement);
             text = removeTitleFromText(text, res.getTitle());
-            // this fails for short facebook post and probably tweets:
-            // text.length() > res.getDescription().length()
+            // this fails for short facebook post and probably tweets: text.length() > res.getDescription().length()
             if (text.length() > res.getTitle().length()) {
+                if (maxContentSize > 0) {
+                    if (text.length() > maxContentSize) {
+                        text = utf8truncate(text, maxContentSize);
+                    }
+                }
                 res.setText(text);
-                // print("best element:", bestMatchElement);
             }
-            res.setTextList(formatter.getTextList(bestMatchElement));
+
+            // extract links from the same best element
+            String fullhtml = bestMatchElement.toString();
+            Elements children = bestMatchElement.select("a[href]"); // a with href = link
+            String linkstr;
+            Integer linkpos;
+            Integer lastlinkpos = 0;
+            for (Element child : children) {
+                linkstr = child.toString();
+                linkpos = fullhtml.indexOf(linkstr, lastlinkpos);
+                res.addLink(child.attr("abs:href"), child.text(), linkpos);
+                lastlinkpos = linkpos;
+            }
         }
 
-        if (res.getImageUrl().isEmpty()) {
-            res.setImageUrl(extractImageUrl(doc));
+        if (extractimages) {
+            if (res.getImageUrl().isEmpty()) {
+                res.setImageUrl(extractImageUrl(doc));
+            }
         }
 
         res.setRssUrl(extractRssUrl(doc));
         res.setVideoUrl(extractVideoUrl(doc));
         res.setFaviconUrl(extractFaviconUrl(doc));
         res.setKeywords(extractKeywords(doc));
+
+        // Sanity checks in author
+        if (res.getAuthorName().length() > MAX_AUTHOR_NAME_LENGHT) {
+            res.setAuthorName(utf8truncate(res.getAuthorName(), MAX_AUTHOR_NAME_LENGHT));
+        }
+
+        // Sanity checks in author description.
+        String authorDescSnippet = getSnippet(res.getAuthorDescription());
+        if (getSnippet(res.getText()).equals(authorDescSnippet) ||
+                getSnippet(res.getDescription()).equals(authorDescSnippet)) {
+            res.setAuthorDescription("");
+        } else {
+            if (res.getAuthorDescription().length() > MAX_AUTHOR_DESC_LENGHT) {
+                res.setAuthorDescription(utf8truncate(res.getAuthorDescription(), MAX_AUTHOR_DESC_LENGHT));
+            }
+        }
+
+        // Sanity checks in image name
+        if (res.getImageUrl().length() > MAX_IMAGE_LENGHT) {
+            // doesn't make sense to truncate a URL
+            res.setImageUrl("");
+        }
+
         return res;
     }
 
-    protected String extractTitle(Document doc) {
+    private static String getSnippet(String data) {
+        if (data.length() < 50)
+            return data;
+        else
+            return data.substring(0, 50);
+    }
+
+    private static String extractTitle(Document doc) {
         String title = cleanTitle(doc.title());
         if (title.isEmpty()) {
             title = SHelper.innerTrim(doc.select("head title").text());
             if (title.isEmpty()) {
                 title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
                 if (title.isEmpty()) {
-                    title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(
-                            "content"));
+                    title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content"));
                     if (title.isEmpty()) {
-                        title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(
-                                "content"));
+                        title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content"));
+                        if (title.isEmpty()) {
+                            title = SHelper.innerTrim(doc.select("h1:first-of-type").text());
+                        }
                     }
                 }
             }
@@ -205,41 +351,349 @@ public class ArticleTextExtractor {
         return title;
     }
 
-    protected String extractCanonicalUrl(Document doc) {
+    private static String extractCanonicalUrl(Document doc) {
         String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
         if (url.isEmpty()) {
             url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
             if (url.isEmpty()) {
-                url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(
-                        "content"));
+                url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content"));
             }
         }
         return url;
     }
 
-    protected String extractDescription(Document doc) {
-        String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(
-                "content"));
+    private static String extractDescription(Document doc) {
+        String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
         if (description.isEmpty()) {
-            description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(
-                    "content"));
+            description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content"));
             if (description.isEmpty()) {
-                description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")
-                        .attr("content"));
+                description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content"));
             }
         }
         return description;
     }
 
-    protected Collection<String> extractKeywords(Document doc) {
+    // Returns the publication Date or null
+    private static Date extractDate(Document doc) {
+        String dateStr = "";
+
+        // try some locations that nytimes uses
+        Element elem = doc.select("meta[name=ptime]").first();
+        if (elem != null) {
+            dateStr = SHelper.innerTrim(elem.attr("content"));
+            //            elem.attr("extragravityscore", Integer.toString(100));
+            //            System.out.println("date modified element " + elem.toString());
+        }
+
+        if ("".equals(dateStr)) {
+            dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content"));
+        }
+        if ("".equals(dateStr)) {
+            dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content"));
+        }
+        if ("".equals(dateStr)) {
+            dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content"));
+        }
+        if ("".equals(dateStr)) {
+            return parseDate(dateStr);
+        }
+
+        // taking this stuff directly from Juicer (and converted to Java)
+        // opengraph (?)
+        Elements elems = doc.select("meta[property=article:published_time]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+                try {
+                    if (dateStr.endsWith("Z")) {
+                        dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00";
+                    } else {
+                        dateStr = String.format(dateStr.substring(0, dateStr.length() - 6),
+                                dateStr.substring(dateStr.length() - 6,
+                                        dateStr.length()));
+                    }
+                } catch (StringIndexOutOfBoundsException ex) {
+                    // do nothing
+                }
+                return parseDate(dateStr);
+            }
+        }
+
+        // rnews 
+        elems = doc.select("meta[property=dateCreated], span[property=dateCreated]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+
+                return parseDate(dateStr);
+            } else {
+                return parseDate(el.text());
+            }
+        }
+
+        // schema.org creativework
+        elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+
+                return parseDate(dateStr);
+            } else if (el.hasAttr("value")) {
+                dateStr = el.attr("value");
+
+                return parseDate(dateStr);
+            } else {
+                return parseDate(el.text());
+            }
+        }
+
+        // parsely page (?)
+        /*  skip conversion for now, seems highly specific and uses new lib
+        elems = doc.select("meta[name=parsely-page]");
+        if (elems.size() > 0) {
+            implicit val formats = net.liftweb.json.DefaultFormats
+
+                Element el = elems.get(0);
+                if(el.hasAttr("content")) {
+                    val json = parse(el.attr("content"))
+
+                        return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz"))
+                        }
+            } 
+        */
+
+        // BBC
+        elems = doc.select("meta[name=OriginalPublicationDate]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+                return parseDate(dateStr);
+            }
+        }
+
+        // wired
+        elems = doc.select("meta[name=DisplayDate]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+                return parseDate(dateStr);
+            }
+        }
+
+        // wildcard
+        elems = doc.select("meta[name*=date]");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            if (el.hasAttr("content")) {
+                dateStr = el.attr("content");
+                Date parsedDate = parseDate(dateStr);
+                if (parsedDate != null) {
+                    return parsedDate;
+                }
+            }
+        }
+
+        // blogger
+        elems = doc.select(".date-header");
+        if (!elems.isEmpty()) {
+            Element el = elems.get(0);
+            dateStr = el.text();
+            return parseDate(dateStr);
+        }
+
+        return null;
+    }
+
+    private static Date parseDate(String dateStr) {
+//        String[] parsePatterns = {
+//                "yyyy-MM-dd'T'HH:mm:ssz",
+//                "yyyy-MM-dd HH:mm:ss",
+//                "yyyy/MM/dd HH:mm:ss",
+//                "yyyy-MM-dd HH:mm",
+//                "yyyy/MM/dd HH:mm",
+//                "yyyy-MM-dd",
+//                "yyyy/MM/dd",
+//                "MM/dd/yyyy HH:mm:ss",
+//                "MM-dd-yyyy HH:mm:ss",
+//                "MM/dd/yyyy HH:mm",
+//                "MM-dd-yyyy HH:mm",
+//                "MM/dd/yyyy",
+//                "MM-dd-yyyy",
+//                "EEE, MMM dd, yyyy",
+//                "MM/dd/yyyy hh:mm:ss a",
+//                "MM-dd-yyyy hh:mm:ss a",
+//                "MM/dd/yyyy hh:mm a",
+//                "MM-dd-yyyy hh:mm a",
+//                "yyyy-MM-dd hh:mm:ss a",
+//                "yyyy/MM/dd hh:mm:ss a ",
+//                "yyyy-MM-dd hh:mm a",
+//                "yyyy/MM/dd hh:mm ",
+//                "dd MMM yyyy",
+//                "dd MMMM yyyy",
+//                "yyyyMMddHHmm",
+//                "yyyyMMdd HHmm",
+//                "dd-MM-yyyy HH:mm:ss",
+//                "dd/MM/yyyy HH:mm:ss",
+//                "dd MMM yyyy HH:mm:ss",
+//                "dd MMMM yyyy HH:mm:ss",
+//                "dd-MM-yyyy HH:mm",
+//                "dd/MM/yyyy HH:mm",
+//                "dd MMM yyyy HH:mm",
+//                "dd MMMM yyyy HH:mm",
+//                "yyyyMMddHHmmss",
+//                "yyyyMMdd HHmmss",
+//                "yyyyMMdd"
+//        };
+//
+        return new Date(0);
+
+//        try {
+//            return DateUtils.parseDateStrictly(dateStr, parsePatterns);
+//        } catch (Exception ex) {
+//            return null;
+//        }
+    }
+
+    // Returns the author name or null
+    private String extractAuthorName(Document doc) {
+        String authorName = "";
+
+        // first try the Google Author tag
+        Element result = doc.select("body [rel*=author]").first();
+        if (result != null)
+            authorName = SHelper.innerTrim(result.ownText());
+
+        // if that doesn't work, try some other methods
+        if (authorName.isEmpty()) {
+
+            // meta tag approaches, get content
+            result = doc.select("head meta[name=author]").first();
+            if (result != null) {
+                authorName = SHelper.innerTrim(result.attr("content"));
+            }
+
+            if (authorName.isEmpty()) {  // for "opengraph"
+                authorName = SHelper.innerTrim(doc.select("head meta[property=article:author]").attr("content"));
+            }
+            if (authorName.isEmpty()) { // OpenGraph twitter:creator tag
+                authorName = SHelper.innerTrim(doc.select("head meta[property=twitter:creator]").attr("content"));
+            }
+            if (authorName.isEmpty()) {  // for "schema.org creativework"
+                authorName = SHelper.innerTrim(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content"));
+            }
+
+            // other hacks
+            if (authorName.isEmpty()) {
+                try {
+                    // build up a set of elements which have likely author-related terms
+                    // .X searches for class X
+                    Elements matches = doc.select("a[rel=author],.byline-name,.byLineTag,.byline,.author,.by,.writer,.address");
+
+                    if (matches == null || matches.isEmpty()) {
+                        matches = doc.select("body [class*=author]");
+                    }
+
+                    if (matches == null || matches.isEmpty()) {
+                        matches = doc.select("body [title*=author]");
+                    }
+
+                    // a hack for huffington post
+                    if (matches == null || matches.isEmpty()) {
+                        matches = doc.select(".staff_info dl a[href]");
+                    }
+
+                    // a hack for http://sports.espn.go.com/
+                    if (matches == null || matches.isEmpty()) {
+                        matches = doc.select("cite[class*=source]");
+                    }
+
+                    // select the best element from them
+                    if (matches != null) {
+                        Element bestMatch = getBestMatchElement(matches);
+
+                        if (!(bestMatch == null)) {
+                            authorName = bestMatch.text();
+
+                            if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) {
+                                authorName = bestMatch.text();
+                            }
+
+                            authorName = SHelper.innerTrim(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll(""));
+
+                            if (authorName.contains(",")) {
+                                authorName = authorName.split(",")[0];
+                            }
+                        }
+                    }
+                } catch (Exception e) {
+                    System.out.println(e.toString());
+                }
+            }
+        }
+
+        for (Pattern pattern : CLEAN_AUTHOR_PATTERNS) {
+            Matcher matcher = pattern.matcher(authorName);
+            if (matcher.matches()) {
+                authorName = SHelper.innerTrim(matcher.group(1));
+                break;
+            }
+        }
+
+        return authorName;
+    }
+
+    // Returns the author description or null
+    private String extractAuthorDescription(Document doc, String authorName) {
+
+        String authorDesc = "";
+
+        if (authorName.isEmpty())
+            return "";
+
+        // Special case for entrepreneur.com
+        Elements matches = doc.select(".byline > .bio");
+        if (matches != null && !matches.isEmpty()) {
+            Element bestMatch = matches.first(); // assume it is the first.
+            authorDesc = bestMatch.text();
+            return authorDesc;
+        }
+
+        // Special case for huffingtonpost.com
+        matches = doc.select(".byline span[class*=teaser]");
+        if (matches != null && !matches.isEmpty()) {
+            Element bestMatch = matches.first(); // assume it is the first.
+            authorDesc = bestMatch.text();
+            return authorDesc;
+        }
+
+        try {
+            Elements nodes = doc.select(":containsOwn(" + authorName + ')');
+            Element bestMatch = getBestMatchElement(nodes);
+            if (bestMatch != null)
+                authorDesc = bestMatch.text();
+        } catch (SelectorParseException se) {
+            // Avoid error when selector is invalid
+        }
+
+        return authorDesc;
+    }
+
+    private static Collection<String> extractKeywords(Document doc) {
         String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
 
-        if (content.startsWith("[") && content.endsWith("]"))
-            content = content.substring(1, content.length() - 1);
+        if (content != null) {
+            if (content.startsWith("[") && content.endsWith("]"))
+                content = content.substring(1, content.length() - 1);
 
-        String[] split = content.split("\\s*,\\s*");
-        if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty()))
-            return Arrays.asList(split);
+            String[] split = content.split("\\s*,\\s*");
+            if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty()))
+                return Arrays.asList(split);
+        }
         return Collections.emptyList();
     }
 
@@ -249,62 +703,101 @@ public class ArticleTextExtractor {
      *
      * @return image url or empty str
      */
-    protected String extractImageUrl(Document doc) {
+    private static String extractImageUrl(Document doc) {
         // use open graph tag to get image
-        String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(
-                "content"));
+        String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content"));
         if (imageUrl.isEmpty()) {
-            imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(
-                    "content"));
+            imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content"));
             if (imageUrl.isEmpty()) {
                 // prefer link over thumbnail-meta if empty
                 imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
                 if (imageUrl.isEmpty()) {
-                    imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(
-                            "content"));
+                    imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content"));
                 }
             }
         }
         return imageUrl;
     }
 
-    protected String extractRssUrl(Document doc) {
-        return SHelper.replaceSpaces(doc.select("link[rel=alternate]")
-                .select("link[type=application/rss+xml]").attr("href"));
+    private static String extractRssUrl(Document doc) {
+        return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href"));
     }
 
-    protected String extractVideoUrl(Document doc) {
+    private static String extractVideoUrl(Document doc) {
         return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
     }
 
-    protected String extractFaviconUrl(Document doc) {
+    private static String extractFaviconUrl(Document doc) {
         String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
         if (faviconUrl.isEmpty()) {
-            faviconUrl = SHelper.replaceSpaces(doc.select(
-                    "head link[rel^=shortcut],link[rel$=icon]").attr("href"));
+            faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href"));
         }
         return faviconUrl;
     }
 
+    private static String extractType(Document doc) {
+        String type = SHelper.innerTrim(doc.select("head meta[property=og:type]").attr("content"));
+        return type;
+    }
+
+    private static String extractSitename(Document doc) {
+        String sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
+        if (sitename.isEmpty()) {
+            sitename = SHelper.innerTrim(doc.select("head meta[name=twitter:site]").attr("content"));
+        }
+        if (sitename.isEmpty()) {
+            sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
+        }
+        return sitename;
+    }
+
+    private static String extractLanguage(Document doc) {
+        String language = SHelper.innerTrim(doc.select("head meta[property=language]").attr("content"));
+        if (language.isEmpty()) {
+            language = SHelper.innerTrim(doc.select("html").attr("lang"));
+            if (language.isEmpty()) {
+                language = SHelper.innerTrim(doc.select("head meta[property=og:locale]").attr("content"));
+            }
+        }
+        if (!language.isEmpty()) {
+            if (language.length() > 2) {
+                language = language.substring(0, 2);
+            }
+        }
+        return language;
+    }
+
     /**
      * Weights current element. By matching it with positive candidates and
      * weighting child nodes. Since it's impossible to predict which exactly
      * names, ids or class names will be used in HTML, major role is played by
      * child nodes
      *
-     * @param e
-     *            Element to weight, along with child nodes
+     * @param e Element to weight, along with child nodes
      */
-    protected int getWeight(Element e) {
+    private int getWeight(Element e, boolean checkextra) {
         int weight = calcWeight(e);
-        weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
-        weight += weightChildNodes(e);
+        int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10);
+        weight += ownTextWeight;
+        int childrenWeight = weightChildNodes(e);
+        weight += childrenWeight;
+
+        // add additional weight using possible 'extragravityscore' attribute
+        if (checkextra) {
+            Element xelem = e.select("[extragravityscore]").first();
+            if (xelem != null) {
+                //                System.out.println("HERE found one: " + xelem.toString());
+                weight += Integer.parseInt(xelem.attr("extragravityscore"));
+                //                System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore"));
+            }
+        }
+
         return weight;
     }
 
     /**
      * Weights a child nodes of given Element. During tests some difficulties
-     * were met. For instanance, not every single document has nested paragraph
+     * were met. For instance, not every single document has nested paragraph
      * tags inside of the major article tag. Sometimes people are adding one
      * more nesting level. So, we're adding 4 points for every 100 symbols
      * contained in tag nested inside of the current weighted element, but only
@@ -312,42 +805,93 @@ public class ArticleTextExtractor {
      * more chances to extract the element that has less nested levels,
      * increasing probability of the correct extraction.
      *
-     * @param rootEl
-     *            Element, who's child nodes will be weighted
+     * @param rootEl Element, who's child nodes will be weighted
      */
-    protected int weightChildNodes(Element rootEl) {
+    private int weightChildNodes(Element rootEl) {
         int weight = 0;
         Element caption = null;
         List<Element> pEls = new ArrayList<>(5);
+
         for (Element child : rootEl.children()) {
             String ownText = child.ownText();
             int ownTextLength = ownText.length();
             if (ownTextLength < 20)
                 continue;
 
-            if (ownTextLength > 200)
-                weight += Math.max(50, ownTextLength / 10);
+            if (ownTextLength > 200) {
+                int childOwnTextWeight = Math.max(50, ownTextLength / 10);
+                weight += childOwnTextWeight;
+            }
 
             if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
-                weight += 30;
+                int h2h1Weight = 30;
+                weight += h2h1Weight;
             } else if (child.tagName().equals("div") || child.tagName().equals("p")) {
-                weight += calcWeightForChild(child, ownText);
+                int calcChildWeight = calcWeightForChild(child, ownText);
+                weight += calcChildWeight;
                 if (child.tagName().equals("p") && ownTextLength > 50)
                     pEls.add(child);
 
-                if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))
+                if (child.className().toLowerCase().equals("caption"))
                     caption = child;
             }
         }
 
+        //
+        // Visit grandchildren, This section visits the grandchildren 
+        // of the node and calculate their weights. Note that grandchildren
+        // weights are only worth 1/3 of children's
+        //
+        int grandChildrenWeight = 0;
+        for (Element child2 : rootEl.children()) {
+
+            // If the node looks negative don't include it in the weights
+            // instead penalize the grandparent. This is done to try to 
+            // avoid giving weigths to navigation nodes, etc.
+            if (NEGATIVE.matcher(child2.id()).find() ||
+                    NEGATIVE.matcher(child2.className()).find()) {
+                grandChildrenWeight -= 30;
+                continue;
+            }
+
+            for (Element grandchild : child2.children()) {
+                int grandchildWeight = 0;
+                String ownText = grandchild.ownText();
+                int ownTextLength = ownText.length();
+                if (ownTextLength < 20)
+                    continue;
+
+                if (ownTextLength > 200) {
+                    int childOwnTextWeight = Math.max(50, ownTextLength / 10);
+                    grandchildWeight += childOwnTextWeight;
+                }
+
+                if (grandchild.tagName().equals("h1") || grandchild.tagName().equals("h2")) {
+                    int h2h1Weight = 30;
+                    grandchildWeight += h2h1Weight;
+                } else if (grandchild.tagName().equals("div") || grandchild.tagName().equals("p")) {
+                    int calcChildWeight = calcWeightForChild(grandchild, ownText);
+                    grandchildWeight += calcChildWeight;
+                }
+
+                grandChildrenWeight += grandchildWeight;
+            }
+        }
+
+        grandChildrenWeight = grandChildrenWeight / 3;
+        weight += grandChildrenWeight;
+
         // use caption and image
-        if (caption != null)
-            weight += 30;
+        if (caption != null) {
+            int captionWeight = 30;
+            weight += captionWeight;
+        }
 
         if (pEls.size() >= 2) {
             for (Element subEl : rootEl.children()) {
                 if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
-                    weight += 20;
+                    int h1h2h3Weight = 20;
+                    weight += h1h2h3Weight;
                     // headerEls.add(subEl);
                 } else if ("table;li;td;th".contains(subEl.tagName())) {
                     addScore(subEl, -30);
@@ -360,26 +904,25 @@ public class ArticleTextExtractor {
         return weight;
     }
 
-    public void addScore(Element el, int score) {
+    private static void addScore(Element el, int score) {
         int old = getScore(el);
         setScore(el, score + old);
     }
 
-    public int getScore(Element el) {
+    private static int getScore(Element el) {
         int old = 0;
         try {
             old = Integer.parseInt(el.attr("gravityScore"));
-        } catch (Exception ex) {
-            ex.printStackTrace();
+        } catch (Exception ignored) {
         }
         return old;
     }
 
-    public void setScore(Element el, int score) {
+    private static void setScore(Element el, int score) {
         el.attr("gravityScore", Integer.toString(score));
     }
 
-    private int calcWeightForChild(Element child, String ownText) {
+    private static int calcWeightForChild(Element child, String ownText) {
         int c = SHelper.count(ownText, "&quot;");
         c += SHelper.count(ownText, "&lt;");
         c += SHelper.count(ownText, "&gt;");
@@ -388,7 +931,7 @@ public class ArticleTextExtractor {
         if (c > 5)
             val = -30;
         else
-            val = (int) Math.round(ownText.length() / 25.0);
+            val = (int) Math.round(ownText.length() / 35.0);
 
         addScore(child, val);
         return val;
@@ -400,7 +943,7 @@ public class ArticleTextExtractor {
             weight += 35;
 
         if (POSITIVE.matcher(e.id()).find())
-            weight += 40;
+            weight += 45;
 
         if (UNLIKELY.matcher(e.className()).find())
             weight -= 20;
@@ -417,10 +960,16 @@ public class ArticleTextExtractor {
         String style = e.attr("style");
         if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
             weight -= 50;
+
+        String itemprop = e.attr("itemprop");
+        if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) {
+            weight += 100;
+        }
+
         return weight;
     }
 
-    public Element determineImageSource(Element el, List<ImageResult> images) {
+    private Element determineImageSource(Element el, List<ImageResult> images) {
         int maxWeight = 0;
         Element maxNode = null;
         Elements els = el.select("img");
@@ -441,8 +990,7 @@ public class ArticleTextExtractor {
                     weight += 20;
                 else
                     weight -= 20;
-            } catch (Exception ex) {
-                ex.printStackTrace();
+            } catch (Exception ignored) {
             }
 
             int width = 0;
@@ -452,8 +1000,7 @@ public class ArticleTextExtractor {
                     weight += 20;
                 else
                     weight -= 20;
-            } catch (Exception ex) {
-                ex.printStackTrace();
+            } catch (Exception ignored) {
             }
             String alt = e.attr("alt");
             if (alt.length() > 35)
@@ -480,8 +1027,7 @@ public class ArticleTextExtractor {
                 score = score / 2;
             }
 
-            ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,
-                    noFollow);
+            ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow);
             images.add(image);
         }
 
@@ -494,12 +1040,11 @@ public class ArticleTextExtractor {
      * from time to time they're getting more score than good ones especially in
      * cases when major text is short.
      *
-     * @param doc
-     *            document to prepare. Passed as reference, and changed inside
+     * @param doc document to prepare. Passed as reference, and changed inside
      *            of function
      */
-    protected void prepareDocument(Document doc) {
-        // stripUnlikelyCandidates(doc);
+    private static void prepareDocument(Document doc) {
+//        stripUnlikelyCandidates(doc);
         removeScriptsAndStyles(doc);
     }
 
@@ -507,27 +1052,25 @@ public class ArticleTextExtractor {
      * Removes unlikely candidates from HTML. Currently takes id and class name
      * and matches them against list of patterns
      *
-     * @param doc
-     *            document to strip unlikely candidates from
+     * @param doc document to strip unlikely candidates from
      */
     protected void stripUnlikelyCandidates(Document doc) {
         for (Element child : doc.select("body").select("*")) {
-            String className = child.className().toLowerCase(Locale.getDefault());
-            String id = child.id().toLowerCase(Locale.getDefault());
+            String className = child.className().toLowerCase();
+            String id = child.id().toLowerCase();
 
-            if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) {
-                // print("REMOVE:", child);
+            if (NEGATIVE.matcher(className).find()
+                    || NEGATIVE.matcher(id).find()) {
                 child.remove();
             }
         }
     }
 
-    private Document removeScriptsAndStyles(Document doc) {
+    private static Document removeScriptsAndStyles(Document doc) {
         Elements scripts = doc.getElementsByTag("script");
         for (Element item : scripts) {
             item.remove();
         }
-
         Elements noscripts = doc.getElementsByTag("noscript");
         for (Element item : noscripts) {
             item.remove();
@@ -541,49 +1084,74 @@ public class ArticleTextExtractor {
         return doc;
     }
 
-    private boolean isAdImage(String imageUrl) {
+    private static boolean isAdImage(String imageUrl) {
         return SHelper.count(imageUrl, "ad") >= 2;
     }
 
     /**
      * Match only exact matching as longestSubstring can be too fuzzy
      */
-    public String removeTitleFromText(String text, String title) {
+    private static String removeTitleFromText(String text, String title) {
         // don't do this as its terrible to read
-        // int index1 = text.toLowerCase().indexOf(title.toLowerCase());
-        // if (index1 >= 0)
-        // text = text.substring(index1 + title.length());
-        // return text.trim();
+//        int index1 = text.toLowerCase().indexOf(title.toLowerCase());
+//        if (index1 >= 0)
+//            text = text.substring(index1 + title.length());
+//        return text.trim();
         return text;
     }
 
+    /**
+     * based on a delimeter in the title take the longest piece or do some
+     * custom logic based on the site
+     *
+     * @param title
+     * @param delimeter
+     * @return
+     */
+    private static String doTitleSplits(String title, String delimeter) {
+        String largeText = "";
+        int largetTextLen = 0;
+        String[] titlePieces = title.split(delimeter);
+
+        // take the largest split
+        for (String p : titlePieces) {
+            if (p.length() > largetTextLen) {
+                largeText = p;
+                largetTextLen = p.length();
+            }
+        }
+
+        largeText = largeText.replace("&raquo;", " ");
+        largeText = largeText.replace("»", " ");
+        return largeText.trim();
+    }
+
     /**
      * @return a set of all important nodes
      */
-    public Collection<Element> getNodes(Document doc) {
-        Set<Element> nodes = new HashSet<>(64);
+    private static Collection<Element> getNodes(Document doc) {
+        Map<Element, Object> nodes = new LinkedHashMap<>(64);
         int score = 100;
         for (Element el : doc.select("body").select("*")) {
             if (NODES.matcher(el.tagName()).matches()) {
-                nodes.add(el);
+                nodes.put(el, null);
                 setScore(el, score);
                 score = score / 2;
             }
         }
-        return nodes;
-
+        return nodes.keySet();
     }
 
-    public String cleanTitle(String title) {
+    private static String cleanTitle(String title) {
         StringBuilder res = new StringBuilder();
-        // int index = title.lastIndexOf("|");
-        // if (index > 0 && title.length() / 2 < index)
-        // title = title.substring(0, index + 1);
+//        int index = title.lastIndexOf("|");
+//        if (index > 0 && title.length() / 2 < index)
+//            title = title.substring(0, index + 1);
 
         int counter = 0;
         String[] strs = title.split("\\|");
         for (String part : strs) {
-            if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))
+            if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim()))
                 continue;
 
             if (counter == strs.length - 1 && res.length() > part.length())
@@ -599,13 +1167,48 @@ public class ArticleTextExtractor {
         return SHelper.innerTrim(res.toString());
     }
 
+    /**
+     * Truncate a Java string so that its UTF-8 representation will not
+     * exceed the specified number of bytes.
+     * <p/>
+     * For discussion of why you might want to do this, see
+     * http://lpar.ath0.com/2011/06/07/unicode-alchemy-with-db2/
+     */
+    private static String utf8truncate(String input, int length) {
+        StringBuilder result = new StringBuilder(length);
+        int resultlen = 0;
+        for (int i = 0; i < input.length(); i++) {
+            char c = input.charAt(i);
+            int charlen = 0;
+            if (c <= 0x7f) {
+                charlen = 1;
+            } else if (c <= 0x7ff) {
+                charlen = 2;
+            } else if (c <= 0xd7ff) {
+                charlen = 3;
+            } else if (c <= 0xdbff) {
+                charlen = 4;
+            } else if (c <= 0xdfff) {
+                charlen = 0;
+            } else if (c <= 0xffff) {
+                charlen = 3;
+            }
+            if (resultlen + charlen > length) {
+                break;
+            }
+            result.append(c);
+            resultlen += charlen;
+        }
+        return result.toString();
+    }
+
+
     /**
      * Comparator for Image by weight
      *
      * @author Chris Alexander, chris@chris-alexander.co.uk
-     *
      */
-    public class ImageComparator implements Comparator<ImageResult> {
+    private class ImageComparator implements Comparator<ImageResult> {
 
         @Override
         public int compare(ImageResult o1, ImageResult o2) {
@@ -613,4 +1216,5 @@ public class ArticleTextExtractor {
             return o2.weight.compareTo(o1.weight);
         }
     }
+
 }
\ No newline at end of file
diff --git a/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java b/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java
index e815234..59d5635 100644
--- a/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java
+++ b/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java
@@ -22,22 +22,19 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
 import java.net.Proxy;
 import java.net.URL;
 import java.util.LinkedHashSet;
-import java.util.Locale;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.Inflater;
 import java.util.zip.InflaterInputStream;
 
-import acr.browser.lightning.constant.Constants;
-import android.util.Log;
-
 /**
  * Class to fetch articles. This class is thread safe.
- * 
+ *
  * @author Peter Karich
  */
 public class HtmlFetcher {
@@ -64,7 +61,7 @@ public class HtmlFetcher {
             else
                 existing.add(domainStr);
 
-            String html = new HtmlFetcher().fetchAsString(url, 20000);
+            String html = new HtmlFetcher().fetchAsString(url, 2000);
             String outFile = domainStr + counterStr + ".html";
             BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
             writer.write(html);
@@ -73,8 +70,8 @@ public class HtmlFetcher {
         reader.close();
     }
 
-    private String referrer = "https://github.com/karussell/snacktory";
-    private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ')';
+    private String referrer = "http://jetsli.de/crawler";
+    private String userAgent = "Mozilla/5.0 (compatible; Jetslide; +" + referrer + ')';
     private String cacheControl = "max-age=0";
     private String language = "en-us";
     private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
@@ -83,7 +80,7 @@ public class HtmlFetcher {
     private final AtomicInteger cacheCounter = new AtomicInteger(0);
     private int maxTextLength = -1;
     private ArticleTextExtractor extractor = new ArticleTextExtractor();
-    private final Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
+    private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
         {
             add("bit.ly");
             add("cli.gs");
@@ -202,6 +199,12 @@ public class HtmlFetcher {
     }
 
     public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
+        return fetchAndExtract(url, timeout, resolve, 0, false);
+    }
+
+    // main workhorse to call externally
+    public JResult fetchAndExtract(String url, int timeout, boolean resolve,
+                                   int maxContentSize, boolean forceReload) throws Exception {
         String originalUrl = url;
         url = SHelper.removeHashbang(url);
         String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
@@ -219,9 +222,8 @@ public class HtmlFetcher {
             if (res != null)
                 return res;
 
-            String resUrl = getResolvedUrl(url, timeout);
+            String resUrl = getResolvedUrl(url, timeout, 0);
             if (resUrl.isEmpty()) {
-                Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);
 
                 JResult result = new JResult();
                 if (cache != null)
@@ -229,10 +231,9 @@ public class HtmlFetcher {
                 return result.setUrl(url);
             }
 
-            // if resolved url is longer then use it!
-            if (resUrl.trim().length() > url.length()) {
-                // this is necessary e.g. for some homebaken url resolvers which
-                // return
+            // if resolved url is different then use it!
+            if (!resUrl.equals(url)) {
+                // this is necessary e.g. for some homebaken url resolvers which return
                 // the resolved url relative to url!
                 url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
             }
@@ -244,20 +245,18 @@ public class HtmlFetcher {
             return res;
 
         JResult result = new JResult();
-        // or should we use? <link rel="canonical"
-        // href="http://www.N24.de/news/newsitem_6797232.html"/>
+        // or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/>
         result.setUrl(url);
         result.setOriginalUrl(originalUrl);
-        result.setDate(SHelper.estimateDate(url));
 
-        // Immediately put the url into the cache as extracting content takes
-        // time.
+        // Immediately put the url into the cache as extracting content takes time.
         if (cache != null) {
             cache.put(originalUrl, result);
             cache.put(url, result);
         }
 
-        String lowerUrl = url.toLowerCase(Locale.getDefault());
+        // extract content to the extent appropriate for content type
+        String lowerUrl = url.toLowerCase();
         if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
             // skip
         } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
@@ -265,16 +264,30 @@ public class HtmlFetcher {
         } else if (SHelper.isImage(lowerUrl)) {
             result.setImageUrl(url);
         } else {
-            extractor.extractContent(result, fetchAsString(url, timeout));
+            try {
+                String urlToDownload = url;
+                if (forceReload) {
+                    urlToDownload = getURLtoBreakCache(url);
+                }
+                extractor.extractContent(result, fetchAsString(urlToDownload, timeout), maxContentSize);
+            } catch (IOException io) {
+                // do nothing
+            }
             if (result.getFaviconUrl().isEmpty())
                 result.setFaviconUrl(SHelper.getDefaultFavicon(url));
 
-            // some links are relative to root and do not include the domain of
-            // the url :(
-            result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
-            result.setImageUrl(fixUrl(url, result.getImageUrl()));
-            result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
-            result.setRssUrl(fixUrl(url, result.getRssUrl()));
+            // some links are relative to root and do not include the domain of the url :(
+            if (!result.getFaviconUrl().isEmpty())
+                result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
+
+            if (!result.getImageUrl().isEmpty())
+                result.setImageUrl(fixUrl(url, result.getImageUrl()));
+
+            if (!result.getVideoUrl().isEmpty())
+                result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
+
+            if (!result.getRssUrl().isEmpty())
+                result.setRssUrl(fixUrl(url, result.getRssUrl()));
         }
         result.setText(lessText(result.getText()));
         synchronized (result) {
@@ -283,6 +296,20 @@ public class HtmlFetcher {
         return result;
     }
 
+    // Ugly hack to break free from any cached versions, a few URLs required this.
+    public static String getURLtoBreakCache(String url) {
+        try {
+            URL aURL = new URL(url);
+            if (aURL.getQuery() != null && aURL.getQuery().isEmpty()) {
+                return url + "?1";
+            } else {
+                return url + "&1";
+            }
+        } catch (MalformedURLException e) {
+            return url;
+        }
+    }
+
     public String lessText(String text) {
         if (text == null)
             return "";
@@ -297,13 +324,14 @@ public class HtmlFetcher {
         return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
     }
 
-    public String fetchAsString(String urlAsString, int timeout) throws
-            IOException {
+    public String fetchAsString(String urlAsString, int timeout)
+            throws MalformedURLException, IOException {
         return fetchAsString(urlAsString, timeout, true);
     }
 
+    // main routine to get raw webpage content
     public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
-            throws IOException {
+            throws MalformedURLException, IOException {
         HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
         hConn.setInstanceFollowRedirects(true);
         String encoding = hConn.getContentEncoding();
@@ -317,27 +345,23 @@ public class HtmlFetcher {
         }
 
         String enc = Converter.extractEncoding(hConn.getContentType());
-        String res = createConverter(urlAsString).streamToString(is, enc);
-        Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
-        return res;
+        return createConverter(urlAsString).streamToString(is, enc);
     }
 
-    public Converter createConverter(String url) {
+    public static Converter createConverter(String url) {
         return new Converter(url);
     }
 
     /**
      * On some devices we have to hack:
-     * http://developers.sun.com/mobility/reference
-     * /techart/design_guidelines/http_redirection.html
-     * 
-     * @param timeout
-     *            Sets a specified timeout value, in milliseconds
+     * http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html
+     *
+     * @param timeout Sets a specified timeout value, in milliseconds
      * @return the resolved url if any. Or null if it couldn't resolve the url
-     *         (within the specified time) or the same url if response code is
-     *         OK
+     * (within the specified time) or the same url if response code is OK
      */
-    public String getResolvedUrl(String urlAsString, int timeout) {
+    public String getResolvedUrl(String urlAsString, int timeout,
+                                 int num_redirects) {
         String newUrl = null;
         int responseCode = -1;
         try {
@@ -354,28 +378,32 @@ public class HtmlFetcher {
                 return urlAsString;
 
             newUrl = hConn.getHeaderField("Location");
-            if (responseCode / 100 == 3 && newUrl != null) {
+            // Note that the max recursion level is 5.
+            if (responseCode / 100 == 3 && newUrl != null && num_redirects < 5) {
                 newUrl = newUrl.replaceAll(" ", "+");
-                // some services use (none-standard) utf8 in their location
-                // header
+                // some services use (none-standard) utf8 in their location header
                 if (urlAsString.startsWith("http://bit.ly")
                         || urlAsString.startsWith("http://is.gd"))
                     newUrl = encodeUriFromHeader(newUrl);
 
-                // fix problems if shortened twice. as it is often the case
-                // after twitters' t.co bullshit
-                if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
-                    newUrl = getResolvedUrl(newUrl, timeout);
-
+                // AP: This code is not longer need, instead we always follow
+                // multiple redirects.
+                //
+                // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit
+                //if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
+                //    newUrl = getResolvedUrl(newUrl, timeout);
+
+                // Add support for URLs with multiple levels of redirection,
+                // call getResolvedUrl until there is no more redirects or a
+                // max number of redirects is reached.
+                newUrl = SHelper.useDomainOfFirstArg4Second(urlAsString, newUrl);
+                newUrl = getResolvedUrl(newUrl, timeout, num_redirects + 1);
                 return newUrl;
             } else
                 return urlAsString;
 
         } catch (Exception ex) {
-            Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
             return "";
-        } finally {
-            Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
         }
     }
 
@@ -400,9 +428,9 @@ public class HtmlFetcher {
     }
 
     protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
-            boolean includeSomeGooseOptions) throws IOException {
+                                                    boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
         URL url = new URL(urlAsStr);
-        // using proxy may increase latency
+        //using proxy may increase latency
         HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
         hConn.setRequestProperty("User-Agent", userAgent);
         hConn.setRequestProperty("Accept", accept);
@@ -415,8 +443,7 @@ public class HtmlFetcher {
             hConn.setRequestProperty("Cache-Control", cacheControl);
         }
 
-        // suggest respond to be gzipped or deflated (which is just another
-        // compression)
+        // suggest respond to be gzipped or deflated (which is just another compression)
         // http://stackoverflow.com/q/3932117
         hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
         hConn.setConnectTimeout(timeout);
@@ -424,14 +451,12 @@ public class HtmlFetcher {
         return hConn;
     }
 
-    private JResult getFromCache(String url, String originalUrl) throws Exception {
+    private JResult getFromCache(String url, String originalUrl) {
         if (cache != null) {
             JResult res = cache.get(url);
             if (res != null) {
-                // e.g. the cache returned a shortened url as original url now
-                // we want to store the
-                // current original url! Also it can be that the cache response
-                // to url but the JResult
+                // e.g. the cache returned a shortened url as original url now we want to store the
+                // current original url! Also it can be that the cache response to url but the JResult
                 // does not contain it so overwrite it:
                 res.setUrl(url);
                 res.setOriginalUrl(originalUrl);
@@ -441,4 +466,4 @@ public class HtmlFetcher {
         }
         return null;
     }
-}
+}
\ No newline at end of file
diff --git a/app/src/main/java/acr/browser/lightning/reading/JResult.java b/app/src/main/java/acr/browser/lightning/reading/JResult.java
index 1b4a23f..dc97de2 100644
--- a/app/src/main/java/acr/browser/lightning/reading/JResult.java
+++ b/app/src/main/java/acr/browser/lightning/reading/JResult.java
@@ -16,14 +16,18 @@
 package acr.browser.lightning.reading;
 
 import java.io.Serializable;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
 
 /**
  * Parsed result from web page containing important title, text and image.
- * 
+ *
  * @author Peter Karich
  */
 public class JResult implements Serializable {
@@ -38,10 +42,15 @@ public class JResult implements Serializable {
     private String text;
     private String faviconUrl;
     private String description;
-    private String dateString;
-    private List<String> textList;
+    private String authorName;
+    private String authorDescription;
+    private Date date;
     private Collection<String> keywords;
     private List<ImageResult> images = null;
+    private List<Map<String, String>> links = new ArrayList<>();
+    private String type;
+    private String sitename;
+    private String language;
 
     public JResult() {
     }
@@ -108,6 +117,28 @@ public class JResult implements Serializable {
         return this;
     }
 
+    public String getAuthorName() {
+        if (authorName == null)
+            return "";
+        return authorName;
+    }
+
+    public JResult setAuthorName(String authorName) {
+        this.authorName = authorName;
+        return this;
+    }
+
+    public String getAuthorDescription() {
+        if (authorDescription == null)
+            return "";
+        return authorDescription;
+    }
+
+    public JResult setAuthorDescription(String authorDescription) {
+        this.authorDescription = authorDescription;
+        return this;
+    }
+
     public String getImageUrl() {
         if (imageUrl == null)
             return "";
@@ -131,17 +162,6 @@ public class JResult implements Serializable {
         return this;
     }
 
-    public List<String> getTextList() {
-        if (this.textList == null)
-            return new ArrayList<>();
-        return this.textList;
-    }
-
-    public JResult setTextList(List<String> textList) {
-        this.textList = textList;
-        return this;
-    }
-
     public String getTitle() {
         if (title == null)
             return "";
@@ -164,8 +184,8 @@ public class JResult implements Serializable {
         return this;
     }
 
-    public JResult setDate(String date) {
-        this.dateString = date;
+    public JResult setDate(Date date) {
+        this.date = date;
         return this;
     }
 
@@ -180,8 +200,8 @@ public class JResult implements Serializable {
     /**
      * @return get date from url or guessed from text
      */
-    public String getDate() {
-        return dateString;
+    public Date getDate() {
+        return date;
     }
 
     /**
@@ -209,8 +229,46 @@ public class JResult implements Serializable {
         this.images = images;
     }
 
+    public void addLink(String url, String text, Integer pos) {
+        Map link = new HashMap();
+        link.put("url", url);
+        link.put("text", text);
+        link.put("offset", String.valueOf(pos));
+        links.add(link);
+    }
+
+    public List<Map<String, String>> getLinks() {
+        if (links == null)
+            return Collections.emptyList();
+        return links;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public void setType(String type) {
+        this.type = type;
+    }
+
+    public String getSitename() {
+        return sitename;
+    }
+
+    public void setSitename(String sitename) {
+        this.sitename = sitename;
+    }
+
+    public String getLanguage() {
+        return language;
+    }
+
+    public void setLanguage(String language) {
+        this.language = language;
+    }
+
     @Override
     public String toString() {
         return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
     }
-}
+}
\ No newline at end of file
diff --git a/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java b/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java
index 9e374a5..d05a26b 100644
--- a/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java
+++ b/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java
@@ -4,40 +4,46 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Locale;
 import java.util.regex.Pattern;
+
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
 
 /**
  * @author goose | jim
  * @author karussell
- * 
- *         this class will be responsible for taking our top node and stripping
- *         out junk we don't want and getting it ready for how we want it
- *         presented to the user
+ *         <p/>
+ *         this class will be responsible for taking our top node and stripping out junk
+ *         we don't want and getting it ready for how we want it presented to the user
  */
 public class OutputFormatter {
 
-    public static final int MIN_PARAGRAPH_TEXT = 50;
+    private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph
+    private static final int MIN_PARAGRAPH_TEXT = 30;       // Min size of any other paragraphs
     private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
     private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
-    protected final int minParagraphText;
-    protected final List<String> nodesToReplace;
-    protected String nodesToKeepCssSelector = "p";
+    private final int minFirstParagraphText;
+    private final int minParagraphText;
+    private final List<String> nodesToReplace;
+    private String nodesToKeepCssSelector = "p, ol";
 
     public OutputFormatter() {
-        this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
+        this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
     }
 
     public OutputFormatter(int minParagraphText) {
-        this(minParagraphText, NODES_TO_REPLACE);
+        this(minParagraphText, minParagraphText, NODES_TO_REPLACE);
+    }
+
+    public OutputFormatter(int minFirstParagraphText, int minParagraphText) {
+        this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE);
     }
 
-    public OutputFormatter(int minParagraphText, List<String> nodesToReplace) {
+    public OutputFormatter(int minFirstParagraphText, int minParagraphText,
+                           List<String> nodesToReplace) {
+        this.minFirstParagraphText = minFirstParagraphText;
         this.minParagraphText = minParagraphText;
         this.nodesToReplace = nodesToReplace;
     }
@@ -53,36 +59,34 @@ public class OutputFormatter {
      * takes an element and turns the P tags into \n\n
      */
     public String getFormattedText(Element topNode) {
+        setParagraphIndex(topNode, nodesToKeepCssSelector);
         removeNodesWithNegativeScores(topNode);
         StringBuilder sb = new StringBuilder();
-        append(topNode, sb, nodesToKeepCssSelector);
+        int countOfP = append(topNode, sb, nodesToKeepCssSelector);
         String str = SHelper.innerTrim(sb.toString());
-        if (str.length() > 100)
+
+        int topNodeLength = topNode.text().length();
+        if (topNodeLength == 0) {
+            topNodeLength = 1;
+        }
+
+
+        boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25);
+        if (str.length() > 100 && countOfP > 0 && !lowTextRatio)
             return str;
 
         // no subelements
-        if (str.isEmpty() || !topNode.text().isEmpty()
+        if (str.isEmpty() || (!topNode.text().isEmpty()
                 && str.length() <= topNode.ownText().length())
+                || countOfP == 0 || lowTextRatio) {
             str = topNode.text();
+        }
 
-        // if jsoup failed to parse the whole html now parse this smaller
+        // if jsoup failed to parse the whole html now parse this smaller 
         // snippet again to avoid html tags disturbing our text:
         return Jsoup.parse(str).text();
     }
 
-    /**
-     * Takes an element and returns a list of texts extracted from the P tags
-     */
-    public List<String> getTextList(Element topNode) {
-        List<String> texts = new ArrayList<>();
-        for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
-            if (element.hasText()) {
-                texts.add(element.text());
-            }
-        }
-        return texts;
-    }
-
     /**
      * If there are elements inside our top node that have a negative gravity
      * score remove them
@@ -90,15 +94,20 @@ public class OutputFormatter {
     protected void removeNodesWithNegativeScores(Element topNode) {
         Elements gravityItems = topNode.select("*[gravityScore]");
         for (Element item : gravityItems) {
-            int score = Integer.parseInt(item.attr("gravityScore"));
-            if (score < 0 || item.text().length() < minParagraphText)
+            int score = getScore(item);
+            int paragraphIndex = getParagraphIndex(item);
+            if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
                 item.remove();
+            }
         }
     }
 
-    protected void append(Element node, StringBuilder sb, String tagName) {
+    protected int append(Element node, StringBuilder sb, String tagName) {
+        int countOfP = 0; // Number of P elements in the article
+        int paragraphWithTextIndex = 0;
         // is select more costly then getElementsByTag?
-        MAIN: for (Element e : node.select(tagName)) {
+        MAIN:
+        for (Element e : node.select(tagName)) {
             Element tmpEl = e;
             // check all elements until 'node'
             while (tmpEl != null && !tmpEl.equals(node)) {
@@ -108,18 +117,56 @@ public class OutputFormatter {
             }
 
             String text = node2Text(e);
-            if (text.isEmpty() || text.length() < minParagraphText
-                    || text.length() > SHelper.countLetters(text) * 2)
+            if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex)
+                    || text.length() > SHelper.countLetters(text) * 2) {
                 continue;
+            }
+
+            if (e.tagName().equals("p")) {
+                countOfP++;
+            }
 
             sb.append(text);
             sb.append("\n\n");
+            paragraphWithTextIndex += 1;
+        }
+
+        return countOfP;
+    }
+
+    protected static void setParagraphIndex(Element node, String tagName) {
+        int paragraphIndex = 0;
+        for (Element e : node.select(tagName)) {
+            e.attr("paragraphIndex", Integer.toString(paragraphIndex++));
+        }
+    }
+
+    protected int getMinParagraph(int paragraphIndex) {
+        if (paragraphIndex < 1) {
+            return minFirstParagraphText;
+        } else {
+            return minParagraphText;
+        }
+    }
+
+    protected static int getParagraphIndex(Element el) {
+        try {
+            return Integer.parseInt(el.attr("paragraphIndex"));
+        } catch (NumberFormatException ex) {
+            return -1;
+        }
+    }
+
+    protected static int getScore(Element el) {
+        try {
+            return Integer.parseInt(el.attr("gravityScore"));
+        } catch (Exception ex) {
+            return 0;
         }
     }
 
     boolean unlikely(Node e) {
-        if (e.attr("class") != null
-                && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
+        if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
             return true;
 
         String style = e.attr("style");
@@ -127,36 +174,34 @@ public class OutputFormatter {
         return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
     }
 
-    void appendTextSkipHidden(Element e, StringBuilder accum) {
+    void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
         for (Node child : e.childNodes()) {
-            if (unlikely(child))
+            if (unlikely(child)) {
                 continue;
+            }
             if (child instanceof TextNode) {
                 TextNode textNode = (TextNode) child;
                 String txt = textNode.text();
                 accum.append(txt);
             } else if (child instanceof Element) {
                 Element element = (Element) child;
-                if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
+                if (accum.length() > 0 && element.isBlock()
+                        && !lastCharIsWhitespace(accum))
                     accum.append(' ');
                 else if (element.tagName().equals("br"))
                     accum.append(' ');
-                appendTextSkipHidden(element, accum);
+                appendTextSkipHidden(element, accum, indent + 1);
             }
         }
     }
 
-    boolean lastCharIsWhitespace(StringBuilder accum) {
-        return (accum.length() != 0) && Character.isWhitespace(accum.charAt(accum.length() - 1));
-    }
-
-    protected String node2TextOld(Element el) {
-        return el.text();
+    static boolean lastCharIsWhitespace(StringBuilder accum) {
+        return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1));
     }
 
     protected String node2Text(Element el) {
         StringBuilder sb = new StringBuilder(200);
-        appendTextSkipHidden(el, sb);
+        appendTextSkipHidden(el, sb, 0);
         return sb.toString();
     }
 
@@ -168,4 +213,4 @@ public class OutputFormatter {
     public OutputFormatter appendUnlikelyPattern(String str) {
         return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str);
     }
-}
+}
\ No newline at end of file
diff --git a/app/src/main/java/acr/browser/lightning/reading/SHelper.java b/app/src/main/java/acr/browser/lightning/reading/SHelper.java
index 7274219..a71acdc 100644
--- a/app/src/main/java/acr/browser/lightning/reading/SHelper.java
+++ b/app/src/main/java/acr/browser/lightning/reading/SHelper.java
@@ -15,17 +15,19 @@
  */
 package acr.browser.lightning.reading;
 
+import org.jsoup.nodes.Element;
+
 import java.io.UnsupportedEncodingException;
 import java.net.CookieHandler;
 import java.net.CookieManager;
 import java.net.CookiePolicy;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.security.SecureRandom;
 import java.security.cert.CertificateException;
 import java.security.cert.X509Certificate;
-import java.text.SimpleDateFormat;
-import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -33,10 +35,8 @@ import javax.net.ssl.KeyManager;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.TrustManager;
 import javax.net.ssl.X509TrustManager;
-import org.jsoup.nodes.Element;
 
 /**
- * 
  * @author Peter Karich
  */
 public class SHelper {
@@ -127,8 +127,7 @@ public class SHelper {
             return null;
 
         // dynamic programming => save already identical length into array
-        // to understand this algo simply print identical length in every entry
-        // of the array
+        // to understand this algo simply print identical length in every entry of the array
         // i+1, j+1 then reuses information from i,j
         // java initializes them already with 0
         int[][] num = new int[str1.length()][str2.length()];
@@ -152,7 +151,7 @@ public class SHelper {
                 }
             }
         }
-        return new int[] { lastSubstrBegin, endIndex };
+        return new int[]{lastSubstrBegin, endIndex};
     }
 
     public static String getDefaultFavicon(String url) {
@@ -160,35 +159,19 @@ public class SHelper {
     }
 
     /**
-     * @param urlForDomain
-     *            extract the domain from this url
-     * @param path
-     *            this url does not have a domain
-     * @return returns the domain
+     * @param urlForDomain extract the domain from this url
+     * @param path         this url does not have a domain
+     * @return
      */
     public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
-        if (path.startsWith("http"))
+        try {
+            // See: http://stackoverflow.com/questions/1389184/building-an-absolute-url-from-a-relative-url-in-java
+            URL baseUrl = new URL(urlForDomain);
+            URL relativeurl = new URL(baseUrl, path);
+            return relativeurl.toString();
+        } catch (MalformedURLException ex) {
             return path;
-
-        if ("favicon.ico".equals(path))
-            path = "/favicon.ico";
-
-        if (path.startsWith("//")) {
-            // wikipedia special case, see tests
-            if (urlForDomain.startsWith("https:"))
-                return "https:" + path;
-
-            return "http:" + path;
-        } else if (path.startsWith("/"))
-            return "http://" + extractHost(urlForDomain) + path;
-        else if (path.startsWith("../")) {
-            int slashIndex = urlForDomain.lastIndexOf("/");
-            if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
-                urlForDomain = urlForDomain.substring(0, slashIndex + 1);
-
-            return urlForDomain + path;
         }
-        return path;
     }
 
     public static String extractHost(String url) {
@@ -224,14 +207,12 @@ public class SHelper {
     }
 
     public static boolean isVideo(String url) {
-        return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi")
-                || url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4")
-                || url.endsWith(".flv") || url.endsWith(".wmv");
+        return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") || url.endsWith(".mov")
+                || url.endsWith(".mpg4") || url.endsWith(".mp4") || url.endsWith(".flv") || url.endsWith(".wmv");
     }
 
     public static boolean isAudio(String url) {
-        return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u")
-                || url.endsWith(".wav");
+        return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") || url.endsWith(".wav");
     }
 
     public static boolean isDoc(String url) {
@@ -241,23 +222,20 @@ public class SHelper {
 
     public static boolean isPackage(String url) {
         return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
-                || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm")
-                || url.endsWith(".7z");
+                || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") || url.endsWith(".7z");
     }
 
     public static boolean isApp(String url) {
-        return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat")
-                || url.endsWith(".dmg");
+        return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") || url.endsWith(".dmg");
     }
 
     public static boolean isImage(String url) {
         return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
-                || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico")
-                || url.endsWith(".eps");
+                || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") || url.endsWith(".eps");
     }
 
     /**
-     * http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se
+     * @see "http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se"
      */
     public static void enableCookieMgmt() {
         CookieManager manager = new CookieManager();
@@ -266,7 +244,7 @@ public class SHelper {
     }
 
     /**
-     * http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection
+     * @see "http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection"
      */
     public static void enableUserAgentOverwrite() {
         System.setProperty("http.agent", "");
@@ -377,8 +355,8 @@ public class SHelper {
                 } else if (counter == monthCounter + 1) {
                     try {
                         day = Integer.parseInt(str);
-                    } catch (Exception ex) {
-                        ex.printStackTrace();
+                    } catch (Exception ignored) {
+                        // ignored
                     }
                     if (day < 1 || day > 31) {
                         day = -1;
@@ -425,21 +403,11 @@ public class SHelper {
         return dateStr + "/01/01";
     }
 
-    /**
-     * keep in mind: simpleDateFormatter is not thread safe! call completeDate
-     * before applying this formatter.
-     */
-    public static SimpleDateFormat createDateFormatter() {
-        return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
-    }
-
-    // with the help of
-    // http://stackoverflow.com/questions/1828775/httpclient-and-ssl
+    // with the help of http://stackoverflow.com/questions/1828775/httpclient-and-ssl
     public static void enableAnySSL() {
         try {
             SSLContext ctx = SSLContext.getInstance("TLS");
-            ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() },
-                    new SecureRandom());
+            ctx.init(new KeyManager[0], new TrustManager[]{new DefaultTrustManager()}, new SecureRandom());
             SSLContext.setDefault(ctx);
         } catch (Exception ex) {
             ex.printStackTrace();
@@ -449,13 +417,11 @@ public class SHelper {
     private static class DefaultTrustManager implements X509TrustManager {
 
         @Override
-        public void checkClientTrusted(X509Certificate[] arg0, String arg1)
-                throws CertificateException {
+        public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
         }
 
         @Override
-        public void checkServerTrusted(X509Certificate[] arg0, String arg1)
-                throws CertificateException {
+        public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
         }
 
         @Override
@@ -473,4 +439,4 @@ public class SHelper {
         }
         return chars;
     }
-}
+}
\ No newline at end of file