Improved reading mode thanks to changes from snacktory fork by skyshard

9 years ago · 367c62bd39
6 changed files with 1050 additions and 356 deletions
--- a/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java
+++ b/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java
@ -139,7 +139,7 @@ public class ReadingActivity extends AppCompatActivity {
				@@ -139,7 +139,7 @@ public class ReadingActivity extends AppCompatActivity {

        private final Activity mActivity;
        private String mTitleText;
-        private List<String> mBodyText;
+        private String mBodyText;

        public PageLoader(Activity activity) {
            mActivity = activity;
@ -163,15 +163,15 @@ public class ReadingActivity extends AppCompatActivity {
				@@ -163,15 +163,15 @@ public class ReadingActivity extends AppCompatActivity {
            try {
                JResult result = fetcher.fetchAndExtract(params[0], 2500, true);
                mTitleText = result.getTitle();
-                mBodyText = result.getTextList();
+                mBodyText = result.getText();
            } catch (Exception e) {
                mTitleText = "";
-                mBodyText = new ArrayList<>();
+                mBodyText = "";
                e.printStackTrace();
            } catch (OutOfMemoryError e) {
                System.gc();
                mTitleText = "";
-                mBodyText = new ArrayList<>();
+                mBodyText = "";
                e.printStackTrace();
            }
            return null;
@ -186,11 +186,7 @@ public class ReadingActivity extends AppCompatActivity {
				@@ -186,11 +186,7 @@ public class ReadingActivity extends AppCompatActivity {
            if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
                setText(getString(R.string.untitled), getString(R.string.loading_failed));
            } else {
-                StringBuilder builder = new StringBuilder();
-                for (String text : mBodyText) {
-                    builder.append(text).append("\n\n");
-                }
-                setText(mTitleText, builder.toString());
+                setText(mTitleText, mBodyText);
            }
            super.onPostExecute(result);
        }
--- a/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java
+++ b/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java
--- a/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java
+++ b/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java
@ -22,22 +22,19 @@ import java.io.FileWriter;
				@@ -22,22 +22,19 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
 import java.net.Proxy;
 import java.net.URL;
 import java.util.LinkedHashSet;
-import java.util.Locale;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.Inflater;
 import java.util.zip.InflaterInputStream;

-import acr.browser.lightning.constant.Constants;
-import android.util.Log;
-
 /**
 * Class to fetch articles. This class is thread safe.
- * 
+ *
 * @author Peter Karich
 */
 public class HtmlFetcher {
@ -64,7 +61,7 @@ public class HtmlFetcher {
				@@ -64,7 +61,7 @@ public class HtmlFetcher {
            else
                existing.add(domainStr);

-            String html = new HtmlFetcher().fetchAsString(url, 20000);
+            String html = new HtmlFetcher().fetchAsString(url, 2000);
            String outFile = domainStr + counterStr + ".html";
            BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
            writer.write(html);
@ -73,8 +70,8 @@ public class HtmlFetcher {
				@@ -73,8 +70,8 @@ public class HtmlFetcher {
        reader.close();
    }

-    private String referrer = "https://github.com/karussell/snacktory";
-    private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ')';
+    private String referrer = "http://jetsli.de/crawler";
+    private String userAgent = "Mozilla/5.0 (compatible; Jetslide; +" + referrer + ')';
    private String cacheControl = "max-age=0";
    private String language = "en-us";
    private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
@ -83,7 +80,7 @@ public class HtmlFetcher {
				@@ -83,7 +80,7 @@ public class HtmlFetcher {
    private final AtomicInteger cacheCounter = new AtomicInteger(0);
    private int maxTextLength = -1;
    private ArticleTextExtractor extractor = new ArticleTextExtractor();
-    private final Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
+    private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
        {
            add("bit.ly");
            add("cli.gs");
@ -202,6 +199,12 @@ public class HtmlFetcher {
				@@ -202,6 +199,12 @@ public class HtmlFetcher {
    }

    public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
+        return fetchAndExtract(url, timeout, resolve, 0, false);
+    }
+
+    // main workhorse to call externally
+    public JResult fetchAndExtract(String url, int timeout, boolean resolve,
+                                   int maxContentSize, boolean forceReload) throws Exception {
        String originalUrl = url;
        url = SHelper.removeHashbang(url);
        String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
@ -219,9 +222,8 @@ public class HtmlFetcher {
				@@ -219,9 +222,8 @@ public class HtmlFetcher {
            if (res != null)
                return res;

-            String resUrl = getResolvedUrl(url, timeout);
+            String resUrl = getResolvedUrl(url, timeout, 0);
            if (resUrl.isEmpty()) {
-                Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);

                JResult result = new JResult();
                if (cache != null)
@ -229,10 +231,9 @@ public class HtmlFetcher {
				@@ -229,10 +231,9 @@ public class HtmlFetcher {
                return result.setUrl(url);
            }

-            // if resolved url is longer then use it!
-            if (resUrl.trim().length() > url.length()) {
-                // this is necessary e.g. for some homebaken url resolvers which
-                // return
+            // if resolved url is different then use it!
+            if (!resUrl.equals(url)) {
+                // this is necessary e.g. for some homebaken url resolvers which return
                // the resolved url relative to url!
                url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
            }
@ -244,20 +245,18 @@ public class HtmlFetcher {
				@@ -244,20 +245,18 @@ public class HtmlFetcher {
            return res;

        JResult result = new JResult();
-        // or should we use? <link rel="canonical"
-        // href="http://www.N24.de/news/newsitem_6797232.html"/>
+        // or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/>
        result.setUrl(url);
        result.setOriginalUrl(originalUrl);
-        result.setDate(SHelper.estimateDate(url));

-        // Immediately put the url into the cache as extracting content takes
-        // time.
+        // Immediately put the url into the cache as extracting content takes time.
        if (cache != null) {
            cache.put(originalUrl, result);
            cache.put(url, result);
        }

-        String lowerUrl = url.toLowerCase(Locale.getDefault());
+        // extract content to the extent appropriate for content type
+        String lowerUrl = url.toLowerCase();
        if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
            // skip
        } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
@ -265,16 +264,30 @@ public class HtmlFetcher {
				@@ -265,16 +264,30 @@ public class HtmlFetcher {
        } else if (SHelper.isImage(lowerUrl)) {
            result.setImageUrl(url);
        } else {
-            extractor.extractContent(result, fetchAsString(url, timeout));
+            try {
+                String urlToDownload = url;
+                if (forceReload) {
+                    urlToDownload = getURLtoBreakCache(url);
+                }
+                extractor.extractContent(result, fetchAsString(urlToDownload, timeout), maxContentSize);
+            } catch (IOException io) {
+                // do nothing
+            }
            if (result.getFaviconUrl().isEmpty())
                result.setFaviconUrl(SHelper.getDefaultFavicon(url));

-            // some links are relative to root and do not include the domain of
-            // the url :(
-            result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
-            result.setImageUrl(fixUrl(url, result.getImageUrl()));
-            result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
-            result.setRssUrl(fixUrl(url, result.getRssUrl()));
+            // some links are relative to root and do not include the domain of the url :(
+            if (!result.getFaviconUrl().isEmpty())
+                result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
+
+            if (!result.getImageUrl().isEmpty())
+                result.setImageUrl(fixUrl(url, result.getImageUrl()));
+
+            if (!result.getVideoUrl().isEmpty())
+                result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
+
+            if (!result.getRssUrl().isEmpty())
+                result.setRssUrl(fixUrl(url, result.getRssUrl()));
        }
        result.setText(lessText(result.getText()));
        synchronized (result) {
@ -283,6 +296,20 @@ public class HtmlFetcher {
				@@ -283,6 +296,20 @@ public class HtmlFetcher {
        return result;
    }

+    // Ugly hack to break free from any cached versions, a few URLs required this.
+    public static String getURLtoBreakCache(String url) {
+        try {
+            URL aURL = new URL(url);
+            if (aURL.getQuery() != null && aURL.getQuery().isEmpty()) {
+                return url + "?1";
+            } else {
+                return url + "&1";
+            }
+        } catch (MalformedURLException e) {
+            return url;
+        }
+    }
+
    public String lessText(String text) {
        if (text == null)
            return "";
@ -297,13 +324,14 @@ public class HtmlFetcher {
				@@ -297,13 +324,14 @@ public class HtmlFetcher {
        return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
    }

-    public String fetchAsString(String urlAsString, int timeout) throws
-            IOException {
+    public String fetchAsString(String urlAsString, int timeout)
+            throws MalformedURLException, IOException {
        return fetchAsString(urlAsString, timeout, true);
    }

+    // main routine to get raw webpage content
    public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
-            throws IOException {
+            throws MalformedURLException, IOException {
        HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
        hConn.setInstanceFollowRedirects(true);
        String encoding = hConn.getContentEncoding();
@ -317,27 +345,23 @@ public class HtmlFetcher {
				@@ -317,27 +345,23 @@ public class HtmlFetcher {
        }

        String enc = Converter.extractEncoding(hConn.getContentType());
-        String res = createConverter(urlAsString).streamToString(is, enc);
-        Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
-        return res;
+        return createConverter(urlAsString).streamToString(is, enc);
    }

-    public Converter createConverter(String url) {
+    public static Converter createConverter(String url) {
        return new Converter(url);
    }

    /**
     * On some devices we have to hack:
-     * http://developers.sun.com/mobility/reference
-     * /techart/design_guidelines/http_redirection.html
-     * 
-     * @param timeout
-     *            Sets a specified timeout value, in milliseconds
+     * http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html
+     *
+     * @param timeout Sets a specified timeout value, in milliseconds
     * @return the resolved url if any. Or null if it couldn't resolve the url
-     *         (within the specified time) or the same url if response code is
-     *         OK
+     * (within the specified time) or the same url if response code is OK
     */
-    public String getResolvedUrl(String urlAsString, int timeout) {
+    public String getResolvedUrl(String urlAsString, int timeout,
+                                 int num_redirects) {
        String newUrl = null;
        int responseCode = -1;
        try {
@ -354,28 +378,32 @@ public class HtmlFetcher {
				@@ -354,28 +378,32 @@ public class HtmlFetcher {
                return urlAsString;

            newUrl = hConn.getHeaderField("Location");
-            if (responseCode / 100 == 3 && newUrl != null) {
+            // Note that the max recursion level is 5.
+            if (responseCode / 100 == 3 && newUrl != null && num_redirects < 5) {
                newUrl = newUrl.replaceAll(" ", "+");
-                // some services use (none-standard) utf8 in their location
-                // header
+                // some services use (none-standard) utf8 in their location header
                if (urlAsString.startsWith("http://bit.ly")
                        || urlAsString.startsWith("http://is.gd"))
                    newUrl = encodeUriFromHeader(newUrl);

-                // fix problems if shortened twice. as it is often the case
-                // after twitters' t.co bullshit
-                if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
-                    newUrl = getResolvedUrl(newUrl, timeout);
-
+                // AP: This code is not longer need, instead we always follow
+                // multiple redirects.
+                //
+                // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit
+                //if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
+                //    newUrl = getResolvedUrl(newUrl, timeout);
+
+                // Add support for URLs with multiple levels of redirection,
+                // call getResolvedUrl until there is no more redirects or a
+                // max number of redirects is reached.
+                newUrl = SHelper.useDomainOfFirstArg4Second(urlAsString, newUrl);
+                newUrl = getResolvedUrl(newUrl, timeout, num_redirects + 1);
                return newUrl;
            } else
                return urlAsString;

        } catch (Exception ex) {
-            Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
            return "";
-        } finally {
-            Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
        }
    }

@ -400,9 +428,9 @@ public class HtmlFetcher {
				@@ -400,9 +428,9 @@ public class HtmlFetcher {
    }

    protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
-            boolean includeSomeGooseOptions) throws IOException {
+                                                    boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
        URL url = new URL(urlAsStr);
-        // using proxy may increase latency
+        //using proxy may increase latency
        HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
        hConn.setRequestProperty("User-Agent", userAgent);
        hConn.setRequestProperty("Accept", accept);
@ -415,8 +443,7 @@ public class HtmlFetcher {
				@@ -415,8 +443,7 @@ public class HtmlFetcher {
            hConn.setRequestProperty("Cache-Control", cacheControl);
        }

-        // suggest respond to be gzipped or deflated (which is just another
-        // compression)
+        // suggest respond to be gzipped or deflated (which is just another compression)
        // http://stackoverflow.com/q/3932117
        hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
        hConn.setConnectTimeout(timeout);
@ -424,14 +451,12 @@ public class HtmlFetcher {
				@@ -424,14 +451,12 @@ public class HtmlFetcher {
        return hConn;
    }

-    private JResult getFromCache(String url, String originalUrl) throws Exception {
+    private JResult getFromCache(String url, String originalUrl) {
        if (cache != null) {
            JResult res = cache.get(url);
            if (res != null) {
-                // e.g. the cache returned a shortened url as original url now
-                // we want to store the
-                // current original url! Also it can be that the cache response
-                // to url but the JResult
+                // e.g. the cache returned a shortened url as original url now we want to store the
+                // current original url! Also it can be that the cache response to url but the JResult
                // does not contain it so overwrite it:
                res.setUrl(url);
                res.setOriginalUrl(originalUrl);
@ -441,4 +466,4 @@ public class HtmlFetcher {
				@@ -441,4 +466,4 @@ public class HtmlFetcher {
        }
        return null;
    }
-}
+}
--- a/app/src/main/java/acr/browser/lightning/reading/JResult.java
+++ b/app/src/main/java/acr/browser/lightning/reading/JResult.java
@ -16,14 +16,18 @@
				@@ -16,14 +16,18 @@
 package acr.browser.lightning.reading;

 import java.io.Serializable;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+

 /**
 * Parsed result from web page containing important title, text and image.
- * 
+ *
 * @author Peter Karich
 */
 public class JResult implements Serializable {
@ -38,10 +42,15 @@ public class JResult implements Serializable {
				@@ -38,10 +42,15 @@ public class JResult implements Serializable {
    private String text;
    private String faviconUrl;
    private String description;
-    private String dateString;
-    private List<String> textList;
+    private String authorName;
+    private String authorDescription;
+    private Date date;
    private Collection<String> keywords;
    private List<ImageResult> images = null;
+    private List<Map<String, String>> links = new ArrayList<>();
+    private String type;
+    private String sitename;
+    private String language;

    public JResult() {
    }
@ -108,6 +117,28 @@ public class JResult implements Serializable {
				@@ -108,6 +117,28 @@ public class JResult implements Serializable {
        return this;
    }

+    public String getAuthorName() {
+        if (authorName == null)
+            return "";
+        return authorName;
+    }
+
+    public JResult setAuthorName(String authorName) {
+        this.authorName = authorName;
+        return this;
+    }
+
+    public String getAuthorDescription() {
+        if (authorDescription == null)
+            return "";
+        return authorDescription;
+    }
+
+    public JResult setAuthorDescription(String authorDescription) {
+        this.authorDescription = authorDescription;
+        return this;
+    }
+
    public String getImageUrl() {
        if (imageUrl == null)
            return "";
@ -131,17 +162,6 @@ public class JResult implements Serializable {
				@@ -131,17 +162,6 @@ public class JResult implements Serializable {
        return this;
    }

-    public List<String> getTextList() {
-        if (this.textList == null)
-            return new ArrayList<>();
-        return this.textList;
-    }
-
-    public JResult setTextList(List<String> textList) {
-        this.textList = textList;
-        return this;
-    }
-
    public String getTitle() {
        if (title == null)
            return "";
@ -164,8 +184,8 @@ public class JResult implements Serializable {
				@@ -164,8 +184,8 @@ public class JResult implements Serializable {
        return this;
    }

-    public JResult setDate(String date) {
-        this.dateString = date;
+    public JResult setDate(Date date) {
+        this.date = date;
        return this;
    }

@ -180,8 +200,8 @@ public class JResult implements Serializable {
				@@ -180,8 +200,8 @@ public class JResult implements Serializable {
    /**
     * @return get date from url or guessed from text
     */
-    public String getDate() {
-        return dateString;
+    public Date getDate() {
+        return date;
    }

    /**
@ -209,8 +229,46 @@ public class JResult implements Serializable {
				@@ -209,8 +229,46 @@ public class JResult implements Serializable {
        this.images = images;
    }

+    public void addLink(String url, String text, Integer pos) {
+        Map link = new HashMap();
+        link.put("url", url);
+        link.put("text", text);
+        link.put("offset", String.valueOf(pos));
+        links.add(link);
+    }
+
+    public List<Map<String, String>> getLinks() {
+        if (links == null)
+            return Collections.emptyList();
+        return links;
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public void setType(String type) {
+        this.type = type;
+    }
+
+    public String getSitename() {
+        return sitename;
+    }
+
+    public void setSitename(String sitename) {
+        this.sitename = sitename;
+    }
+
+    public String getLanguage() {
+        return language;
+    }
+
+    public void setLanguage(String language) {
+        this.language = language;
+    }
+
    @Override
    public String toString() {
        return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
    }
-}
+}
--- a/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java
+++ b/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java
@ -4,40 +4,46 @@ import org.jsoup.Jsoup;
				@@ -4,40 +4,46 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;

-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.Locale;
 import java.util.regex.Pattern;
+
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;

 /**
 * @author goose | jim
 * @author karussell
- * 
- *         this class will be responsible for taking our top node and stripping
- *         out junk we don't want and getting it ready for how we want it
- *         presented to the user
+ *         <p/>
+ *         this class will be responsible for taking our top node and stripping out junk
+ *         we don't want and getting it ready for how we want it presented to the user
 */
 public class OutputFormatter {

-    public static final int MIN_PARAGRAPH_TEXT = 50;
+    private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph
+    private static final int MIN_PARAGRAPH_TEXT = 30;       // Min size of any other paragraphs
    private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
    private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
-    protected final int minParagraphText;
-    protected final List<String> nodesToReplace;
-    protected String nodesToKeepCssSelector = "p";
+    private final int minFirstParagraphText;
+    private final int minParagraphText;
+    private final List<String> nodesToReplace;
+    private String nodesToKeepCssSelector = "p, ol";

    public OutputFormatter() {
-        this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
+        this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
    }

    public OutputFormatter(int minParagraphText) {
-        this(minParagraphText, NODES_TO_REPLACE);
+        this(minParagraphText, minParagraphText, NODES_TO_REPLACE);
+    }
+
+    public OutputFormatter(int minFirstParagraphText, int minParagraphText) {
+        this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE);
    }

-    public OutputFormatter(int minParagraphText, List<String> nodesToReplace) {
+    public OutputFormatter(int minFirstParagraphText, int minParagraphText,
+                           List<String> nodesToReplace) {
+        this.minFirstParagraphText = minFirstParagraphText;
        this.minParagraphText = minParagraphText;
        this.nodesToReplace = nodesToReplace;
    }
@ -53,36 +59,34 @@ public class OutputFormatter {
				@@ -53,36 +59,34 @@ public class OutputFormatter {
     * takes an element and turns the P tags into \n\n
     */
    public String getFormattedText(Element topNode) {
+        setParagraphIndex(topNode, nodesToKeepCssSelector);
        removeNodesWithNegativeScores(topNode);
        StringBuilder sb = new StringBuilder();
-        append(topNode, sb, nodesToKeepCssSelector);
+        int countOfP = append(topNode, sb, nodesToKeepCssSelector);
        String str = SHelper.innerTrim(sb.toString());
-        if (str.length() > 100)
+
+        int topNodeLength = topNode.text().length();
+        if (topNodeLength == 0) {
+            topNodeLength = 1;
+        }
+
+
+        boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25);
+        if (str.length() > 100 && countOfP > 0 && !lowTextRatio)
            return str;

        // no subelements
-        if (str.isEmpty() || !topNode.text().isEmpty()
+        if (str.isEmpty() || (!topNode.text().isEmpty()
                && str.length() <= topNode.ownText().length())
+                || countOfP == 0 || lowTextRatio) {
            str = topNode.text();
+        }

-        // if jsoup failed to parse the whole html now parse this smaller
+        // if jsoup failed to parse the whole html now parse this smaller 
        // snippet again to avoid html tags disturbing our text:
        return Jsoup.parse(str).text();
    }

-    /**
-     * Takes an element and returns a list of texts extracted from the P tags
-     */
-    public List<String> getTextList(Element topNode) {
-        List<String> texts = new ArrayList<>();
-        for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
-            if (element.hasText()) {
-                texts.add(element.text());
-            }
-        }
-        return texts;
-    }
-
    /**
     * If there are elements inside our top node that have a negative gravity
     * score remove them
@ -90,15 +94,20 @@ public class OutputFormatter {
				@@ -90,15 +94,20 @@ public class OutputFormatter {
    protected void removeNodesWithNegativeScores(Element topNode) {
        Elements gravityItems = topNode.select("*[gravityScore]");
        for (Element item : gravityItems) {
-            int score = Integer.parseInt(item.attr("gravityScore"));
-            if (score < 0 || item.text().length() < minParagraphText)
+            int score = getScore(item);
+            int paragraphIndex = getParagraphIndex(item);
+            if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
                item.remove();
+            }
        }
    }

-    protected void append(Element node, StringBuilder sb, String tagName) {
+    protected int append(Element node, StringBuilder sb, String tagName) {
+        int countOfP = 0; // Number of P elements in the article
+        int paragraphWithTextIndex = 0;
        // is select more costly then getElementsByTag?
-        MAIN: for (Element e : node.select(tagName)) {
+        MAIN:
+        for (Element e : node.select(tagName)) {
            Element tmpEl = e;
            // check all elements until 'node'
            while (tmpEl != null && !tmpEl.equals(node)) {
@ -108,18 +117,56 @@ public class OutputFormatter {
				@@ -108,18 +117,56 @@ public class OutputFormatter {
            }

            String text = node2Text(e);
-            if (text.isEmpty() || text.length() < minParagraphText
-                    || text.length() > SHelper.countLetters(text) * 2)
+            if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex)
+                    || text.length() > SHelper.countLetters(text) * 2) {
                continue;
+            }
+
+            if (e.tagName().equals("p")) {
+                countOfP++;
+            }

            sb.append(text);
            sb.append("\n\n");
+            paragraphWithTextIndex += 1;
+        }
+
+        return countOfP;
+    }
+
+    protected static void setParagraphIndex(Element node, String tagName) {
+        int paragraphIndex = 0;
+        for (Element e : node.select(tagName)) {
+            e.attr("paragraphIndex", Integer.toString(paragraphIndex++));
+        }
+    }
+
+    protected int getMinParagraph(int paragraphIndex) {
+        if (paragraphIndex < 1) {
+            return minFirstParagraphText;
+        } else {
+            return minParagraphText;
+        }
+    }
+
+    protected static int getParagraphIndex(Element el) {
+        try {
+            return Integer.parseInt(el.attr("paragraphIndex"));
+        } catch (NumberFormatException ex) {
+            return -1;
+        }
+    }
+
+    protected static int getScore(Element el) {
+        try {
+            return Integer.parseInt(el.attr("gravityScore"));
+        } catch (Exception ex) {
+            return 0;
        }
    }

    boolean unlikely(Node e) {
-        if (e.attr("class") != null
-                && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
+        if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
            return true;

        String style = e.attr("style");
@ -127,36 +174,34 @@ public class OutputFormatter {
				@@ -127,36 +174,34 @@ public class OutputFormatter {
        return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
    }

-    void appendTextSkipHidden(Element e, StringBuilder accum) {
+    void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
        for (Node child : e.childNodes()) {
-            if (unlikely(child))
+            if (unlikely(child)) {
                continue;
+            }
            if (child instanceof TextNode) {
                TextNode textNode = (TextNode) child;
                String txt = textNode.text();
                accum.append(txt);
            } else if (child instanceof Element) {
                Element element = (Element) child;
-                if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
+                if (accum.length() > 0 && element.isBlock()
+                        && !lastCharIsWhitespace(accum))
                    accum.append(' ');
                else if (element.tagName().equals("br"))
                    accum.append(' ');
-                appendTextSkipHidden(element, accum);
+                appendTextSkipHidden(element, accum, indent + 1);
            }
        }
    }

-    boolean lastCharIsWhitespace(StringBuilder accum) {
-        return (accum.length() != 0) && Character.isWhitespace(accum.charAt(accum.length() - 1));
-    }
-
-    protected String node2TextOld(Element el) {
-        return el.text();
+    static boolean lastCharIsWhitespace(StringBuilder accum) {
+        return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1));
    }

    protected String node2Text(Element el) {
        StringBuilder sb = new StringBuilder(200);
-        appendTextSkipHidden(el, sb);
+        appendTextSkipHidden(el, sb, 0);
        return sb.toString();
    }

@ -168,4 +213,4 @@ public class OutputFormatter {
				@@ -168,4 +213,4 @@ public class OutputFormatter {
    public OutputFormatter appendUnlikelyPattern(String str) {
        return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str);
    }
-}
+}
--- a/app/src/main/java/acr/browser/lightning/reading/SHelper.java
+++ b/app/src/main/java/acr/browser/lightning/reading/SHelper.java
@ -15,17 +15,19 @@
				@@ -15,17 +15,19 @@
 */
 package acr.browser.lightning.reading;

+import org.jsoup.nodes.Element;
+
 import java.io.UnsupportedEncodingException;
 import java.net.CookieHandler;
 import java.net.CookieManager;
 import java.net.CookiePolicy;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.security.SecureRandom;
 import java.security.cert.CertificateException;
 import java.security.cert.X509Certificate;
-import java.text.SimpleDateFormat;
-import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -33,10 +35,8 @@ import javax.net.ssl.KeyManager;
				@@ -33,10 +35,8 @@ import javax.net.ssl.KeyManager;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.TrustManager;
 import javax.net.ssl.X509TrustManager;
-import org.jsoup.nodes.Element;

 /**
- * 
 * @author Peter Karich
 */
 public class SHelper {
@ -127,8 +127,7 @@ public class SHelper {
				@@ -127,8 +127,7 @@ public class SHelper {
            return null;

        // dynamic programming => save already identical length into array
-        // to understand this algo simply print identical length in every entry
-        // of the array
+        // to understand this algo simply print identical length in every entry of the array
        // i+1, j+1 then reuses information from i,j
        // java initializes them already with 0
        int[][] num = new int[str1.length()][str2.length()];
@ -152,7 +151,7 @@ public class SHelper {
				@@ -152,7 +151,7 @@ public class SHelper {
                }
            }
        }
-        return new int[] { lastSubstrBegin, endIndex };
+        return new int[]{lastSubstrBegin, endIndex};
    }

    public static String getDefaultFavicon(String url) {
@ -160,35 +159,19 @@ public class SHelper {
				@@ -160,35 +159,19 @@ public class SHelper {
    }

    /**
-     * @param urlForDomain
-     *            extract the domain from this url
-     * @param path
-     *            this url does not have a domain
-     * @return returns the domain
+     * @param urlForDomain extract the domain from this url
+     * @param path         this url does not have a domain
+     * @return
     */
    public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
-        if (path.startsWith("http"))
+        try {
+            // See: http://stackoverflow.com/questions/1389184/building-an-absolute-url-from-a-relative-url-in-java
+            URL baseUrl = new URL(urlForDomain);
+            URL relativeurl = new URL(baseUrl, path);
+            return relativeurl.toString();
+        } catch (MalformedURLException ex) {
            return path;
-
-        if ("favicon.ico".equals(path))
-            path = "/favicon.ico";
-
-        if (path.startsWith("//")) {
-            // wikipedia special case, see tests
-            if (urlForDomain.startsWith("https:"))
-                return "https:" + path;
-
-            return "http:" + path;
-        } else if (path.startsWith("/"))
-            return "http://" + extractHost(urlForDomain) + path;
-        else if (path.startsWith("../")) {
-            int slashIndex = urlForDomain.lastIndexOf("/");
-            if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
-                urlForDomain = urlForDomain.substring(0, slashIndex + 1);
-
-            return urlForDomain + path;
        }
-        return path;
    }

    public static String extractHost(String url) {
@ -224,14 +207,12 @@ public class SHelper {
				@@ -224,14 +207,12 @@ public class SHelper {
    }

    public static boolean isVideo(String url) {
-        return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi")
-                || url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4")
-                || url.endsWith(".flv") || url.endsWith(".wmv");
+        return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") || url.endsWith(".mov")
+                || url.endsWith(".mpg4") || url.endsWith(".mp4") || url.endsWith(".flv") || url.endsWith(".wmv");
    }

    public static boolean isAudio(String url) {
-        return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u")
-                || url.endsWith(".wav");
+        return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") || url.endsWith(".wav");
    }

    public static boolean isDoc(String url) {
@ -241,23 +222,20 @@ public class SHelper {
				@@ -241,23 +222,20 @@ public class SHelper {

    public static boolean isPackage(String url) {
        return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
-                || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm")
-                || url.endsWith(".7z");
+                || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") || url.endsWith(".7z");
    }

    public static boolean isApp(String url) {
-        return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat")
-                || url.endsWith(".dmg");
+        return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") || url.endsWith(".dmg");
    }

    public static boolean isImage(String url) {
        return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
-                || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico")
-                || url.endsWith(".eps");
+                || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") || url.endsWith(".eps");
    }

    /**
-     * http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se
+     * @see "http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se"
     */
    public static void enableCookieMgmt() {
        CookieManager manager = new CookieManager();
@ -266,7 +244,7 @@ public class SHelper {
				@@ -266,7 +244,7 @@ public class SHelper {
    }

    /**
-     * http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection
+     * @see "http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection"
     */
    public static void enableUserAgentOverwrite() {
        System.setProperty("http.agent", "");
@ -377,8 +355,8 @@ public class SHelper {
				@@ -377,8 +355,8 @@ public class SHelper {
                } else if (counter == monthCounter + 1) {
                    try {
                        day = Integer.parseInt(str);
-                    } catch (Exception ex) {
-                        ex.printStackTrace();
+                    } catch (Exception ignored) {
+                        // ignored
                    }
                    if (day < 1 || day > 31) {
                        day = -1;
@ -425,21 +403,11 @@ public class SHelper {
				@@ -425,21 +403,11 @@ public class SHelper {
        return dateStr + "/01/01";
    }

-    /**
-     * keep in mind: simpleDateFormatter is not thread safe! call completeDate
-     * before applying this formatter.
-     */
-    public static SimpleDateFormat createDateFormatter() {
-        return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
-    }
-
-    // with the help of
-    // http://stackoverflow.com/questions/1828775/httpclient-and-ssl
+    // with the help of http://stackoverflow.com/questions/1828775/httpclient-and-ssl
    public static void enableAnySSL() {
        try {
            SSLContext ctx = SSLContext.getInstance("TLS");
-            ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() },
-                    new SecureRandom());
+            ctx.init(new KeyManager[0], new TrustManager[]{new DefaultTrustManager()}, new SecureRandom());
            SSLContext.setDefault(ctx);
        } catch (Exception ex) {
            ex.printStackTrace();
@ -449,13 +417,11 @@ public class SHelper {
				@@ -449,13 +417,11 @@ public class SHelper {
    private static class DefaultTrustManager implements X509TrustManager {

        @Override
-        public void checkClientTrusted(X509Certificate[] arg0, String arg1)
-                throws CertificateException {
+        public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
        }

        @Override
-        public void checkServerTrusted(X509Certificate[] arg0, String arg1)
-                throws CertificateException {
+        public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
        }

        @Override
@ -473,4 +439,4 @@ public class SHelper {
				@@ -473,4 +439,4 @@ public class SHelper {
        }
        return chars;
    }
-}
+}