diff --git a/AndroidManifest.xml b/AndroidManifest.xml
index 7b68d0c..5c2bcc0 100644
--- a/AndroidManifest.xml
+++ b/AndroidManifest.xml
@@ -200,6 +200,17 @@
+
+
+
+
+
+
+
diff --git a/libs/jsoup-1.8.1.jar b/libs/jsoup-1.8.1.jar
new file mode 100644
index 0000000..ae717d4
Binary files /dev/null and b/libs/jsoup-1.8.1.jar differ
diff --git a/res/layout/license_activity.xml b/res/layout/license_activity.xml
index 53a53f5..a9c024e 100644
--- a/res/layout/license_activity.xml
+++ b/res/layout/license_activity.xml
@@ -140,5 +140,39 @@
android:layout_marginLeft="10dp"
android:layout_marginRight="10dp"
android:background="#cdcdcd" />
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/res/layout/reading_view.xml b/res/layout/reading_view.xml
new file mode 100644
index 0000000..c086420
--- /dev/null
+++ b/res/layout/reading_view.xml
@@ -0,0 +1,39 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/res/menu-xlarge/main.xml b/res/menu-xlarge/main.xml
index edf26c2..b9b33df 100644
--- a/res/menu-xlarge/main.xml
+++ b/res/menu-xlarge/main.xml
@@ -1,73 +1,77 @@
-
-
-
\ No newline at end of file
diff --git a/res/values/strings.xml b/res/values/strings.xml
index cc56b3a..05287d3 100644
--- a/res/values/strings.xml
+++ b/res/values/strings.xml
@@ -198,4 +198,8 @@
Block 3rd Party Cookies
This feature is only available on Android 5.0+
Enable Color Mode
+ Reader Mode
+ Loading…
+ Couldn\'t load anything from the page.
+ Snacktory
diff --git a/src/acr/browser/lightning/BrowserActivity.java b/src/acr/browser/lightning/BrowserActivity.java
index c8a4e2b..966d80e 100644
--- a/src/acr/browser/lightning/BrowserActivity.java
+++ b/src/acr/browser/lightning/BrowserActivity.java
@@ -179,7 +179,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
mDrawerListRight.setDividerHeight(0);
setNavigationDrawerWidth();
mDrawerLayout.setDrawerListener(new DrawerLocker());
-
+
mWebpageBitmap = BitmapFactory.decodeResource(getResources(), R.drawable.ic_webpage);
mActionBar = getSupportActionBar();
final TypedArray styledAttributes = mContext.getTheme().obtainStyledAttributes(
@@ -350,7 +350,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
};
anim.setDuration(300);
anim.setInterpolator(new DecelerateInterpolator());
- anim.setAnimationListener(new AnimationListener(){
+ anim.setAnimationListener(new AnimationListener() {
@Override
public void onAnimationStart(Animation animation) {
@@ -368,7 +368,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onAnimationRepeat(Animation animation) {
}
-
+
});
new Handler().postDelayed(new Runnable() {
@@ -488,12 +488,12 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
checkForTor();
}
-
+
private class DrawerLocker implements DrawerListener {
@Override
public void onDrawerClosed(View v) {
- if(v == mDrawerRight){
+ if (v == mDrawerRight) {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerLeft);
} else {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerRight);
@@ -502,7 +502,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onDrawerOpened(View v) {
- if(v == mDrawerRight){
+ if (v == mDrawerRight) {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerLeft);
} else {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerRight);
@@ -516,7 +516,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onDrawerStateChanged(int arg) {
}
-
+
}
public boolean handleMenuItemClick(MenuItem item) {
@@ -596,6 +596,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
case R.id.action_find:
findInPage();
return true;
+ case R.id.action_reading_mode:
+ Intent read = new Intent(this, ReadingActivity.class);
+ read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
+ startActivity(read);
+ return true;
default:
return super.onOptionsItemSelected(item);
}
@@ -912,6 +917,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
case R.id.action_find:
findInPage();
return true;
+ case R.id.action_reading_mode:
+ Intent read = new Intent(this, ReadingActivity.class);
+ read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
+ startActivity(read);
+ return true;
default:
return super.onOptionsItemSelected(item);
}
@@ -1622,7 +1632,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
}
});
-
+
ViewCompat.jumpDrawablesToCurrentState(holder.exit);
LightningView web = data.get(position);
diff --git a/src/acr/browser/lightning/Constants.java b/src/acr/browser/lightning/Constants.java
index da89b36..cfb6477 100644
--- a/src/acr/browser/lightning/Constants.java
+++ b/src/acr/browser/lightning/Constants.java
@@ -29,6 +29,8 @@ public final class Constants {
public static final String JAVASCRIPT_INVERT_PAGE = "javascript:(function(){var e='img {-webkit-filter: invert(100%);'+'-moz-filter: invert(100%);'+'-o-filter: invert(100%);'+'-ms-filter: invert(100%); }',t=document.getElementsByTagName('head')[0],n=document.createElement('style');if(!window.counter){window.counter=1}else{window.counter++;if(window.counter%2==0){var e='html {-webkit-filter: invert(0%); -moz-filter: invert(0%); -o-filter: invert(0%); -ms-filter: invert(0%); }'}}n.type='text/css';if(n.styleSheet){n.styleSheet.cssText=e}else{n.appendChild(document.createTextNode(e))}t.appendChild(n)})();";
public static final String JAVASCRIPT_TEXT_REFLOW = "javascript:document.getElementsByTagName('body')[0].style.width=window.innerWidth+'px';";
+ public static final String LOAD_READING_URL = "ReadingUrl";
+
public static final String SEPARATOR = "\\|\\$\\|SEPARATOR\\|\\$\\|";
public static final String HTTP = "http://";
public static final String HTTPS = "https://";
diff --git a/src/acr/browser/lightning/LicenseActivity.java b/src/acr/browser/lightning/LicenseActivity.java
index 15e9642..7b1885c 100644
--- a/src/acr/browser/lightning/LicenseActivity.java
+++ b/src/acr/browser/lightning/LicenseActivity.java
@@ -30,6 +30,7 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
findViewById(R.id.licenseAOSP).setOnClickListener(this);
findViewById(R.id.licenseHosts).setOnClickListener(this);
findViewById(R.id.licenseOrbot).setOnClickListener(this);
+ findViewById(R.id.licenseSnactory).setOnClickListener(this);
}
@Override
@@ -47,6 +48,9 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
case R.id.licenseOrbot:
actionView("http://www.gnu.org/licenses/lgpl.html");
break;
+ case R.id.licenseSnactory:
+ actionView("http://www.apache.org/licenses/LICENSE-2.0");
+ break;
}
}
diff --git a/src/acr/browser/lightning/Reading/ArticleTextExtractor.java b/src/acr/browser/lightning/Reading/ArticleTextExtractor.java
new file mode 100644
index 0000000..3a9188a
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/ArticleTextExtractor.java
@@ -0,0 +1,619 @@
+package acr.browser.lightning.Reading;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import android.util.Log;
+
+/**
+ * This class is thread safe.
+ *
+ * @author Alex P (ifesdjeen from jreadability)
+ * @author Peter Karich
+ */
+public class ArticleTextExtractor {
+
+ // Interessting nodes
+ private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
+ // Unlikely candidates
+ private String unlikelyStr;
+ private Pattern UNLIKELY;
+ // Most likely positive candidates
+ private String positiveStr;
+ private Pattern POSITIVE;
+ // Most likely negative candidates
+ private String negativeStr;
+ private Pattern NEGATIVE;
+ private static final Pattern NEGATIVE_STYLE = Pattern
+ .compile("hidden|display: ?none|font-size: ?small");
+ private static final Set IGNORED_TITLE_PARTS = new LinkedHashSet() {
+ {
+ add("hacker news");
+ add("facebook");
+ }
+ };
+ private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
+ private OutputFormatter formatter = DEFAULT_FORMATTER;
+
+ public ArticleTextExtractor() {
+ setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
+ + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
+ + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
+ + "login|si(debar|gn|ngle)");
+ setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+ + "|arti(cle|kel)|instapaper_body");
+ setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
+ }
+
+ public ArticleTextExtractor setUnlikely(String unlikelyStr) {
+ this.unlikelyStr = unlikelyStr;
+ UNLIKELY = Pattern.compile(unlikelyStr);
+ return this;
+ }
+
+ public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
+ return setUnlikely(unlikelyStr + "|" + unlikelyMatches);
+ }
+
+ public ArticleTextExtractor setPositive(String positiveStr) {
+ this.positiveStr = positiveStr;
+ POSITIVE = Pattern.compile(positiveStr);
+ return this;
+ }
+
+ public ArticleTextExtractor addPositive(String pos) {
+ return setPositive(positiveStr + "|" + pos);
+ }
+
+ public ArticleTextExtractor setNegative(String negativeStr) {
+ this.negativeStr = negativeStr;
+ NEGATIVE = Pattern.compile(negativeStr);
+ return this;
+ }
+
+ public ArticleTextExtractor addNegative(String neg) {
+ setNegative(negativeStr + "|" + neg);
+ return this;
+ }
+
+ public void setOutputFormatter(OutputFormatter formatter) {
+ this.formatter = formatter;
+ }
+
+ /**
+ * @param html
+ * extracts article text from given html string. wasn't tested
+ * with improper HTML, although jSoup should be able to handle
+ * minor stuff.
+ * @returns extracted article, all HTML tags stripped
+ */
+ public JResult extractContent(Document doc) throws Exception {
+ return extractContent(new JResult(), doc, formatter);
+ }
+
+ public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {
+ return extractContent(new JResult(), doc, formatter);
+ }
+
+ public JResult extractContent(String html) throws Exception {
+ return extractContent(new JResult(), html);
+ }
+
+ public JResult extractContent(JResult res, String html) throws Exception {
+ return extractContent(res, html, formatter);
+ }
+
+ public JResult extractContent(JResult res, String html, OutputFormatter formatter)
+ throws Exception {
+ if (html.isEmpty())
+ throw new IllegalArgumentException("html string is empty!?");
+
+ // http://jsoup.org/cookbook/extracting-data/selector-syntax
+ return extractContent(res, Jsoup.parse(html), formatter);
+ }
+
+ public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)
+ throws Exception {
+ if (doc == null)
+ throw new NullPointerException("missing document");
+
+ res.setTitle(extractTitle(doc));
+ res.setDescription(extractDescription(doc));
+ res.setCanonicalUrl(extractCanonicalUrl(doc));
+
+ // now remove the clutter
+ prepareDocument(doc);
+
+ // init elements
+ Collection nodes = getNodes(doc);
+ int maxWeight = 0;
+ Element bestMatchElement = null;
+ for (Element entry : nodes) {
+ int currentWeight = getWeight(entry);
+ if (currentWeight > maxWeight) {
+ maxWeight = currentWeight;
+ bestMatchElement = entry;
+ if (maxWeight > 200)
+ break;
+ }
+ }
+
+ if (bestMatchElement != null) {
+ List images = new ArrayList();
+ Element imgEl = determineImageSource(bestMatchElement, images);
+ if (imgEl != null) {
+ res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
+ // TODO remove parent container of image if it is contained in
+ // bestMatchElement
+ // to avoid image subtitles flooding in
+
+ res.setImages(images);
+ }
+
+ // clean before grabbing text
+ String text = formatter.getFormattedText(bestMatchElement);
+ text = removeTitleFromText(text, res.getTitle());
+ // this fails for short facebook post and probably tweets:
+ // text.length() > res.getDescription().length()
+ if (text.length() > res.getTitle().length()) {
+ res.setText(text);
+ // print("best element:", bestMatchElement);
+ }
+ res.setTextList(formatter.getTextList(bestMatchElement));
+ }
+
+ if (res.getImageUrl().isEmpty()) {
+ res.setImageUrl(extractImageUrl(doc));
+ }
+
+ res.setRssUrl(extractRssUrl(doc));
+ res.setVideoUrl(extractVideoUrl(doc));
+ res.setFaviconUrl(extractFaviconUrl(doc));
+ res.setKeywords(extractKeywords(doc));
+ return res;
+ }
+
+ protected String extractTitle(Document doc) {
+ String title = cleanTitle(doc.title());
+ if (title.isEmpty()) {
+ title = SHelper.innerTrim(doc.select("head title").text());
+ if (title.isEmpty()) {
+ title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
+ if (title.isEmpty()) {
+ title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(
+ "content"));
+ if (title.isEmpty()) {
+ title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(
+ "content"));
+ }
+ }
+ }
+ }
+ return title;
+ }
+
+ protected String extractCanonicalUrl(Document doc) {
+ String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
+ if (url.isEmpty()) {
+ url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
+ if (url.isEmpty()) {
+ url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(
+ "content"));
+ }
+ }
+ return url;
+ }
+
+ protected String extractDescription(Document doc) {
+ String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(
+ "content"));
+ if (description.isEmpty()) {
+ description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(
+ "content"));
+ if (description.isEmpty()) {
+ description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")
+ .attr("content"));
+ }
+ }
+ return description;
+ }
+
+ protected Collection extractKeywords(Document doc) {
+ String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
+
+ if (content != null) {
+ if (content.startsWith("[") && content.endsWith("]"))
+ content = content.substring(1, content.length() - 1);
+
+ String[] split = content.split("\\s*,\\s*");
+ if (split.length > 1 || (split.length > 0 && !"".equals(split[0])))
+ return Arrays.asList(split);
+ }
+ return Collections.emptyList();
+ }
+
+ /**
+ * Tries to extract an image url from metadata if determineImageSource
+ * failed
+ *
+ * @return image url or empty str
+ */
+ protected String extractImageUrl(Document doc) {
+ // use open graph tag to get image
+ String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(
+ "content"));
+ if (imageUrl.isEmpty()) {
+ imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(
+ "content"));
+ if (imageUrl.isEmpty()) {
+ // prefer link over thumbnail-meta if empty
+ imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
+ if (imageUrl.isEmpty()) {
+ imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(
+ "content"));
+ }
+ }
+ }
+ return imageUrl;
+ }
+
+ protected String extractRssUrl(Document doc) {
+ return SHelper.replaceSpaces(doc.select("link[rel=alternate]")
+ .select("link[type=application/rss+xml]").attr("href"));
+ }
+
+ protected String extractVideoUrl(Document doc) {
+ return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
+ }
+
+ protected String extractFaviconUrl(Document doc) {
+ String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
+ if (faviconUrl.isEmpty()) {
+ faviconUrl = SHelper.replaceSpaces(doc.select(
+ "head link[rel^=shortcut],link[rel$=icon]").attr("href"));
+ }
+ return faviconUrl;
+ }
+
+ /**
+ * Weights current element. By matching it with positive candidates and
+ * weighting child nodes. Since it's impossible to predict which exactly
+ * names, ids or class names will be used in HTML, major role is played by
+ * child nodes
+ *
+ * @param e
+ * Element to weight, along with child nodes
+ */
+ protected int getWeight(Element e) {
+ int weight = calcWeight(e);
+ weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
+ weight += weightChildNodes(e);
+ return weight;
+ }
+
+ /**
+ * Weights a child nodes of given Element. During tests some difficulties
+ * were met. For instanance, not every single document has nested paragraph
+ * tags inside of the major article tag. Sometimes people are adding one
+ * more nesting level. So, we're adding 4 points for every 100 symbols
+ * contained in tag nested inside of the current weighted element, but only
+ * 3 points for every element that's nested 2 levels deep. This way we give
+ * more chances to extract the element that has less nested levels,
+ * increasing probability of the correct extraction.
+ *
+ * @param rootEl
+ * Element, who's child nodes will be weighted
+ */
+ protected int weightChildNodes(Element rootEl) {
+ int weight = 0;
+ Element caption = null;
+ List pEls = new ArrayList(5);
+ for (Element child : rootEl.children()) {
+ String ownText = child.ownText();
+ int ownTextLength = ownText.length();
+ if (ownTextLength < 20)
+ continue;
+
+ if (ownTextLength > 200)
+ weight += Math.max(50, ownTextLength / 10);
+
+ if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
+ weight += 30;
+ } else if (child.tagName().equals("div") || child.tagName().equals("p")) {
+ weight += calcWeightForChild(child, ownText);
+ if (child.tagName().equals("p") && ownTextLength > 50)
+ pEls.add(child);
+
+ if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))
+ caption = child;
+ }
+ }
+
+ // use caption and image
+ if (caption != null)
+ weight += 30;
+
+ if (pEls.size() >= 2) {
+ for (Element subEl : rootEl.children()) {
+ if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
+ weight += 20;
+ // headerEls.add(subEl);
+ } else if ("table;li;td;th".contains(subEl.tagName())) {
+ addScore(subEl, -30);
+ }
+
+ if ("p".contains(subEl.tagName()))
+ addScore(subEl, 30);
+ }
+ }
+ return weight;
+ }
+
+ public void addScore(Element el, int score) {
+ int old = getScore(el);
+ setScore(el, score + old);
+ }
+
+ public int getScore(Element el) {
+ int old = 0;
+ try {
+ old = Integer.parseInt(el.attr("gravityScore"));
+ } catch (Exception ex) {
+ }
+ return old;
+ }
+
+ public void setScore(Element el, int score) {
+ el.attr("gravityScore", Integer.toString(score));
+ }
+
+ private int calcWeightForChild(Element child, String ownText) {
+ int c = SHelper.count(ownText, """);
+ c += SHelper.count(ownText, "<");
+ c += SHelper.count(ownText, ">");
+ c += SHelper.count(ownText, "px");
+ int val;
+ if (c > 5)
+ val = -30;
+ else
+ val = (int) Math.round(ownText.length() / 25.0);
+
+ addScore(child, val);
+ return val;
+ }
+
+ private int calcWeight(Element e) {
+ int weight = 0;
+ if (POSITIVE.matcher(e.className()).find())
+ weight += 35;
+
+ if (POSITIVE.matcher(e.id()).find())
+ weight += 40;
+
+ if (UNLIKELY.matcher(e.className()).find())
+ weight -= 20;
+
+ if (UNLIKELY.matcher(e.id()).find())
+ weight -= 20;
+
+ if (NEGATIVE.matcher(e.className()).find())
+ weight -= 50;
+
+ if (NEGATIVE.matcher(e.id()).find())
+ weight -= 50;
+
+ String style = e.attr("style");
+ if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
+ weight -= 50;
+ return weight;
+ }
+
+ public Element determineImageSource(Element el, List images) {
+ int maxWeight = 0;
+ Element maxNode = null;
+ Elements els = el.select("img");
+ if (els.isEmpty())
+ els = el.parent().select("img");
+
+ double score = 1;
+ for (Element e : els) {
+ String sourceUrl = e.attr("src");
+ if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
+ continue;
+
+ int weight = 0;
+ int height = 0;
+ try {
+ height = Integer.parseInt(e.attr("height"));
+ if (height >= 50)
+ weight += 20;
+ else
+ weight -= 20;
+ } catch (Exception ex) {
+ }
+
+ int width = 0;
+ try {
+ width = Integer.parseInt(e.attr("width"));
+ if (width >= 50)
+ weight += 20;
+ else
+ weight -= 20;
+ } catch (Exception ex) {
+ }
+ String alt = e.attr("alt");
+ if (alt.length() > 35)
+ weight += 20;
+
+ String title = e.attr("title");
+ if (title.length() > 35)
+ weight += 20;
+
+ String rel = null;
+ boolean noFollow = false;
+ if (e.parent() != null) {
+ rel = e.parent().attr("rel");
+ if (rel != null && rel.contains("nofollow")) {
+ noFollow = rel.contains("nofollow");
+ weight -= 40;
+ }
+ }
+
+ weight = (int) (weight * score);
+ if (weight > maxWeight) {
+ maxWeight = weight;
+ maxNode = e;
+ score = score / 2;
+ }
+
+ ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,
+ noFollow);
+ images.add(image);
+ }
+
+ Collections.sort(images, new ImageComparator());
+ return maxNode;
+ }
+
+ /**
+ * Prepares document. Currently only stipping unlikely candidates, since
+ * from time to time they're getting more score than good ones especially in
+ * cases when major text is short.
+ *
+ * @param doc
+ * document to prepare. Passed as reference, and changed inside
+ * of function
+ */
+ protected void prepareDocument(Document doc) {
+ // stripUnlikelyCandidates(doc);
+ removeScriptsAndStyles(doc);
+ }
+
+ /**
+ * Removes unlikely candidates from HTML. Currently takes id and class name
+ * and matches them against list of patterns
+ *
+ * @param doc
+ * document to strip unlikely candidates from
+ */
+ protected void stripUnlikelyCandidates(Document doc) {
+ for (Element child : doc.select("body").select("*")) {
+ String className = child.className().toLowerCase(Locale.getDefault());
+ String id = child.id().toLowerCase(Locale.getDefault());
+
+ if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) {
+ // print("REMOVE:", child);
+ child.remove();
+ }
+ }
+ }
+
+ private Document removeScriptsAndStyles(Document doc) {
+ Elements scripts = doc.getElementsByTag("script");
+ for (Element item : scripts) {
+ item.remove();
+ }
+
+ Elements noscripts = doc.getElementsByTag("noscript");
+ for (Element item : noscripts) {
+ item.remove();
+ }
+
+ Elements styles = doc.getElementsByTag("style");
+ for (Element style : styles) {
+ style.remove();
+ }
+
+ return doc;
+ }
+
+ private boolean isAdImage(String imageUrl) {
+ return SHelper.count(imageUrl, "ad") >= 2;
+ }
+
+ /**
+ * Match only exact matching as longestSubstring can be too fuzzy
+ */
+ public String removeTitleFromText(String text, String title) {
+ // don't do this as its terrible to read
+ // int index1 = text.toLowerCase().indexOf(title.toLowerCase());
+ // if (index1 >= 0)
+ // text = text.substring(index1 + title.length());
+ // return text.trim();
+ return text;
+ }
+
+ /**
+ * @return a set of all important nodes
+ */
+ public Collection getNodes(Document doc) {
+ Set nodes = new HashSet(64);
+ int score = 100;
+ for (Element el : doc.select("body").select("*")) {
+ if (NODES.matcher(el.tagName()).matches()) {
+ nodes.add(el);
+ setScore(el, score);
+ score = score / 2;
+ }
+ }
+ return nodes;
+
+ }
+
+ public String cleanTitle(String title) {
+ StringBuilder res = new StringBuilder();
+ // int index = title.lastIndexOf("|");
+ // if (index > 0 && title.length() / 2 < index)
+ // title = title.substring(0, index + 1);
+
+ int counter = 0;
+ String[] strs = title.split("\\|");
+ for (String part : strs) {
+ if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))
+ continue;
+
+ if (counter == strs.length - 1 && res.length() > part.length())
+ continue;
+
+ if (counter > 0)
+ res.append("|");
+
+ res.append(part);
+ counter++;
+ }
+
+ return SHelper.innerTrim(res.toString());
+ }
+
+ /**
+ * Comparator for Image by weight
+ *
+ * @author Chris Alexander, chris@chris-alexander.co.uk
+ *
+ */
+ public class ImageComparator implements Comparator {
+
+ @Override
+ public int compare(ImageResult o1, ImageResult o2) {
+ // Returns the highest weight first
+ return o2.weight.compareTo(o1.weight);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/acr/browser/lightning/Reading/Converter.java b/src/acr/browser/lightning/Reading/Converter.java
new file mode 100644
index 0000000..4ed3178
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/Converter.java
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2011 Peter Karich
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package acr.browser.lightning.Reading;
+
+import java.io.*;
+import java.net.SocketTimeoutException;
+import java.nio.charset.Charset;
+import java.util.Locale;
+
+import acr.browser.lightning.Constants;
+import android.util.Log;
+
+/**
+ * This class is not thread safe. Use one new instance every time due to
+ * encoding variable.
+ *
+ * @author Peter Karich
+ */
+public class Converter {
+
+ public final static String UTF8 = "UTF-8";
+ public final static String ISO = "ISO-8859-1";
+ public final static int K2 = 2048;
+ private int maxBytes = 1000000 / 2;
+ private String encoding;
+ private String url;
+
+ public Converter(String urlOnlyHint) {
+ url = urlOnlyHint;
+ }
+
+ public Converter() {
+ }
+
+ public Converter setMaxBytes(int maxBytes) {
+ this.maxBytes = maxBytes;
+ return this;
+ }
+
+ public static String extractEncoding(String contentType) {
+ String[] values;
+ if (contentType != null)
+ values = contentType.split(";");
+ else
+ values = new String[0];
+
+ String charset = "";
+
+ for (String value : values) {
+ value = value.trim().toLowerCase(Locale.getDefault());
+
+ if (value.startsWith("charset="))
+ charset = value.substring("charset=".length());
+ }
+
+ // http1.1 says ISO-8859-1 is the default charset
+ if (charset.length() == 0)
+ charset = ISO;
+
+ return charset;
+ }
+
+ public String getEncoding() {
+ if (encoding == null)
+ return "";
+ return encoding.toLowerCase(Locale.getDefault());
+ }
+
+ public String streamToString(InputStream is) {
+ return streamToString(is, maxBytes, encoding);
+ }
+
+ public String streamToString(InputStream is, String enc) {
+ return streamToString(is, maxBytes, enc);
+ }
+
+ /**
+ * reads bytes off the string and returns a string
+ *
+ * @param is
+ * @param maxBytes
+ * The max bytes that we want to read from the input stream
+ * @return String
+ */
+ public String streamToString(InputStream is, int maxBytes, String enc) {
+ encoding = enc;
+ // Http 1.1. standard is iso-8859-1 not utf8 :(
+ // but we force utf-8 as youtube assumes it ;)
+ if (encoding == null || encoding.isEmpty())
+ encoding = UTF8;
+
+ BufferedInputStream in = null;
+ try {
+ in = new BufferedInputStream(is, K2);
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+
+ // detect encoding with the help of meta tag
+ try {
+ in.mark(K2 * 2);
+ String tmpEnc = detectCharset("charset=", output, in, encoding);
+ if (tmpEnc != null)
+ encoding = tmpEnc;
+ else {
+ Log.d(Constants.TAG, "no charset found in first stage");
+ // detect with the help of xml beginning ala
+ // encoding="charset"
+ tmpEnc = detectCharset("encoding=", output, in, encoding);
+ if (tmpEnc != null)
+ encoding = tmpEnc;
+ else
+ Log.d(Constants.TAG, "no charset found in second stage");
+ }
+
+ if (!Charset.isSupported(encoding))
+ throw new UnsupportedEncodingException(encoding);
+ } catch (UnsupportedEncodingException e) {
+ Log.d(Constants.TAG,
+ "Using default encoding:" + UTF8 + " problem:" + e.getMessage()
+ + " encoding:" + encoding + " " + url);
+ encoding = UTF8;
+ }
+
+ // SocketException: Connection reset
+ // IOException: missing CR => problem on server (probably some xml
+ // character thing?)
+ // IOException: Premature EOF => socket unexpectly closed from
+ // server
+ int bytesRead = output.size();
+ byte[] arr = new byte[K2];
+ while (true) {
+ if (bytesRead >= maxBytes) {
+ Log.d(Constants.TAG, "Maxbyte of " + maxBytes
+ + " exceeded! Maybe html is now broken but try it nevertheless. Url: "
+ + url);
+ break;
+ }
+
+ int n = in.read(arr);
+ if (n < 0)
+ break;
+ bytesRead += n;
+ output.write(arr, 0, n);
+ }
+
+ return output.toString(encoding);
+ } catch (SocketTimeoutException e) {
+ Log.e(Constants.TAG, e.toString() + " url:" + url);
+ } catch (IOException e) {
+ Log.e(Constants.TAG, e.toString() + " url:" + url);
+ } finally {
+ if (in != null) {
+ try {
+ in.close();
+ } catch (Exception e) {
+ }
+ }
+ }
+ return "";
+ }
+
+ /**
+ * This method detects the charset even if the first call only returns some
+ * bytes. It will read until 4K bytes are reached and then try to determine
+ * the encoding
+ *
+ * @throws IOException
+ */
+ protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
+ String enc) throws IOException {
+
+ // Grab better encoding from stream
+ byte[] arr = new byte[K2];
+ int nSum = 0;
+ while (nSum < K2) {
+ int n = in.read(arr);
+ if (n < 0)
+ break;
+
+ nSum += n;
+ bos.write(arr, 0, n);
+ }
+
+ String str = bos.toString(enc);
+ int encIndex = str.indexOf(key);
+ int clength = key.length();
+ if (encIndex > 0) {
+ char startChar = str.charAt(encIndex + clength);
+ int lastEncIndex;
+ if (startChar == '\'')
+ // if we have charset='something'
+ lastEncIndex = str.indexOf("'", ++encIndex + clength);
+ else if (startChar == '\"')
+ // if we have charset="something"
+ lastEncIndex = str.indexOf("\"", ++encIndex + clength);
+ else {
+ // if we have "text/html; charset=utf-8"
+ int first = str.indexOf("\"", encIndex + clength);
+ if (first < 0)
+ first = Integer.MAX_VALUE;
+
+ // or "text/html; charset=utf-8 "
+ int sec = str.indexOf(" ", encIndex + clength);
+ if (sec < 0)
+ sec = Integer.MAX_VALUE;
+ lastEncIndex = Math.min(first, sec);
+
+ // or "text/html; charset=utf-8 '
+ int third = str.indexOf("'", encIndex + clength);
+ if (third > 0)
+ lastEncIndex = Math.min(lastEncIndex, third);
+ }
+
+ // re-read byte array with different encoding
+ // assume that the encoding string cannot be greater than 40 chars
+ if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
+ String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,
+ lastEncIndex));
+ try {
+ in.reset();
+ bos.reset();
+ return tmpEnc;
+ } catch (IOException ex) {
+ Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "
+ + tmpEnc + " " + ex.toString());
+ }
+ }
+ }
+ return null;
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/HtmlFetcher.java b/src/acr/browser/lightning/Reading/HtmlFetcher.java
new file mode 100644
index 0000000..c597193
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/HtmlFetcher.java
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2011 Peter Karich
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package acr.browser.lightning.Reading;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.Proxy;
+import java.net.URL;
+import java.util.LinkedHashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+
+import acr.browser.lightning.Constants;
+import android.util.Log;
+
+/**
+ * Class to fetch articles. This class is thread safe.
+ *
+ * @author Peter Karich
+ */
+public class HtmlFetcher {
+
+ static {
+ SHelper.enableCookieMgmt();
+ SHelper.enableUserAgentOverwrite();
+ SHelper.enableAnySSL();
+ }
+
+ public static void main(String[] args) throws Exception {
+ BufferedReader reader = new BufferedReader(new FileReader("urls.txt"));
+ String line = null;
+ Set existing = new LinkedHashSet();
+ while ((line = reader.readLine()) != null) {
+ int index1 = line.indexOf("\"");
+ int index2 = line.indexOf("\"", index1 + 1);
+ String url = line.substring(index1 + 1, index2);
+ String domainStr = SHelper.extractDomain(url, true);
+ String counterStr = "";
+ // TODO more similarities
+ if (existing.contains(domainStr))
+ counterStr = "2";
+ else
+ existing.add(domainStr);
+
+ String html = new HtmlFetcher().fetchAsString(url, 20000);
+ String outFile = domainStr + counterStr + ".html";
+ BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+ writer.write(html);
+ writer.close();
+ }
+ reader.close();
+ }
+
+ private String referrer = "https://github.com/karussell/snacktory";
+ private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ")";
+ private String cacheControl = "max-age=0";
+ private String language = "en-us";
+ private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
+ private String charset = "UTF-8";
+ private SCache cache;
+ private AtomicInteger cacheCounter = new AtomicInteger(0);
+ private int maxTextLength = -1;
+ private ArticleTextExtractor extractor = new ArticleTextExtractor();
+ private Set furtherResolveNecessary = new LinkedHashSet() {
+ {
+ add("bit.ly");
+ add("cli.gs");
+ add("deck.ly");
+ add("fb.me");
+ add("feedproxy.google.com");
+ add("flic.kr");
+ add("fur.ly");
+ add("goo.gl");
+ add("is.gd");
+ add("ink.co");
+ add("j.mp");
+ add("lnkd.in");
+ add("on.fb.me");
+ add("ow.ly");
+ add("plurl.us");
+ add("sns.mx");
+ add("snurl.com");
+ add("su.pr");
+ add("t.co");
+ add("tcrn.ch");
+ add("tl.gd");
+ add("tiny.cc");
+ add("tinyurl.com");
+ add("tmi.me");
+ add("tr.im");
+ add("twurl.nl");
+ }
+ };
+
+ public HtmlFetcher() {
+ }
+
+ public void setExtractor(ArticleTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ public ArticleTextExtractor getExtractor() {
+ return extractor;
+ }
+
+ public HtmlFetcher setCache(SCache cache) {
+ this.cache = cache;
+ return this;
+ }
+
+ public SCache getCache() {
+ return cache;
+ }
+
+ public int getCacheCounter() {
+ return cacheCounter.get();
+ }
+
+ public HtmlFetcher clearCacheCounter() {
+ cacheCounter.set(0);
+ return this;
+ }
+
+ public HtmlFetcher setMaxTextLength(int maxTextLength) {
+ this.maxTextLength = maxTextLength;
+ return this;
+ }
+
+ public int getMaxTextLength() {
+ return maxTextLength;
+ }
+
+ public void setAccept(String accept) {
+ this.accept = accept;
+ }
+
+ public void setCharset(String charset) {
+ this.charset = charset;
+ }
+
+ public void setCacheControl(String cacheControl) {
+ this.cacheControl = cacheControl;
+ }
+
+ public String getLanguage() {
+ return language;
+ }
+
+ public void setLanguage(String language) {
+ this.language = language;
+ }
+
+ public String getReferrer() {
+ return referrer;
+ }
+
+ public HtmlFetcher setReferrer(String referrer) {
+ this.referrer = referrer;
+ return this;
+ }
+
+ public String getUserAgent() {
+ return userAgent;
+ }
+
+ public void setUserAgent(String userAgent) {
+ this.userAgent = userAgent;
+ }
+
+ public String getAccept() {
+ return accept;
+ }
+
+ public String getCacheControl() {
+ return cacheControl;
+ }
+
+ public String getCharset() {
+ return charset;
+ }
+
+ public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
+ String originalUrl = url;
+ url = SHelper.removeHashbang(url);
+ String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
+ if (gUrl != null)
+ url = gUrl;
+ else {
+ gUrl = SHelper.getUrlFromUglyFacebookRedirect(url);
+ if (gUrl != null)
+ url = gUrl;
+ }
+
+ if (resolve) {
+ // check if we can avoid resolving the URL (which hits the website!)
+ JResult res = getFromCache(url, originalUrl);
+ if (res != null)
+ return res;
+
+ String resUrl = getResolvedUrl(url, timeout);
+ if (resUrl.isEmpty()) {
+ Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);
+
+ JResult result = new JResult();
+ if (cache != null)
+ cache.put(url, result);
+ return result.setUrl(url);
+ }
+
+ // if resolved url is longer then use it!
+ if (resUrl != null && resUrl.trim().length() > url.length()) {
+ // this is necessary e.g. for some homebaken url resolvers which
+ // return
+ // the resolved url relative to url!
+ url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
+ }
+ }
+
+ // check if we have the (resolved) URL in cache
+ JResult res = getFromCache(url, originalUrl);
+ if (res != null)
+ return res;
+
+ JResult result = new JResult();
+ // or should we use?
+ result.setUrl(url);
+ result.setOriginalUrl(originalUrl);
+ result.setDate(SHelper.estimateDate(url));
+
+ // Immediately put the url into the cache as extracting content takes
+ // time.
+ if (cache != null) {
+ cache.put(originalUrl, result);
+ cache.put(url, result);
+ }
+
+ String lowerUrl = url.toLowerCase(Locale.getDefault());
+ if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
+ // skip
+ } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
+ result.setVideoUrl(url);
+ } else if (SHelper.isImage(lowerUrl)) {
+ result.setImageUrl(url);
+ } else {
+ extractor.extractContent(result, fetchAsString(url, timeout));
+ if (result.getFaviconUrl().isEmpty())
+ result.setFaviconUrl(SHelper.getDefaultFavicon(url));
+
+ // some links are relative to root and do not include the domain of
+ // the url :(
+ result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
+ result.setImageUrl(fixUrl(url, result.getImageUrl()));
+ result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
+ result.setRssUrl(fixUrl(url, result.getRssUrl()));
+ }
+ result.setText(lessText(result.getText()));
+ synchronized (result) {
+ result.notifyAll();
+ }
+ return result;
+ }
+
+ public String lessText(String text) {
+ if (text == null)
+ return "";
+
+ if (maxTextLength >= 0 && text.length() > maxTextLength)
+ return text.substring(0, maxTextLength);
+
+ return text;
+ }
+
+ private static String fixUrl(String url, String urlOrPath) {
+ return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
+ }
+
+ public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException,
+ IOException {
+ return fetchAsString(urlAsString, timeout, true);
+ }
+
+ public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
+ throws MalformedURLException, IOException {
+ HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
+ hConn.setInstanceFollowRedirects(true);
+ String encoding = hConn.getContentEncoding();
+ InputStream is;
+ if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+ is = new GZIPInputStream(hConn.getInputStream());
+ } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
+ is = new InflaterInputStream(hConn.getInputStream(), new Inflater(true));
+ } else {
+ is = hConn.getInputStream();
+ }
+
+ String enc = Converter.extractEncoding(hConn.getContentType());
+ String res = createConverter(urlAsString).streamToString(is, enc);
+ Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
+ return res;
+ }
+
+ public Converter createConverter(String url) {
+ return new Converter(url);
+ }
+
+ /**
+ * On some devices we have to hack:
+ * http://developers.sun.com/mobility/reference
+ * /techart/design_guidelines/http_redirection.html
+ *
+ * @param timeout
+ * Sets a specified timeout value, in milliseconds
+ * @return the resolved url if any. Or null if it couldn't resolve the url
+ * (within the specified time) or the same url if response code is
+ * OK
+ */
+ public String getResolvedUrl(String urlAsString, int timeout) {
+ String newUrl = null;
+ int responseCode = -1;
+ try {
+ HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, true);
+ // force no follow
+ hConn.setInstanceFollowRedirects(false);
+ // the program doesn't care what the content actually is !!
+ // http://java.sun.com/developer/JDCTechTips/2003/tt0422.html
+ hConn.setRequestMethod("HEAD");
+ hConn.connect();
+ responseCode = hConn.getResponseCode();
+ hConn.getInputStream().close();
+ if (responseCode == HttpURLConnection.HTTP_OK)
+ return urlAsString;
+
+ newUrl = hConn.getHeaderField("Location");
+ if (responseCode / 100 == 3 && newUrl != null) {
+ newUrl = newUrl.replaceAll(" ", "+");
+ // some services use (none-standard) utf8 in their location
+ // header
+ if (urlAsString.startsWith("http://bit.ly")
+ || urlAsString.startsWith("http://is.gd"))
+ newUrl = encodeUriFromHeader(newUrl);
+
+ // fix problems if shortened twice. as it is often the case
+ // after twitters' t.co bullshit
+ if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
+ newUrl = getResolvedUrl(newUrl, timeout);
+
+ return newUrl;
+ } else
+ return urlAsString;
+
+ } catch (Exception ex) {
+ Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
+ return "";
+ } finally {
+ Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
+ }
+ }
+
+ /**
+ * Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding
+ * to non-ASCII characters. Workaround for broken origin servers that send
+ * UTF-8 in the Location: header.
+ */
+ static String encodeUriFromHeader(String badLocation) {
+ StringBuilder sb = new StringBuilder();
+
+ for (char ch : badLocation.toCharArray()) {
+ if (ch < (char) 128) {
+ sb.append(ch);
+ } else {
+ // this is ONLY valid if the uri was decoded using ISO-8859-1
+ sb.append(String.format("%%%02X", (int) ch));
+ }
+ }
+
+ return sb.toString();
+ }
+
+ protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
+ boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
+ URL url = new URL(urlAsStr);
+ // using proxy may increase latency
+ HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
+ hConn.setRequestProperty("User-Agent", userAgent);
+ hConn.setRequestProperty("Accept", accept);
+
+ if (includeSomeGooseOptions) {
+ hConn.setRequestProperty("Accept-Language", language);
+ hConn.setRequestProperty("content-charset", charset);
+ hConn.addRequestProperty("Referer", referrer);
+ // avoid the cache for testing purposes only?
+ hConn.setRequestProperty("Cache-Control", cacheControl);
+ }
+
+ // suggest respond to be gzipped or deflated (which is just another
+ // compression)
+ // http://stackoverflow.com/q/3932117
+ hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+ hConn.setConnectTimeout(timeout);
+ hConn.setReadTimeout(timeout);
+ return hConn;
+ }
+
+ private JResult getFromCache(String url, String originalUrl) throws Exception {
+ if (cache != null) {
+ JResult res = cache.get(url);
+ if (res != null) {
+ // e.g. the cache returned a shortened url as original url now
+ // we want to store the
+ // current original url! Also it can be that the cache response
+ // to url but the JResult
+ // does not contain it so overwrite it:
+ res.setUrl(url);
+ res.setOriginalUrl(originalUrl);
+ cacheCounter.addAndGet(1);
+ return res;
+ }
+ }
+ return null;
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/ImageResult.java b/src/acr/browser/lightning/Reading/ImageResult.java
new file mode 100644
index 0000000..2a8321e
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/ImageResult.java
@@ -0,0 +1,31 @@
+package acr.browser.lightning.Reading;
+
+import org.jsoup.nodes.Element;
+
+/**
+ * Class which encapsulates the data from an image found under an element
+ *
+ * @author Chris Alexander, chris@chris-alexander.co.uk
+ */
+public class ImageResult {
+
+ public String src;
+ public Integer weight;
+ public String title;
+ public int height;
+ public int width;
+ public String alt;
+ public boolean noFollow;
+ public Element element;
+
+ public ImageResult(String src, Integer weight, String title, int height, int width, String alt,
+ boolean noFollow) {
+ this.src = src;
+ this.weight = weight;
+ this.title = title;
+ this.height = height;
+ this.width = width;
+ this.alt = alt;
+ this.noFollow = noFollow;
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/JResult.java b/src/acr/browser/lightning/Reading/JResult.java
new file mode 100644
index 0000000..50ae5ea
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/JResult.java
@@ -0,0 +1,216 @@
+/*
+ * Copyright 2011 Peter Karich
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package acr.browser.lightning.Reading;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Parsed result from web page containing important title, text and image.
+ *
+ * @author Peter Karich
+ */
+public class JResult implements Serializable {
+
+ private String title;
+ private String url;
+ private String originalUrl;
+ private String canonicalUrl;
+ private String imageUrl;
+ private String videoUrl;
+ private String rssUrl;
+ private String text;
+ private String faviconUrl;
+ private String description;
+ private String dateString;
+ private List textList;
+ private Collection keywords;
+ private List images = null;
+
+ public JResult() {
+ }
+
+ public String getUrl() {
+ if (url == null)
+ return "";
+ return url;
+ }
+
+ public JResult setUrl(String url) {
+ this.url = url;
+ return this;
+ }
+
+ public JResult setOriginalUrl(String originalUrl) {
+ this.originalUrl = originalUrl;
+ return this;
+ }
+
+ public String getOriginalUrl() {
+ return originalUrl;
+ }
+
+ public JResult setCanonicalUrl(String canonicalUrl) {
+ this.canonicalUrl = canonicalUrl;
+ return this;
+ }
+
+ public String getCanonicalUrl() {
+ return canonicalUrl;
+ }
+
+ public String getFaviconUrl() {
+ if (faviconUrl == null)
+ return "";
+ return faviconUrl;
+ }
+
+ public JResult setFaviconUrl(String faviconUrl) {
+ this.faviconUrl = faviconUrl;
+ return this;
+ }
+
+ public JResult setRssUrl(String rssUrl) {
+ this.rssUrl = rssUrl;
+ return this;
+ }
+
+ public String getRssUrl() {
+ if (rssUrl == null)
+ return "";
+ return rssUrl;
+ }
+
+ public String getDescription() {
+ if (description == null)
+ return "";
+ return description;
+ }
+
+ public JResult setDescription(String description) {
+ this.description = description;
+ return this;
+ }
+
+ public String getImageUrl() {
+ if (imageUrl == null)
+ return "";
+ return imageUrl;
+ }
+
+ public JResult setImageUrl(String imageUrl) {
+ this.imageUrl = imageUrl;
+ return this;
+ }
+
+ public String getText() {
+ if (text == null)
+ return "";
+
+ return text;
+ }
+
+ public JResult setText(String text) {
+ this.text = text;
+ return this;
+ }
+
+ public List getTextList() {
+ if (this.textList == null)
+ return new ArrayList();
+ return this.textList;
+ }
+
+ public JResult setTextList(List textList) {
+ this.textList = textList;
+ return this;
+ }
+
+ public String getTitle() {
+ if (title == null)
+ return "";
+ return title;
+ }
+
+ public JResult setTitle(String title) {
+ this.title = title;
+ return this;
+ }
+
+ public String getVideoUrl() {
+ if (videoUrl == null)
+ return "";
+ return videoUrl;
+ }
+
+ public JResult setVideoUrl(String videoUrl) {
+ this.videoUrl = videoUrl;
+ return this;
+ }
+
+ public JResult setDate(String date) {
+ this.dateString = date;
+ return this;
+ }
+
+ public Collection getKeywords() {
+ return keywords;
+ }
+
+ public void setKeywords(Collection keywords) {
+ this.keywords = keywords;
+ }
+
+ /**
+ * @return get date from url or guessed from text
+ */
+ public String getDate() {
+ return dateString;
+ }
+
+ /**
+ * @return images list
+ */
+ public List getImages() {
+ if (images == null)
+ return Collections.emptyList();
+ return images;
+ }
+
+ /**
+ * @return images count
+ */
+ public int getImagesCount() {
+ if (images == null)
+ return 0;
+ return images.size();
+ }
+
+ /**
+ * set images list
+ */
+ public void setImages(List images) {
+ this.images = images;
+ }
+
+ @Override
+ public String toString() {
+ return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/MapEntry.java b/src/acr/browser/lightning/Reading/MapEntry.java
new file mode 100644
index 0000000..31e7c36
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/MapEntry.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright (C) 2010 Peter Karich <>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package acr.browser.lightning.Reading;
+
+import java.io.Serializable;
+import java.util.Map;
+
+/**
+ * Simple impl of Map.Entry. So that we can have ordered maps.
+ *
+ * @author Peter Karich, peat_hal ‘at’ users ‘dot’ sourceforge ‘dot’
+ * net
+ */
+public class MapEntry implements Map.Entry, Serializable {
+
+ private static final long serialVersionUID = 1L;
+ private K key;
+ private V value;
+
+ public MapEntry(K key, V value) {
+ this.key = key;
+ this.value = value;
+ }
+
+ @Override
+ public K getKey() {
+ return key;
+ }
+
+ @Override
+ public V getValue() {
+ return value;
+ }
+
+ @Override
+ public V setValue(V value) {
+ this.value = value;
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return getKey() + ", " + getValue();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final MapEntry other = (MapEntry) obj;
+ if (this.key != other.key && (this.key == null || !this.key.equals(other.key)))
+ return false;
+ if (this.value != other.value && (this.value == null || !this.value.equals(other.value)))
+ return false;
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 19 * hash + (this.key != null ? this.key.hashCode() : 0);
+ hash = 19 * hash + (this.value != null ? this.value.hashCode() : 0);
+ return hash;
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/OutputFormatter.java b/src/acr/browser/lightning/Reading/OutputFormatter.java
new file mode 100644
index 0000000..c456b1a
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/OutputFormatter.java
@@ -0,0 +1,174 @@
+package acr.browser.lightning.Reading;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Pattern;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+
+/**
+ * @author goose | jim
+ * @author karussell
+ *
+ * this class will be responsible for taking our top node and stripping
+ * out junk we don't want and getting it ready for how we want it
+ * presented to the user
+ */
+public class OutputFormatter {
+
+ public static final int MIN_PARAGRAPH_TEXT = 50;
+ private static final List NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
+ private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
+ protected final int minParagraphText;
+ protected final List nodesToReplace;
+ protected String nodesToKeepCssSelector = "p";
+
+ public OutputFormatter() {
+ this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
+ }
+
+ public OutputFormatter(int minParagraphText) {
+ this(minParagraphText, NODES_TO_REPLACE);
+ }
+
+ public OutputFormatter(int minParagraphText, List nodesToReplace) {
+ this.minParagraphText = minParagraphText;
+ this.nodesToReplace = nodesToReplace;
+ }
+
+ /**
+ * set elements to keep in output text
+ */
+ public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) {
+ this.nodesToKeepCssSelector = nodesToKeepCssSelector;
+ }
+
+ /**
+ * takes an element and turns the P tags into \n\n
+ */
+ public String getFormattedText(Element topNode) {
+ removeNodesWithNegativeScores(topNode);
+ StringBuilder sb = new StringBuilder();
+ append(topNode, sb, nodesToKeepCssSelector);
+ String str = SHelper.innerTrim(sb.toString());
+ if (str.length() > 100)
+ return str;
+
+ // no subelements
+ if (str.isEmpty() || !topNode.text().isEmpty()
+ && str.length() <= topNode.ownText().length())
+ str = topNode.text();
+
+ // if jsoup failed to parse the whole html now parse this smaller
+ // snippet again to avoid html tags disturbing our text:
+ return Jsoup.parse(str).text();
+ }
+
+ /**
+ * Takes an element and returns a list of texts extracted from the P tags
+ */
+ public List getTextList(Element topNode) {
+ List texts = new ArrayList();
+ for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
+ if (element.hasText()) {
+ texts.add(element.text());
+ }
+ }
+ return texts;
+ }
+
+ /**
+ * If there are elements inside our top node that have a negative gravity
+ * score remove them
+ */
+ protected void removeNodesWithNegativeScores(Element topNode) {
+ Elements gravityItems = topNode.select("*[gravityScore]");
+ for (Element item : gravityItems) {
+ int score = Integer.parseInt(item.attr("gravityScore"));
+ if (score < 0 || item.text().length() < minParagraphText)
+ item.remove();
+ }
+ }
+
+ protected void append(Element node, StringBuilder sb, String tagName) {
+ // is select more costly then getElementsByTag?
+ MAIN: for (Element e : node.select(tagName)) {
+ Element tmpEl = e;
+ // check all elements until 'node'
+ while (tmpEl != null && !tmpEl.equals(node)) {
+ if (unlikely(tmpEl))
+ continue MAIN;
+ tmpEl = tmpEl.parent();
+ }
+
+ String text = node2Text(e);
+ if (text.isEmpty() || text.length() < minParagraphText
+ || text.length() > SHelper.countLetters(text) * 2)
+ continue;
+
+ sb.append(text);
+ sb.append("\n\n");
+ }
+ }
+
+ boolean unlikely(Node e) {
+ if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
+ return true;
+
+ String style = e.attr("style");
+ String clazz = e.attr("class");
+ if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find())
+ return true;
+ return false;
+ }
+
+ void appendTextSkipHidden(Element e, StringBuilder accum) {
+ for (Node child : e.childNodes()) {
+ if (unlikely(child))
+ continue;
+ if (child instanceof TextNode) {
+ TextNode textNode = (TextNode) child;
+ String txt = textNode.text();
+ accum.append(txt);
+ } else if (child instanceof Element) {
+ Element element = (Element) child;
+ if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
+ accum.append(" ");
+ else if (element.tagName().equals("br"))
+ accum.append(" ");
+ appendTextSkipHidden(element, accum);
+ }
+ }
+ }
+
+ boolean lastCharIsWhitespace(StringBuilder accum) {
+ if (accum.length() == 0)
+ return false;
+ return Character.isWhitespace(accum.charAt(accum.length() - 1));
+ }
+
+ protected String node2TextOld(Element el) {
+ return el.text();
+ }
+
+ protected String node2Text(Element el) {
+ StringBuilder sb = new StringBuilder(200);
+ appendTextSkipHidden(el, sb);
+ return sb.toString();
+ }
+
+ public OutputFormatter setUnlikelyPattern(String unlikelyPattern) {
+ this.unlikelyPattern = Pattern.compile(unlikelyPattern);
+ return this;
+ }
+
+ public OutputFormatter appendUnlikelyPattern(String str) {
+ return setUnlikelyPattern(unlikelyPattern.toString() + "|" + str);
+ }
+}
diff --git a/src/acr/browser/lightning/Reading/SCache.java b/src/acr/browser/lightning/Reading/SCache.java
new file mode 100644
index 0000000..ace929c
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/SCache.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011 Peter Karich
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package acr.browser.lightning.Reading;
+
+/**
+ *
+ * @author Peter Karich
+ */
+public interface SCache {
+
+ JResult get(String url);
+
+ void put(String url, JResult res);
+
+ int getSize();
+}
diff --git a/src/acr/browser/lightning/Reading/SHelper.java b/src/acr/browser/lightning/Reading/SHelper.java
new file mode 100644
index 0000000..ab58c5f
--- /dev/null
+++ b/src/acr/browser/lightning/Reading/SHelper.java
@@ -0,0 +1,480 @@
+/*
+ * Copyright 2011 Peter Karich
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package acr.browser.lightning.Reading;
+
+import java.io.UnsupportedEncodingException;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.net.CookiePolicy;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.security.SecureRandom;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+import java.text.SimpleDateFormat;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.net.ssl.KeyManager;
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+import org.jsoup.nodes.Element;
+
+/**
+ *
+ * @author Peter Karich
+ */
+public class SHelper {
+
+ public static final String UTF8 = "UTF-8";
+ private static final Pattern SPACE = Pattern.compile(" ");
+
+ public static String replaceSpaces(String url) {
+ if (!url.isEmpty()) {
+ url = url.trim();
+ if (url.contains(" ")) {
+ Matcher spaces = SPACE.matcher(url);
+ url = spaces.replaceAll("%20");
+ }
+ }
+ return url;
+ }
+
+ public static int count(String str, String substring) {
+ int c = 0;
+ int index1 = str.indexOf(substring);
+ if (index1 >= 0) {
+ c++;
+ c += count(str.substring(index1 + substring.length()), substring);
+ }
+ return c;
+ }
+
+ /**
+ * remove more than two spaces or newlines
+ */
+ public static String innerTrim(String str) {
+ if (str.isEmpty())
+ return "";
+
+ StringBuilder sb = new StringBuilder();
+ boolean previousSpace = false;
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (c == ' ' || (int) c == 9 || c == '\n') {
+ previousSpace = true;
+ continue;
+ }
+
+ if (previousSpace)
+ sb.append(' ');
+
+ previousSpace = false;
+ sb.append(c);
+ }
+ return sb.toString().trim();
+ }
+
+ /**
+ * Starts reading the encoding from the first valid character until an
+ * invalid encoding character occurs.
+ */
+ public static String encodingCleanup(String str) {
+ StringBuilder sb = new StringBuilder();
+ boolean startedWithCorrectString = false;
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (Character.isDigit(c) || Character.isLetter(c) || c == '-' || c == '_') {
+ startedWithCorrectString = true;
+ sb.append(c);
+ continue;
+ }
+
+ if (startedWithCorrectString)
+ break;
+ }
+ return sb.toString().trim();
+ }
+
+ /**
+ * @return the longest substring as str1.substring(result[0], result[1]);
+ */
+ public static String getLongestSubstring(String str1, String str2) {
+ int res[] = longestSubstring(str1, str2);
+ if (res == null || res[0] >= res[1])
+ return "";
+
+ return str1.substring(res[0], res[1]);
+ }
+
+ public static int[] longestSubstring(String str1, String str2) {
+ if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty())
+ return null;
+
+ // dynamic programming => save already identical length into array
+ // to understand this algo simply print identical length in every entry
+ // of the array
+ // i+1, j+1 then reuses information from i,j
+ // java initializes them already with 0
+ int[][] num = new int[str1.length()][str2.length()];
+ int maxlen = 0;
+ int lastSubstrBegin = 0;
+ int endIndex = 0;
+ for (int i = 0; i < str1.length(); i++) {
+ for (int j = 0; j < str2.length(); j++) {
+ if (str1.charAt(i) == str2.charAt(j)) {
+ if ((i == 0) || (j == 0))
+ num[i][j] = 1;
+ else
+ num[i][j] = 1 + num[i - 1][j - 1];
+
+ if (num[i][j] > maxlen) {
+ maxlen = num[i][j];
+ // generate substring from str1 => i
+ lastSubstrBegin = i - num[i][j] + 1;
+ endIndex = i + 1;
+ }
+ }
+ }
+ }
+ return new int[] { lastSubstrBegin, endIndex };
+ }
+
+ public static String getDefaultFavicon(String url) {
+ return useDomainOfFirstArg4Second(url, "/favicon.ico");
+ }
+
+ /**
+ * @param urlForDomain
+ * extract the domain from this url
+ * @param path
+ * this url does not have a domain
+ * @return
+ */
+ public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
+ if (path.startsWith("http"))
+ return path;
+
+ if ("favicon.ico".equals(path))
+ path = "/favicon.ico";
+
+ if (path.startsWith("//")) {
+ // wikipedia special case, see tests
+ if (urlForDomain.startsWith("https:"))
+ return "https:" + path;
+
+ return "http:" + path;
+ } else if (path.startsWith("/"))
+ return "http://" + extractHost(urlForDomain) + path;
+ else if (path.startsWith("../")) {
+ int slashIndex = urlForDomain.lastIndexOf("/");
+ if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
+ urlForDomain = urlForDomain.substring(0, slashIndex + 1);
+
+ return urlForDomain + path;
+ }
+ return path;
+ }
+
+ public static String extractHost(String url) {
+ return extractDomain(url, false);
+ }
+
+ public static String extractDomain(String url, boolean aggressive) {
+ if (url.startsWith("http://"))
+ url = url.substring("http://".length());
+ else if (url.startsWith("https://"))
+ url = url.substring("https://".length());
+
+ if (aggressive) {
+ if (url.startsWith("www."))
+ url = url.substring("www.".length());
+
+ // strip mobile from start
+ if (url.startsWith("m."))
+ url = url.substring("m.".length());
+ }
+
+ int slashIndex = url.indexOf("/");
+ if (slashIndex > 0)
+ url = url.substring(0, slashIndex);
+
+ return url;
+ }
+
+ public static boolean isVideoLink(String url) {
+ url = extractDomain(url, true);
+ return url.startsWith("youtube.com") || url.startsWith("video.yahoo.com")
+ || url.startsWith("vimeo.com") || url.startsWith("blip.tv");
+ }
+
+ public static boolean isVideo(String url) {
+ return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi")
+ || url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4")
+ || url.endsWith(".flv") || url.endsWith(".wmv");
+ }
+
+ public static boolean isAudio(String url) {
+ return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u")
+ || url.endsWith(".wav");
+ }
+
+ public static boolean isDoc(String url) {
+ return url.endsWith(".pdf") || url.endsWith(".ppt") || url.endsWith(".doc")
+ || url.endsWith(".swf") || url.endsWith(".rtf") || url.endsWith(".xls");
+ }
+
+ public static boolean isPackage(String url) {
+ return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
+ || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm")
+ || url.endsWith(".7z");
+ }
+
+ public static boolean isApp(String url) {
+ return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat")
+ || url.endsWith(".dmg");
+ }
+
+ public static boolean isImage(String url) {
+ return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
+ || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico")
+ || url.endsWith(".eps");
+ }
+
+ /**
+ * @see http
+ * ://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se
+ */
+ public static void enableCookieMgmt() {
+ CookieManager manager = new CookieManager();
+ manager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
+ CookieHandler.setDefault(manager);
+ }
+
+ /**
+ * @see http
+ * ://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java
+ * -urlconnection
+ */
+ public static void enableUserAgentOverwrite() {
+ System.setProperty("http.agent", "");
+ }
+
+ public static String getUrlFromUglyGoogleRedirect(String url) {
+ if (url.startsWith("http://www.google.com/url?")) {
+ url = url.substring("http://www.google.com/url?".length());
+ String arr[] = urlDecode(url).split("\\&");
+ if (arr != null)
+ for (String str : arr) {
+ if (str.startsWith("q="))
+ return str.substring("q=".length());
+ }
+ }
+
+ return null;
+ }
+
+ public static String getUrlFromUglyFacebookRedirect(String url) {
+ if (url.startsWith("http://www.facebook.com/l.php?u=")) {
+ url = url.substring("http://www.facebook.com/l.php?u=".length());
+ return urlDecode(url);
+ }
+
+ return null;
+ }
+
+ public static String urlEncode(String str) {
+ try {
+ return URLEncoder.encode(str, UTF8);
+ } catch (UnsupportedEncodingException ex) {
+ return str;
+ }
+ }
+
+ public static String urlDecode(String str) {
+ try {
+ return URLDecoder.decode(str, UTF8);
+ } catch (UnsupportedEncodingException ex) {
+ return str;
+ }
+ }
+
+ /**
+ * Popular sites uses the #! to indicate the importance of the following
+ * chars. Ugly but true. Such as: facebook, twitter, gizmodo, ...
+ */
+ public static String removeHashbang(String url) {
+ return url.replaceFirst("#!", "");
+ }
+
+ public static String printNode(Element root) {
+ return printNode(root, 0);
+ }
+
+ public static String printNode(Element root, int indentation) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < indentation; i++) {
+ sb.append(' ');
+ }
+ sb.append(root.tagName());
+ sb.append(":");
+ sb.append(root.ownText());
+ sb.append("\n");
+ for (Element el : root.children()) {
+ sb.append(printNode(el, indentation + 1));
+ sb.append("\n");
+ }
+ return sb.toString();
+ }
+
+ public static String estimateDate(String url) {
+ int index = url.indexOf("://");
+ if (index > 0)
+ url = url.substring(index + 3);
+
+ int year = -1;
+ int yearCounter = -1;
+ int month = -1;
+ int monthCounter = -1;
+ int day = -1;
+ String strs[] = url.split("/");
+ for (int counter = 0; counter < strs.length; counter++) {
+ String str = strs[counter];
+ if (str.length() == 4) {
+ try {
+ year = Integer.parseInt(str);
+ } catch (Exception ex) {
+ continue;
+ }
+ if (year < 1970 || year > 3000) {
+ year = -1;
+ continue;
+ }
+ yearCounter = counter;
+ } else if (str.length() == 2) {
+ if (monthCounter < 0 && counter == yearCounter + 1) {
+ try {
+ month = Integer.parseInt(str);
+ } catch (Exception ex) {
+ continue;
+ }
+ if (month < 1 || month > 12) {
+ month = -1;
+ continue;
+ }
+ monthCounter = counter;
+ } else if (counter == monthCounter + 1) {
+ try {
+ day = Integer.parseInt(str);
+ } catch (Exception ex) {
+ }
+ if (day < 1 || day > 31) {
+ day = -1;
+ continue;
+ }
+ break;
+ }
+ }
+ }
+
+ if (year < 0)
+ return null;
+
+ StringBuilder str = new StringBuilder();
+ str.append(year);
+ if (month < 1)
+ return str.toString();
+
+ str.append('/');
+ if (month < 10)
+ str.append('0');
+ str.append(month);
+ if (day < 1)
+ return str.toString();
+
+ str.append('/');
+ if (day < 10)
+ str.append('0');
+ str.append(day);
+ return str.toString();
+ }
+
+ public static String completeDate(String dateStr) {
+ if (dateStr == null)
+ return null;
+
+ int index = dateStr.indexOf('/');
+ if (index > 0) {
+ index = dateStr.indexOf('/', index + 1);
+ if (index > 0)
+ return dateStr;
+ else
+ return dateStr + "/01";
+ }
+ return dateStr + "/01/01";
+ }
+
+ /**
+ * keep in mind: simpleDateFormatter is not thread safe! call completeDate
+ * before applying this formatter.
+ */
+ public static SimpleDateFormat createDateFormatter() {
+ return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
+ }
+
+ // with the help of
+ // http://stackoverflow.com/questions/1828775/httpclient-and-ssl
+ public static void enableAnySSL() {
+ try {
+ SSLContext ctx = SSLContext.getInstance("TLS");
+ ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() },
+ new SecureRandom());
+ SSLContext.setDefault(ctx);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ private static class DefaultTrustManager implements X509TrustManager {
+
+ @Override
+ public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ }
+
+ @Override
+ public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+ throws CertificateException {
+ }
+
+ @Override
+ public X509Certificate[] getAcceptedIssuers() {
+ return null;
+ }
+ }
+
+ public static int countLetters(String str) {
+ int len = str.length();
+ int chars = 0;
+ for (int i = 0; i < len; i++) {
+ if (Character.isLetter(str.charAt(i)))
+ chars++;
+ }
+ return chars;
+ }
+}
diff --git a/src/acr/browser/lightning/ReadingActivity.java b/src/acr/browser/lightning/ReadingActivity.java
new file mode 100644
index 0000000..ada7513
--- /dev/null
+++ b/src/acr/browser/lightning/ReadingActivity.java
@@ -0,0 +1,153 @@
+package acr.browser.lightning;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import acr.browser.lightning.Reading.HtmlFetcher;
+import acr.browser.lightning.Reading.JResult;
+import android.animation.ObjectAnimator;
+import android.app.ProgressDialog;
+import android.content.Context;
+import android.content.Intent;
+import android.os.AsyncTask;
+import android.os.Bundle;
+import android.support.v7.app.ActionBarActivity;
+import android.support.v7.widget.Toolbar;
+import android.view.MenuItem;
+import android.view.View;
+import android.widget.TextView;
+
+public class ReadingActivity extends ActionBarActivity {
+
+ private TextView mTitle;
+ private TextView mBody;
+
+ @Override
+ protected void onCreate(Bundle savedInstanceState) {
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.reading_view);
+
+ Toolbar toolbar = (Toolbar) findViewById(R.id.toolbar);
+ setSupportActionBar(toolbar);
+
+ getSupportActionBar().setDisplayHomeAsUpEnabled(true);
+
+ mTitle = (TextView) findViewById(R.id.textViewTitle);
+ mBody = (TextView) findViewById(R.id.textViewBody);
+
+ mTitle.setText(getString(R.string.untitled));
+ mBody.setText(getString(R.string.loading));
+
+ mTitle.setVisibility(View.INVISIBLE);
+ mBody.setVisibility(View.INVISIBLE);
+
+ Intent intent = getIntent();
+ if (!loadPage(intent)) {
+ setText(getString(R.string.untitled), getString(R.string.loading_failed));
+ }
+ }
+
+ protected boolean loadPage(Intent intent) {
+ if (intent == null) {
+ return false;
+ }
+ String url = intent.getStringExtra(Constants.LOAD_READING_URL);
+ if (url == null) {
+ return false;
+ }
+ getSupportActionBar().setTitle(Utils.getDomainName(url));
+ new PageLoader(this).execute(url);
+ return true;
+ }
+
+ private class PageLoader extends AsyncTask {
+
+ private Context mContext;
+ private ProgressDialog mProgressDialog;
+ private String mTitleText;
+ private List mBodyText;
+
+ public PageLoader(Context context) {
+ mContext = context;
+ }
+
+ @Override
+ protected void onPreExecute() {
+ super.onPreExecute();
+ mProgressDialog = new ProgressDialog(mContext);
+ mProgressDialog.setProgressStyle(ProgressDialog.STYLE_SPINNER);
+ mProgressDialog.setCancelable(false);
+ mProgressDialog.setIndeterminate(true);
+ mProgressDialog.setMessage(mContext.getString(R.string.loading));
+ mProgressDialog.show();
+ }
+
+ @Override
+ protected Void doInBackground(String... params) {
+
+ HtmlFetcher fetcher = new HtmlFetcher();
+ try {
+ JResult result = fetcher.fetchAndExtract(params[0], 5000, true);
+ mTitleText = result.getTitle();
+ mBodyText = result.getTextList();
+ } catch (Exception e) {
+ mTitleText = "";
+ mBodyText = new ArrayList<>();
+ e.printStackTrace();
+ } catch (OutOfMemoryError e) {
+ System.gc();
+ mTitleText = "";
+ mBodyText = new ArrayList<>();
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ @Override
+ protected void onPostExecute(Void result) {
+ mProgressDialog.dismiss();
+ if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
+ setText(getString(R.string.untitled), getString(R.string.loading_failed));
+ } else {
+ StringBuilder builder = new StringBuilder();
+ for (String text : mBodyText) {
+ builder.append(text + "\n\n");
+ }
+ setText(mTitleText, builder.toString());
+ }
+ super.onPostExecute(result);
+ }
+
+ }
+
+ private void setText(String title, String body) {
+ if (mTitle.getVisibility() == View.INVISIBLE) {
+ mTitle.setAlpha(0.0f);
+ mTitle.setVisibility(View.VISIBLE);
+ mTitle.setText(title);
+ ObjectAnimator animator = ObjectAnimator.ofFloat(mTitle, "alpha", 1.0f);
+ animator.setDuration(300);
+ animator.start();
+ } else {
+ mTitle.setText(title);
+ }
+
+ if (mBody.getVisibility() == View.INVISIBLE) {
+ mBody.setAlpha(0.0f);
+ mBody.setVisibility(View.VISIBLE);
+ mBody.setText(body);
+ ObjectAnimator animator = ObjectAnimator.ofFloat(mBody, "alpha", 1.0f);
+ animator.setDuration(300);
+ animator.start();
+ } else {
+ mBody.setText(body);
+ }
+ }
+
+ @Override
+ public boolean onOptionsItemSelected(MenuItem item) {
+ finish();
+ return super.onOptionsItemSelected(item);
+ }
+
+}