diff --git a/AndroidManifest.xml b/AndroidManifest.xml index 7b68d0c..5c2bcc0 100644 --- a/AndroidManifest.xml +++ b/AndroidManifest.xml @@ -200,6 +200,17 @@ + + + + + + + diff --git a/libs/jsoup-1.8.1.jar b/libs/jsoup-1.8.1.jar new file mode 100644 index 0000000..ae717d4 Binary files /dev/null and b/libs/jsoup-1.8.1.jar differ diff --git a/res/layout/license_activity.xml b/res/layout/license_activity.xml index 53a53f5..a9c024e 100644 --- a/res/layout/license_activity.xml +++ b/res/layout/license_activity.xml @@ -140,5 +140,39 @@ android:layout_marginLeft="10dp" android:layout_marginRight="10dp" android:background="#cdcdcd" /> + + + + + + + + + \ No newline at end of file diff --git a/res/layout/reading_view.xml b/res/layout/reading_view.xml new file mode 100644 index 0000000..c086420 --- /dev/null +++ b/res/layout/reading_view.xml @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/res/menu-xlarge/main.xml b/res/menu-xlarge/main.xml index edf26c2..b9b33df 100644 --- a/res/menu-xlarge/main.xml +++ b/res/menu-xlarge/main.xml @@ -1,73 +1,77 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/res/menu/main.xml b/res/menu/main.xml index a1a73a0..9631c95 100644 --- a/res/menu/main.xml +++ b/res/menu/main.xml @@ -23,6 +23,7 @@ + \ No newline at end of file diff --git a/res/values/strings.xml b/res/values/strings.xml index cc56b3a..05287d3 100644 --- a/res/values/strings.xml +++ b/res/values/strings.xml @@ -198,4 +198,8 @@ Block 3rd Party Cookies This feature is only available on Android 5.0+ Enable Color Mode + Reader Mode + Loading… + Couldn\'t load anything from the page. + Snacktory diff --git a/src/acr/browser/lightning/BrowserActivity.java b/src/acr/browser/lightning/BrowserActivity.java index c8a4e2b..966d80e 100644 --- a/src/acr/browser/lightning/BrowserActivity.java +++ b/src/acr/browser/lightning/BrowserActivity.java @@ -179,7 +179,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl mDrawerListRight.setDividerHeight(0); setNavigationDrawerWidth(); mDrawerLayout.setDrawerListener(new DrawerLocker()); - + mWebpageBitmap = BitmapFactory.decodeResource(getResources(), R.drawable.ic_webpage); mActionBar = getSupportActionBar(); final TypedArray styledAttributes = mContext.getTheme().obtainStyledAttributes( @@ -350,7 +350,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl }; anim.setDuration(300); anim.setInterpolator(new DecelerateInterpolator()); - anim.setAnimationListener(new AnimationListener(){ + anim.setAnimationListener(new AnimationListener() { @Override public void onAnimationStart(Animation animation) { @@ -368,7 +368,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @Override public void onAnimationRepeat(Animation animation) { } - + }); new Handler().postDelayed(new Runnable() { @@ -488,12 +488,12 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl checkForTor(); } - + private class DrawerLocker implements DrawerListener { @Override public void onDrawerClosed(View v) { - if(v == mDrawerRight){ + if (v == mDrawerRight) { mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerLeft); } else { mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerRight); @@ -502,7 +502,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @Override public void onDrawerOpened(View v) { - if(v == mDrawerRight){ + if (v == mDrawerRight) { mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerLeft); } else { mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerRight); @@ -516,7 +516,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @Override public void onDrawerStateChanged(int arg) { } - + } public boolean handleMenuItemClick(MenuItem item) { @@ -596,6 +596,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl case R.id.action_find: findInPage(); return true; + case R.id.action_reading_mode: + Intent read = new Intent(this, ReadingActivity.class); + read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl()); + startActivity(read); + return true; default: return super.onOptionsItemSelected(item); } @@ -912,6 +917,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl case R.id.action_find: findInPage(); return true; + case R.id.action_reading_mode: + Intent read = new Intent(this, ReadingActivity.class); + read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl()); + startActivity(read); + return true; default: return super.onOptionsItemSelected(item); } @@ -1622,7 +1632,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl } }); - + ViewCompat.jumpDrawablesToCurrentState(holder.exit); LightningView web = data.get(position); diff --git a/src/acr/browser/lightning/Constants.java b/src/acr/browser/lightning/Constants.java index da89b36..cfb6477 100644 --- a/src/acr/browser/lightning/Constants.java +++ b/src/acr/browser/lightning/Constants.java @@ -29,6 +29,8 @@ public final class Constants { public static final String JAVASCRIPT_INVERT_PAGE = "javascript:(function(){var e='img {-webkit-filter: invert(100%);'+'-moz-filter: invert(100%);'+'-o-filter: invert(100%);'+'-ms-filter: invert(100%); }',t=document.getElementsByTagName('head')[0],n=document.createElement('style');if(!window.counter){window.counter=1}else{window.counter++;if(window.counter%2==0){var e='html {-webkit-filter: invert(0%); -moz-filter: invert(0%); -o-filter: invert(0%); -ms-filter: invert(0%); }'}}n.type='text/css';if(n.styleSheet){n.styleSheet.cssText=e}else{n.appendChild(document.createTextNode(e))}t.appendChild(n)})();"; public static final String JAVASCRIPT_TEXT_REFLOW = "javascript:document.getElementsByTagName('body')[0].style.width=window.innerWidth+'px';"; + public static final String LOAD_READING_URL = "ReadingUrl"; + public static final String SEPARATOR = "\\|\\$\\|SEPARATOR\\|\\$\\|"; public static final String HTTP = "http://"; public static final String HTTPS = "https://"; diff --git a/src/acr/browser/lightning/LicenseActivity.java b/src/acr/browser/lightning/LicenseActivity.java index 15e9642..7b1885c 100644 --- a/src/acr/browser/lightning/LicenseActivity.java +++ b/src/acr/browser/lightning/LicenseActivity.java @@ -30,6 +30,7 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi findViewById(R.id.licenseAOSP).setOnClickListener(this); findViewById(R.id.licenseHosts).setOnClickListener(this); findViewById(R.id.licenseOrbot).setOnClickListener(this); + findViewById(R.id.licenseSnactory).setOnClickListener(this); } @Override @@ -47,6 +48,9 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi case R.id.licenseOrbot: actionView("http://www.gnu.org/licenses/lgpl.html"); break; + case R.id.licenseSnactory: + actionView("http://www.apache.org/licenses/LICENSE-2.0"); + break; } } diff --git a/src/acr/browser/lightning/Reading/ArticleTextExtractor.java b/src/acr/browser/lightning/Reading/ArticleTextExtractor.java new file mode 100644 index 0000000..3a9188a --- /dev/null +++ b/src/acr/browser/lightning/Reading/ArticleTextExtractor.java @@ -0,0 +1,619 @@ +package acr.browser.lightning.Reading; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import android.util.Log; + +/** + * This class is thread safe. + * + * @author Alex P (ifesdjeen from jreadability) + * @author Peter Karich + */ +public class ArticleTextExtractor { + + // Interessting nodes + private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section"); + // Unlikely candidates + private String unlikelyStr; + private Pattern UNLIKELY; + // Most likely positive candidates + private String positiveStr; + private Pattern POSITIVE; + // Most likely negative candidates + private String negativeStr; + private Pattern NEGATIVE; + private static final Pattern NEGATIVE_STYLE = Pattern + .compile("hidden|display: ?none|font-size: ?small"); + private static final Set IGNORED_TITLE_PARTS = new LinkedHashSet() { + { + add("hacker news"); + add("facebook"); + } + }; + private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter(); + private OutputFormatter formatter = DEFAULT_FORMATTER; + + public ArticleTextExtractor() { + setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" + + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|" + + "login|si(debar|gn|ngle)"); + setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" + + "|arti(cle|kel)|instapaper_body"); + setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" + + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" + + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); + } + + public ArticleTextExtractor setUnlikely(String unlikelyStr) { + this.unlikelyStr = unlikelyStr; + UNLIKELY = Pattern.compile(unlikelyStr); + return this; + } + + public ArticleTextExtractor addUnlikely(String unlikelyMatches) { + return setUnlikely(unlikelyStr + "|" + unlikelyMatches); + } + + public ArticleTextExtractor setPositive(String positiveStr) { + this.positiveStr = positiveStr; + POSITIVE = Pattern.compile(positiveStr); + return this; + } + + public ArticleTextExtractor addPositive(String pos) { + return setPositive(positiveStr + "|" + pos); + } + + public ArticleTextExtractor setNegative(String negativeStr) { + this.negativeStr = negativeStr; + NEGATIVE = Pattern.compile(negativeStr); + return this; + } + + public ArticleTextExtractor addNegative(String neg) { + setNegative(negativeStr + "|" + neg); + return this; + } + + public void setOutputFormatter(OutputFormatter formatter) { + this.formatter = formatter; + } + + /** + * @param html + * extracts article text from given html string. wasn't tested + * with improper HTML, although jSoup should be able to handle + * minor stuff. + * @returns extracted article, all HTML tags stripped + */ + public JResult extractContent(Document doc) throws Exception { + return extractContent(new JResult(), doc, formatter); + } + + public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception { + return extractContent(new JResult(), doc, formatter); + } + + public JResult extractContent(String html) throws Exception { + return extractContent(new JResult(), html); + } + + public JResult extractContent(JResult res, String html) throws Exception { + return extractContent(res, html, formatter); + } + + public JResult extractContent(JResult res, String html, OutputFormatter formatter) + throws Exception { + if (html.isEmpty()) + throw new IllegalArgumentException("html string is empty!?"); + + // http://jsoup.org/cookbook/extracting-data/selector-syntax + return extractContent(res, Jsoup.parse(html), formatter); + } + + public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) + throws Exception { + if (doc == null) + throw new NullPointerException("missing document"); + + res.setTitle(extractTitle(doc)); + res.setDescription(extractDescription(doc)); + res.setCanonicalUrl(extractCanonicalUrl(doc)); + + // now remove the clutter + prepareDocument(doc); + + // init elements + Collection nodes = getNodes(doc); + int maxWeight = 0; + Element bestMatchElement = null; + for (Element entry : nodes) { + int currentWeight = getWeight(entry); + if (currentWeight > maxWeight) { + maxWeight = currentWeight; + bestMatchElement = entry; + if (maxWeight > 200) + break; + } + } + + if (bestMatchElement != null) { + List images = new ArrayList(); + Element imgEl = determineImageSource(bestMatchElement, images); + if (imgEl != null) { + res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src"))); + // TODO remove parent container of image if it is contained in + // bestMatchElement + // to avoid image subtitles flooding in + + res.setImages(images); + } + + // clean before grabbing text + String text = formatter.getFormattedText(bestMatchElement); + text = removeTitleFromText(text, res.getTitle()); + // this fails for short facebook post and probably tweets: + // text.length() > res.getDescription().length() + if (text.length() > res.getTitle().length()) { + res.setText(text); + // print("best element:", bestMatchElement); + } + res.setTextList(formatter.getTextList(bestMatchElement)); + } + + if (res.getImageUrl().isEmpty()) { + res.setImageUrl(extractImageUrl(doc)); + } + + res.setRssUrl(extractRssUrl(doc)); + res.setVideoUrl(extractVideoUrl(doc)); + res.setFaviconUrl(extractFaviconUrl(doc)); + res.setKeywords(extractKeywords(doc)); + return res; + } + + protected String extractTitle(Document doc) { + String title = cleanTitle(doc.title()); + if (title.isEmpty()) { + title = SHelper.innerTrim(doc.select("head title").text()); + if (title.isEmpty()) { + title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content")); + if (title.isEmpty()) { + title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr( + "content")); + if (title.isEmpty()) { + title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr( + "content")); + } + } + } + } + return title; + } + + protected String extractCanonicalUrl(Document doc) { + String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href")); + if (url.isEmpty()) { + url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content")); + if (url.isEmpty()) { + url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr( + "content")); + } + } + return url; + } + + protected String extractDescription(Document doc) { + String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr( + "content")); + if (description.isEmpty()) { + description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr( + "content")); + if (description.isEmpty()) { + description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]") + .attr("content")); + } + } + return description; + } + + protected Collection extractKeywords(Document doc) { + String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content")); + + if (content != null) { + if (content.startsWith("[") && content.endsWith("]")) + content = content.substring(1, content.length() - 1); + + String[] split = content.split("\\s*,\\s*"); + if (split.length > 1 || (split.length > 0 && !"".equals(split[0]))) + return Arrays.asList(split); + } + return Collections.emptyList(); + } + + /** + * Tries to extract an image url from metadata if determineImageSource + * failed + * + * @return image url or empty str + */ + protected String extractImageUrl(Document doc) { + // use open graph tag to get image + String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr( + "content")); + if (imageUrl.isEmpty()) { + imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr( + "content")); + if (imageUrl.isEmpty()) { + // prefer link over thumbnail-meta if empty + imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href")); + if (imageUrl.isEmpty()) { + imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr( + "content")); + } + } + } + return imageUrl; + } + + protected String extractRssUrl(Document doc) { + return SHelper.replaceSpaces(doc.select("link[rel=alternate]") + .select("link[type=application/rss+xml]").attr("href")); + } + + protected String extractVideoUrl(Document doc) { + return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content")); + } + + protected String extractFaviconUrl(Document doc) { + String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href")); + if (faviconUrl.isEmpty()) { + faviconUrl = SHelper.replaceSpaces(doc.select( + "head link[rel^=shortcut],link[rel$=icon]").attr("href")); + } + return faviconUrl; + } + + /** + * Weights current element. By matching it with positive candidates and + * weighting child nodes. Since it's impossible to predict which exactly + * names, ids or class names will be used in HTML, major role is played by + * child nodes + * + * @param e + * Element to weight, along with child nodes + */ + protected int getWeight(Element e) { + int weight = calcWeight(e); + weight += (int) Math.round(e.ownText().length() / 100.0 * 10); + weight += weightChildNodes(e); + return weight; + } + + /** + * Weights a child nodes of given Element. During tests some difficulties + * were met. For instanance, not every single document has nested paragraph + * tags inside of the major article tag. Sometimes people are adding one + * more nesting level. So, we're adding 4 points for every 100 symbols + * contained in tag nested inside of the current weighted element, but only + * 3 points for every element that's nested 2 levels deep. This way we give + * more chances to extract the element that has less nested levels, + * increasing probability of the correct extraction. + * + * @param rootEl + * Element, who's child nodes will be weighted + */ + protected int weightChildNodes(Element rootEl) { + int weight = 0; + Element caption = null; + List pEls = new ArrayList(5); + for (Element child : rootEl.children()) { + String ownText = child.ownText(); + int ownTextLength = ownText.length(); + if (ownTextLength < 20) + continue; + + if (ownTextLength > 200) + weight += Math.max(50, ownTextLength / 10); + + if (child.tagName().equals("h1") || child.tagName().equals("h2")) { + weight += 30; + } else if (child.tagName().equals("div") || child.tagName().equals("p")) { + weight += calcWeightForChild(child, ownText); + if (child.tagName().equals("p") && ownTextLength > 50) + pEls.add(child); + + if (child.className().toLowerCase(Locale.getDefault()).equals("caption")) + caption = child; + } + } + + // use caption and image + if (caption != null) + weight += 30; + + if (pEls.size() >= 2) { + for (Element subEl : rootEl.children()) { + if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) { + weight += 20; + // headerEls.add(subEl); + } else if ("table;li;td;th".contains(subEl.tagName())) { + addScore(subEl, -30); + } + + if ("p".contains(subEl.tagName())) + addScore(subEl, 30); + } + } + return weight; + } + + public void addScore(Element el, int score) { + int old = getScore(el); + setScore(el, score + old); + } + + public int getScore(Element el) { + int old = 0; + try { + old = Integer.parseInt(el.attr("gravityScore")); + } catch (Exception ex) { + } + return old; + } + + public void setScore(Element el, int score) { + el.attr("gravityScore", Integer.toString(score)); + } + + private int calcWeightForChild(Element child, String ownText) { + int c = SHelper.count(ownText, """); + c += SHelper.count(ownText, "<"); + c += SHelper.count(ownText, ">"); + c += SHelper.count(ownText, "px"); + int val; + if (c > 5) + val = -30; + else + val = (int) Math.round(ownText.length() / 25.0); + + addScore(child, val); + return val; + } + + private int calcWeight(Element e) { + int weight = 0; + if (POSITIVE.matcher(e.className()).find()) + weight += 35; + + if (POSITIVE.matcher(e.id()).find()) + weight += 40; + + if (UNLIKELY.matcher(e.className()).find()) + weight -= 20; + + if (UNLIKELY.matcher(e.id()).find()) + weight -= 20; + + if (NEGATIVE.matcher(e.className()).find()) + weight -= 50; + + if (NEGATIVE.matcher(e.id()).find()) + weight -= 50; + + String style = e.attr("style"); + if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) + weight -= 50; + return weight; + } + + public Element determineImageSource(Element el, List images) { + int maxWeight = 0; + Element maxNode = null; + Elements els = el.select("img"); + if (els.isEmpty()) + els = el.parent().select("img"); + + double score = 1; + for (Element e : els) { + String sourceUrl = e.attr("src"); + if (sourceUrl.isEmpty() || isAdImage(sourceUrl)) + continue; + + int weight = 0; + int height = 0; + try { + height = Integer.parseInt(e.attr("height")); + if (height >= 50) + weight += 20; + else + weight -= 20; + } catch (Exception ex) { + } + + int width = 0; + try { + width = Integer.parseInt(e.attr("width")); + if (width >= 50) + weight += 20; + else + weight -= 20; + } catch (Exception ex) { + } + String alt = e.attr("alt"); + if (alt.length() > 35) + weight += 20; + + String title = e.attr("title"); + if (title.length() > 35) + weight += 20; + + String rel = null; + boolean noFollow = false; + if (e.parent() != null) { + rel = e.parent().attr("rel"); + if (rel != null && rel.contains("nofollow")) { + noFollow = rel.contains("nofollow"); + weight -= 40; + } + } + + weight = (int) (weight * score); + if (weight > maxWeight) { + maxWeight = weight; + maxNode = e; + score = score / 2; + } + + ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, + noFollow); + images.add(image); + } + + Collections.sort(images, new ImageComparator()); + return maxNode; + } + + /** + * Prepares document. Currently only stipping unlikely candidates, since + * from time to time they're getting more score than good ones especially in + * cases when major text is short. + * + * @param doc + * document to prepare. Passed as reference, and changed inside + * of function + */ + protected void prepareDocument(Document doc) { + // stripUnlikelyCandidates(doc); + removeScriptsAndStyles(doc); + } + + /** + * Removes unlikely candidates from HTML. Currently takes id and class name + * and matches them against list of patterns + * + * @param doc + * document to strip unlikely candidates from + */ + protected void stripUnlikelyCandidates(Document doc) { + for (Element child : doc.select("body").select("*")) { + String className = child.className().toLowerCase(Locale.getDefault()); + String id = child.id().toLowerCase(Locale.getDefault()); + + if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) { + // print("REMOVE:", child); + child.remove(); + } + } + } + + private Document removeScriptsAndStyles(Document doc) { + Elements scripts = doc.getElementsByTag("script"); + for (Element item : scripts) { + item.remove(); + } + + Elements noscripts = doc.getElementsByTag("noscript"); + for (Element item : noscripts) { + item.remove(); + } + + Elements styles = doc.getElementsByTag("style"); + for (Element style : styles) { + style.remove(); + } + + return doc; + } + + private boolean isAdImage(String imageUrl) { + return SHelper.count(imageUrl, "ad") >= 2; + } + + /** + * Match only exact matching as longestSubstring can be too fuzzy + */ + public String removeTitleFromText(String text, String title) { + // don't do this as its terrible to read + // int index1 = text.toLowerCase().indexOf(title.toLowerCase()); + // if (index1 >= 0) + // text = text.substring(index1 + title.length()); + // return text.trim(); + return text; + } + + /** + * @return a set of all important nodes + */ + public Collection getNodes(Document doc) { + Set nodes = new HashSet(64); + int score = 100; + for (Element el : doc.select("body").select("*")) { + if (NODES.matcher(el.tagName()).matches()) { + nodes.add(el); + setScore(el, score); + score = score / 2; + } + } + return nodes; + + } + + public String cleanTitle(String title) { + StringBuilder res = new StringBuilder(); + // int index = title.lastIndexOf("|"); + // if (index > 0 && title.length() / 2 < index) + // title = title.substring(0, index + 1); + + int counter = 0; + String[] strs = title.split("\\|"); + for (String part : strs) { + if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim())) + continue; + + if (counter == strs.length - 1 && res.length() > part.length()) + continue; + + if (counter > 0) + res.append("|"); + + res.append(part); + counter++; + } + + return SHelper.innerTrim(res.toString()); + } + + /** + * Comparator for Image by weight + * + * @author Chris Alexander, chris@chris-alexander.co.uk + * + */ + public class ImageComparator implements Comparator { + + @Override + public int compare(ImageResult o1, ImageResult o2) { + // Returns the highest weight first + return o2.weight.compareTo(o1.weight); + } + } +} \ No newline at end of file diff --git a/src/acr/browser/lightning/Reading/Converter.java b/src/acr/browser/lightning/Reading/Converter.java new file mode 100644 index 0000000..4ed3178 --- /dev/null +++ b/src/acr/browser/lightning/Reading/Converter.java @@ -0,0 +1,243 @@ +/* + * Copyright 2011 Peter Karich + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package acr.browser.lightning.Reading; + +import java.io.*; +import java.net.SocketTimeoutException; +import java.nio.charset.Charset; +import java.util.Locale; + +import acr.browser.lightning.Constants; +import android.util.Log; + +/** + * This class is not thread safe. Use one new instance every time due to + * encoding variable. + * + * @author Peter Karich + */ +public class Converter { + + public final static String UTF8 = "UTF-8"; + public final static String ISO = "ISO-8859-1"; + public final static int K2 = 2048; + private int maxBytes = 1000000 / 2; + private String encoding; + private String url; + + public Converter(String urlOnlyHint) { + url = urlOnlyHint; + } + + public Converter() { + } + + public Converter setMaxBytes(int maxBytes) { + this.maxBytes = maxBytes; + return this; + } + + public static String extractEncoding(String contentType) { + String[] values; + if (contentType != null) + values = contentType.split(";"); + else + values = new String[0]; + + String charset = ""; + + for (String value : values) { + value = value.trim().toLowerCase(Locale.getDefault()); + + if (value.startsWith("charset=")) + charset = value.substring("charset=".length()); + } + + // http1.1 says ISO-8859-1 is the default charset + if (charset.length() == 0) + charset = ISO; + + return charset; + } + + public String getEncoding() { + if (encoding == null) + return ""; + return encoding.toLowerCase(Locale.getDefault()); + } + + public String streamToString(InputStream is) { + return streamToString(is, maxBytes, encoding); + } + + public String streamToString(InputStream is, String enc) { + return streamToString(is, maxBytes, enc); + } + + /** + * reads bytes off the string and returns a string + * + * @param is + * @param maxBytes + * The max bytes that we want to read from the input stream + * @return String + */ + public String streamToString(InputStream is, int maxBytes, String enc) { + encoding = enc; + // Http 1.1. standard is iso-8859-1 not utf8 :( + // but we force utf-8 as youtube assumes it ;) + if (encoding == null || encoding.isEmpty()) + encoding = UTF8; + + BufferedInputStream in = null; + try { + in = new BufferedInputStream(is, K2); + ByteArrayOutputStream output = new ByteArrayOutputStream(); + + // detect encoding with the help of meta tag + try { + in.mark(K2 * 2); + String tmpEnc = detectCharset("charset=", output, in, encoding); + if (tmpEnc != null) + encoding = tmpEnc; + else { + Log.d(Constants.TAG, "no charset found in first stage"); + // detect with the help of xml beginning ala + // encoding="charset" + tmpEnc = detectCharset("encoding=", output, in, encoding); + if (tmpEnc != null) + encoding = tmpEnc; + else + Log.d(Constants.TAG, "no charset found in second stage"); + } + + if (!Charset.isSupported(encoding)) + throw new UnsupportedEncodingException(encoding); + } catch (UnsupportedEncodingException e) { + Log.d(Constants.TAG, + "Using default encoding:" + UTF8 + " problem:" + e.getMessage() + + " encoding:" + encoding + " " + url); + encoding = UTF8; + } + + // SocketException: Connection reset + // IOException: missing CR => problem on server (probably some xml + // character thing?) + // IOException: Premature EOF => socket unexpectly closed from + // server + int bytesRead = output.size(); + byte[] arr = new byte[K2]; + while (true) { + if (bytesRead >= maxBytes) { + Log.d(Constants.TAG, "Maxbyte of " + maxBytes + + " exceeded! Maybe html is now broken but try it nevertheless. Url: " + + url); + break; + } + + int n = in.read(arr); + if (n < 0) + break; + bytesRead += n; + output.write(arr, 0, n); + } + + return output.toString(encoding); + } catch (SocketTimeoutException e) { + Log.e(Constants.TAG, e.toString() + " url:" + url); + } catch (IOException e) { + Log.e(Constants.TAG, e.toString() + " url:" + url); + } finally { + if (in != null) { + try { + in.close(); + } catch (Exception e) { + } + } + } + return ""; + } + + /** + * This method detects the charset even if the first call only returns some + * bytes. It will read until 4K bytes are reached and then try to determine + * the encoding + * + * @throws IOException + */ + protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in, + String enc) throws IOException { + + // Grab better encoding from stream + byte[] arr = new byte[K2]; + int nSum = 0; + while (nSum < K2) { + int n = in.read(arr); + if (n < 0) + break; + + nSum += n; + bos.write(arr, 0, n); + } + + String str = bos.toString(enc); + int encIndex = str.indexOf(key); + int clength = key.length(); + if (encIndex > 0) { + char startChar = str.charAt(encIndex + clength); + int lastEncIndex; + if (startChar == '\'') + // if we have charset='something' + lastEncIndex = str.indexOf("'", ++encIndex + clength); + else if (startChar == '\"') + // if we have charset="something" + lastEncIndex = str.indexOf("\"", ++encIndex + clength); + else { + // if we have "text/html; charset=utf-8" + int first = str.indexOf("\"", encIndex + clength); + if (first < 0) + first = Integer.MAX_VALUE; + + // or "text/html; charset=utf-8 " + int sec = str.indexOf(" ", encIndex + clength); + if (sec < 0) + sec = Integer.MAX_VALUE; + lastEncIndex = Math.min(first, sec); + + // or "text/html; charset=utf-8 ' + int third = str.indexOf("'", encIndex + clength); + if (third > 0) + lastEncIndex = Math.min(lastEncIndex, third); + } + + // re-read byte array with different encoding + // assume that the encoding string cannot be greater than 40 chars + if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) { + String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, + lastEncIndex)); + try { + in.reset(); + bos.reset(); + return tmpEnc; + } catch (IOException ex) { + Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding " + + tmpEnc + " " + ex.toString()); + } + } + } + return null; + } +} diff --git a/src/acr/browser/lightning/Reading/HtmlFetcher.java b/src/acr/browser/lightning/Reading/HtmlFetcher.java new file mode 100644 index 0000000..c597193 --- /dev/null +++ b/src/acr/browser/lightning/Reading/HtmlFetcher.java @@ -0,0 +1,445 @@ +/* + * Copyright 2011 Peter Karich + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package acr.browser.lightning.Reading; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.Proxy; +import java.net.URL; +import java.util.LinkedHashSet; +import java.util.Locale; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.zip.GZIPInputStream; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; + +import acr.browser.lightning.Constants; +import android.util.Log; + +/** + * Class to fetch articles. This class is thread safe. + * + * @author Peter Karich + */ +public class HtmlFetcher { + + static { + SHelper.enableCookieMgmt(); + SHelper.enableUserAgentOverwrite(); + SHelper.enableAnySSL(); + } + + public static void main(String[] args) throws Exception { + BufferedReader reader = new BufferedReader(new FileReader("urls.txt")); + String line = null; + Set existing = new LinkedHashSet(); + while ((line = reader.readLine()) != null) { + int index1 = line.indexOf("\""); + int index2 = line.indexOf("\"", index1 + 1); + String url = line.substring(index1 + 1, index2); + String domainStr = SHelper.extractDomain(url, true); + String counterStr = ""; + // TODO more similarities + if (existing.contains(domainStr)) + counterStr = "2"; + else + existing.add(domainStr); + + String html = new HtmlFetcher().fetchAsString(url, 20000); + String outFile = domainStr + counterStr + ".html"; + BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); + writer.write(html); + writer.close(); + } + reader.close(); + } + + private String referrer = "https://github.com/karussell/snacktory"; + private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ")"; + private String cacheControl = "max-age=0"; + private String language = "en-us"; + private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; + private String charset = "UTF-8"; + private SCache cache; + private AtomicInteger cacheCounter = new AtomicInteger(0); + private int maxTextLength = -1; + private ArticleTextExtractor extractor = new ArticleTextExtractor(); + private Set furtherResolveNecessary = new LinkedHashSet() { + { + add("bit.ly"); + add("cli.gs"); + add("deck.ly"); + add("fb.me"); + add("feedproxy.google.com"); + add("flic.kr"); + add("fur.ly"); + add("goo.gl"); + add("is.gd"); + add("ink.co"); + add("j.mp"); + add("lnkd.in"); + add("on.fb.me"); + add("ow.ly"); + add("plurl.us"); + add("sns.mx"); + add("snurl.com"); + add("su.pr"); + add("t.co"); + add("tcrn.ch"); + add("tl.gd"); + add("tiny.cc"); + add("tinyurl.com"); + add("tmi.me"); + add("tr.im"); + add("twurl.nl"); + } + }; + + public HtmlFetcher() { + } + + public void setExtractor(ArticleTextExtractor extractor) { + this.extractor = extractor; + } + + public ArticleTextExtractor getExtractor() { + return extractor; + } + + public HtmlFetcher setCache(SCache cache) { + this.cache = cache; + return this; + } + + public SCache getCache() { + return cache; + } + + public int getCacheCounter() { + return cacheCounter.get(); + } + + public HtmlFetcher clearCacheCounter() { + cacheCounter.set(0); + return this; + } + + public HtmlFetcher setMaxTextLength(int maxTextLength) { + this.maxTextLength = maxTextLength; + return this; + } + + public int getMaxTextLength() { + return maxTextLength; + } + + public void setAccept(String accept) { + this.accept = accept; + } + + public void setCharset(String charset) { + this.charset = charset; + } + + public void setCacheControl(String cacheControl) { + this.cacheControl = cacheControl; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getReferrer() { + return referrer; + } + + public HtmlFetcher setReferrer(String referrer) { + this.referrer = referrer; + return this; + } + + public String getUserAgent() { + return userAgent; + } + + public void setUserAgent(String userAgent) { + this.userAgent = userAgent; + } + + public String getAccept() { + return accept; + } + + public String getCacheControl() { + return cacheControl; + } + + public String getCharset() { + return charset; + } + + public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception { + String originalUrl = url; + url = SHelper.removeHashbang(url); + String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url); + if (gUrl != null) + url = gUrl; + else { + gUrl = SHelper.getUrlFromUglyFacebookRedirect(url); + if (gUrl != null) + url = gUrl; + } + + if (resolve) { + // check if we can avoid resolving the URL (which hits the website!) + JResult res = getFromCache(url, originalUrl); + if (res != null) + return res; + + String resUrl = getResolvedUrl(url, timeout); + if (resUrl.isEmpty()) { + Log.d(Constants.TAG, "resolved url is empty. Url is: " + url); + + JResult result = new JResult(); + if (cache != null) + cache.put(url, result); + return result.setUrl(url); + } + + // if resolved url is longer then use it! + if (resUrl != null && resUrl.trim().length() > url.length()) { + // this is necessary e.g. for some homebaken url resolvers which + // return + // the resolved url relative to url! + url = SHelper.useDomainOfFirstArg4Second(url, resUrl); + } + } + + // check if we have the (resolved) URL in cache + JResult res = getFromCache(url, originalUrl); + if (res != null) + return res; + + JResult result = new JResult(); + // or should we use? + result.setUrl(url); + result.setOriginalUrl(originalUrl); + result.setDate(SHelper.estimateDate(url)); + + // Immediately put the url into the cache as extracting content takes + // time. + if (cache != null) { + cache.put(originalUrl, result); + cache.put(url, result); + } + + String lowerUrl = url.toLowerCase(Locale.getDefault()); + if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) { + // skip + } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) { + result.setVideoUrl(url); + } else if (SHelper.isImage(lowerUrl)) { + result.setImageUrl(url); + } else { + extractor.extractContent(result, fetchAsString(url, timeout)); + if (result.getFaviconUrl().isEmpty()) + result.setFaviconUrl(SHelper.getDefaultFavicon(url)); + + // some links are relative to root and do not include the domain of + // the url :( + result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); + result.setImageUrl(fixUrl(url, result.getImageUrl())); + result.setVideoUrl(fixUrl(url, result.getVideoUrl())); + result.setRssUrl(fixUrl(url, result.getRssUrl())); + } + result.setText(lessText(result.getText())); + synchronized (result) { + result.notifyAll(); + } + return result; + } + + public String lessText(String text) { + if (text == null) + return ""; + + if (maxTextLength >= 0 && text.length() > maxTextLength) + return text.substring(0, maxTextLength); + + return text; + } + + private static String fixUrl(String url, String urlOrPath) { + return SHelper.useDomainOfFirstArg4Second(url, urlOrPath); + } + + public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException, + IOException { + return fetchAsString(urlAsString, timeout, true); + } + + public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions) + throws MalformedURLException, IOException { + HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions); + hConn.setInstanceFollowRedirects(true); + String encoding = hConn.getContentEncoding(); + InputStream is; + if (encoding != null && encoding.equalsIgnoreCase("gzip")) { + is = new GZIPInputStream(hConn.getInputStream()); + } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { + is = new InflaterInputStream(hConn.getInputStream(), new Inflater(true)); + } else { + is = hConn.getInputStream(); + } + + String enc = Converter.extractEncoding(hConn.getContentType()); + String res = createConverter(urlAsString).streamToString(is, enc); + Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString); + return res; + } + + public Converter createConverter(String url) { + return new Converter(url); + } + + /** + * On some devices we have to hack: + * http://developers.sun.com/mobility/reference + * /techart/design_guidelines/http_redirection.html + * + * @param timeout + * Sets a specified timeout value, in milliseconds + * @return the resolved url if any. Or null if it couldn't resolve the url + * (within the specified time) or the same url if response code is + * OK + */ + public String getResolvedUrl(String urlAsString, int timeout) { + String newUrl = null; + int responseCode = -1; + try { + HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, true); + // force no follow + hConn.setInstanceFollowRedirects(false); + // the program doesn't care what the content actually is !! + // http://java.sun.com/developer/JDCTechTips/2003/tt0422.html + hConn.setRequestMethod("HEAD"); + hConn.connect(); + responseCode = hConn.getResponseCode(); + hConn.getInputStream().close(); + if (responseCode == HttpURLConnection.HTTP_OK) + return urlAsString; + + newUrl = hConn.getHeaderField("Location"); + if (responseCode / 100 == 3 && newUrl != null) { + newUrl = newUrl.replaceAll(" ", "+"); + // some services use (none-standard) utf8 in their location + // header + if (urlAsString.startsWith("http://bit.ly") + || urlAsString.startsWith("http://is.gd")) + newUrl = encodeUriFromHeader(newUrl); + + // fix problems if shortened twice. as it is often the case + // after twitters' t.co bullshit + if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) + newUrl = getResolvedUrl(newUrl, timeout); + + return newUrl; + } else + return urlAsString; + + } catch (Exception ex) { + Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage()); + return ""; + } finally { + Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl); + } + } + + /** + * Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding + * to non-ASCII characters. Workaround for broken origin servers that send + * UTF-8 in the Location: header. + */ + static String encodeUriFromHeader(String badLocation) { + StringBuilder sb = new StringBuilder(); + + for (char ch : badLocation.toCharArray()) { + if (ch < (char) 128) { + sb.append(ch); + } else { + // this is ONLY valid if the uri was decoded using ISO-8859-1 + sb.append(String.format("%%%02X", (int) ch)); + } + } + + return sb.toString(); + } + + protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout, + boolean includeSomeGooseOptions) throws MalformedURLException, IOException { + URL url = new URL(urlAsStr); + // using proxy may increase latency + HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); + hConn.setRequestProperty("User-Agent", userAgent); + hConn.setRequestProperty("Accept", accept); + + if (includeSomeGooseOptions) { + hConn.setRequestProperty("Accept-Language", language); + hConn.setRequestProperty("content-charset", charset); + hConn.addRequestProperty("Referer", referrer); + // avoid the cache for testing purposes only? + hConn.setRequestProperty("Cache-Control", cacheControl); + } + + // suggest respond to be gzipped or deflated (which is just another + // compression) + // http://stackoverflow.com/q/3932117 + hConn.setRequestProperty("Accept-Encoding", "gzip, deflate"); + hConn.setConnectTimeout(timeout); + hConn.setReadTimeout(timeout); + return hConn; + } + + private JResult getFromCache(String url, String originalUrl) throws Exception { + if (cache != null) { + JResult res = cache.get(url); + if (res != null) { + // e.g. the cache returned a shortened url as original url now + // we want to store the + // current original url! Also it can be that the cache response + // to url but the JResult + // does not contain it so overwrite it: + res.setUrl(url); + res.setOriginalUrl(originalUrl); + cacheCounter.addAndGet(1); + return res; + } + } + return null; + } +} diff --git a/src/acr/browser/lightning/Reading/ImageResult.java b/src/acr/browser/lightning/Reading/ImageResult.java new file mode 100644 index 0000000..2a8321e --- /dev/null +++ b/src/acr/browser/lightning/Reading/ImageResult.java @@ -0,0 +1,31 @@ +package acr.browser.lightning.Reading; + +import org.jsoup.nodes.Element; + +/** + * Class which encapsulates the data from an image found under an element + * + * @author Chris Alexander, chris@chris-alexander.co.uk + */ +public class ImageResult { + + public String src; + public Integer weight; + public String title; + public int height; + public int width; + public String alt; + public boolean noFollow; + public Element element; + + public ImageResult(String src, Integer weight, String title, int height, int width, String alt, + boolean noFollow) { + this.src = src; + this.weight = weight; + this.title = title; + this.height = height; + this.width = width; + this.alt = alt; + this.noFollow = noFollow; + } +} diff --git a/src/acr/browser/lightning/Reading/JResult.java b/src/acr/browser/lightning/Reading/JResult.java new file mode 100644 index 0000000..50ae5ea --- /dev/null +++ b/src/acr/browser/lightning/Reading/JResult.java @@ -0,0 +1,216 @@ +/* + * Copyright 2011 Peter Karich + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package acr.browser.lightning.Reading; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; + +/** + * Parsed result from web page containing important title, text and image. + * + * @author Peter Karich + */ +public class JResult implements Serializable { + + private String title; + private String url; + private String originalUrl; + private String canonicalUrl; + private String imageUrl; + private String videoUrl; + private String rssUrl; + private String text; + private String faviconUrl; + private String description; + private String dateString; + private List textList; + private Collection keywords; + private List images = null; + + public JResult() { + } + + public String getUrl() { + if (url == null) + return ""; + return url; + } + + public JResult setUrl(String url) { + this.url = url; + return this; + } + + public JResult setOriginalUrl(String originalUrl) { + this.originalUrl = originalUrl; + return this; + } + + public String getOriginalUrl() { + return originalUrl; + } + + public JResult setCanonicalUrl(String canonicalUrl) { + this.canonicalUrl = canonicalUrl; + return this; + } + + public String getCanonicalUrl() { + return canonicalUrl; + } + + public String getFaviconUrl() { + if (faviconUrl == null) + return ""; + return faviconUrl; + } + + public JResult setFaviconUrl(String faviconUrl) { + this.faviconUrl = faviconUrl; + return this; + } + + public JResult setRssUrl(String rssUrl) { + this.rssUrl = rssUrl; + return this; + } + + public String getRssUrl() { + if (rssUrl == null) + return ""; + return rssUrl; + } + + public String getDescription() { + if (description == null) + return ""; + return description; + } + + public JResult setDescription(String description) { + this.description = description; + return this; + } + + public String getImageUrl() { + if (imageUrl == null) + return ""; + return imageUrl; + } + + public JResult setImageUrl(String imageUrl) { + this.imageUrl = imageUrl; + return this; + } + + public String getText() { + if (text == null) + return ""; + + return text; + } + + public JResult setText(String text) { + this.text = text; + return this; + } + + public List getTextList() { + if (this.textList == null) + return new ArrayList(); + return this.textList; + } + + public JResult setTextList(List textList) { + this.textList = textList; + return this; + } + + public String getTitle() { + if (title == null) + return ""; + return title; + } + + public JResult setTitle(String title) { + this.title = title; + return this; + } + + public String getVideoUrl() { + if (videoUrl == null) + return ""; + return videoUrl; + } + + public JResult setVideoUrl(String videoUrl) { + this.videoUrl = videoUrl; + return this; + } + + public JResult setDate(String date) { + this.dateString = date; + return this; + } + + public Collection getKeywords() { + return keywords; + } + + public void setKeywords(Collection keywords) { + this.keywords = keywords; + } + + /** + * @return get date from url or guessed from text + */ + public String getDate() { + return dateString; + } + + /** + * @return images list + */ + public List getImages() { + if (images == null) + return Collections.emptyList(); + return images; + } + + /** + * @return images count + */ + public int getImagesCount() { + if (images == null) + return 0; + return images.size(); + } + + /** + * set images list + */ + public void setImages(List images) { + this.images = images; + } + + @Override + public String toString() { + return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text; + } +} diff --git a/src/acr/browser/lightning/Reading/MapEntry.java b/src/acr/browser/lightning/Reading/MapEntry.java new file mode 100644 index 0000000..31e7c36 --- /dev/null +++ b/src/acr/browser/lightning/Reading/MapEntry.java @@ -0,0 +1,80 @@ +/** + * Copyright (C) 2010 Peter Karich <> + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package acr.browser.lightning.Reading; + +import java.io.Serializable; +import java.util.Map; + +/** + * Simple impl of Map.Entry. So that we can have ordered maps. + * + * @author Peter Karich, peat_hal ‘at’ users ‘dot’ sourceforge ‘dot’ + * net + */ +public class MapEntry implements Map.Entry, Serializable { + + private static final long serialVersionUID = 1L; + private K key; + private V value; + + public MapEntry(K key, V value) { + this.key = key; + this.value = value; + } + + @Override + public K getKey() { + return key; + } + + @Override + public V getValue() { + return value; + } + + @Override + public V setValue(V value) { + this.value = value; + return value; + } + + @Override + public String toString() { + return getKey() + ", " + getValue(); + } + + @Override + public boolean equals(Object obj) { + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final MapEntry other = (MapEntry) obj; + if (this.key != other.key && (this.key == null || !this.key.equals(other.key))) + return false; + if (this.value != other.value && (this.value == null || !this.value.equals(other.value))) + return false; + return true; + } + + @Override + public int hashCode() { + int hash = 7; + hash = 19 * hash + (this.key != null ? this.key.hashCode() : 0); + hash = 19 * hash + (this.value != null ? this.value.hashCode() : 0); + return hash; + } +} diff --git a/src/acr/browser/lightning/Reading/OutputFormatter.java b/src/acr/browser/lightning/Reading/OutputFormatter.java new file mode 100644 index 0000000..c456b1a --- /dev/null +++ b/src/acr/browser/lightning/Reading/OutputFormatter.java @@ -0,0 +1,174 @@ +package acr.browser.lightning.Reading; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.regex.Pattern; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; + +/** + * @author goose | jim + * @author karussell + * + * this class will be responsible for taking our top node and stripping + * out junk we don't want and getting it ready for how we want it + * presented to the user + */ +public class OutputFormatter { + + public static final int MIN_PARAGRAPH_TEXT = 50; + private static final List NODES_TO_REPLACE = Arrays.asList("strong", "b", "i"); + private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden"); + protected final int minParagraphText; + protected final List nodesToReplace; + protected String nodesToKeepCssSelector = "p"; + + public OutputFormatter() { + this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); + } + + public OutputFormatter(int minParagraphText) { + this(minParagraphText, NODES_TO_REPLACE); + } + + public OutputFormatter(int minParagraphText, List nodesToReplace) { + this.minParagraphText = minParagraphText; + this.nodesToReplace = nodesToReplace; + } + + /** + * set elements to keep in output text + */ + public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) { + this.nodesToKeepCssSelector = nodesToKeepCssSelector; + } + + /** + * takes an element and turns the P tags into \n\n + */ + public String getFormattedText(Element topNode) { + removeNodesWithNegativeScores(topNode); + StringBuilder sb = new StringBuilder(); + append(topNode, sb, nodesToKeepCssSelector); + String str = SHelper.innerTrim(sb.toString()); + if (str.length() > 100) + return str; + + // no subelements + if (str.isEmpty() || !topNode.text().isEmpty() + && str.length() <= topNode.ownText().length()) + str = topNode.text(); + + // if jsoup failed to parse the whole html now parse this smaller + // snippet again to avoid html tags disturbing our text: + return Jsoup.parse(str).text(); + } + + /** + * Takes an element and returns a list of texts extracted from the P tags + */ + public List getTextList(Element topNode) { + List texts = new ArrayList(); + for (Element element : topNode.select(this.nodesToKeepCssSelector)) { + if (element.hasText()) { + texts.add(element.text()); + } + } + return texts; + } + + /** + * If there are elements inside our top node that have a negative gravity + * score remove them + */ + protected void removeNodesWithNegativeScores(Element topNode) { + Elements gravityItems = topNode.select("*[gravityScore]"); + for (Element item : gravityItems) { + int score = Integer.parseInt(item.attr("gravityScore")); + if (score < 0 || item.text().length() < minParagraphText) + item.remove(); + } + } + + protected void append(Element node, StringBuilder sb, String tagName) { + // is select more costly then getElementsByTag? + MAIN: for (Element e : node.select(tagName)) { + Element tmpEl = e; + // check all elements until 'node' + while (tmpEl != null && !tmpEl.equals(node)) { + if (unlikely(tmpEl)) + continue MAIN; + tmpEl = tmpEl.parent(); + } + + String text = node2Text(e); + if (text.isEmpty() || text.length() < minParagraphText + || text.length() > SHelper.countLetters(text) * 2) + continue; + + sb.append(text); + sb.append("\n\n"); + } + } + + boolean unlikely(Node e) { + if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) + return true; + + String style = e.attr("style"); + String clazz = e.attr("class"); + if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find()) + return true; + return false; + } + + void appendTextSkipHidden(Element e, StringBuilder accum) { + for (Node child : e.childNodes()) { + if (unlikely(child)) + continue; + if (child instanceof TextNode) { + TextNode textNode = (TextNode) child; + String txt = textNode.text(); + accum.append(txt); + } else if (child instanceof Element) { + Element element = (Element) child; + if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) + accum.append(" "); + else if (element.tagName().equals("br")) + accum.append(" "); + appendTextSkipHidden(element, accum); + } + } + } + + boolean lastCharIsWhitespace(StringBuilder accum) { + if (accum.length() == 0) + return false; + return Character.isWhitespace(accum.charAt(accum.length() - 1)); + } + + protected String node2TextOld(Element el) { + return el.text(); + } + + protected String node2Text(Element el) { + StringBuilder sb = new StringBuilder(200); + appendTextSkipHidden(el, sb); + return sb.toString(); + } + + public OutputFormatter setUnlikelyPattern(String unlikelyPattern) { + this.unlikelyPattern = Pattern.compile(unlikelyPattern); + return this; + } + + public OutputFormatter appendUnlikelyPattern(String str) { + return setUnlikelyPattern(unlikelyPattern.toString() + "|" + str); + } +} diff --git a/src/acr/browser/lightning/Reading/SCache.java b/src/acr/browser/lightning/Reading/SCache.java new file mode 100644 index 0000000..ace929c --- /dev/null +++ b/src/acr/browser/lightning/Reading/SCache.java @@ -0,0 +1,29 @@ +/* + * Copyright 2011 Peter Karich + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package acr.browser.lightning.Reading; + +/** + * + * @author Peter Karich + */ +public interface SCache { + + JResult get(String url); + + void put(String url, JResult res); + + int getSize(); +} diff --git a/src/acr/browser/lightning/Reading/SHelper.java b/src/acr/browser/lightning/Reading/SHelper.java new file mode 100644 index 0000000..ab58c5f --- /dev/null +++ b/src/acr/browser/lightning/Reading/SHelper.java @@ -0,0 +1,480 @@ +/* + * Copyright 2011 Peter Karich + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package acr.browser.lightning.Reading; + +import java.io.UnsupportedEncodingException; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.net.CookiePolicy; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.security.SecureRandom; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.text.SimpleDateFormat; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.net.ssl.KeyManager; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import org.jsoup.nodes.Element; + +/** + * + * @author Peter Karich + */ +public class SHelper { + + public static final String UTF8 = "UTF-8"; + private static final Pattern SPACE = Pattern.compile(" "); + + public static String replaceSpaces(String url) { + if (!url.isEmpty()) { + url = url.trim(); + if (url.contains(" ")) { + Matcher spaces = SPACE.matcher(url); + url = spaces.replaceAll("%20"); + } + } + return url; + } + + public static int count(String str, String substring) { + int c = 0; + int index1 = str.indexOf(substring); + if (index1 >= 0) { + c++; + c += count(str.substring(index1 + substring.length()), substring); + } + return c; + } + + /** + * remove more than two spaces or newlines + */ + public static String innerTrim(String str) { + if (str.isEmpty()) + return ""; + + StringBuilder sb = new StringBuilder(); + boolean previousSpace = false; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (c == ' ' || (int) c == 9 || c == '\n') { + previousSpace = true; + continue; + } + + if (previousSpace) + sb.append(' '); + + previousSpace = false; + sb.append(c); + } + return sb.toString().trim(); + } + + /** + * Starts reading the encoding from the first valid character until an + * invalid encoding character occurs. + */ + public static String encodingCleanup(String str) { + StringBuilder sb = new StringBuilder(); + boolean startedWithCorrectString = false; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (Character.isDigit(c) || Character.isLetter(c) || c == '-' || c == '_') { + startedWithCorrectString = true; + sb.append(c); + continue; + } + + if (startedWithCorrectString) + break; + } + return sb.toString().trim(); + } + + /** + * @return the longest substring as str1.substring(result[0], result[1]); + */ + public static String getLongestSubstring(String str1, String str2) { + int res[] = longestSubstring(str1, str2); + if (res == null || res[0] >= res[1]) + return ""; + + return str1.substring(res[0], res[1]); + } + + public static int[] longestSubstring(String str1, String str2) { + if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty()) + return null; + + // dynamic programming => save already identical length into array + // to understand this algo simply print identical length in every entry + // of the array + // i+1, j+1 then reuses information from i,j + // java initializes them already with 0 + int[][] num = new int[str1.length()][str2.length()]; + int maxlen = 0; + int lastSubstrBegin = 0; + int endIndex = 0; + for (int i = 0; i < str1.length(); i++) { + for (int j = 0; j < str2.length(); j++) { + if (str1.charAt(i) == str2.charAt(j)) { + if ((i == 0) || (j == 0)) + num[i][j] = 1; + else + num[i][j] = 1 + num[i - 1][j - 1]; + + if (num[i][j] > maxlen) { + maxlen = num[i][j]; + // generate substring from str1 => i + lastSubstrBegin = i - num[i][j] + 1; + endIndex = i + 1; + } + } + } + } + return new int[] { lastSubstrBegin, endIndex }; + } + + public static String getDefaultFavicon(String url) { + return useDomainOfFirstArg4Second(url, "/favicon.ico"); + } + + /** + * @param urlForDomain + * extract the domain from this url + * @param path + * this url does not have a domain + * @return + */ + public static String useDomainOfFirstArg4Second(String urlForDomain, String path) { + if (path.startsWith("http")) + return path; + + if ("favicon.ico".equals(path)) + path = "/favicon.ico"; + + if (path.startsWith("//")) { + // wikipedia special case, see tests + if (urlForDomain.startsWith("https:")) + return "https:" + path; + + return "http:" + path; + } else if (path.startsWith("/")) + return "http://" + extractHost(urlForDomain) + path; + else if (path.startsWith("../")) { + int slashIndex = urlForDomain.lastIndexOf("/"); + if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length()) + urlForDomain = urlForDomain.substring(0, slashIndex + 1); + + return urlForDomain + path; + } + return path; + } + + public static String extractHost(String url) { + return extractDomain(url, false); + } + + public static String extractDomain(String url, boolean aggressive) { + if (url.startsWith("http://")) + url = url.substring("http://".length()); + else if (url.startsWith("https://")) + url = url.substring("https://".length()); + + if (aggressive) { + if (url.startsWith("www.")) + url = url.substring("www.".length()); + + // strip mobile from start + if (url.startsWith("m.")) + url = url.substring("m.".length()); + } + + int slashIndex = url.indexOf("/"); + if (slashIndex > 0) + url = url.substring(0, slashIndex); + + return url; + } + + public static boolean isVideoLink(String url) { + url = extractDomain(url, true); + return url.startsWith("youtube.com") || url.startsWith("video.yahoo.com") + || url.startsWith("vimeo.com") || url.startsWith("blip.tv"); + } + + public static boolean isVideo(String url) { + return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") + || url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4") + || url.endsWith(".flv") || url.endsWith(".wmv"); + } + + public static boolean isAudio(String url) { + return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") + || url.endsWith(".wav"); + } + + public static boolean isDoc(String url) { + return url.endsWith(".pdf") || url.endsWith(".ppt") || url.endsWith(".doc") + || url.endsWith(".swf") || url.endsWith(".rtf") || url.endsWith(".xls"); + } + + public static boolean isPackage(String url) { + return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip") + || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") + || url.endsWith(".7z"); + } + + public static boolean isApp(String url) { + return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") + || url.endsWith(".dmg"); + } + + public static boolean isImage(String url) { + return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif") + || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") + || url.endsWith(".eps"); + } + + /** + * @see http + * ://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se + */ + public static void enableCookieMgmt() { + CookieManager manager = new CookieManager(); + manager.setCookiePolicy(CookiePolicy.ACCEPT_ALL); + CookieHandler.setDefault(manager); + } + + /** + * @see http + * ://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java + * -urlconnection + */ + public static void enableUserAgentOverwrite() { + System.setProperty("http.agent", ""); + } + + public static String getUrlFromUglyGoogleRedirect(String url) { + if (url.startsWith("http://www.google.com/url?")) { + url = url.substring("http://www.google.com/url?".length()); + String arr[] = urlDecode(url).split("\\&"); + if (arr != null) + for (String str : arr) { + if (str.startsWith("q=")) + return str.substring("q=".length()); + } + } + + return null; + } + + public static String getUrlFromUglyFacebookRedirect(String url) { + if (url.startsWith("http://www.facebook.com/l.php?u=")) { + url = url.substring("http://www.facebook.com/l.php?u=".length()); + return urlDecode(url); + } + + return null; + } + + public static String urlEncode(String str) { + try { + return URLEncoder.encode(str, UTF8); + } catch (UnsupportedEncodingException ex) { + return str; + } + } + + public static String urlDecode(String str) { + try { + return URLDecoder.decode(str, UTF8); + } catch (UnsupportedEncodingException ex) { + return str; + } + } + + /** + * Popular sites uses the #! to indicate the importance of the following + * chars. Ugly but true. Such as: facebook, twitter, gizmodo, ... + */ + public static String removeHashbang(String url) { + return url.replaceFirst("#!", ""); + } + + public static String printNode(Element root) { + return printNode(root, 0); + } + + public static String printNode(Element root, int indentation) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < indentation; i++) { + sb.append(' '); + } + sb.append(root.tagName()); + sb.append(":"); + sb.append(root.ownText()); + sb.append("\n"); + for (Element el : root.children()) { + sb.append(printNode(el, indentation + 1)); + sb.append("\n"); + } + return sb.toString(); + } + + public static String estimateDate(String url) { + int index = url.indexOf("://"); + if (index > 0) + url = url.substring(index + 3); + + int year = -1; + int yearCounter = -1; + int month = -1; + int monthCounter = -1; + int day = -1; + String strs[] = url.split("/"); + for (int counter = 0; counter < strs.length; counter++) { + String str = strs[counter]; + if (str.length() == 4) { + try { + year = Integer.parseInt(str); + } catch (Exception ex) { + continue; + } + if (year < 1970 || year > 3000) { + year = -1; + continue; + } + yearCounter = counter; + } else if (str.length() == 2) { + if (monthCounter < 0 && counter == yearCounter + 1) { + try { + month = Integer.parseInt(str); + } catch (Exception ex) { + continue; + } + if (month < 1 || month > 12) { + month = -1; + continue; + } + monthCounter = counter; + } else if (counter == monthCounter + 1) { + try { + day = Integer.parseInt(str); + } catch (Exception ex) { + } + if (day < 1 || day > 31) { + day = -1; + continue; + } + break; + } + } + } + + if (year < 0) + return null; + + StringBuilder str = new StringBuilder(); + str.append(year); + if (month < 1) + return str.toString(); + + str.append('/'); + if (month < 10) + str.append('0'); + str.append(month); + if (day < 1) + return str.toString(); + + str.append('/'); + if (day < 10) + str.append('0'); + str.append(day); + return str.toString(); + } + + public static String completeDate(String dateStr) { + if (dateStr == null) + return null; + + int index = dateStr.indexOf('/'); + if (index > 0) { + index = dateStr.indexOf('/', index + 1); + if (index > 0) + return dateStr; + else + return dateStr + "/01"; + } + return dateStr + "/01/01"; + } + + /** + * keep in mind: simpleDateFormatter is not thread safe! call completeDate + * before applying this formatter. + */ + public static SimpleDateFormat createDateFormatter() { + return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault()); + } + + // with the help of + // http://stackoverflow.com/questions/1828775/httpclient-and-ssl + public static void enableAnySSL() { + try { + SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() }, + new SecureRandom()); + SSLContext.setDefault(ctx); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + private static class DefaultTrustManager implements X509TrustManager { + + @Override + public void checkClientTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { + } + + @Override + public void checkServerTrusted(X509Certificate[] arg0, String arg1) + throws CertificateException { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + } + + public static int countLetters(String str) { + int len = str.length(); + int chars = 0; + for (int i = 0; i < len; i++) { + if (Character.isLetter(str.charAt(i))) + chars++; + } + return chars; + } +} diff --git a/src/acr/browser/lightning/ReadingActivity.java b/src/acr/browser/lightning/ReadingActivity.java new file mode 100644 index 0000000..ada7513 --- /dev/null +++ b/src/acr/browser/lightning/ReadingActivity.java @@ -0,0 +1,153 @@ +package acr.browser.lightning; + +import java.util.ArrayList; +import java.util.List; + +import acr.browser.lightning.Reading.HtmlFetcher; +import acr.browser.lightning.Reading.JResult; +import android.animation.ObjectAnimator; +import android.app.ProgressDialog; +import android.content.Context; +import android.content.Intent; +import android.os.AsyncTask; +import android.os.Bundle; +import android.support.v7.app.ActionBarActivity; +import android.support.v7.widget.Toolbar; +import android.view.MenuItem; +import android.view.View; +import android.widget.TextView; + +public class ReadingActivity extends ActionBarActivity { + + private TextView mTitle; + private TextView mBody; + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.reading_view); + + Toolbar toolbar = (Toolbar) findViewById(R.id.toolbar); + setSupportActionBar(toolbar); + + getSupportActionBar().setDisplayHomeAsUpEnabled(true); + + mTitle = (TextView) findViewById(R.id.textViewTitle); + mBody = (TextView) findViewById(R.id.textViewBody); + + mTitle.setText(getString(R.string.untitled)); + mBody.setText(getString(R.string.loading)); + + mTitle.setVisibility(View.INVISIBLE); + mBody.setVisibility(View.INVISIBLE); + + Intent intent = getIntent(); + if (!loadPage(intent)) { + setText(getString(R.string.untitled), getString(R.string.loading_failed)); + } + } + + protected boolean loadPage(Intent intent) { + if (intent == null) { + return false; + } + String url = intent.getStringExtra(Constants.LOAD_READING_URL); + if (url == null) { + return false; + } + getSupportActionBar().setTitle(Utils.getDomainName(url)); + new PageLoader(this).execute(url); + return true; + } + + private class PageLoader extends AsyncTask { + + private Context mContext; + private ProgressDialog mProgressDialog; + private String mTitleText; + private List mBodyText; + + public PageLoader(Context context) { + mContext = context; + } + + @Override + protected void onPreExecute() { + super.onPreExecute(); + mProgressDialog = new ProgressDialog(mContext); + mProgressDialog.setProgressStyle(ProgressDialog.STYLE_SPINNER); + mProgressDialog.setCancelable(false); + mProgressDialog.setIndeterminate(true); + mProgressDialog.setMessage(mContext.getString(R.string.loading)); + mProgressDialog.show(); + } + + @Override + protected Void doInBackground(String... params) { + + HtmlFetcher fetcher = new HtmlFetcher(); + try { + JResult result = fetcher.fetchAndExtract(params[0], 5000, true); + mTitleText = result.getTitle(); + mBodyText = result.getTextList(); + } catch (Exception e) { + mTitleText = ""; + mBodyText = new ArrayList<>(); + e.printStackTrace(); + } catch (OutOfMemoryError e) { + System.gc(); + mTitleText = ""; + mBodyText = new ArrayList<>(); + e.printStackTrace(); + } + return null; + } + + @Override + protected void onPostExecute(Void result) { + mProgressDialog.dismiss(); + if (mTitleText.isEmpty() || mBodyText.isEmpty()) { + setText(getString(R.string.untitled), getString(R.string.loading_failed)); + } else { + StringBuilder builder = new StringBuilder(); + for (String text : mBodyText) { + builder.append(text + "\n\n"); + } + setText(mTitleText, builder.toString()); + } + super.onPostExecute(result); + } + + } + + private void setText(String title, String body) { + if (mTitle.getVisibility() == View.INVISIBLE) { + mTitle.setAlpha(0.0f); + mTitle.setVisibility(View.VISIBLE); + mTitle.setText(title); + ObjectAnimator animator = ObjectAnimator.ofFloat(mTitle, "alpha", 1.0f); + animator.setDuration(300); + animator.start(); + } else { + mTitle.setText(title); + } + + if (mBody.getVisibility() == View.INVISIBLE) { + mBody.setAlpha(0.0f); + mBody.setVisibility(View.VISIBLE); + mBody.setText(body); + ObjectAnimator animator = ObjectAnimator.ofFloat(mBody, "alpha", 1.0f); + animator.setDuration(300); + animator.start(); + } else { + mBody.setText(body); + } + } + + @Override + public boolean onOptionsItemSelected(MenuItem item) { + finish(); + return super.onOptionsItemSelected(item); + } + +}