Browse Source

Added a Reading Mode that can be accessed from the menu

Reading Mode utilizes the Snacktory library created by karussel which is
licensed under the Apache 2.0 license.
https://github.com/karussell/snacktory
master
Anthony Restaino 10 years ago
parent
commit
10668a019b
  1. 11
      AndroidManifest.xml
  2. BIN
      libs/jsoup-1.8.1.jar
  3. 34
      res/layout/license_activity.xml
  4. 39
      res/layout/reading_view.xml
  5. 148
      res/menu-xlarge/main.xml
  6. 1
      res/menu/main.xml
  7. 4
      res/values/strings.xml
  8. 26
      src/acr/browser/lightning/BrowserActivity.java
  9. 2
      src/acr/browser/lightning/Constants.java
  10. 4
      src/acr/browser/lightning/LicenseActivity.java
  11. 619
      src/acr/browser/lightning/Reading/ArticleTextExtractor.java
  12. 243
      src/acr/browser/lightning/Reading/Converter.java
  13. 445
      src/acr/browser/lightning/Reading/HtmlFetcher.java
  14. 31
      src/acr/browser/lightning/Reading/ImageResult.java
  15. 216
      src/acr/browser/lightning/Reading/JResult.java
  16. 80
      src/acr/browser/lightning/Reading/MapEntry.java
  17. 174
      src/acr/browser/lightning/Reading/OutputFormatter.java
  18. 29
      src/acr/browser/lightning/Reading/SCache.java
  19. 480
      src/acr/browser/lightning/Reading/SHelper.java
  20. 153
      src/acr/browser/lightning/ReadingActivity.java

11
AndroidManifest.xml

@ -200,6 +200,17 @@ @@ -200,6 +200,17 @@
<intent-filter>
<action android:name="android.intent.action.BOOKMARK" />
<category android:name="android.intent.category.DEFAULT" />
</intent-filter>
</activity>
<activity
android:name="acr.browser.lightning.ReadingActivity"
android:configChanges="orientation|screenSize|keyboardHidden|keyboard"
android:label="@string/reading_mode"
android:theme="@style/Theme.SettingsTheme" >
<intent-filter>
<action android:name="android.intent.action.READING" />
<category android:name="android.intent.category.DEFAULT" />
</intent-filter>
</activity>

BIN
libs/jsoup-1.8.1.jar

Binary file not shown.

34
res/layout/license_activity.xml

@ -140,5 +140,39 @@ @@ -140,5 +140,39 @@
android:layout_marginLeft="10dp"
android:layout_marginRight="10dp"
android:background="#cdcdcd" />
<LinearLayout
android:id="@+id/licenseSnactory"
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:background="?attr/listChoiceBackgroundIndicator"
android:orientation="vertical"
android:paddingBottom="10dp"
android:paddingTop="10dp" >
<TextView
android:id="@+id/textView5"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:paddingLeft="16dp"
android:text="@string/snacktory"
android:textAppearance="?android:attr/textAppearanceMedium" />
<TextView
android:id="@+id/textView6"
android:layout_width="wrap_content"
android:layout_height="wrap_content"
android:paddingLeft="16dp"
android:text="@string/apache"
android:textAppearance="?android:attr/textAppearanceSmall"
android:textColor="@color/light" />
</LinearLayout>
<LinearLayout
android:layout_width="match_parent"
android:layout_height="1dp"
android:layout_marginLeft="10dp"
android:layout_marginRight="10dp"
android:background="#cdcdcd" />
</LinearLayout>

39
res/layout/reading_view.xml

@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:layout_width="match_parent"
android:layout_height="match_parent"
android:orientation="vertical" >
<include layout="@layout/toolbar_settings" />
<ScrollView
android:layout_width="match_parent"
android:layout_height="match_parent" >
<LinearLayout
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:padding="20dp"
android:orientation="vertical">
<TextView
android:id="@+id/textViewTitle"
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:layout_marginBottom="20dp"
android:gravity="center_horizontal|center_vertical"
android:text="Large Text"
android:textAppearance="?android:attr/textAppearanceLarge" />
<TextView
android:id="@+id/textViewBody"
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:gravity="start"
android:text="Medium Text"
android:textAppearance="?android:attr/textAppearanceMedium" />
</LinearLayout>
</ScrollView>
</LinearLayout>

148
res/menu-xlarge/main.xml

@ -1,73 +1,77 @@ @@ -1,73 +1,77 @@
<!--
Copyright 2014 A.C.R. Development
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<menu xmlns:android="http://schemas.android.com/apk/res/android" >
<item
android:id="@+id/action_back"
android:checkable="false"
android:enabled="true"
android:icon="?arrowBackDrawable"
android:showAsAction="always"
android:title="@string/action_back"
android:visible="true">
</item>
<item
android:id="@+id/action_forward"
android:checkable="false"
android:enabled="true"
android:icon="?arrowForwardDrawable"
android:showAsAction="always"
android:title="@string/action_forward"
android:visible="true">
</item>
<item
android:id="@+id/action_new_tab"
android:title="@string/action_new_tab">
</item>
<item
android:id="@+id/action_incognito"
android:title="@string/action_incognito">
</item>
<item
android:id="@+id/action_share"
android:title="@string/action_share"/>
<item
android:id="@+id/action_history"
android:title="@string/action_history">
</item>
<item
android:id="@+id/action_find"
android:title="@string/action_find">
</item>
<item
android:id="@+id/action_copy"
android:title="@string/action_copy">
</item>
<item
android:id="@+id/action_bookmarks"
android:title="@string/action_bookmarks">
</item>
<item
android:id="@+id/action_add_bookmark"
android:title="@string/action_add_bookmark">
</item>
<item
android:id="@+id/action_settings"
android:title="@string/settings">
</item>
<!--
Copyright 2014 A.C.R. Development
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<menu xmlns:android="http://schemas.android.com/apk/res/android" >
<item
android:id="@+id/action_back"
android:checkable="false"
android:enabled="true"
android:icon="?arrowBackDrawable"
android:showAsAction="always"
android:title="@string/action_back"
android:visible="true">
</item>
<item
android:id="@+id/action_forward"
android:checkable="false"
android:enabled="true"
android:icon="?arrowForwardDrawable"
android:showAsAction="always"
android:title="@string/action_forward"
android:visible="true">
</item>
<item
android:id="@+id/action_new_tab"
android:title="@string/action_new_tab">
</item>
<item
android:id="@+id/action_incognito"
android:title="@string/action_incognito">
</item>
<item
android:id="@+id/action_share"
android:title="@string/action_share"/>
<item
android:id="@+id/action_history"
android:title="@string/action_history">
</item>
<item
android:id="@+id/action_find"
android:title="@string/action_find">
</item>
<item
android:id="@+id/action_copy"
android:title="@string/action_copy">
</item>
<item
android:id="@+id/action_bookmarks"
android:title="@string/action_bookmarks">
</item>
<item
android:id="@+id/action_add_bookmark"
android:title="@string/action_add_bookmark">
</item>
<item
android:id="@+id/action_reading_mode"
android:title="@string/reading_mode">
</item>
<item
android:id="@+id/action_settings"
android:title="@string/settings">
</item>
</menu>

1
res/menu/main.xml

@ -23,6 +23,7 @@ @@ -23,6 +23,7 @@
<item android:id="@+id/action_copy" android:title="@string/action_copy" ></item>
<item android:id="@+id/action_bookmarks" android:title="@string/action_bookmarks" ></item>
<item android:id="@+id/action_add_bookmark" android:title="@string/action_add_bookmark" ></item>
<item android:id="@+id/action_reading_mode" android:title="@string/reading_mode" ></item>
<item android:id="@+id/action_settings" android:title="@string/settings" ></item>
</menu>

4
res/values/strings.xml

@ -198,4 +198,8 @@ @@ -198,4 +198,8 @@
<string name="third_party">Block 3rd Party Cookies</string>
<string name="available_lollipop">This feature is only available on Android 5.0+</string>
<string name="color_mode">Enable Color Mode</string>
<string name="reading_mode">Reader Mode</string>
<string name="loading">Loading&#8230;</string>
<string name="loading_failed">Couldn\'t load anything from the page.</string>
<string name="snacktory">Snacktory</string>
</resources>

26
src/acr/browser/lightning/BrowserActivity.java

@ -179,7 +179,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -179,7 +179,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
mDrawerListRight.setDividerHeight(0);
setNavigationDrawerWidth();
mDrawerLayout.setDrawerListener(new DrawerLocker());
mWebpageBitmap = BitmapFactory.decodeResource(getResources(), R.drawable.ic_webpage);
mActionBar = getSupportActionBar();
final TypedArray styledAttributes = mContext.getTheme().obtainStyledAttributes(
@ -350,7 +350,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -350,7 +350,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
};
anim.setDuration(300);
anim.setInterpolator(new DecelerateInterpolator());
anim.setAnimationListener(new AnimationListener(){
anim.setAnimationListener(new AnimationListener() {
@Override
public void onAnimationStart(Animation animation) {
@ -368,7 +368,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -368,7 +368,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onAnimationRepeat(Animation animation) {
}
});
new Handler().postDelayed(new Runnable() {
@ -488,12 +488,12 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -488,12 +488,12 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
checkForTor();
}
private class DrawerLocker implements DrawerListener {
@Override
public void onDrawerClosed(View v) {
if(v == mDrawerRight){
if (v == mDrawerRight) {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerLeft);
} else {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerRight);
@ -502,7 +502,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -502,7 +502,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onDrawerOpened(View v) {
if(v == mDrawerRight){
if (v == mDrawerRight) {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerLeft);
} else {
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerRight);
@ -516,7 +516,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -516,7 +516,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
@Override
public void onDrawerStateChanged(int arg) {
}
}
public boolean handleMenuItemClick(MenuItem item) {
@ -596,6 +596,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -596,6 +596,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
case R.id.action_find:
findInPage();
return true;
case R.id.action_reading_mode:
Intent read = new Intent(this, ReadingActivity.class);
read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
startActivity(read);
return true;
default:
return super.onOptionsItemSelected(item);
}
@ -912,6 +917,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -912,6 +917,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
case R.id.action_find:
findInPage();
return true;
case R.id.action_reading_mode:
Intent read = new Intent(this, ReadingActivity.class);
read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
startActivity(read);
return true;
default:
return super.onOptionsItemSelected(item);
}
@ -1622,7 +1632,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl @@ -1622,7 +1632,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
}
});
ViewCompat.jumpDrawablesToCurrentState(holder.exit);
LightningView web = data.get(position);

2
src/acr/browser/lightning/Constants.java

@ -29,6 +29,8 @@ public final class Constants { @@ -29,6 +29,8 @@ public final class Constants {
public static final String JAVASCRIPT_INVERT_PAGE = "javascript:(function(){var e='img {-webkit-filter: invert(100%);'+'-moz-filter: invert(100%);'+'-o-filter: invert(100%);'+'-ms-filter: invert(100%); }',t=document.getElementsByTagName('head')[0],n=document.createElement('style');if(!window.counter){window.counter=1}else{window.counter++;if(window.counter%2==0){var e='html {-webkit-filter: invert(0%); -moz-filter: invert(0%); -o-filter: invert(0%); -ms-filter: invert(0%); }'}}n.type='text/css';if(n.styleSheet){n.styleSheet.cssText=e}else{n.appendChild(document.createTextNode(e))}t.appendChild(n)})();";
public static final String JAVASCRIPT_TEXT_REFLOW = "javascript:document.getElementsByTagName('body')[0].style.width=window.innerWidth+'px';";
public static final String LOAD_READING_URL = "ReadingUrl";
public static final String SEPARATOR = "\\|\\$\\|SEPARATOR\\|\\$\\|";
public static final String HTTP = "http://";
public static final String HTTPS = "https://";

4
src/acr/browser/lightning/LicenseActivity.java

@ -30,6 +30,7 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi @@ -30,6 +30,7 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
findViewById(R.id.licenseAOSP).setOnClickListener(this);
findViewById(R.id.licenseHosts).setOnClickListener(this);
findViewById(R.id.licenseOrbot).setOnClickListener(this);
findViewById(R.id.licenseSnactory).setOnClickListener(this);
}
@Override
@ -47,6 +48,9 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi @@ -47,6 +48,9 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
case R.id.licenseOrbot:
actionView("http://www.gnu.org/licenses/lgpl.html");
break;
case R.id.licenseSnactory:
actionView("http://www.apache.org/licenses/LICENSE-2.0");
break;
}
}

619
src/acr/browser/lightning/Reading/ArticleTextExtractor.java

@ -0,0 +1,619 @@ @@ -0,0 +1,619 @@
package acr.browser.lightning.Reading;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import android.util.Log;
/**
* This class is thread safe.
*
* @author Alex P (ifesdjeen from jreadability)
* @author Peter Karich
*/
public class ArticleTextExtractor {
// Interessting nodes
private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
// Unlikely candidates
private String unlikelyStr;
private Pattern UNLIKELY;
// Most likely positive candidates
private String positiveStr;
private Pattern POSITIVE;
// Most likely negative candidates
private String negativeStr;
private Pattern NEGATIVE;
private static final Pattern NEGATIVE_STYLE = Pattern
.compile("hidden|display: ?none|font-size: ?small");
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
{
add("hacker news");
add("facebook");
}
};
private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
private OutputFormatter formatter = DEFAULT_FORMATTER;
public ArticleTextExtractor() {
setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
+ "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
+ "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
+ "login|si(debar|gn|ngle)");
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+ "|arti(cle|kel)|instapaper_body");
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
}
public ArticleTextExtractor setUnlikely(String unlikelyStr) {
this.unlikelyStr = unlikelyStr;
UNLIKELY = Pattern.compile(unlikelyStr);
return this;
}
public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
return setUnlikely(unlikelyStr + "|" + unlikelyMatches);
}
public ArticleTextExtractor setPositive(String positiveStr) {
this.positiveStr = positiveStr;
POSITIVE = Pattern.compile(positiveStr);
return this;
}
public ArticleTextExtractor addPositive(String pos) {
return setPositive(positiveStr + "|" + pos);
}
public ArticleTextExtractor setNegative(String negativeStr) {
this.negativeStr = negativeStr;
NEGATIVE = Pattern.compile(negativeStr);
return this;
}
public ArticleTextExtractor addNegative(String neg) {
setNegative(negativeStr + "|" + neg);
return this;
}
public void setOutputFormatter(OutputFormatter formatter) {
this.formatter = formatter;
}
/**
* @param html
* extracts article text from given html string. wasn't tested
* with improper HTML, although jSoup should be able to handle
* minor stuff.
* @returns extracted article, all HTML tags stripped
*/
public JResult extractContent(Document doc) throws Exception {
return extractContent(new JResult(), doc, formatter);
}
public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {
return extractContent(new JResult(), doc, formatter);
}
public JResult extractContent(String html) throws Exception {
return extractContent(new JResult(), html);
}
public JResult extractContent(JResult res, String html) throws Exception {
return extractContent(res, html, formatter);
}
public JResult extractContent(JResult res, String html, OutputFormatter formatter)
throws Exception {
if (html.isEmpty())
throw new IllegalArgumentException("html string is empty!?");
// http://jsoup.org/cookbook/extracting-data/selector-syntax
return extractContent(res, Jsoup.parse(html), formatter);
}
public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)
throws Exception {
if (doc == null)
throw new NullPointerException("missing document");
res.setTitle(extractTitle(doc));
res.setDescription(extractDescription(doc));
res.setCanonicalUrl(extractCanonicalUrl(doc));
// now remove the clutter
prepareDocument(doc);
// init elements
Collection<Element> nodes = getNodes(doc);
int maxWeight = 0;
Element bestMatchElement = null;
for (Element entry : nodes) {
int currentWeight = getWeight(entry);
if (currentWeight > maxWeight) {
maxWeight = currentWeight;
bestMatchElement = entry;
if (maxWeight > 200)
break;
}
}
if (bestMatchElement != null) {
List<ImageResult> images = new ArrayList<ImageResult>();
Element imgEl = determineImageSource(bestMatchElement, images);
if (imgEl != null) {
res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
// TODO remove parent container of image if it is contained in
// bestMatchElement
// to avoid image subtitles flooding in
res.setImages(images);
}
// clean before grabbing text
String text = formatter.getFormattedText(bestMatchElement);
text = removeTitleFromText(text, res.getTitle());
// this fails for short facebook post and probably tweets:
// text.length() > res.getDescription().length()
if (text.length() > res.getTitle().length()) {
res.setText(text);
// print("best element:", bestMatchElement);
}
res.setTextList(formatter.getTextList(bestMatchElement));
}
if (res.getImageUrl().isEmpty()) {
res.setImageUrl(extractImageUrl(doc));
}
res.setRssUrl(extractRssUrl(doc));
res.setVideoUrl(extractVideoUrl(doc));
res.setFaviconUrl(extractFaviconUrl(doc));
res.setKeywords(extractKeywords(doc));
return res;
}
protected String extractTitle(Document doc) {
String title = cleanTitle(doc.title());
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head title").text());
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(
"content"));
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(
"content"));
}
}
}
}
return title;
}
protected String extractCanonicalUrl(Document doc) {
String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
if (url.isEmpty()) {
url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
if (url.isEmpty()) {
url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(
"content"));
}
}
return url;
}
protected String extractDescription(Document doc) {
String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(
"content"));
if (description.isEmpty()) {
description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(
"content"));
if (description.isEmpty()) {
description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")
.attr("content"));
}
}
return description;
}
protected Collection<String> extractKeywords(Document doc) {
String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
if (content != null) {
if (content.startsWith("[") && content.endsWith("]"))
content = content.substring(1, content.length() - 1);
String[] split = content.split("\\s*,\\s*");
if (split.length > 1 || (split.length > 0 && !"".equals(split[0])))
return Arrays.asList(split);
}
return Collections.emptyList();
}
/**
* Tries to extract an image url from metadata if determineImageSource
* failed
*
* @return image url or empty str
*/
protected String extractImageUrl(Document doc) {
// use open graph tag to get image
String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(
"content"));
if (imageUrl.isEmpty()) {
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(
"content"));
if (imageUrl.isEmpty()) {
// prefer link over thumbnail-meta if empty
imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
if (imageUrl.isEmpty()) {
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(
"content"));
}
}
}
return imageUrl;
}
protected String extractRssUrl(Document doc) {
return SHelper.replaceSpaces(doc.select("link[rel=alternate]")
.select("link[type=application/rss+xml]").attr("href"));
}
protected String extractVideoUrl(Document doc) {
return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
}
protected String extractFaviconUrl(Document doc) {
String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
if (faviconUrl.isEmpty()) {
faviconUrl = SHelper.replaceSpaces(doc.select(
"head link[rel^=shortcut],link[rel$=icon]").attr("href"));
}
return faviconUrl;
}
/**
* Weights current element. By matching it with positive candidates and
* weighting child nodes. Since it's impossible to predict which exactly
* names, ids or class names will be used in HTML, major role is played by
* child nodes
*
* @param e
* Element to weight, along with child nodes
*/
protected int getWeight(Element e) {
int weight = calcWeight(e);
weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
weight += weightChildNodes(e);
return weight;
}
/**
* Weights a child nodes of given Element. During tests some difficulties
* were met. For instanance, not every single document has nested paragraph
* tags inside of the major article tag. Sometimes people are adding one
* more nesting level. So, we're adding 4 points for every 100 symbols
* contained in tag nested inside of the current weighted element, but only
* 3 points for every element that's nested 2 levels deep. This way we give
* more chances to extract the element that has less nested levels,
* increasing probability of the correct extraction.
*
* @param rootEl
* Element, who's child nodes will be weighted
*/
protected int weightChildNodes(Element rootEl) {
int weight = 0;
Element caption = null;
List<Element> pEls = new ArrayList<Element>(5);
for (Element child : rootEl.children()) {
String ownText = child.ownText();
int ownTextLength = ownText.length();
if (ownTextLength < 20)
continue;
if (ownTextLength > 200)
weight += Math.max(50, ownTextLength / 10);
if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
weight += 30;
} else if (child.tagName().equals("div") || child.tagName().equals("p")) {
weight += calcWeightForChild(child, ownText);
if (child.tagName().equals("p") && ownTextLength > 50)
pEls.add(child);
if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))
caption = child;
}
}
// use caption and image
if (caption != null)
weight += 30;
if (pEls.size() >= 2) {
for (Element subEl : rootEl.children()) {
if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
weight += 20;
// headerEls.add(subEl);
} else if ("table;li;td;th".contains(subEl.tagName())) {
addScore(subEl, -30);
}
if ("p".contains(subEl.tagName()))
addScore(subEl, 30);
}
}
return weight;
}
public void addScore(Element el, int score) {
int old = getScore(el);
setScore(el, score + old);
}
public int getScore(Element el) {
int old = 0;
try {
old = Integer.parseInt(el.attr("gravityScore"));
} catch (Exception ex) {
}
return old;
}
public void setScore(Element el, int score) {
el.attr("gravityScore", Integer.toString(score));
}
private int calcWeightForChild(Element child, String ownText) {
int c = SHelper.count(ownText, "&quot;");
c += SHelper.count(ownText, "&lt;");
c += SHelper.count(ownText, "&gt;");
c += SHelper.count(ownText, "px");
int val;
if (c > 5)
val = -30;
else
val = (int) Math.round(ownText.length() / 25.0);
addScore(child, val);
return val;
}
private int calcWeight(Element e) {
int weight = 0;
if (POSITIVE.matcher(e.className()).find())
weight += 35;
if (POSITIVE.matcher(e.id()).find())
weight += 40;
if (UNLIKELY.matcher(e.className()).find())
weight -= 20;
if (UNLIKELY.matcher(e.id()).find())
weight -= 20;
if (NEGATIVE.matcher(e.className()).find())
weight -= 50;
if (NEGATIVE.matcher(e.id()).find())
weight -= 50;
String style = e.attr("style");
if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
weight -= 50;
return weight;
}
public Element determineImageSource(Element el, List<ImageResult> images) {
int maxWeight = 0;
Element maxNode = null;
Elements els = el.select("img");
if (els.isEmpty())
els = el.parent().select("img");
double score = 1;
for (Element e : els) {
String sourceUrl = e.attr("src");
if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
continue;
int weight = 0;
int height = 0;
try {
height = Integer.parseInt(e.attr("height"));
if (height >= 50)
weight += 20;
else
weight -= 20;
} catch (Exception ex) {
}
int width = 0;
try {
width = Integer.parseInt(e.attr("width"));
if (width >= 50)
weight += 20;
else
weight -= 20;
} catch (Exception ex) {
}
String alt = e.attr("alt");
if (alt.length() > 35)
weight += 20;
String title = e.attr("title");
if (title.length() > 35)
weight += 20;
String rel = null;
boolean noFollow = false;
if (e.parent() != null) {
rel = e.parent().attr("rel");
if (rel != null && rel.contains("nofollow")) {
noFollow = rel.contains("nofollow");
weight -= 40;
}
}
weight = (int) (weight * score);
if (weight > maxWeight) {
maxWeight = weight;
maxNode = e;
score = score / 2;
}
ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,
noFollow);
images.add(image);
}
Collections.sort(images, new ImageComparator());
return maxNode;
}
/**
* Prepares document. Currently only stipping unlikely candidates, since
* from time to time they're getting more score than good ones especially in
* cases when major text is short.
*
* @param doc
* document to prepare. Passed as reference, and changed inside
* of function
*/
protected void prepareDocument(Document doc) {
// stripUnlikelyCandidates(doc);
removeScriptsAndStyles(doc);
}
/**
* Removes unlikely candidates from HTML. Currently takes id and class name
* and matches them against list of patterns
*
* @param doc
* document to strip unlikely candidates from
*/
protected void stripUnlikelyCandidates(Document doc) {
for (Element child : doc.select("body").select("*")) {
String className = child.className().toLowerCase(Locale.getDefault());
String id = child.id().toLowerCase(Locale.getDefault());
if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) {
// print("REMOVE:", child);
child.remove();
}
}
}
private Document removeScriptsAndStyles(Document doc) {
Elements scripts = doc.getElementsByTag("script");
for (Element item : scripts) {
item.remove();
}
Elements noscripts = doc.getElementsByTag("noscript");
for (Element item : noscripts) {
item.remove();
}
Elements styles = doc.getElementsByTag("style");
for (Element style : styles) {
style.remove();
}
return doc;
}
private boolean isAdImage(String imageUrl) {
return SHelper.count(imageUrl, "ad") >= 2;
}
/**
* Match only exact matching as longestSubstring can be too fuzzy
*/
public String removeTitleFromText(String text, String title) {
// don't do this as its terrible to read
// int index1 = text.toLowerCase().indexOf(title.toLowerCase());
// if (index1 >= 0)
// text = text.substring(index1 + title.length());
// return text.trim();
return text;
}
/**
* @return a set of all important nodes
*/
public Collection<Element> getNodes(Document doc) {
Set<Element> nodes = new HashSet<Element>(64);
int score = 100;
for (Element el : doc.select("body").select("*")) {
if (NODES.matcher(el.tagName()).matches()) {
nodes.add(el);
setScore(el, score);
score = score / 2;
}
}
return nodes;
}
public String cleanTitle(String title) {
StringBuilder res = new StringBuilder();
// int index = title.lastIndexOf("|");
// if (index > 0 && title.length() / 2 < index)
// title = title.substring(0, index + 1);
int counter = 0;
String[] strs = title.split("\\|");
for (String part : strs) {
if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))
continue;
if (counter == strs.length - 1 && res.length() > part.length())
continue;
if (counter > 0)
res.append("|");
res.append(part);
counter++;
}
return SHelper.innerTrim(res.toString());
}
/**
* Comparator for Image by weight
*
* @author Chris Alexander, chris@chris-alexander.co.uk
*
*/
public class ImageComparator implements Comparator<ImageResult> {
@Override
public int compare(ImageResult o1, ImageResult o2) {
// Returns the highest weight first
return o2.weight.compareTo(o1.weight);
}
}
}

243
src/acr/browser/lightning/Reading/Converter.java

@ -0,0 +1,243 @@ @@ -0,0 +1,243 @@
/*
* Copyright 2011 Peter Karich
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package acr.browser.lightning.Reading;
import java.io.*;
import java.net.SocketTimeoutException;
import java.nio.charset.Charset;
import java.util.Locale;
import acr.browser.lightning.Constants;
import android.util.Log;
/**
* This class is not thread safe. Use one new instance every time due to
* encoding variable.
*
* @author Peter Karich
*/
public class Converter {
public final static String UTF8 = "UTF-8";
public final static String ISO = "ISO-8859-1";
public final static int K2 = 2048;
private int maxBytes = 1000000 / 2;
private String encoding;
private String url;
public Converter(String urlOnlyHint) {
url = urlOnlyHint;
}
public Converter() {
}
public Converter setMaxBytes(int maxBytes) {
this.maxBytes = maxBytes;
return this;
}
public static String extractEncoding(String contentType) {
String[] values;
if (contentType != null)
values = contentType.split(";");
else
values = new String[0];
String charset = "";
for (String value : values) {
value = value.trim().toLowerCase(Locale.getDefault());
if (value.startsWith("charset="))
charset = value.substring("charset=".length());
}
// http1.1 says ISO-8859-1 is the default charset
if (charset.length() == 0)
charset = ISO;
return charset;
}
public String getEncoding() {
if (encoding == null)
return "";
return encoding.toLowerCase(Locale.getDefault());
}
public String streamToString(InputStream is) {
return streamToString(is, maxBytes, encoding);
}
public String streamToString(InputStream is, String enc) {
return streamToString(is, maxBytes, enc);
}
/**
* reads bytes off the string and returns a string
*
* @param is
* @param maxBytes
* The max bytes that we want to read from the input stream
* @return String
*/
public String streamToString(InputStream is, int maxBytes, String enc) {
encoding = enc;
// Http 1.1. standard is iso-8859-1 not utf8 :(
// but we force utf-8 as youtube assumes it ;)
if (encoding == null || encoding.isEmpty())
encoding = UTF8;
BufferedInputStream in = null;
try {
in = new BufferedInputStream(is, K2);
ByteArrayOutputStream output = new ByteArrayOutputStream();
// detect encoding with the help of meta tag
try {
in.mark(K2 * 2);
String tmpEnc = detectCharset("charset=", output, in, encoding);
if (tmpEnc != null)
encoding = tmpEnc;
else {
Log.d(Constants.TAG, "no charset found in first stage");
// detect with the help of xml beginning ala
// encoding="charset"
tmpEnc = detectCharset("encoding=", output, in, encoding);
if (tmpEnc != null)
encoding = tmpEnc;
else
Log.d(Constants.TAG, "no charset found in second stage");
}
if (!Charset.isSupported(encoding))
throw new UnsupportedEncodingException(encoding);
} catch (UnsupportedEncodingException e) {
Log.d(Constants.TAG,
"Using default encoding:" + UTF8 + " problem:" + e.getMessage()
+ " encoding:" + encoding + " " + url);
encoding = UTF8;
}
// SocketException: Connection reset
// IOException: missing CR => problem on server (probably some xml
// character thing?)
// IOException: Premature EOF => socket unexpectly closed from
// server
int bytesRead = output.size();
byte[] arr = new byte[K2];
while (true) {
if (bytesRead >= maxBytes) {
Log.d(Constants.TAG, "Maxbyte of " + maxBytes
+ " exceeded! Maybe html is now broken but try it nevertheless. Url: "
+ url);
break;
}
int n = in.read(arr);
if (n < 0)
break;
bytesRead += n;
output.write(arr, 0, n);
}
return output.toString(encoding);
} catch (SocketTimeoutException e) {
Log.e(Constants.TAG, e.toString() + " url:" + url);
} catch (IOException e) {
Log.e(Constants.TAG, e.toString() + " url:" + url);
} finally {
if (in != null) {
try {
in.close();
} catch (Exception e) {
}
}
}
return "";
}
/**
* This method detects the charset even if the first call only returns some
* bytes. It will read until 4K bytes are reached and then try to determine
* the encoding
*
* @throws IOException
*/
protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
String enc) throws IOException {
// Grab better encoding from stream
byte[] arr = new byte[K2];
int nSum = 0;
while (nSum < K2) {
int n = in.read(arr);
if (n < 0)
break;
nSum += n;
bos.write(arr, 0, n);
}
String str = bos.toString(enc);
int encIndex = str.indexOf(key);
int clength = key.length();
if (encIndex > 0) {
char startChar = str.charAt(encIndex + clength);
int lastEncIndex;
if (startChar == '\'')
// if we have charset='something'
lastEncIndex = str.indexOf("'", ++encIndex + clength);
else if (startChar == '\"')
// if we have charset="something"
lastEncIndex = str.indexOf("\"", ++encIndex + clength);
else {
// if we have "text/html; charset=utf-8"
int first = str.indexOf("\"", encIndex + clength);
if (first < 0)
first = Integer.MAX_VALUE;
// or "text/html; charset=utf-8 "
int sec = str.indexOf(" ", encIndex + clength);
if (sec < 0)
sec = Integer.MAX_VALUE;
lastEncIndex = Math.min(first, sec);
// or "text/html; charset=utf-8 '
int third = str.indexOf("'", encIndex + clength);
if (third > 0)
lastEncIndex = Math.min(lastEncIndex, third);
}
// re-read byte array with different encoding
// assume that the encoding string cannot be greater than 40 chars
if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,
lastEncIndex));
try {
in.reset();
bos.reset();
return tmpEnc;
} catch (IOException ex) {
Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "
+ tmpEnc + " " + ex.toString());
}
}
}
return null;
}
}

445
src/acr/browser/lightning/Reading/HtmlFetcher.java

@ -0,0 +1,445 @@ @@ -0,0 +1,445 @@
/*
* Copyright 2011 Peter Karich
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package acr.browser.lightning.Reading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import acr.browser.lightning.Constants;
import android.util.Log;
/**
* Class to fetch articles. This class is thread safe.
*
* @author Peter Karich
*/
public class HtmlFetcher {
static {
SHelper.enableCookieMgmt();
SHelper.enableUserAgentOverwrite();
SHelper.enableAnySSL();
}
public static void main(String[] args) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader("urls.txt"));
String line = null;
Set<String> existing = new LinkedHashSet<String>();
while ((line = reader.readLine()) != null) {
int index1 = line.indexOf("\"");
int index2 = line.indexOf("\"", index1 + 1);
String url = line.substring(index1 + 1, index2);
String domainStr = SHelper.extractDomain(url, true);
String counterStr = "";
// TODO more similarities
if (existing.contains(domainStr))
counterStr = "2";
else
existing.add(domainStr);
String html = new HtmlFetcher().fetchAsString(url, 20000);
String outFile = domainStr + counterStr + ".html";
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
writer.write(html);
writer.close();
}
reader.close();
}
private String referrer = "https://github.com/karussell/snacktory";
private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ")";
private String cacheControl = "max-age=0";
private String language = "en-us";
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
private String charset = "UTF-8";
private SCache cache;
private AtomicInteger cacheCounter = new AtomicInteger(0);
private int maxTextLength = -1;
private ArticleTextExtractor extractor = new ArticleTextExtractor();
private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
{
add("bit.ly");
add("cli.gs");
add("deck.ly");
add("fb.me");
add("feedproxy.google.com");
add("flic.kr");
add("fur.ly");
add("goo.gl");
add("is.gd");
add("ink.co");
add("j.mp");
add("lnkd.in");
add("on.fb.me");
add("ow.ly");
add("plurl.us");
add("sns.mx");
add("snurl.com");
add("su.pr");
add("t.co");
add("tcrn.ch");
add("tl.gd");
add("tiny.cc");
add("tinyurl.com");
add("tmi.me");
add("tr.im");
add("twurl.nl");
}
};
public HtmlFetcher() {
}
public void setExtractor(ArticleTextExtractor extractor) {
this.extractor = extractor;
}
public ArticleTextExtractor getExtractor() {
return extractor;
}
public HtmlFetcher setCache(SCache cache) {
this.cache = cache;
return this;
}
public SCache getCache() {
return cache;
}
public int getCacheCounter() {
return cacheCounter.get();
}
public HtmlFetcher clearCacheCounter() {
cacheCounter.set(0);
return this;
}
public HtmlFetcher setMaxTextLength(int maxTextLength) {
this.maxTextLength = maxTextLength;
return this;
}
public int getMaxTextLength() {
return maxTextLength;
}
public void setAccept(String accept) {
this.accept = accept;
}
public void setCharset(String charset) {
this.charset = charset;
}
public void setCacheControl(String cacheControl) {
this.cacheControl = cacheControl;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getReferrer() {
return referrer;
}
public HtmlFetcher setReferrer(String referrer) {
this.referrer = referrer;
return this;
}
public String getUserAgent() {
return userAgent;
}
public void setUserAgent(String userAgent) {
this.userAgent = userAgent;
}
public String getAccept() {
return accept;
}
public String getCacheControl() {
return cacheControl;
}
public String getCharset() {
return charset;
}
public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
String originalUrl = url;
url = SHelper.removeHashbang(url);
String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
if (gUrl != null)
url = gUrl;
else {
gUrl = SHelper.getUrlFromUglyFacebookRedirect(url);
if (gUrl != null)
url = gUrl;
}
if (resolve) {
// check if we can avoid resolving the URL (which hits the website!)
JResult res = getFromCache(url, originalUrl);
if (res != null)
return res;
String resUrl = getResolvedUrl(url, timeout);
if (resUrl.isEmpty()) {
Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);
JResult result = new JResult();
if (cache != null)
cache.put(url, result);
return result.setUrl(url);
}
// if resolved url is longer then use it!
if (resUrl != null && resUrl.trim().length() > url.length()) {
// this is necessary e.g. for some homebaken url resolvers which
// return
// the resolved url relative to url!
url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
}
}
// check if we have the (resolved) URL in cache
JResult res = getFromCache(url, originalUrl);
if (res != null)
return res;
JResult result = new JResult();
// or should we use? <link rel="canonical"
// href="http://www.N24.de/news/newsitem_6797232.html"/>
result.setUrl(url);
result.setOriginalUrl(originalUrl);
result.setDate(SHelper.estimateDate(url));
// Immediately put the url into the cache as extracting content takes
// time.
if (cache != null) {
cache.put(originalUrl, result);
cache.put(url, result);
}
String lowerUrl = url.toLowerCase(Locale.getDefault());
if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
// skip
} else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
result.setVideoUrl(url);
} else if (SHelper.isImage(lowerUrl)) {
result.setImageUrl(url);
} else {
extractor.extractContent(result, fetchAsString(url, timeout));
if (result.getFaviconUrl().isEmpty())
result.setFaviconUrl(SHelper.getDefaultFavicon(url));
// some links are relative to root and do not include the domain of
// the url :(
result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
result.setImageUrl(fixUrl(url, result.getImageUrl()));
result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
result.setRssUrl(fixUrl(url, result.getRssUrl()));
}
result.setText(lessText(result.getText()));
synchronized (result) {
result.notifyAll();
}
return result;
}
public String lessText(String text) {
if (text == null)
return "";
if (maxTextLength >= 0 && text.length() > maxTextLength)
return text.substring(0, maxTextLength);
return text;
}
private static String fixUrl(String url, String urlOrPath) {
return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
}
public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException,
IOException {
return fetchAsString(urlAsString, timeout, true);
}
public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
throws MalformedURLException, IOException {
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
hConn.setInstanceFollowRedirects(true);
String encoding = hConn.getContentEncoding();
InputStream is;
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
is = new GZIPInputStream(hConn.getInputStream());
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
is = new InflaterInputStream(hConn.getInputStream(), new Inflater(true));
} else {
is = hConn.getInputStream();
}
String enc = Converter.extractEncoding(hConn.getContentType());
String res = createConverter(urlAsString).streamToString(is, enc);
Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
return res;
}
public Converter createConverter(String url) {
return new Converter(url);
}
/**
* On some devices we have to hack:
* http://developers.sun.com/mobility/reference
* /techart/design_guidelines/http_redirection.html
*
* @param timeout
* Sets a specified timeout value, in milliseconds
* @return the resolved url if any. Or null if it couldn't resolve the url
* (within the specified time) or the same url if response code is
* OK
*/
public String getResolvedUrl(String urlAsString, int timeout) {
String newUrl = null;
int responseCode = -1;
try {
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, true);
// force no follow
hConn.setInstanceFollowRedirects(false);
// the program doesn't care what the content actually is !!
// http://java.sun.com/developer/JDCTechTips/2003/tt0422.html
hConn.setRequestMethod("HEAD");
hConn.connect();
responseCode = hConn.getResponseCode();
hConn.getInputStream().close();
if (responseCode == HttpURLConnection.HTTP_OK)
return urlAsString;
newUrl = hConn.getHeaderField("Location");
if (responseCode / 100 == 3 && newUrl != null) {
newUrl = newUrl.replaceAll(" ", "+");
// some services use (none-standard) utf8 in their location
// header
if (urlAsString.startsWith("http://bit.ly")
|| urlAsString.startsWith("http://is.gd"))
newUrl = encodeUriFromHeader(newUrl);
// fix problems if shortened twice. as it is often the case
// after twitters' t.co bullshit
if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
newUrl = getResolvedUrl(newUrl, timeout);
return newUrl;
} else
return urlAsString;
} catch (Exception ex) {
Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
return "";
} finally {
Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
}
}
/**
* Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding
* to non-ASCII characters. Workaround for broken origin servers that send
* UTF-8 in the Location: header.
*/
static String encodeUriFromHeader(String badLocation) {
StringBuilder sb = new StringBuilder();
for (char ch : badLocation.toCharArray()) {
if (ch < (char) 128) {
sb.append(ch);
} else {
// this is ONLY valid if the uri was decoded using ISO-8859-1
sb.append(String.format("%%%02X", (int) ch));
}
}
return sb.toString();
}
protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
URL url = new URL(urlAsStr);
// using proxy may increase latency
HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
hConn.setRequestProperty("User-Agent", userAgent);
hConn.setRequestProperty("Accept", accept);
if (includeSomeGooseOptions) {
hConn.setRequestProperty("Accept-Language", language);
hConn.setRequestProperty("content-charset", charset);
hConn.addRequestProperty("Referer", referrer);
// avoid the cache for testing purposes only?
hConn.setRequestProperty("Cache-Control", cacheControl);
}
// suggest respond to be gzipped or deflated (which is just another
// compression)
// http://stackoverflow.com/q/3932117
hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
hConn.setConnectTimeout(timeout);
hConn.setReadTimeout(timeout);
return hConn;
}
private JResult getFromCache(String url, String originalUrl) throws Exception {
if (cache != null) {
JResult res = cache.get(url);
if (res != null) {
// e.g. the cache returned a shortened url as original url now
// we want to store the
// current original url! Also it can be that the cache response
// to url but the JResult
// does not contain it so overwrite it:
res.setUrl(url);
res.setOriginalUrl(originalUrl);
cacheCounter.addAndGet(1);
return res;
}
}
return null;
}
}

31
src/acr/browser/lightning/Reading/ImageResult.java

@ -0,0 +1,31 @@ @@ -0,0 +1,31 @@
package acr.browser.lightning.Reading;
import org.jsoup.nodes.Element;
/**
* Class which encapsulates the data from an image found under an element
*
* @author Chris Alexander, chris@chris-alexander.co.uk
*/
public class ImageResult {
public String src;
public Integer weight;
public String title;
public int height;
public int width;
public String alt;
public boolean noFollow;
public Element element;
public ImageResult(String src, Integer weight, String title, int height, int width, String alt,
boolean noFollow) {
this.src = src;
this.weight = weight;
this.title = title;
this.height = height;
this.width = width;
this.alt = alt;
this.noFollow = noFollow;
}
}

216
src/acr/browser/lightning/Reading/JResult.java

@ -0,0 +1,216 @@ @@ -0,0 +1,216 @@
/*
* Copyright 2011 Peter Karich
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package acr.browser.lightning.Reading;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
/**
* Parsed result from web page containing important title, text and image.
*
* @author Peter Karich
*/
public class JResult implements Serializable {
private String title;
private String url;
private String originalUrl;
private String canonicalUrl;
private String imageUrl;
private String videoUrl;
private String rssUrl;
private String text;
private String faviconUrl;
private String description;
private String dateString;
private List<String> textList;
private Collection<String> keywords;
private List<ImageResult> images = null;
public JResult() {
}
public String getUrl() {
if (url == null)
return "";
return url;
}
public JResult setUrl(String url) {
this.url = url;
return this;
}
public JResult setOriginalUrl(String originalUrl) {
this.originalUrl = originalUrl;
return this;
}
public String getOriginalUrl() {
return originalUrl;
}
public JResult setCanonicalUrl(String canonicalUrl) {
this.canonicalUrl = canonicalUrl;
return this;
}
public String getCanonicalUrl() {
return canonicalUrl;
}
public String getFaviconUrl() {
if (faviconUrl == null)
return "";
return faviconUrl;
}
public JResult setFaviconUrl(String faviconUrl) {
this.faviconUrl = faviconUrl;
return this;
}
public JResult setRssUrl(String rssUrl) {
this.rssUrl = rssUrl;
return this;
}
public String getRssUrl() {
if (rssUrl == null)
return "";
return rssUrl;
}
public String getDescription() {
if (description == null)
return "";
return description;
}
public JResult setDescription(String description) {
this.description = description;
return this;
}
public String getImageUrl() {
if (imageUrl == null)
return "";
return imageUrl;
}
public JResult setImageUrl(String imageUrl) {
this.imageUrl = imageUrl;
return this;
}
public String getText() {
if (text == null)
return "";
return text;
}
public JResult setText(String text) {
this.text = text;
return this;
}
public List<String> getTextList() {
if (this.textList == null)
return new ArrayList<String>();
return this.textList;
}
public JResult setTextList(List<String> textList) {
this.textList = textList;
return this;
}
public String getTitle() {
if (title == null)
return "";
return title;
}
public JResult setTitle(String title) {
this.title = title;
return this;
}
public String getVideoUrl() {
if (videoUrl == null)
return "";
return videoUrl;
}
public JResult setVideoUrl(String videoUrl) {
this.videoUrl = videoUrl;
return this;
}
public JResult setDate(String date) {
this.dateString = date;
return this;
}
public Collection<String> getKeywords() {
return keywords;
}
public void setKeywords(Collection<String> keywords) {
this.keywords = keywords;
}
/**
* @return get date from url or guessed from text
*/
public String getDate() {
return dateString;
}
/**
* @return images list
*/
public List<ImageResult> getImages() {
if (images == null)
return Collections.emptyList();
return images;
}
/**
* @return images count
*/
public int getImagesCount() {
if (images == null)
return 0;
return images.size();
}
/**
* set images list
*/
public void setImages(List<ImageResult> images) {
this.images = images;
}
@Override
public String toString() {
return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
}
}

80
src/acr/browser/lightning/Reading/MapEntry.java

@ -0,0 +1,80 @@ @@ -0,0 +1,80 @@
/**
* Copyright (C) 2010 Peter Karich <>
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package acr.browser.lightning.Reading;
import java.io.Serializable;
import java.util.Map;
/**
* Simple impl of Map.Entry. So that we can have ordered maps.
*
* @author Peter Karich, peat_hal at users dot sourceforge dot
* net
*/
public class MapEntry<K, V> implements Map.Entry<K, V>, Serializable {
private static final long serialVersionUID = 1L;
private K key;
private V value;
public MapEntry(K key, V value) {
this.key = key;
this.value = value;
}
@Override
public K getKey() {
return key;
}
@Override
public V getValue() {
return value;
}
@Override
public V setValue(V value) {
this.value = value;
return value;
}
@Override
public String toString() {
return getKey() + ", " + getValue();
}
@Override
public boolean equals(Object obj) {
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final MapEntry<K, V> other = (MapEntry<K, V>) obj;
if (this.key != other.key && (this.key == null || !this.key.equals(other.key)))
return false;
if (this.value != other.value && (this.value == null || !this.value.equals(other.value)))
return false;
return true;
}
@Override
public int hashCode() {
int hash = 7;
hash = 19 * hash + (this.key != null ? this.key.hashCode() : 0);
hash = 19 * hash + (this.value != null ? this.value.hashCode() : 0);
return hash;
}
}

174
src/acr/browser/lightning/Reading/OutputFormatter.java

@ -0,0 +1,174 @@ @@ -0,0 +1,174 @@
package acr.browser.lightning.Reading;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
/**
* @author goose | jim
* @author karussell
*
* this class will be responsible for taking our top node and stripping
* out junk we don't want and getting it ready for how we want it
* presented to the user
*/
public class OutputFormatter {
public static final int MIN_PARAGRAPH_TEXT = 50;
private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
protected final int minParagraphText;
protected final List<String> nodesToReplace;
protected String nodesToKeepCssSelector = "p";
public OutputFormatter() {
this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
}
public OutputFormatter(int minParagraphText) {
this(minParagraphText, NODES_TO_REPLACE);
}
public OutputFormatter(int minParagraphText, List<String> nodesToReplace) {
this.minParagraphText = minParagraphText;
this.nodesToReplace = nodesToReplace;
}
/**
* set elements to keep in output text
*/
public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) {
this.nodesToKeepCssSelector = nodesToKeepCssSelector;
}
/**
* takes an element and turns the P tags into \n\n
*/
public String getFormattedText(Element topNode) {
removeNodesWithNegativeScores(topNode);
StringBuilder sb = new StringBuilder();
append(topNode, sb, nodesToKeepCssSelector);
String str = SHelper.innerTrim(sb.toString());
if (str.length() > 100)
return str;
// no subelements
if (str.isEmpty() || !topNode.text().isEmpty()
&& str.length() <= topNode.ownText().length())
str = topNode.text();
// if jsoup failed to parse the whole html now parse this smaller
// snippet again to avoid html tags disturbing our text:
return Jsoup.parse(str).text();
}
/**
* Takes an element and returns a list of texts extracted from the P tags
*/
public List<String> getTextList(Element topNode) {
List<String> texts = new ArrayList<String>();
for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
if (element.hasText()) {
texts.add(element.text());
}
}
return texts;
}
/**
* If there are elements inside our top node that have a negative gravity
* score remove them
*/
protected void removeNodesWithNegativeScores(Element topNode) {
Elements gravityItems = topNode.select("*[gravityScore]");
for (Element item : gravityItems) {
int score = Integer.parseInt(item.attr("gravityScore"));
if (score < 0 || item.text().length() < minParagraphText)
item.remove();
}
}
protected void append(Element node, StringBuilder sb, String tagName) {
// is select more costly then getElementsByTag?
MAIN: for (Element e : node.select(tagName)) {
Element tmpEl = e;
// check all elements until 'node'
while (tmpEl != null && !tmpEl.equals(node)) {
if (unlikely(tmpEl))
continue MAIN;
tmpEl = tmpEl.parent();
}
String text = node2Text(e);
if (text.isEmpty() || text.length() < minParagraphText
|| text.length() > SHelper.countLetters(text) * 2)
continue;
sb.append(text);
sb.append("\n\n");
}
}
boolean unlikely(Node e) {
if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
return true;
String style = e.attr("style");
String clazz = e.attr("class");
if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find())
return true;
return false;
}
void appendTextSkipHidden(Element e, StringBuilder accum) {
for (Node child : e.childNodes()) {
if (unlikely(child))
continue;
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
String txt = textNode.text();
accum.append(txt);
} else if (child instanceof Element) {
Element element = (Element) child;
if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
accum.append(" ");
else if (element.tagName().equals("br"))
accum.append(" ");
appendTextSkipHidden(element, accum);
}
}
}
boolean lastCharIsWhitespace(StringBuilder accum) {
if (accum.length() == 0)
return false;
return Character.isWhitespace(accum.charAt(accum.length() - 1));
}
protected String node2TextOld(Element el) {
return el.text();
}
protected String node2Text(Element el) {
StringBuilder sb = new StringBuilder(200);
appendTextSkipHidden(el, sb);
return sb.toString();
}
public OutputFormatter setUnlikelyPattern(String unlikelyPattern) {
this.unlikelyPattern = Pattern.compile(unlikelyPattern);
return this;
}
public OutputFormatter appendUnlikelyPattern(String str) {
return setUnlikelyPattern(unlikelyPattern.toString() + "|" + str);
}
}

29
src/acr/browser/lightning/Reading/SCache.java

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
/*
* Copyright 2011 Peter Karich
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package acr.browser.lightning.Reading;
/**
*
* @author Peter Karich
*/
public interface SCache {
JResult get(String url);
void put(String url, JResult res);
int getSize();
}

480
src/acr/browser/lightning/Reading/SHelper.java

@ -0,0 +1,480 @@ @@ -0,0 +1,480 @@
/*
* Copyright 2011 Peter Karich
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package acr.browser.lightning.Reading;
import java.io.UnsupportedEncodingException;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.CookiePolicy;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.KeyManager;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.jsoup.nodes.Element;
/**
*
* @author Peter Karich
*/
public class SHelper {
public static final String UTF8 = "UTF-8";
private static final Pattern SPACE = Pattern.compile(" ");
public static String replaceSpaces(String url) {
if (!url.isEmpty()) {
url = url.trim();
if (url.contains(" ")) {
Matcher spaces = SPACE.matcher(url);
url = spaces.replaceAll("%20");
}
}
return url;
}
public static int count(String str, String substring) {
int c = 0;
int index1 = str.indexOf(substring);
if (index1 >= 0) {
c++;
c += count(str.substring(index1 + substring.length()), substring);
}
return c;
}
/**
* remove more than two spaces or newlines
*/
public static String innerTrim(String str) {
if (str.isEmpty())
return "";
StringBuilder sb = new StringBuilder();
boolean previousSpace = false;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c == ' ' || (int) c == 9 || c == '\n') {
previousSpace = true;
continue;
}
if (previousSpace)
sb.append(' ');
previousSpace = false;
sb.append(c);
}
return sb.toString().trim();
}
/**
* Starts reading the encoding from the first valid character until an
* invalid encoding character occurs.
*/
public static String encodingCleanup(String str) {
StringBuilder sb = new StringBuilder();
boolean startedWithCorrectString = false;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (Character.isDigit(c) || Character.isLetter(c) || c == '-' || c == '_') {
startedWithCorrectString = true;
sb.append(c);
continue;
}
if (startedWithCorrectString)
break;
}
return sb.toString().trim();
}
/**
* @return the longest substring as str1.substring(result[0], result[1]);
*/
public static String getLongestSubstring(String str1, String str2) {
int res[] = longestSubstring(str1, str2);
if (res == null || res[0] >= res[1])
return "";
return str1.substring(res[0], res[1]);
}
public static int[] longestSubstring(String str1, String str2) {
if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty())
return null;
// dynamic programming => save already identical length into array
// to understand this algo simply print identical length in every entry
// of the array
// i+1, j+1 then reuses information from i,j
// java initializes them already with 0
int[][] num = new int[str1.length()][str2.length()];
int maxlen = 0;
int lastSubstrBegin = 0;
int endIndex = 0;
for (int i = 0; i < str1.length(); i++) {
for (int j = 0; j < str2.length(); j++) {
if (str1.charAt(i) == str2.charAt(j)) {
if ((i == 0) || (j == 0))
num[i][j] = 1;
else
num[i][j] = 1 + num[i - 1][j - 1];
if (num[i][j] > maxlen) {
maxlen = num[i][j];
// generate substring from str1 => i
lastSubstrBegin = i - num[i][j] + 1;
endIndex = i + 1;
}
}
}
}
return new int[] { lastSubstrBegin, endIndex };
}
public static String getDefaultFavicon(String url) {
return useDomainOfFirstArg4Second(url, "/favicon.ico");
}
/**
* @param urlForDomain
* extract the domain from this url
* @param path
* this url does not have a domain
* @return
*/
public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
if (path.startsWith("http"))
return path;
if ("favicon.ico".equals(path))
path = "/favicon.ico";
if (path.startsWith("//")) {
// wikipedia special case, see tests
if (urlForDomain.startsWith("https:"))
return "https:" + path;
return "http:" + path;
} else if (path.startsWith("/"))
return "http://" + extractHost(urlForDomain) + path;
else if (path.startsWith("../")) {
int slashIndex = urlForDomain.lastIndexOf("/");
if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
urlForDomain = urlForDomain.substring(0, slashIndex + 1);
return urlForDomain + path;
}
return path;
}
public static String extractHost(String url) {
return extractDomain(url, false);
}
public static String extractDomain(String url, boolean aggressive) {
if (url.startsWith("http://"))
url = url.substring("http://".length());
else if (url.startsWith("https://"))
url = url.substring("https://".length());
if (aggressive) {
if (url.startsWith("www."))
url = url.substring("www.".length());
// strip mobile from start
if (url.startsWith("m."))
url = url.substring("m.".length());
}
int slashIndex = url.indexOf("/");
if (slashIndex > 0)
url = url.substring(0, slashIndex);
return url;
}
public static boolean isVideoLink(String url) {
url = extractDomain(url, true);
return url.startsWith("youtube.com") || url.startsWith("video.yahoo.com")
|| url.startsWith("vimeo.com") || url.startsWith("blip.tv");
}
public static boolean isVideo(String url) {
return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi")
|| url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4")
|| url.endsWith(".flv") || url.endsWith(".wmv");
}
public static boolean isAudio(String url) {
return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u")
|| url.endsWith(".wav");
}
public static boolean isDoc(String url) {
return url.endsWith(".pdf") || url.endsWith(".ppt") || url.endsWith(".doc")
|| url.endsWith(".swf") || url.endsWith(".rtf") || url.endsWith(".xls");
}
public static boolean isPackage(String url) {
return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
|| url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm")
|| url.endsWith(".7z");
}
public static boolean isApp(String url) {
return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat")
|| url.endsWith(".dmg");
}
public static boolean isImage(String url) {
return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
|| url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico")
|| url.endsWith(".eps");
}
/**
* @see http
* ://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se
*/
public static void enableCookieMgmt() {
CookieManager manager = new CookieManager();
manager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
CookieHandler.setDefault(manager);
}
/**
* @see http
* ://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java
* -urlconnection
*/
public static void enableUserAgentOverwrite() {
System.setProperty("http.agent", "");
}
public static String getUrlFromUglyGoogleRedirect(String url) {
if (url.startsWith("http://www.google.com/url?")) {
url = url.substring("http://www.google.com/url?".length());
String arr[] = urlDecode(url).split("\\&");
if (arr != null)
for (String str : arr) {
if (str.startsWith("q="))
return str.substring("q=".length());
}
}
return null;
}
public static String getUrlFromUglyFacebookRedirect(String url) {
if (url.startsWith("http://www.facebook.com/l.php?u=")) {
url = url.substring("http://www.facebook.com/l.php?u=".length());
return urlDecode(url);
}
return null;
}
public static String urlEncode(String str) {
try {
return URLEncoder.encode(str, UTF8);
} catch (UnsupportedEncodingException ex) {
return str;
}
}
public static String urlDecode(String str) {
try {
return URLDecoder.decode(str, UTF8);
} catch (UnsupportedEncodingException ex) {
return str;
}
}
/**
* Popular sites uses the #! to indicate the importance of the following
* chars. Ugly but true. Such as: facebook, twitter, gizmodo, ...
*/
public static String removeHashbang(String url) {
return url.replaceFirst("#!", "");
}
public static String printNode(Element root) {
return printNode(root, 0);
}
public static String printNode(Element root, int indentation) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < indentation; i++) {
sb.append(' ');
}
sb.append(root.tagName());
sb.append(":");
sb.append(root.ownText());
sb.append("\n");
for (Element el : root.children()) {
sb.append(printNode(el, indentation + 1));
sb.append("\n");
}
return sb.toString();
}
public static String estimateDate(String url) {
int index = url.indexOf("://");
if (index > 0)
url = url.substring(index + 3);
int year = -1;
int yearCounter = -1;
int month = -1;
int monthCounter = -1;
int day = -1;
String strs[] = url.split("/");
for (int counter = 0; counter < strs.length; counter++) {
String str = strs[counter];
if (str.length() == 4) {
try {
year = Integer.parseInt(str);
} catch (Exception ex) {
continue;
}
if (year < 1970 || year > 3000) {
year = -1;
continue;
}
yearCounter = counter;
} else if (str.length() == 2) {
if (monthCounter < 0 && counter == yearCounter + 1) {
try {
month = Integer.parseInt(str);
} catch (Exception ex) {
continue;
}
if (month < 1 || month > 12) {
month = -1;
continue;
}
monthCounter = counter;
} else if (counter == monthCounter + 1) {
try {
day = Integer.parseInt(str);
} catch (Exception ex) {
}
if (day < 1 || day > 31) {
day = -1;
continue;
}
break;
}
}
}
if (year < 0)
return null;
StringBuilder str = new StringBuilder();
str.append(year);
if (month < 1)
return str.toString();
str.append('/');
if (month < 10)
str.append('0');
str.append(month);
if (day < 1)
return str.toString();
str.append('/');
if (day < 10)
str.append('0');
str.append(day);
return str.toString();
}
public static String completeDate(String dateStr) {
if (dateStr == null)
return null;
int index = dateStr.indexOf('/');
if (index > 0) {
index = dateStr.indexOf('/', index + 1);
if (index > 0)
return dateStr;
else
return dateStr + "/01";
}
return dateStr + "/01/01";
}
/**
* keep in mind: simpleDateFormatter is not thread safe! call completeDate
* before applying this formatter.
*/
public static SimpleDateFormat createDateFormatter() {
return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
}
// with the help of
// http://stackoverflow.com/questions/1828775/httpclient-and-ssl
public static void enableAnySSL() {
try {
SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() },
new SecureRandom());
SSLContext.setDefault(ctx);
} catch (Exception ex) {
ex.printStackTrace();
}
}
private static class DefaultTrustManager implements X509TrustManager {
@Override
public void checkClientTrusted(X509Certificate[] arg0, String arg1)
throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] arg0, String arg1)
throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
}
public static int countLetters(String str) {
int len = str.length();
int chars = 0;
for (int i = 0; i < len; i++) {
if (Character.isLetter(str.charAt(i)))
chars++;
}
return chars;
}
}

153
src/acr/browser/lightning/ReadingActivity.java

@ -0,0 +1,153 @@ @@ -0,0 +1,153 @@
package acr.browser.lightning;
import java.util.ArrayList;
import java.util.List;
import acr.browser.lightning.Reading.HtmlFetcher;
import acr.browser.lightning.Reading.JResult;
import android.animation.ObjectAnimator;
import android.app.ProgressDialog;
import android.content.Context;
import android.content.Intent;
import android.os.AsyncTask;
import android.os.Bundle;
import android.support.v7.app.ActionBarActivity;
import android.support.v7.widget.Toolbar;
import android.view.MenuItem;
import android.view.View;
import android.widget.TextView;
public class ReadingActivity extends ActionBarActivity {
private TextView mTitle;
private TextView mBody;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.reading_view);
Toolbar toolbar = (Toolbar) findViewById(R.id.toolbar);
setSupportActionBar(toolbar);
getSupportActionBar().setDisplayHomeAsUpEnabled(true);
mTitle = (TextView) findViewById(R.id.textViewTitle);
mBody = (TextView) findViewById(R.id.textViewBody);
mTitle.setText(getString(R.string.untitled));
mBody.setText(getString(R.string.loading));
mTitle.setVisibility(View.INVISIBLE);
mBody.setVisibility(View.INVISIBLE);
Intent intent = getIntent();
if (!loadPage(intent)) {
setText(getString(R.string.untitled), getString(R.string.loading_failed));
}
}
protected boolean loadPage(Intent intent) {
if (intent == null) {
return false;
}
String url = intent.getStringExtra(Constants.LOAD_READING_URL);
if (url == null) {
return false;
}
getSupportActionBar().setTitle(Utils.getDomainName(url));
new PageLoader(this).execute(url);
return true;
}
private class PageLoader extends AsyncTask<String, Void, Void> {
private Context mContext;
private ProgressDialog mProgressDialog;
private String mTitleText;
private List<String> mBodyText;
public PageLoader(Context context) {
mContext = context;
}
@Override
protected void onPreExecute() {
super.onPreExecute();
mProgressDialog = new ProgressDialog(mContext);
mProgressDialog.setProgressStyle(ProgressDialog.STYLE_SPINNER);
mProgressDialog.setCancelable(false);
mProgressDialog.setIndeterminate(true);
mProgressDialog.setMessage(mContext.getString(R.string.loading));
mProgressDialog.show();
}
@Override
protected Void doInBackground(String... params) {
HtmlFetcher fetcher = new HtmlFetcher();
try {
JResult result = fetcher.fetchAndExtract(params[0], 5000, true);
mTitleText = result.getTitle();
mBodyText = result.getTextList();
} catch (Exception e) {
mTitleText = "";
mBodyText = new ArrayList<>();
e.printStackTrace();
} catch (OutOfMemoryError e) {
System.gc();
mTitleText = "";
mBodyText = new ArrayList<>();
e.printStackTrace();
}
return null;
}
@Override
protected void onPostExecute(Void result) {
mProgressDialog.dismiss();
if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
setText(getString(R.string.untitled), getString(R.string.loading_failed));
} else {
StringBuilder builder = new StringBuilder();
for (String text : mBodyText) {
builder.append(text + "\n\n");
}
setText(mTitleText, builder.toString());
}
super.onPostExecute(result);
}
}
private void setText(String title, String body) {
if (mTitle.getVisibility() == View.INVISIBLE) {
mTitle.setAlpha(0.0f);
mTitle.setVisibility(View.VISIBLE);
mTitle.setText(title);
ObjectAnimator animator = ObjectAnimator.ofFloat(mTitle, "alpha", 1.0f);
animator.setDuration(300);
animator.start();
} else {
mTitle.setText(title);
}
if (mBody.getVisibility() == View.INVISIBLE) {
mBody.setAlpha(0.0f);
mBody.setVisibility(View.VISIBLE);
mBody.setText(body);
ObjectAnimator animator = ObjectAnimator.ofFloat(mBody, "alpha", 1.0f);
animator.setDuration(300);
animator.start();
} else {
mBody.setText(body);
}
}
@Override
public boolean onOptionsItemSelected(MenuItem item) {
finish();
return super.onOptionsItemSelected(item);
}
}
Loading…
Cancel
Save