Added a Reading Mode that can be accessed from the menu
Reading Mode utilizes the Snacktory library created by karussel which is licensed under the Apache 2.0 license. https://github.com/karussell/snacktory
This commit is contained in:
parent
313f9fb105
commit
10668a019b
@ -200,6 +200,17 @@
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.BOOKMARK" />
|
||||
|
||||
<category android:name="android.intent.category.DEFAULT" />
|
||||
</intent-filter>
|
||||
</activity>
|
||||
<activity
|
||||
android:name="acr.browser.lightning.ReadingActivity"
|
||||
android:configChanges="orientation|screenSize|keyboardHidden|keyboard"
|
||||
android:label="@string/reading_mode"
|
||||
android:theme="@style/Theme.SettingsTheme" >
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.READING" />
|
||||
|
||||
<category android:name="android.intent.category.DEFAULT" />
|
||||
</intent-filter>
|
||||
</activity>
|
||||
|
BIN
libs/jsoup-1.8.1.jar
Normal file
BIN
libs/jsoup-1.8.1.jar
Normal file
Binary file not shown.
@ -140,5 +140,39 @@
|
||||
android:layout_marginLeft="10dp"
|
||||
android:layout_marginRight="10dp"
|
||||
android:background="#cdcdcd" />
|
||||
|
||||
<LinearLayout
|
||||
android:id="@+id/licenseSnactory"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:background="?attr/listChoiceBackgroundIndicator"
|
||||
android:orientation="vertical"
|
||||
android:paddingBottom="10dp"
|
||||
android:paddingTop="10dp" >
|
||||
|
||||
<TextView
|
||||
android:id="@+id/textView5"
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:paddingLeft="16dp"
|
||||
android:text="@string/snacktory"
|
||||
android:textAppearance="?android:attr/textAppearanceMedium" />
|
||||
|
||||
<TextView
|
||||
android:id="@+id/textView6"
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:paddingLeft="16dp"
|
||||
android:text="@string/apache"
|
||||
android:textAppearance="?android:attr/textAppearanceSmall"
|
||||
android:textColor="@color/light" />
|
||||
</LinearLayout>
|
||||
|
||||
<LinearLayout
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="1dp"
|
||||
android:layout_marginLeft="10dp"
|
||||
android:layout_marginRight="10dp"
|
||||
android:background="#cdcdcd" />
|
||||
|
||||
</LinearLayout>
|
39
res/layout/reading_view.xml
Normal file
39
res/layout/reading_view.xml
Normal file
@ -0,0 +1,39 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
android:orientation="vertical" >
|
||||
|
||||
<include layout="@layout/toolbar_settings" />
|
||||
|
||||
<ScrollView
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent" >
|
||||
|
||||
<LinearLayout
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:padding="20dp"
|
||||
android:orientation="vertical">
|
||||
|
||||
<TextView
|
||||
android:id="@+id/textViewTitle"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:layout_marginBottom="20dp"
|
||||
android:gravity="center_horizontal|center_vertical"
|
||||
android:text="Large Text"
|
||||
android:textAppearance="?android:attr/textAppearanceLarge" />
|
||||
|
||||
<TextView
|
||||
android:id="@+id/textViewBody"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:gravity="start"
|
||||
android:text="Medium Text"
|
||||
android:textAppearance="?android:attr/textAppearanceMedium" />
|
||||
|
||||
</LinearLayout>
|
||||
</ScrollView>
|
||||
|
||||
</LinearLayout>
|
@ -1,73 +1,77 @@
|
||||
<!--
|
||||
Copyright 2014 A.C.R. Development
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<menu xmlns:android="http://schemas.android.com/apk/res/android" >
|
||||
|
||||
<item
|
||||
android:id="@+id/action_back"
|
||||
android:checkable="false"
|
||||
android:enabled="true"
|
||||
android:icon="?arrowBackDrawable"
|
||||
android:showAsAction="always"
|
||||
android:title="@string/action_back"
|
||||
android:visible="true">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_forward"
|
||||
android:checkable="false"
|
||||
android:enabled="true"
|
||||
android:icon="?arrowForwardDrawable"
|
||||
android:showAsAction="always"
|
||||
android:title="@string/action_forward"
|
||||
android:visible="true">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_new_tab"
|
||||
android:title="@string/action_new_tab">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_incognito"
|
||||
android:title="@string/action_incognito">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_share"
|
||||
android:title="@string/action_share"/>
|
||||
<item
|
||||
android:id="@+id/action_history"
|
||||
android:title="@string/action_history">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_find"
|
||||
android:title="@string/action_find">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_copy"
|
||||
android:title="@string/action_copy">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_bookmarks"
|
||||
android:title="@string/action_bookmarks">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_add_bookmark"
|
||||
android:title="@string/action_add_bookmark">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_settings"
|
||||
android:title="@string/settings">
|
||||
</item>
|
||||
|
||||
<!--
|
||||
Copyright 2014 A.C.R. Development
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<menu xmlns:android="http://schemas.android.com/apk/res/android" >
|
||||
|
||||
<item
|
||||
android:id="@+id/action_back"
|
||||
android:checkable="false"
|
||||
android:enabled="true"
|
||||
android:icon="?arrowBackDrawable"
|
||||
android:showAsAction="always"
|
||||
android:title="@string/action_back"
|
||||
android:visible="true">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_forward"
|
||||
android:checkable="false"
|
||||
android:enabled="true"
|
||||
android:icon="?arrowForwardDrawable"
|
||||
android:showAsAction="always"
|
||||
android:title="@string/action_forward"
|
||||
android:visible="true">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_new_tab"
|
||||
android:title="@string/action_new_tab">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_incognito"
|
||||
android:title="@string/action_incognito">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_share"
|
||||
android:title="@string/action_share"/>
|
||||
<item
|
||||
android:id="@+id/action_history"
|
||||
android:title="@string/action_history">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_find"
|
||||
android:title="@string/action_find">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_copy"
|
||||
android:title="@string/action_copy">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_bookmarks"
|
||||
android:title="@string/action_bookmarks">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_add_bookmark"
|
||||
android:title="@string/action_add_bookmark">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_reading_mode"
|
||||
android:title="@string/reading_mode">
|
||||
</item>
|
||||
<item
|
||||
android:id="@+id/action_settings"
|
||||
android:title="@string/settings">
|
||||
</item>
|
||||
|
||||
</menu>
|
@ -23,6 +23,7 @@
|
||||
<item android:id="@+id/action_copy" android:title="@string/action_copy" ></item>
|
||||
<item android:id="@+id/action_bookmarks" android:title="@string/action_bookmarks" ></item>
|
||||
<item android:id="@+id/action_add_bookmark" android:title="@string/action_add_bookmark" ></item>
|
||||
<item android:id="@+id/action_reading_mode" android:title="@string/reading_mode" ></item>
|
||||
<item android:id="@+id/action_settings" android:title="@string/settings" ></item>
|
||||
|
||||
</menu>
|
@ -198,4 +198,8 @@
|
||||
<string name="third_party">Block 3rd Party Cookies</string>
|
||||
<string name="available_lollipop">This feature is only available on Android 5.0+</string>
|
||||
<string name="color_mode">Enable Color Mode</string>
|
||||
<string name="reading_mode">Reader Mode</string>
|
||||
<string name="loading">Loading…</string>
|
||||
<string name="loading_failed">Couldn\'t load anything from the page.</string>
|
||||
<string name="snacktory">Snacktory</string>
|
||||
</resources>
|
||||
|
@ -179,7 +179,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
mDrawerListRight.setDividerHeight(0);
|
||||
setNavigationDrawerWidth();
|
||||
mDrawerLayout.setDrawerListener(new DrawerLocker());
|
||||
|
||||
|
||||
mWebpageBitmap = BitmapFactory.decodeResource(getResources(), R.drawable.ic_webpage);
|
||||
mActionBar = getSupportActionBar();
|
||||
final TypedArray styledAttributes = mContext.getTheme().obtainStyledAttributes(
|
||||
@ -350,7 +350,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
};
|
||||
anim.setDuration(300);
|
||||
anim.setInterpolator(new DecelerateInterpolator());
|
||||
anim.setAnimationListener(new AnimationListener(){
|
||||
anim.setAnimationListener(new AnimationListener() {
|
||||
|
||||
@Override
|
||||
public void onAnimationStart(Animation animation) {
|
||||
@ -368,7 +368,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
@Override
|
||||
public void onAnimationRepeat(Animation animation) {
|
||||
}
|
||||
|
||||
|
||||
});
|
||||
new Handler().postDelayed(new Runnable() {
|
||||
|
||||
@ -488,12 +488,12 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
checkForTor();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private class DrawerLocker implements DrawerListener {
|
||||
|
||||
@Override
|
||||
public void onDrawerClosed(View v) {
|
||||
if(v == mDrawerRight){
|
||||
if (v == mDrawerRight) {
|
||||
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerLeft);
|
||||
} else {
|
||||
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_UNLOCKED, mDrawerRight);
|
||||
@ -502,7 +502,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
|
||||
@Override
|
||||
public void onDrawerOpened(View v) {
|
||||
if(v == mDrawerRight){
|
||||
if (v == mDrawerRight) {
|
||||
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerLeft);
|
||||
} else {
|
||||
mDrawerLayout.setDrawerLockMode(DrawerLayout.LOCK_MODE_LOCKED_CLOSED, mDrawerRight);
|
||||
@ -516,7 +516,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
@Override
|
||||
public void onDrawerStateChanged(int arg) {
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public boolean handleMenuItemClick(MenuItem item) {
|
||||
@ -596,6 +596,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
case R.id.action_find:
|
||||
findInPage();
|
||||
return true;
|
||||
case R.id.action_reading_mode:
|
||||
Intent read = new Intent(this, ReadingActivity.class);
|
||||
read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
|
||||
startActivity(read);
|
||||
return true;
|
||||
default:
|
||||
return super.onOptionsItemSelected(item);
|
||||
}
|
||||
@ -912,6 +917,11 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
case R.id.action_find:
|
||||
findInPage();
|
||||
return true;
|
||||
case R.id.action_reading_mode:
|
||||
Intent read = new Intent(this, ReadingActivity.class);
|
||||
read.putExtra(Constants.LOAD_READING_URL, mCurrentView.getUrl());
|
||||
startActivity(read);
|
||||
return true;
|
||||
default:
|
||||
return super.onOptionsItemSelected(item);
|
||||
}
|
||||
@ -1622,7 +1632,7 @@ public class BrowserActivity extends ActionBarActivity implements BrowserControl
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
|
||||
ViewCompat.jumpDrawablesToCurrentState(holder.exit);
|
||||
|
||||
LightningView web = data.get(position);
|
||||
|
@ -29,6 +29,8 @@ public final class Constants {
|
||||
public static final String JAVASCRIPT_INVERT_PAGE = "javascript:(function(){var e='img {-webkit-filter: invert(100%);'+'-moz-filter: invert(100%);'+'-o-filter: invert(100%);'+'-ms-filter: invert(100%); }',t=document.getElementsByTagName('head')[0],n=document.createElement('style');if(!window.counter){window.counter=1}else{window.counter++;if(window.counter%2==0){var e='html {-webkit-filter: invert(0%); -moz-filter: invert(0%); -o-filter: invert(0%); -ms-filter: invert(0%); }'}}n.type='text/css';if(n.styleSheet){n.styleSheet.cssText=e}else{n.appendChild(document.createTextNode(e))}t.appendChild(n)})();";
|
||||
public static final String JAVASCRIPT_TEXT_REFLOW = "javascript:document.getElementsByTagName('body')[0].style.width=window.innerWidth+'px';";
|
||||
|
||||
public static final String LOAD_READING_URL = "ReadingUrl";
|
||||
|
||||
public static final String SEPARATOR = "\\|\\$\\|SEPARATOR\\|\\$\\|";
|
||||
public static final String HTTP = "http://";
|
||||
public static final String HTTPS = "https://";
|
||||
|
@ -30,6 +30,7 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
|
||||
findViewById(R.id.licenseAOSP).setOnClickListener(this);
|
||||
findViewById(R.id.licenseHosts).setOnClickListener(this);
|
||||
findViewById(R.id.licenseOrbot).setOnClickListener(this);
|
||||
findViewById(R.id.licenseSnactory).setOnClickListener(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -47,6 +48,9 @@ public class LicenseActivity extends ActionBarActivity implements View.OnClickLi
|
||||
case R.id.licenseOrbot:
|
||||
actionView("http://www.gnu.org/licenses/lgpl.html");
|
||||
break;
|
||||
case R.id.licenseSnactory:
|
||||
actionView("http://www.apache.org/licenses/LICENSE-2.0");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
619
src/acr/browser/lightning/Reading/ArticleTextExtractor.java
Normal file
619
src/acr/browser/lightning/Reading/ArticleTextExtractor.java
Normal file
@ -0,0 +1,619 @@
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import android.util.Log;
|
||||
|
||||
/**
|
||||
* This class is thread safe.
|
||||
*
|
||||
* @author Alex P (ifesdjeen from jreadability)
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public class ArticleTextExtractor {
|
||||
|
||||
// Interessting nodes
|
||||
private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
|
||||
// Unlikely candidates
|
||||
private String unlikelyStr;
|
||||
private Pattern UNLIKELY;
|
||||
// Most likely positive candidates
|
||||
private String positiveStr;
|
||||
private Pattern POSITIVE;
|
||||
// Most likely negative candidates
|
||||
private String negativeStr;
|
||||
private Pattern NEGATIVE;
|
||||
private static final Pattern NEGATIVE_STYLE = Pattern
|
||||
.compile("hidden|display: ?none|font-size: ?small");
|
||||
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
|
||||
{
|
||||
add("hacker news");
|
||||
add("facebook");
|
||||
}
|
||||
};
|
||||
private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
|
||||
private OutputFormatter formatter = DEFAULT_FORMATTER;
|
||||
|
||||
public ArticleTextExtractor() {
|
||||
setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
|
||||
+ "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
|
||||
+ "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
|
||||
+ "login|si(debar|gn|ngle)");
|
||||
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
|
||||
+ "|arti(cle|kel)|instapaper_body");
|
||||
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
|
||||
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
|
||||
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
|
||||
}
|
||||
|
||||
public ArticleTextExtractor setUnlikely(String unlikelyStr) {
|
||||
this.unlikelyStr = unlikelyStr;
|
||||
UNLIKELY = Pattern.compile(unlikelyStr);
|
||||
return this;
|
||||
}
|
||||
|
||||
public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
|
||||
return setUnlikely(unlikelyStr + "|" + unlikelyMatches);
|
||||
}
|
||||
|
||||
public ArticleTextExtractor setPositive(String positiveStr) {
|
||||
this.positiveStr = positiveStr;
|
||||
POSITIVE = Pattern.compile(positiveStr);
|
||||
return this;
|
||||
}
|
||||
|
||||
public ArticleTextExtractor addPositive(String pos) {
|
||||
return setPositive(positiveStr + "|" + pos);
|
||||
}
|
||||
|
||||
public ArticleTextExtractor setNegative(String negativeStr) {
|
||||
this.negativeStr = negativeStr;
|
||||
NEGATIVE = Pattern.compile(negativeStr);
|
||||
return this;
|
||||
}
|
||||
|
||||
public ArticleTextExtractor addNegative(String neg) {
|
||||
setNegative(negativeStr + "|" + neg);
|
||||
return this;
|
||||
}
|
||||
|
||||
public void setOutputFormatter(OutputFormatter formatter) {
|
||||
this.formatter = formatter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param html
|
||||
* extracts article text from given html string. wasn't tested
|
||||
* with improper HTML, although jSoup should be able to handle
|
||||
* minor stuff.
|
||||
* @returns extracted article, all HTML tags stripped
|
||||
*/
|
||||
public JResult extractContent(Document doc) throws Exception {
|
||||
return extractContent(new JResult(), doc, formatter);
|
||||
}
|
||||
|
||||
public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {
|
||||
return extractContent(new JResult(), doc, formatter);
|
||||
}
|
||||
|
||||
public JResult extractContent(String html) throws Exception {
|
||||
return extractContent(new JResult(), html);
|
||||
}
|
||||
|
||||
public JResult extractContent(JResult res, String html) throws Exception {
|
||||
return extractContent(res, html, formatter);
|
||||
}
|
||||
|
||||
public JResult extractContent(JResult res, String html, OutputFormatter formatter)
|
||||
throws Exception {
|
||||
if (html.isEmpty())
|
||||
throw new IllegalArgumentException("html string is empty!?");
|
||||
|
||||
// http://jsoup.org/cookbook/extracting-data/selector-syntax
|
||||
return extractContent(res, Jsoup.parse(html), formatter);
|
||||
}
|
||||
|
||||
public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)
|
||||
throws Exception {
|
||||
if (doc == null)
|
||||
throw new NullPointerException("missing document");
|
||||
|
||||
res.setTitle(extractTitle(doc));
|
||||
res.setDescription(extractDescription(doc));
|
||||
res.setCanonicalUrl(extractCanonicalUrl(doc));
|
||||
|
||||
// now remove the clutter
|
||||
prepareDocument(doc);
|
||||
|
||||
// init elements
|
||||
Collection<Element> nodes = getNodes(doc);
|
||||
int maxWeight = 0;
|
||||
Element bestMatchElement = null;
|
||||
for (Element entry : nodes) {
|
||||
int currentWeight = getWeight(entry);
|
||||
if (currentWeight > maxWeight) {
|
||||
maxWeight = currentWeight;
|
||||
bestMatchElement = entry;
|
||||
if (maxWeight > 200)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestMatchElement != null) {
|
||||
List<ImageResult> images = new ArrayList<ImageResult>();
|
||||
Element imgEl = determineImageSource(bestMatchElement, images);
|
||||
if (imgEl != null) {
|
||||
res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
|
||||
// TODO remove parent container of image if it is contained in
|
||||
// bestMatchElement
|
||||
// to avoid image subtitles flooding in
|
||||
|
||||
res.setImages(images);
|
||||
}
|
||||
|
||||
// clean before grabbing text
|
||||
String text = formatter.getFormattedText(bestMatchElement);
|
||||
text = removeTitleFromText(text, res.getTitle());
|
||||
// this fails for short facebook post and probably tweets:
|
||||
// text.length() > res.getDescription().length()
|
||||
if (text.length() > res.getTitle().length()) {
|
||||
res.setText(text);
|
||||
// print("best element:", bestMatchElement);
|
||||
}
|
||||
res.setTextList(formatter.getTextList(bestMatchElement));
|
||||
}
|
||||
|
||||
if (res.getImageUrl().isEmpty()) {
|
||||
res.setImageUrl(extractImageUrl(doc));
|
||||
}
|
||||
|
||||
res.setRssUrl(extractRssUrl(doc));
|
||||
res.setVideoUrl(extractVideoUrl(doc));
|
||||
res.setFaviconUrl(extractFaviconUrl(doc));
|
||||
res.setKeywords(extractKeywords(doc));
|
||||
return res;
|
||||
}
|
||||
|
||||
protected String extractTitle(Document doc) {
|
||||
String title = cleanTitle(doc.title());
|
||||
if (title.isEmpty()) {
|
||||
title = SHelper.innerTrim(doc.select("head title").text());
|
||||
if (title.isEmpty()) {
|
||||
title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
|
||||
if (title.isEmpty()) {
|
||||
title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(
|
||||
"content"));
|
||||
if (title.isEmpty()) {
|
||||
title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(
|
||||
"content"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
protected String extractCanonicalUrl(Document doc) {
|
||||
String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
|
||||
if (url.isEmpty()) {
|
||||
url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
|
||||
if (url.isEmpty()) {
|
||||
url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(
|
||||
"content"));
|
||||
}
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
protected String extractDescription(Document doc) {
|
||||
String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(
|
||||
"content"));
|
||||
if (description.isEmpty()) {
|
||||
description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(
|
||||
"content"));
|
||||
if (description.isEmpty()) {
|
||||
description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")
|
||||
.attr("content"));
|
||||
}
|
||||
}
|
||||
return description;
|
||||
}
|
||||
|
||||
protected Collection<String> extractKeywords(Document doc) {
|
||||
String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
|
||||
|
||||
if (content != null) {
|
||||
if (content.startsWith("[") && content.endsWith("]"))
|
||||
content = content.substring(1, content.length() - 1);
|
||||
|
||||
String[] split = content.split("\\s*,\\s*");
|
||||
if (split.length > 1 || (split.length > 0 && !"".equals(split[0])))
|
||||
return Arrays.asList(split);
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to extract an image url from metadata if determineImageSource
|
||||
* failed
|
||||
*
|
||||
* @return image url or empty str
|
||||
*/
|
||||
protected String extractImageUrl(Document doc) {
|
||||
// use open graph tag to get image
|
||||
String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(
|
||||
"content"));
|
||||
if (imageUrl.isEmpty()) {
|
||||
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(
|
||||
"content"));
|
||||
if (imageUrl.isEmpty()) {
|
||||
// prefer link over thumbnail-meta if empty
|
||||
imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
|
||||
if (imageUrl.isEmpty()) {
|
||||
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(
|
||||
"content"));
|
||||
}
|
||||
}
|
||||
}
|
||||
return imageUrl;
|
||||
}
|
||||
|
||||
protected String extractRssUrl(Document doc) {
|
||||
return SHelper.replaceSpaces(doc.select("link[rel=alternate]")
|
||||
.select("link[type=application/rss+xml]").attr("href"));
|
||||
}
|
||||
|
||||
protected String extractVideoUrl(Document doc) {
|
||||
return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
|
||||
}
|
||||
|
||||
protected String extractFaviconUrl(Document doc) {
|
||||
String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
|
||||
if (faviconUrl.isEmpty()) {
|
||||
faviconUrl = SHelper.replaceSpaces(doc.select(
|
||||
"head link[rel^=shortcut],link[rel$=icon]").attr("href"));
|
||||
}
|
||||
return faviconUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Weights current element. By matching it with positive candidates and
|
||||
* weighting child nodes. Since it's impossible to predict which exactly
|
||||
* names, ids or class names will be used in HTML, major role is played by
|
||||
* child nodes
|
||||
*
|
||||
* @param e
|
||||
* Element to weight, along with child nodes
|
||||
*/
|
||||
protected int getWeight(Element e) {
|
||||
int weight = calcWeight(e);
|
||||
weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
|
||||
weight += weightChildNodes(e);
|
||||
return weight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Weights a child nodes of given Element. During tests some difficulties
|
||||
* were met. For instanance, not every single document has nested paragraph
|
||||
* tags inside of the major article tag. Sometimes people are adding one
|
||||
* more nesting level. So, we're adding 4 points for every 100 symbols
|
||||
* contained in tag nested inside of the current weighted element, but only
|
||||
* 3 points for every element that's nested 2 levels deep. This way we give
|
||||
* more chances to extract the element that has less nested levels,
|
||||
* increasing probability of the correct extraction.
|
||||
*
|
||||
* @param rootEl
|
||||
* Element, who's child nodes will be weighted
|
||||
*/
|
||||
protected int weightChildNodes(Element rootEl) {
|
||||
int weight = 0;
|
||||
Element caption = null;
|
||||
List<Element> pEls = new ArrayList<Element>(5);
|
||||
for (Element child : rootEl.children()) {
|
||||
String ownText = child.ownText();
|
||||
int ownTextLength = ownText.length();
|
||||
if (ownTextLength < 20)
|
||||
continue;
|
||||
|
||||
if (ownTextLength > 200)
|
||||
weight += Math.max(50, ownTextLength / 10);
|
||||
|
||||
if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
|
||||
weight += 30;
|
||||
} else if (child.tagName().equals("div") || child.tagName().equals("p")) {
|
||||
weight += calcWeightForChild(child, ownText);
|
||||
if (child.tagName().equals("p") && ownTextLength > 50)
|
||||
pEls.add(child);
|
||||
|
||||
if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))
|
||||
caption = child;
|
||||
}
|
||||
}
|
||||
|
||||
// use caption and image
|
||||
if (caption != null)
|
||||
weight += 30;
|
||||
|
||||
if (pEls.size() >= 2) {
|
||||
for (Element subEl : rootEl.children()) {
|
||||
if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
|
||||
weight += 20;
|
||||
// headerEls.add(subEl);
|
||||
} else if ("table;li;td;th".contains(subEl.tagName())) {
|
||||
addScore(subEl, -30);
|
||||
}
|
||||
|
||||
if ("p".contains(subEl.tagName()))
|
||||
addScore(subEl, 30);
|
||||
}
|
||||
}
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void addScore(Element el, int score) {
|
||||
int old = getScore(el);
|
||||
setScore(el, score + old);
|
||||
}
|
||||
|
||||
public int getScore(Element el) {
|
||||
int old = 0;
|
||||
try {
|
||||
old = Integer.parseInt(el.attr("gravityScore"));
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
return old;
|
||||
}
|
||||
|
||||
public void setScore(Element el, int score) {
|
||||
el.attr("gravityScore", Integer.toString(score));
|
||||
}
|
||||
|
||||
private int calcWeightForChild(Element child, String ownText) {
|
||||
int c = SHelper.count(ownText, """);
|
||||
c += SHelper.count(ownText, "<");
|
||||
c += SHelper.count(ownText, ">");
|
||||
c += SHelper.count(ownText, "px");
|
||||
int val;
|
||||
if (c > 5)
|
||||
val = -30;
|
||||
else
|
||||
val = (int) Math.round(ownText.length() / 25.0);
|
||||
|
||||
addScore(child, val);
|
||||
return val;
|
||||
}
|
||||
|
||||
private int calcWeight(Element e) {
|
||||
int weight = 0;
|
||||
if (POSITIVE.matcher(e.className()).find())
|
||||
weight += 35;
|
||||
|
||||
if (POSITIVE.matcher(e.id()).find())
|
||||
weight += 40;
|
||||
|
||||
if (UNLIKELY.matcher(e.className()).find())
|
||||
weight -= 20;
|
||||
|
||||
if (UNLIKELY.matcher(e.id()).find())
|
||||
weight -= 20;
|
||||
|
||||
if (NEGATIVE.matcher(e.className()).find())
|
||||
weight -= 50;
|
||||
|
||||
if (NEGATIVE.matcher(e.id()).find())
|
||||
weight -= 50;
|
||||
|
||||
String style = e.attr("style");
|
||||
if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
|
||||
weight -= 50;
|
||||
return weight;
|
||||
}
|
||||
|
||||
public Element determineImageSource(Element el, List<ImageResult> images) {
|
||||
int maxWeight = 0;
|
||||
Element maxNode = null;
|
||||
Elements els = el.select("img");
|
||||
if (els.isEmpty())
|
||||
els = el.parent().select("img");
|
||||
|
||||
double score = 1;
|
||||
for (Element e : els) {
|
||||
String sourceUrl = e.attr("src");
|
||||
if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
|
||||
continue;
|
||||
|
||||
int weight = 0;
|
||||
int height = 0;
|
||||
try {
|
||||
height = Integer.parseInt(e.attr("height"));
|
||||
if (height >= 50)
|
||||
weight += 20;
|
||||
else
|
||||
weight -= 20;
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
|
||||
int width = 0;
|
||||
try {
|
||||
width = Integer.parseInt(e.attr("width"));
|
||||
if (width >= 50)
|
||||
weight += 20;
|
||||
else
|
||||
weight -= 20;
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
String alt = e.attr("alt");
|
||||
if (alt.length() > 35)
|
||||
weight += 20;
|
||||
|
||||
String title = e.attr("title");
|
||||
if (title.length() > 35)
|
||||
weight += 20;
|
||||
|
||||
String rel = null;
|
||||
boolean noFollow = false;
|
||||
if (e.parent() != null) {
|
||||
rel = e.parent().attr("rel");
|
||||
if (rel != null && rel.contains("nofollow")) {
|
||||
noFollow = rel.contains("nofollow");
|
||||
weight -= 40;
|
||||
}
|
||||
}
|
||||
|
||||
weight = (int) (weight * score);
|
||||
if (weight > maxWeight) {
|
||||
maxWeight = weight;
|
||||
maxNode = e;
|
||||
score = score / 2;
|
||||
}
|
||||
|
||||
ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,
|
||||
noFollow);
|
||||
images.add(image);
|
||||
}
|
||||
|
||||
Collections.sort(images, new ImageComparator());
|
||||
return maxNode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares document. Currently only stipping unlikely candidates, since
|
||||
* from time to time they're getting more score than good ones especially in
|
||||
* cases when major text is short.
|
||||
*
|
||||
* @param doc
|
||||
* document to prepare. Passed as reference, and changed inside
|
||||
* of function
|
||||
*/
|
||||
protected void prepareDocument(Document doc) {
|
||||
// stripUnlikelyCandidates(doc);
|
||||
removeScriptsAndStyles(doc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes unlikely candidates from HTML. Currently takes id and class name
|
||||
* and matches them against list of patterns
|
||||
*
|
||||
* @param doc
|
||||
* document to strip unlikely candidates from
|
||||
*/
|
||||
protected void stripUnlikelyCandidates(Document doc) {
|
||||
for (Element child : doc.select("body").select("*")) {
|
||||
String className = child.className().toLowerCase(Locale.getDefault());
|
||||
String id = child.id().toLowerCase(Locale.getDefault());
|
||||
|
||||
if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) {
|
||||
// print("REMOVE:", child);
|
||||
child.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Document removeScriptsAndStyles(Document doc) {
|
||||
Elements scripts = doc.getElementsByTag("script");
|
||||
for (Element item : scripts) {
|
||||
item.remove();
|
||||
}
|
||||
|
||||
Elements noscripts = doc.getElementsByTag("noscript");
|
||||
for (Element item : noscripts) {
|
||||
item.remove();
|
||||
}
|
||||
|
||||
Elements styles = doc.getElementsByTag("style");
|
||||
for (Element style : styles) {
|
||||
style.remove();
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
private boolean isAdImage(String imageUrl) {
|
||||
return SHelper.count(imageUrl, "ad") >= 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Match only exact matching as longestSubstring can be too fuzzy
|
||||
*/
|
||||
public String removeTitleFromText(String text, String title) {
|
||||
// don't do this as its terrible to read
|
||||
// int index1 = text.toLowerCase().indexOf(title.toLowerCase());
|
||||
// if (index1 >= 0)
|
||||
// text = text.substring(index1 + title.length());
|
||||
// return text.trim();
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a set of all important nodes
|
||||
*/
|
||||
public Collection<Element> getNodes(Document doc) {
|
||||
Set<Element> nodes = new HashSet<Element>(64);
|
||||
int score = 100;
|
||||
for (Element el : doc.select("body").select("*")) {
|
||||
if (NODES.matcher(el.tagName()).matches()) {
|
||||
nodes.add(el);
|
||||
setScore(el, score);
|
||||
score = score / 2;
|
||||
}
|
||||
}
|
||||
return nodes;
|
||||
|
||||
}
|
||||
|
||||
public String cleanTitle(String title) {
|
||||
StringBuilder res = new StringBuilder();
|
||||
// int index = title.lastIndexOf("|");
|
||||
// if (index > 0 && title.length() / 2 < index)
|
||||
// title = title.substring(0, index + 1);
|
||||
|
||||
int counter = 0;
|
||||
String[] strs = title.split("\\|");
|
||||
for (String part : strs) {
|
||||
if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))
|
||||
continue;
|
||||
|
||||
if (counter == strs.length - 1 && res.length() > part.length())
|
||||
continue;
|
||||
|
||||
if (counter > 0)
|
||||
res.append("|");
|
||||
|
||||
res.append(part);
|
||||
counter++;
|
||||
}
|
||||
|
||||
return SHelper.innerTrim(res.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Comparator for Image by weight
|
||||
*
|
||||
* @author Chris Alexander, chris@chris-alexander.co.uk
|
||||
*
|
||||
*/
|
||||
public class ImageComparator implements Comparator<ImageResult> {
|
||||
|
||||
@Override
|
||||
public int compare(ImageResult o1, ImageResult o2) {
|
||||
// Returns the highest weight first
|
||||
return o2.weight.compareTo(o1.weight);
|
||||
}
|
||||
}
|
||||
}
|
243
src/acr/browser/lightning/Reading/Converter.java
Normal file
243
src/acr/browser/lightning/Reading/Converter.java
Normal file
@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Copyright 2011 Peter Karich
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Locale;
|
||||
|
||||
import acr.browser.lightning.Constants;
|
||||
import android.util.Log;
|
||||
|
||||
/**
|
||||
* This class is not thread safe. Use one new instance every time due to
|
||||
* encoding variable.
|
||||
*
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public class Converter {
|
||||
|
||||
public final static String UTF8 = "UTF-8";
|
||||
public final static String ISO = "ISO-8859-1";
|
||||
public final static int K2 = 2048;
|
||||
private int maxBytes = 1000000 / 2;
|
||||
private String encoding;
|
||||
private String url;
|
||||
|
||||
public Converter(String urlOnlyHint) {
|
||||
url = urlOnlyHint;
|
||||
}
|
||||
|
||||
public Converter() {
|
||||
}
|
||||
|
||||
public Converter setMaxBytes(int maxBytes) {
|
||||
this.maxBytes = maxBytes;
|
||||
return this;
|
||||
}
|
||||
|
||||
public static String extractEncoding(String contentType) {
|
||||
String[] values;
|
||||
if (contentType != null)
|
||||
values = contentType.split(";");
|
||||
else
|
||||
values = new String[0];
|
||||
|
||||
String charset = "";
|
||||
|
||||
for (String value : values) {
|
||||
value = value.trim().toLowerCase(Locale.getDefault());
|
||||
|
||||
if (value.startsWith("charset="))
|
||||
charset = value.substring("charset=".length());
|
||||
}
|
||||
|
||||
// http1.1 says ISO-8859-1 is the default charset
|
||||
if (charset.length() == 0)
|
||||
charset = ISO;
|
||||
|
||||
return charset;
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
if (encoding == null)
|
||||
return "";
|
||||
return encoding.toLowerCase(Locale.getDefault());
|
||||
}
|
||||
|
||||
public String streamToString(InputStream is) {
|
||||
return streamToString(is, maxBytes, encoding);
|
||||
}
|
||||
|
||||
public String streamToString(InputStream is, String enc) {
|
||||
return streamToString(is, maxBytes, enc);
|
||||
}
|
||||
|
||||
/**
|
||||
* reads bytes off the string and returns a string
|
||||
*
|
||||
* @param is
|
||||
* @param maxBytes
|
||||
* The max bytes that we want to read from the input stream
|
||||
* @return String
|
||||
*/
|
||||
public String streamToString(InputStream is, int maxBytes, String enc) {
|
||||
encoding = enc;
|
||||
// Http 1.1. standard is iso-8859-1 not utf8 :(
|
||||
// but we force utf-8 as youtube assumes it ;)
|
||||
if (encoding == null || encoding.isEmpty())
|
||||
encoding = UTF8;
|
||||
|
||||
BufferedInputStream in = null;
|
||||
try {
|
||||
in = new BufferedInputStream(is, K2);
|
||||
ByteArrayOutputStream output = new ByteArrayOutputStream();
|
||||
|
||||
// detect encoding with the help of meta tag
|
||||
try {
|
||||
in.mark(K2 * 2);
|
||||
String tmpEnc = detectCharset("charset=", output, in, encoding);
|
||||
if (tmpEnc != null)
|
||||
encoding = tmpEnc;
|
||||
else {
|
||||
Log.d(Constants.TAG, "no charset found in first stage");
|
||||
// detect with the help of xml beginning ala
|
||||
// encoding="charset"
|
||||
tmpEnc = detectCharset("encoding=", output, in, encoding);
|
||||
if (tmpEnc != null)
|
||||
encoding = tmpEnc;
|
||||
else
|
||||
Log.d(Constants.TAG, "no charset found in second stage");
|
||||
}
|
||||
|
||||
if (!Charset.isSupported(encoding))
|
||||
throw new UnsupportedEncodingException(encoding);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
Log.d(Constants.TAG,
|
||||
"Using default encoding:" + UTF8 + " problem:" + e.getMessage()
|
||||
+ " encoding:" + encoding + " " + url);
|
||||
encoding = UTF8;
|
||||
}
|
||||
|
||||
// SocketException: Connection reset
|
||||
// IOException: missing CR => problem on server (probably some xml
|
||||
// character thing?)
|
||||
// IOException: Premature EOF => socket unexpectly closed from
|
||||
// server
|
||||
int bytesRead = output.size();
|
||||
byte[] arr = new byte[K2];
|
||||
while (true) {
|
||||
if (bytesRead >= maxBytes) {
|
||||
Log.d(Constants.TAG, "Maxbyte of " + maxBytes
|
||||
+ " exceeded! Maybe html is now broken but try it nevertheless. Url: "
|
||||
+ url);
|
||||
break;
|
||||
}
|
||||
|
||||
int n = in.read(arr);
|
||||
if (n < 0)
|
||||
break;
|
||||
bytesRead += n;
|
||||
output.write(arr, 0, n);
|
||||
}
|
||||
|
||||
return output.toString(encoding);
|
||||
} catch (SocketTimeoutException e) {
|
||||
Log.e(Constants.TAG, e.toString() + " url:" + url);
|
||||
} catch (IOException e) {
|
||||
Log.e(Constants.TAG, e.toString() + " url:" + url);
|
||||
} finally {
|
||||
if (in != null) {
|
||||
try {
|
||||
in.close();
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* This method detects the charset even if the first call only returns some
|
||||
* bytes. It will read until 4K bytes are reached and then try to determine
|
||||
* the encoding
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
|
||||
String enc) throws IOException {
|
||||
|
||||
// Grab better encoding from stream
|
||||
byte[] arr = new byte[K2];
|
||||
int nSum = 0;
|
||||
while (nSum < K2) {
|
||||
int n = in.read(arr);
|
||||
if (n < 0)
|
||||
break;
|
||||
|
||||
nSum += n;
|
||||
bos.write(arr, 0, n);
|
||||
}
|
||||
|
||||
String str = bos.toString(enc);
|
||||
int encIndex = str.indexOf(key);
|
||||
int clength = key.length();
|
||||
if (encIndex > 0) {
|
||||
char startChar = str.charAt(encIndex + clength);
|
||||
int lastEncIndex;
|
||||
if (startChar == '\'')
|
||||
// if we have charset='something'
|
||||
lastEncIndex = str.indexOf("'", ++encIndex + clength);
|
||||
else if (startChar == '\"')
|
||||
// if we have charset="something"
|
||||
lastEncIndex = str.indexOf("\"", ++encIndex + clength);
|
||||
else {
|
||||
// if we have "text/html; charset=utf-8"
|
||||
int first = str.indexOf("\"", encIndex + clength);
|
||||
if (first < 0)
|
||||
first = Integer.MAX_VALUE;
|
||||
|
||||
// or "text/html; charset=utf-8 "
|
||||
int sec = str.indexOf(" ", encIndex + clength);
|
||||
if (sec < 0)
|
||||
sec = Integer.MAX_VALUE;
|
||||
lastEncIndex = Math.min(first, sec);
|
||||
|
||||
// or "text/html; charset=utf-8 '
|
||||
int third = str.indexOf("'", encIndex + clength);
|
||||
if (third > 0)
|
||||
lastEncIndex = Math.min(lastEncIndex, third);
|
||||
}
|
||||
|
||||
// re-read byte array with different encoding
|
||||
// assume that the encoding string cannot be greater than 40 chars
|
||||
if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
|
||||
String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,
|
||||
lastEncIndex));
|
||||
try {
|
||||
in.reset();
|
||||
bos.reset();
|
||||
return tmpEnc;
|
||||
} catch (IOException ex) {
|
||||
Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "
|
||||
+ tmpEnc + " " + ex.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
445
src/acr/browser/lightning/Reading/HtmlFetcher.java
Normal file
445
src/acr/browser/lightning/Reading/HtmlFetcher.java
Normal file
@ -0,0 +1,445 @@
|
||||
/*
|
||||
* Copyright 2011 Peter Karich
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.Proxy;
|
||||
import java.net.URL;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.Inflater;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
|
||||
import acr.browser.lightning.Constants;
|
||||
import android.util.Log;
|
||||
|
||||
/**
|
||||
* Class to fetch articles. This class is thread safe.
|
||||
*
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public class HtmlFetcher {
|
||||
|
||||
static {
|
||||
SHelper.enableCookieMgmt();
|
||||
SHelper.enableUserAgentOverwrite();
|
||||
SHelper.enableAnySSL();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
BufferedReader reader = new BufferedReader(new FileReader("urls.txt"));
|
||||
String line = null;
|
||||
Set<String> existing = new LinkedHashSet<String>();
|
||||
while ((line = reader.readLine()) != null) {
|
||||
int index1 = line.indexOf("\"");
|
||||
int index2 = line.indexOf("\"", index1 + 1);
|
||||
String url = line.substring(index1 + 1, index2);
|
||||
String domainStr = SHelper.extractDomain(url, true);
|
||||
String counterStr = "";
|
||||
// TODO more similarities
|
||||
if (existing.contains(domainStr))
|
||||
counterStr = "2";
|
||||
else
|
||||
existing.add(domainStr);
|
||||
|
||||
String html = new HtmlFetcher().fetchAsString(url, 20000);
|
||||
String outFile = domainStr + counterStr + ".html";
|
||||
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
|
||||
writer.write(html);
|
||||
writer.close();
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
private String referrer = "https://github.com/karussell/snacktory";
|
||||
private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ")";
|
||||
private String cacheControl = "max-age=0";
|
||||
private String language = "en-us";
|
||||
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
|
||||
private String charset = "UTF-8";
|
||||
private SCache cache;
|
||||
private AtomicInteger cacheCounter = new AtomicInteger(0);
|
||||
private int maxTextLength = -1;
|
||||
private ArticleTextExtractor extractor = new ArticleTextExtractor();
|
||||
private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
|
||||
{
|
||||
add("bit.ly");
|
||||
add("cli.gs");
|
||||
add("deck.ly");
|
||||
add("fb.me");
|
||||
add("feedproxy.google.com");
|
||||
add("flic.kr");
|
||||
add("fur.ly");
|
||||
add("goo.gl");
|
||||
add("is.gd");
|
||||
add("ink.co");
|
||||
add("j.mp");
|
||||
add("lnkd.in");
|
||||
add("on.fb.me");
|
||||
add("ow.ly");
|
||||
add("plurl.us");
|
||||
add("sns.mx");
|
||||
add("snurl.com");
|
||||
add("su.pr");
|
||||
add("t.co");
|
||||
add("tcrn.ch");
|
||||
add("tl.gd");
|
||||
add("tiny.cc");
|
||||
add("tinyurl.com");
|
||||
add("tmi.me");
|
||||
add("tr.im");
|
||||
add("twurl.nl");
|
||||
}
|
||||
};
|
||||
|
||||
public HtmlFetcher() {
|
||||
}
|
||||
|
||||
public void setExtractor(ArticleTextExtractor extractor) {
|
||||
this.extractor = extractor;
|
||||
}
|
||||
|
||||
public ArticleTextExtractor getExtractor() {
|
||||
return extractor;
|
||||
}
|
||||
|
||||
public HtmlFetcher setCache(SCache cache) {
|
||||
this.cache = cache;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SCache getCache() {
|
||||
return cache;
|
||||
}
|
||||
|
||||
public int getCacheCounter() {
|
||||
return cacheCounter.get();
|
||||
}
|
||||
|
||||
public HtmlFetcher clearCacheCounter() {
|
||||
cacheCounter.set(0);
|
||||
return this;
|
||||
}
|
||||
|
||||
public HtmlFetcher setMaxTextLength(int maxTextLength) {
|
||||
this.maxTextLength = maxTextLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int getMaxTextLength() {
|
||||
return maxTextLength;
|
||||
}
|
||||
|
||||
public void setAccept(String accept) {
|
||||
this.accept = accept;
|
||||
}
|
||||
|
||||
public void setCharset(String charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public void setCacheControl(String cacheControl) {
|
||||
this.cacheControl = cacheControl;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getReferrer() {
|
||||
return referrer;
|
||||
}
|
||||
|
||||
public HtmlFetcher setReferrer(String referrer) {
|
||||
this.referrer = referrer;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getUserAgent() {
|
||||
return userAgent;
|
||||
}
|
||||
|
||||
public void setUserAgent(String userAgent) {
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public String getAccept() {
|
||||
return accept;
|
||||
}
|
||||
|
||||
public String getCacheControl() {
|
||||
return cacheControl;
|
||||
}
|
||||
|
||||
public String getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
|
||||
String originalUrl = url;
|
||||
url = SHelper.removeHashbang(url);
|
||||
String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
|
||||
if (gUrl != null)
|
||||
url = gUrl;
|
||||
else {
|
||||
gUrl = SHelper.getUrlFromUglyFacebookRedirect(url);
|
||||
if (gUrl != null)
|
||||
url = gUrl;
|
||||
}
|
||||
|
||||
if (resolve) {
|
||||
// check if we can avoid resolving the URL (which hits the website!)
|
||||
JResult res = getFromCache(url, originalUrl);
|
||||
if (res != null)
|
||||
return res;
|
||||
|
||||
String resUrl = getResolvedUrl(url, timeout);
|
||||
if (resUrl.isEmpty()) {
|
||||
Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);
|
||||
|
||||
JResult result = new JResult();
|
||||
if (cache != null)
|
||||
cache.put(url, result);
|
||||
return result.setUrl(url);
|
||||
}
|
||||
|
||||
// if resolved url is longer then use it!
|
||||
if (resUrl != null && resUrl.trim().length() > url.length()) {
|
||||
// this is necessary e.g. for some homebaken url resolvers which
|
||||
// return
|
||||
// the resolved url relative to url!
|
||||
url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
|
||||
}
|
||||
}
|
||||
|
||||
// check if we have the (resolved) URL in cache
|
||||
JResult res = getFromCache(url, originalUrl);
|
||||
if (res != null)
|
||||
return res;
|
||||
|
||||
JResult result = new JResult();
|
||||
// or should we use? <link rel="canonical"
|
||||
// href="http://www.N24.de/news/newsitem_6797232.html"/>
|
||||
result.setUrl(url);
|
||||
result.setOriginalUrl(originalUrl);
|
||||
result.setDate(SHelper.estimateDate(url));
|
||||
|
||||
// Immediately put the url into the cache as extracting content takes
|
||||
// time.
|
||||
if (cache != null) {
|
||||
cache.put(originalUrl, result);
|
||||
cache.put(url, result);
|
||||
}
|
||||
|
||||
String lowerUrl = url.toLowerCase(Locale.getDefault());
|
||||
if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
|
||||
// skip
|
||||
} else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
|
||||
result.setVideoUrl(url);
|
||||
} else if (SHelper.isImage(lowerUrl)) {
|
||||
result.setImageUrl(url);
|
||||
} else {
|
||||
extractor.extractContent(result, fetchAsString(url, timeout));
|
||||
if (result.getFaviconUrl().isEmpty())
|
||||
result.setFaviconUrl(SHelper.getDefaultFavicon(url));
|
||||
|
||||
// some links are relative to root and do not include the domain of
|
||||
// the url :(
|
||||
result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
|
||||
result.setImageUrl(fixUrl(url, result.getImageUrl()));
|
||||
result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
|
||||
result.setRssUrl(fixUrl(url, result.getRssUrl()));
|
||||
}
|
||||
result.setText(lessText(result.getText()));
|
||||
synchronized (result) {
|
||||
result.notifyAll();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public String lessText(String text) {
|
||||
if (text == null)
|
||||
return "";
|
||||
|
||||
if (maxTextLength >= 0 && text.length() > maxTextLength)
|
||||
return text.substring(0, maxTextLength);
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
private static String fixUrl(String url, String urlOrPath) {
|
||||
return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
|
||||
}
|
||||
|
||||
public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException,
|
||||
IOException {
|
||||
return fetchAsString(urlAsString, timeout, true);
|
||||
}
|
||||
|
||||
public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
|
||||
throws MalformedURLException, IOException {
|
||||
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
|
||||
hConn.setInstanceFollowRedirects(true);
|
||||
String encoding = hConn.getContentEncoding();
|
||||
InputStream is;
|
||||
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
|
||||
is = new GZIPInputStream(hConn.getInputStream());
|
||||
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
|
||||
is = new InflaterInputStream(hConn.getInputStream(), new Inflater(true));
|
||||
} else {
|
||||
is = hConn.getInputStream();
|
||||
}
|
||||
|
||||
String enc = Converter.extractEncoding(hConn.getContentType());
|
||||
String res = createConverter(urlAsString).streamToString(is, enc);
|
||||
Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
|
||||
return res;
|
||||
}
|
||||
|
||||
public Converter createConverter(String url) {
|
||||
return new Converter(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* On some devices we have to hack:
|
||||
* http://developers.sun.com/mobility/reference
|
||||
* /techart/design_guidelines/http_redirection.html
|
||||
*
|
||||
* @param timeout
|
||||
* Sets a specified timeout value, in milliseconds
|
||||
* @return the resolved url if any. Or null if it couldn't resolve the url
|
||||
* (within the specified time) or the same url if response code is
|
||||
* OK
|
||||
*/
|
||||
public String getResolvedUrl(String urlAsString, int timeout) {
|
||||
String newUrl = null;
|
||||
int responseCode = -1;
|
||||
try {
|
||||
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, true);
|
||||
// force no follow
|
||||
hConn.setInstanceFollowRedirects(false);
|
||||
// the program doesn't care what the content actually is !!
|
||||
// http://java.sun.com/developer/JDCTechTips/2003/tt0422.html
|
||||
hConn.setRequestMethod("HEAD");
|
||||
hConn.connect();
|
||||
responseCode = hConn.getResponseCode();
|
||||
hConn.getInputStream().close();
|
||||
if (responseCode == HttpURLConnection.HTTP_OK)
|
||||
return urlAsString;
|
||||
|
||||
newUrl = hConn.getHeaderField("Location");
|
||||
if (responseCode / 100 == 3 && newUrl != null) {
|
||||
newUrl = newUrl.replaceAll(" ", "+");
|
||||
// some services use (none-standard) utf8 in their location
|
||||
// header
|
||||
if (urlAsString.startsWith("http://bit.ly")
|
||||
|| urlAsString.startsWith("http://is.gd"))
|
||||
newUrl = encodeUriFromHeader(newUrl);
|
||||
|
||||
// fix problems if shortened twice. as it is often the case
|
||||
// after twitters' t.co bullshit
|
||||
if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
|
||||
newUrl = getResolvedUrl(newUrl, timeout);
|
||||
|
||||
return newUrl;
|
||||
} else
|
||||
return urlAsString;
|
||||
|
||||
} catch (Exception ex) {
|
||||
Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
|
||||
return "";
|
||||
} finally {
|
||||
Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding
|
||||
* to non-ASCII characters. Workaround for broken origin servers that send
|
||||
* UTF-8 in the Location: header.
|
||||
*/
|
||||
static String encodeUriFromHeader(String badLocation) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (char ch : badLocation.toCharArray()) {
|
||||
if (ch < (char) 128) {
|
||||
sb.append(ch);
|
||||
} else {
|
||||
// this is ONLY valid if the uri was decoded using ISO-8859-1
|
||||
sb.append(String.format("%%%02X", (int) ch));
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
|
||||
boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
|
||||
URL url = new URL(urlAsStr);
|
||||
// using proxy may increase latency
|
||||
HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
|
||||
hConn.setRequestProperty("User-Agent", userAgent);
|
||||
hConn.setRequestProperty("Accept", accept);
|
||||
|
||||
if (includeSomeGooseOptions) {
|
||||
hConn.setRequestProperty("Accept-Language", language);
|
||||
hConn.setRequestProperty("content-charset", charset);
|
||||
hConn.addRequestProperty("Referer", referrer);
|
||||
// avoid the cache for testing purposes only?
|
||||
hConn.setRequestProperty("Cache-Control", cacheControl);
|
||||
}
|
||||
|
||||
// suggest respond to be gzipped or deflated (which is just another
|
||||
// compression)
|
||||
// http://stackoverflow.com/q/3932117
|
||||
hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
|
||||
hConn.setConnectTimeout(timeout);
|
||||
hConn.setReadTimeout(timeout);
|
||||
return hConn;
|
||||
}
|
||||
|
||||
private JResult getFromCache(String url, String originalUrl) throws Exception {
|
||||
if (cache != null) {
|
||||
JResult res = cache.get(url);
|
||||
if (res != null) {
|
||||
// e.g. the cache returned a shortened url as original url now
|
||||
// we want to store the
|
||||
// current original url! Also it can be that the cache response
|
||||
// to url but the JResult
|
||||
// does not contain it so overwrite it:
|
||||
res.setUrl(url);
|
||||
res.setOriginalUrl(originalUrl);
|
||||
cacheCounter.addAndGet(1);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
31
src/acr/browser/lightning/Reading/ImageResult.java
Normal file
31
src/acr/browser/lightning/Reading/ImageResult.java
Normal file
@ -0,0 +1,31 @@
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
/**
|
||||
* Class which encapsulates the data from an image found under an element
|
||||
*
|
||||
* @author Chris Alexander, chris@chris-alexander.co.uk
|
||||
*/
|
||||
public class ImageResult {
|
||||
|
||||
public String src;
|
||||
public Integer weight;
|
||||
public String title;
|
||||
public int height;
|
||||
public int width;
|
||||
public String alt;
|
||||
public boolean noFollow;
|
||||
public Element element;
|
||||
|
||||
public ImageResult(String src, Integer weight, String title, int height, int width, String alt,
|
||||
boolean noFollow) {
|
||||
this.src = src;
|
||||
this.weight = weight;
|
||||
this.title = title;
|
||||
this.height = height;
|
||||
this.width = width;
|
||||
this.alt = alt;
|
||||
this.noFollow = noFollow;
|
||||
}
|
||||
}
|
216
src/acr/browser/lightning/Reading/JResult.java
Normal file
216
src/acr/browser/lightning/Reading/JResult.java
Normal file
@ -0,0 +1,216 @@
|
||||
/*
|
||||
* Copyright 2011 Peter Karich
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Parsed result from web page containing important title, text and image.
|
||||
*
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public class JResult implements Serializable {
|
||||
|
||||
private String title;
|
||||
private String url;
|
||||
private String originalUrl;
|
||||
private String canonicalUrl;
|
||||
private String imageUrl;
|
||||
private String videoUrl;
|
||||
private String rssUrl;
|
||||
private String text;
|
||||
private String faviconUrl;
|
||||
private String description;
|
||||
private String dateString;
|
||||
private List<String> textList;
|
||||
private Collection<String> keywords;
|
||||
private List<ImageResult> images = null;
|
||||
|
||||
public JResult() {
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
if (url == null)
|
||||
return "";
|
||||
return url;
|
||||
}
|
||||
|
||||
public JResult setUrl(String url) {
|
||||
this.url = url;
|
||||
return this;
|
||||
}
|
||||
|
||||
public JResult setOriginalUrl(String originalUrl) {
|
||||
this.originalUrl = originalUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOriginalUrl() {
|
||||
return originalUrl;
|
||||
}
|
||||
|
||||
public JResult setCanonicalUrl(String canonicalUrl) {
|
||||
this.canonicalUrl = canonicalUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getCanonicalUrl() {
|
||||
return canonicalUrl;
|
||||
}
|
||||
|
||||
public String getFaviconUrl() {
|
||||
if (faviconUrl == null)
|
||||
return "";
|
||||
return faviconUrl;
|
||||
}
|
||||
|
||||
public JResult setFaviconUrl(String faviconUrl) {
|
||||
this.faviconUrl = faviconUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public JResult setRssUrl(String rssUrl) {
|
||||
this.rssUrl = rssUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getRssUrl() {
|
||||
if (rssUrl == null)
|
||||
return "";
|
||||
return rssUrl;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
if (description == null)
|
||||
return "";
|
||||
return description;
|
||||
}
|
||||
|
||||
public JResult setDescription(String description) {
|
||||
this.description = description;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getImageUrl() {
|
||||
if (imageUrl == null)
|
||||
return "";
|
||||
return imageUrl;
|
||||
}
|
||||
|
||||
public JResult setImageUrl(String imageUrl) {
|
||||
this.imageUrl = imageUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
if (text == null)
|
||||
return "";
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
public JResult setText(String text) {
|
||||
this.text = text;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<String> getTextList() {
|
||||
if (this.textList == null)
|
||||
return new ArrayList<String>();
|
||||
return this.textList;
|
||||
}
|
||||
|
||||
public JResult setTextList(List<String> textList) {
|
||||
this.textList = textList;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
if (title == null)
|
||||
return "";
|
||||
return title;
|
||||
}
|
||||
|
||||
public JResult setTitle(String title) {
|
||||
this.title = title;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getVideoUrl() {
|
||||
if (videoUrl == null)
|
||||
return "";
|
||||
return videoUrl;
|
||||
}
|
||||
|
||||
public JResult setVideoUrl(String videoUrl) {
|
||||
this.videoUrl = videoUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public JResult setDate(String date) {
|
||||
this.dateString = date;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Collection<String> getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(Collection<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return get date from url or guessed from text
|
||||
*/
|
||||
public String getDate() {
|
||||
return dateString;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return images list
|
||||
*/
|
||||
public List<ImageResult> getImages() {
|
||||
if (images == null)
|
||||
return Collections.emptyList();
|
||||
return images;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return images count
|
||||
*/
|
||||
public int getImagesCount() {
|
||||
if (images == null)
|
||||
return 0;
|
||||
return images.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* set images list
|
||||
*/
|
||||
public void setImages(List<ImageResult> images) {
|
||||
this.images = images;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
|
||||
}
|
||||
}
|
80
src/acr/browser/lightning/Reading/MapEntry.java
Normal file
80
src/acr/browser/lightning/Reading/MapEntry.java
Normal file
@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Copyright (C) 2010 Peter Karich <>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
* use this file except in compliance with the License. You may obtain a copy of
|
||||
* the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations under
|
||||
* the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Simple impl of Map.Entry. So that we can have ordered maps.
|
||||
*
|
||||
* @author Peter Karich, peat_hal ‘at’ users ‘dot’ sourceforge ‘dot’
|
||||
* net
|
||||
*/
|
||||
public class MapEntry<K, V> implements Map.Entry<K, V>, Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private K key;
|
||||
private V value;
|
||||
|
||||
public MapEntry(K key, V value) {
|
||||
this.key = key;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public K getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
@Override
|
||||
public V getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public V setValue(V value) {
|
||||
this.value = value;
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getKey() + ", " + getValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
final MapEntry<K, V> other = (MapEntry<K, V>) obj;
|
||||
if (this.key != other.key && (this.key == null || !this.key.equals(other.key)))
|
||||
return false;
|
||||
if (this.value != other.value && (this.value == null || !this.value.equals(other.value)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = 7;
|
||||
hash = 19 * hash + (this.key != null ? this.key.hashCode() : 0);
|
||||
hash = 19 * hash + (this.value != null ? this.value.hashCode() : 0);
|
||||
return hash;
|
||||
}
|
||||
}
|
174
src/acr/browser/lightning/Reading/OutputFormatter.java
Normal file
174
src/acr/browser/lightning/Reading/OutputFormatter.java
Normal file
@ -0,0 +1,174 @@
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
|
||||
/**
|
||||
* @author goose | jim
|
||||
* @author karussell
|
||||
*
|
||||
* this class will be responsible for taking our top node and stripping
|
||||
* out junk we don't want and getting it ready for how we want it
|
||||
* presented to the user
|
||||
*/
|
||||
public class OutputFormatter {
|
||||
|
||||
public static final int MIN_PARAGRAPH_TEXT = 50;
|
||||
private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
|
||||
private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
|
||||
protected final int minParagraphText;
|
||||
protected final List<String> nodesToReplace;
|
||||
protected String nodesToKeepCssSelector = "p";
|
||||
|
||||
public OutputFormatter() {
|
||||
this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
|
||||
}
|
||||
|
||||
public OutputFormatter(int minParagraphText) {
|
||||
this(minParagraphText, NODES_TO_REPLACE);
|
||||
}
|
||||
|
||||
public OutputFormatter(int minParagraphText, List<String> nodesToReplace) {
|
||||
this.minParagraphText = minParagraphText;
|
||||
this.nodesToReplace = nodesToReplace;
|
||||
}
|
||||
|
||||
/**
|
||||
* set elements to keep in output text
|
||||
*/
|
||||
public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) {
|
||||
this.nodesToKeepCssSelector = nodesToKeepCssSelector;
|
||||
}
|
||||
|
||||
/**
|
||||
* takes an element and turns the P tags into \n\n
|
||||
*/
|
||||
public String getFormattedText(Element topNode) {
|
||||
removeNodesWithNegativeScores(topNode);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
append(topNode, sb, nodesToKeepCssSelector);
|
||||
String str = SHelper.innerTrim(sb.toString());
|
||||
if (str.length() > 100)
|
||||
return str;
|
||||
|
||||
// no subelements
|
||||
if (str.isEmpty() || !topNode.text().isEmpty()
|
||||
&& str.length() <= topNode.ownText().length())
|
||||
str = topNode.text();
|
||||
|
||||
// if jsoup failed to parse the whole html now parse this smaller
|
||||
// snippet again to avoid html tags disturbing our text:
|
||||
return Jsoup.parse(str).text();
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes an element and returns a list of texts extracted from the P tags
|
||||
*/
|
||||
public List<String> getTextList(Element topNode) {
|
||||
List<String> texts = new ArrayList<String>();
|
||||
for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
|
||||
if (element.hasText()) {
|
||||
texts.add(element.text());
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
}
|
||||
|
||||
/**
|
||||
* If there are elements inside our top node that have a negative gravity
|
||||
* score remove them
|
||||
*/
|
||||
protected void removeNodesWithNegativeScores(Element topNode) {
|
||||
Elements gravityItems = topNode.select("*[gravityScore]");
|
||||
for (Element item : gravityItems) {
|
||||
int score = Integer.parseInt(item.attr("gravityScore"));
|
||||
if (score < 0 || item.text().length() < minParagraphText)
|
||||
item.remove();
|
||||
}
|
||||
}
|
||||
|
||||
protected void append(Element node, StringBuilder sb, String tagName) {
|
||||
// is select more costly then getElementsByTag?
|
||||
MAIN: for (Element e : node.select(tagName)) {
|
||||
Element tmpEl = e;
|
||||
// check all elements until 'node'
|
||||
while (tmpEl != null && !tmpEl.equals(node)) {
|
||||
if (unlikely(tmpEl))
|
||||
continue MAIN;
|
||||
tmpEl = tmpEl.parent();
|
||||
}
|
||||
|
||||
String text = node2Text(e);
|
||||
if (text.isEmpty() || text.length() < minParagraphText
|
||||
|| text.length() > SHelper.countLetters(text) * 2)
|
||||
continue;
|
||||
|
||||
sb.append(text);
|
||||
sb.append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
boolean unlikely(Node e) {
|
||||
if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
|
||||
return true;
|
||||
|
||||
String style = e.attr("style");
|
||||
String clazz = e.attr("class");
|
||||
if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void appendTextSkipHidden(Element e, StringBuilder accum) {
|
||||
for (Node child : e.childNodes()) {
|
||||
if (unlikely(child))
|
||||
continue;
|
||||
if (child instanceof TextNode) {
|
||||
TextNode textNode = (TextNode) child;
|
||||
String txt = textNode.text();
|
||||
accum.append(txt);
|
||||
} else if (child instanceof Element) {
|
||||
Element element = (Element) child;
|
||||
if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
|
||||
accum.append(" ");
|
||||
else if (element.tagName().equals("br"))
|
||||
accum.append(" ");
|
||||
appendTextSkipHidden(element, accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean lastCharIsWhitespace(StringBuilder accum) {
|
||||
if (accum.length() == 0)
|
||||
return false;
|
||||
return Character.isWhitespace(accum.charAt(accum.length() - 1));
|
||||
}
|
||||
|
||||
protected String node2TextOld(Element el) {
|
||||
return el.text();
|
||||
}
|
||||
|
||||
protected String node2Text(Element el) {
|
||||
StringBuilder sb = new StringBuilder(200);
|
||||
appendTextSkipHidden(el, sb);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public OutputFormatter setUnlikelyPattern(String unlikelyPattern) {
|
||||
this.unlikelyPattern = Pattern.compile(unlikelyPattern);
|
||||
return this;
|
||||
}
|
||||
|
||||
public OutputFormatter appendUnlikelyPattern(String str) {
|
||||
return setUnlikelyPattern(unlikelyPattern.toString() + "|" + str);
|
||||
}
|
||||
}
|
29
src/acr/browser/lightning/Reading/SCache.java
Normal file
29
src/acr/browser/lightning/Reading/SCache.java
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright 2011 Peter Karich
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public interface SCache {
|
||||
|
||||
JResult get(String url);
|
||||
|
||||
void put(String url, JResult res);
|
||||
|
||||
int getSize();
|
||||
}
|
480
src/acr/browser/lightning/Reading/SHelper.java
Normal file
480
src/acr/browser/lightning/Reading/SHelper.java
Normal file
@ -0,0 +1,480 @@
|
||||
/*
|
||||
* Copyright 2011 Peter Karich
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package acr.browser.lightning.Reading;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.CookieManager;
|
||||
import java.net.CookiePolicy;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URLEncoder;
|
||||
import java.security.SecureRandom;
|
||||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.net.ssl.KeyManager;
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Peter Karich
|
||||
*/
|
||||
public class SHelper {
|
||||
|
||||
public static final String UTF8 = "UTF-8";
|
||||
private static final Pattern SPACE = Pattern.compile(" ");
|
||||
|
||||
public static String replaceSpaces(String url) {
|
||||
if (!url.isEmpty()) {
|
||||
url = url.trim();
|
||||
if (url.contains(" ")) {
|
||||
Matcher spaces = SPACE.matcher(url);
|
||||
url = spaces.replaceAll("%20");
|
||||
}
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
public static int count(String str, String substring) {
|
||||
int c = 0;
|
||||
int index1 = str.indexOf(substring);
|
||||
if (index1 >= 0) {
|
||||
c++;
|
||||
c += count(str.substring(index1 + substring.length()), substring);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* remove more than two spaces or newlines
|
||||
*/
|
||||
public static String innerTrim(String str) {
|
||||
if (str.isEmpty())
|
||||
return "";
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean previousSpace = false;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
char c = str.charAt(i);
|
||||
if (c == ' ' || (int) c == 9 || c == '\n') {
|
||||
previousSpace = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (previousSpace)
|
||||
sb.append(' ');
|
||||
|
||||
previousSpace = false;
|
||||
sb.append(c);
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts reading the encoding from the first valid character until an
|
||||
* invalid encoding character occurs.
|
||||
*/
|
||||
public static String encodingCleanup(String str) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean startedWithCorrectString = false;
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
char c = str.charAt(i);
|
||||
if (Character.isDigit(c) || Character.isLetter(c) || c == '-' || c == '_') {
|
||||
startedWithCorrectString = true;
|
||||
sb.append(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (startedWithCorrectString)
|
||||
break;
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the longest substring as str1.substring(result[0], result[1]);
|
||||
*/
|
||||
public static String getLongestSubstring(String str1, String str2) {
|
||||
int res[] = longestSubstring(str1, str2);
|
||||
if (res == null || res[0] >= res[1])
|
||||
return "";
|
||||
|
||||
return str1.substring(res[0], res[1]);
|
||||
}
|
||||
|
||||
public static int[] longestSubstring(String str1, String str2) {
|
||||
if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty())
|
||||
return null;
|
||||
|
||||
// dynamic programming => save already identical length into array
|
||||
// to understand this algo simply print identical length in every entry
|
||||
// of the array
|
||||
// i+1, j+1 then reuses information from i,j
|
||||
// java initializes them already with 0
|
||||
int[][] num = new int[str1.length()][str2.length()];
|
||||
int maxlen = 0;
|
||||
int lastSubstrBegin = 0;
|
||||
int endIndex = 0;
|
||||
for (int i = 0; i < str1.length(); i++) {
|
||||
for (int j = 0; j < str2.length(); j++) {
|
||||
if (str1.charAt(i) == str2.charAt(j)) {
|
||||
if ((i == 0) || (j == 0))
|
||||
num[i][j] = 1;
|
||||
else
|
||||
num[i][j] = 1 + num[i - 1][j - 1];
|
||||
|
||||
if (num[i][j] > maxlen) {
|
||||
maxlen = num[i][j];
|
||||
// generate substring from str1 => i
|
||||
lastSubstrBegin = i - num[i][j] + 1;
|
||||
endIndex = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return new int[] { lastSubstrBegin, endIndex };
|
||||
}
|
||||
|
||||
public static String getDefaultFavicon(String url) {
|
||||
return useDomainOfFirstArg4Second(url, "/favicon.ico");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param urlForDomain
|
||||
* extract the domain from this url
|
||||
* @param path
|
||||
* this url does not have a domain
|
||||
* @return
|
||||
*/
|
||||
public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
|
||||
if (path.startsWith("http"))
|
||||
return path;
|
||||
|
||||
if ("favicon.ico".equals(path))
|
||||
path = "/favicon.ico";
|
||||
|
||||
if (path.startsWith("//")) {
|
||||
// wikipedia special case, see tests
|
||||
if (urlForDomain.startsWith("https:"))
|
||||
return "https:" + path;
|
||||
|
||||
return "http:" + path;
|
||||
} else if (path.startsWith("/"))
|
||||
return "http://" + extractHost(urlForDomain) + path;
|
||||
else if (path.startsWith("../")) {
|
||||
int slashIndex = urlForDomain.lastIndexOf("/");
|
||||
if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
|
||||
urlForDomain = urlForDomain.substring(0, slashIndex + 1);
|
||||
|
||||
return urlForDomain + path;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
public static String extractHost(String url) {
|
||||
return extractDomain(url, false);
|
||||
}
|
||||
|
||||
public static String extractDomain(String url, boolean aggressive) {
|
||||
if (url.startsWith("http://"))
|
||||
url = url.substring("http://".length());
|
||||
else if (url.startsWith("https://"))
|
||||
url = url.substring("https://".length());
|
||||
|
||||
if (aggressive) {
|
||||
if (url.startsWith("www."))
|
||||
url = url.substring("www.".length());
|
||||
|
||||
// strip mobile from start
|
||||
if (url.startsWith("m."))
|
||||
url = url.substring("m.".length());
|
||||
}
|
||||
|
||||
int slashIndex = url.indexOf("/");
|
||||
if (slashIndex > 0)
|
||||
url = url.substring(0, slashIndex);
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
public static boolean isVideoLink(String url) {
|
||||
url = extractDomain(url, true);
|
||||
return url.startsWith("youtube.com") || url.startsWith("video.yahoo.com")
|
||||
|| url.startsWith("vimeo.com") || url.startsWith("blip.tv");
|
||||
}
|
||||
|
||||
public static boolean isVideo(String url) {
|
||||
return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi")
|
||||
|| url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4")
|
||||
|| url.endsWith(".flv") || url.endsWith(".wmv");
|
||||
}
|
||||
|
||||
public static boolean isAudio(String url) {
|
||||
return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u")
|
||||
|| url.endsWith(".wav");
|
||||
}
|
||||
|
||||
public static boolean isDoc(String url) {
|
||||
return url.endsWith(".pdf") || url.endsWith(".ppt") || url.endsWith(".doc")
|
||||
|| url.endsWith(".swf") || url.endsWith(".rtf") || url.endsWith(".xls");
|
||||
}
|
||||
|
||||
public static boolean isPackage(String url) {
|
||||
return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
|
||||
|| url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm")
|
||||
|| url.endsWith(".7z");
|
||||
}
|
||||
|
||||
public static boolean isApp(String url) {
|
||||
return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat")
|
||||
|| url.endsWith(".dmg");
|
||||
}
|
||||
|
||||
public static boolean isImage(String url) {
|
||||
return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
|
||||
|| url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico")
|
||||
|| url.endsWith(".eps");
|
||||
}
|
||||
|
||||
/**
|
||||
* @see http
|
||||
* ://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se
|
||||
*/
|
||||
public static void enableCookieMgmt() {
|
||||
CookieManager manager = new CookieManager();
|
||||
manager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
|
||||
CookieHandler.setDefault(manager);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see http
|
||||
* ://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java
|
||||
* -urlconnection
|
||||
*/
|
||||
public static void enableUserAgentOverwrite() {
|
||||
System.setProperty("http.agent", "");
|
||||
}
|
||||
|
||||
public static String getUrlFromUglyGoogleRedirect(String url) {
|
||||
if (url.startsWith("http://www.google.com/url?")) {
|
||||
url = url.substring("http://www.google.com/url?".length());
|
||||
String arr[] = urlDecode(url).split("\\&");
|
||||
if (arr != null)
|
||||
for (String str : arr) {
|
||||
if (str.startsWith("q="))
|
||||
return str.substring("q=".length());
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String getUrlFromUglyFacebookRedirect(String url) {
|
||||
if (url.startsWith("http://www.facebook.com/l.php?u=")) {
|
||||
url = url.substring("http://www.facebook.com/l.php?u=".length());
|
||||
return urlDecode(url);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String urlEncode(String str) {
|
||||
try {
|
||||
return URLEncoder.encode(str, UTF8);
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
||||
public static String urlDecode(String str) {
|
||||
try {
|
||||
return URLDecoder.decode(str, UTF8);
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Popular sites uses the #! to indicate the importance of the following
|
||||
* chars. Ugly but true. Such as: facebook, twitter, gizmodo, ...
|
||||
*/
|
||||
public static String removeHashbang(String url) {
|
||||
return url.replaceFirst("#!", "");
|
||||
}
|
||||
|
||||
public static String printNode(Element root) {
|
||||
return printNode(root, 0);
|
||||
}
|
||||
|
||||
public static String printNode(Element root, int indentation) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < indentation; i++) {
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(root.tagName());
|
||||
sb.append(":");
|
||||
sb.append(root.ownText());
|
||||
sb.append("\n");
|
||||
for (Element el : root.children()) {
|
||||
sb.append(printNode(el, indentation + 1));
|
||||
sb.append("\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static String estimateDate(String url) {
|
||||
int index = url.indexOf("://");
|
||||
if (index > 0)
|
||||
url = url.substring(index + 3);
|
||||
|
||||
int year = -1;
|
||||
int yearCounter = -1;
|
||||
int month = -1;
|
||||
int monthCounter = -1;
|
||||
int day = -1;
|
||||
String strs[] = url.split("/");
|
||||
for (int counter = 0; counter < strs.length; counter++) {
|
||||
String str = strs[counter];
|
||||
if (str.length() == 4) {
|
||||
try {
|
||||
year = Integer.parseInt(str);
|
||||
} catch (Exception ex) {
|
||||
continue;
|
||||
}
|
||||
if (year < 1970 || year > 3000) {
|
||||
year = -1;
|
||||
continue;
|
||||
}
|
||||
yearCounter = counter;
|
||||
} else if (str.length() == 2) {
|
||||
if (monthCounter < 0 && counter == yearCounter + 1) {
|
||||
try {
|
||||
month = Integer.parseInt(str);
|
||||
} catch (Exception ex) {
|
||||
continue;
|
||||
}
|
||||
if (month < 1 || month > 12) {
|
||||
month = -1;
|
||||
continue;
|
||||
}
|
||||
monthCounter = counter;
|
||||
} else if (counter == monthCounter + 1) {
|
||||
try {
|
||||
day = Integer.parseInt(str);
|
||||
} catch (Exception ex) {
|
||||
}
|
||||
if (day < 1 || day > 31) {
|
||||
day = -1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (year < 0)
|
||||
return null;
|
||||
|
||||
StringBuilder str = new StringBuilder();
|
||||
str.append(year);
|
||||
if (month < 1)
|
||||
return str.toString();
|
||||
|
||||
str.append('/');
|
||||
if (month < 10)
|
||||
str.append('0');
|
||||
str.append(month);
|
||||
if (day < 1)
|
||||
return str.toString();
|
||||
|
||||
str.append('/');
|
||||
if (day < 10)
|
||||
str.append('0');
|
||||
str.append(day);
|
||||
return str.toString();
|
||||
}
|
||||
|
||||
public static String completeDate(String dateStr) {
|
||||
if (dateStr == null)
|
||||
return null;
|
||||
|
||||
int index = dateStr.indexOf('/');
|
||||
if (index > 0) {
|
||||
index = dateStr.indexOf('/', index + 1);
|
||||
if (index > 0)
|
||||
return dateStr;
|
||||
else
|
||||
return dateStr + "/01";
|
||||
}
|
||||
return dateStr + "/01/01";
|
||||
}
|
||||
|
||||
/**
|
||||
* keep in mind: simpleDateFormatter is not thread safe! call completeDate
|
||||
* before applying this formatter.
|
||||
*/
|
||||
public static SimpleDateFormat createDateFormatter() {
|
||||
return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
|
||||
}
|
||||
|
||||
// with the help of
|
||||
// http://stackoverflow.com/questions/1828775/httpclient-and-ssl
|
||||
public static void enableAnySSL() {
|
||||
try {
|
||||
SSLContext ctx = SSLContext.getInstance("TLS");
|
||||
ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() },
|
||||
new SecureRandom());
|
||||
SSLContext.setDefault(ctx);
|
||||
} catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static class DefaultTrustManager implements X509TrustManager {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] arg0, String arg1)
|
||||
throws CertificateException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] arg0, String arg1)
|
||||
throws CertificateException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static int countLetters(String str) {
|
||||
int len = str.length();
|
||||
int chars = 0;
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (Character.isLetter(str.charAt(i)))
|
||||
chars++;
|
||||
}
|
||||
return chars;
|
||||
}
|
||||
}
|
153
src/acr/browser/lightning/ReadingActivity.java
Normal file
153
src/acr/browser/lightning/ReadingActivity.java
Normal file
@ -0,0 +1,153 @@
|
||||
package acr.browser.lightning;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import acr.browser.lightning.Reading.HtmlFetcher;
|
||||
import acr.browser.lightning.Reading.JResult;
|
||||
import android.animation.ObjectAnimator;
|
||||
import android.app.ProgressDialog;
|
||||
import android.content.Context;
|
||||
import android.content.Intent;
|
||||
import android.os.AsyncTask;
|
||||
import android.os.Bundle;
|
||||
import android.support.v7.app.ActionBarActivity;
|
||||
import android.support.v7.widget.Toolbar;
|
||||
import android.view.MenuItem;
|
||||
import android.view.View;
|
||||
import android.widget.TextView;
|
||||
|
||||
public class ReadingActivity extends ActionBarActivity {
|
||||
|
||||
private TextView mTitle;
|
||||
private TextView mBody;
|
||||
|
||||
@Override
|
||||
protected void onCreate(Bundle savedInstanceState) {
|
||||
super.onCreate(savedInstanceState);
|
||||
setContentView(R.layout.reading_view);
|
||||
|
||||
Toolbar toolbar = (Toolbar) findViewById(R.id.toolbar);
|
||||
setSupportActionBar(toolbar);
|
||||
|
||||
getSupportActionBar().setDisplayHomeAsUpEnabled(true);
|
||||
|
||||
mTitle = (TextView) findViewById(R.id.textViewTitle);
|
||||
mBody = (TextView) findViewById(R.id.textViewBody);
|
||||
|
||||
mTitle.setText(getString(R.string.untitled));
|
||||
mBody.setText(getString(R.string.loading));
|
||||
|
||||
mTitle.setVisibility(View.INVISIBLE);
|
||||
mBody.setVisibility(View.INVISIBLE);
|
||||
|
||||
Intent intent = getIntent();
|
||||
if (!loadPage(intent)) {
|
||||
setText(getString(R.string.untitled), getString(R.string.loading_failed));
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean loadPage(Intent intent) {
|
||||
if (intent == null) {
|
||||
return false;
|
||||
}
|
||||
String url = intent.getStringExtra(Constants.LOAD_READING_URL);
|
||||
if (url == null) {
|
||||
return false;
|
||||
}
|
||||
getSupportActionBar().setTitle(Utils.getDomainName(url));
|
||||
new PageLoader(this).execute(url);
|
||||
return true;
|
||||
}
|
||||
|
||||
private class PageLoader extends AsyncTask<String, Void, Void> {
|
||||
|
||||
private Context mContext;
|
||||
private ProgressDialog mProgressDialog;
|
||||
private String mTitleText;
|
||||
private List<String> mBodyText;
|
||||
|
||||
public PageLoader(Context context) {
|
||||
mContext = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onPreExecute() {
|
||||
super.onPreExecute();
|
||||
mProgressDialog = new ProgressDialog(mContext);
|
||||
mProgressDialog.setProgressStyle(ProgressDialog.STYLE_SPINNER);
|
||||
mProgressDialog.setCancelable(false);
|
||||
mProgressDialog.setIndeterminate(true);
|
||||
mProgressDialog.setMessage(mContext.getString(R.string.loading));
|
||||
mProgressDialog.show();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Void doInBackground(String... params) {
|
||||
|
||||
HtmlFetcher fetcher = new HtmlFetcher();
|
||||
try {
|
||||
JResult result = fetcher.fetchAndExtract(params[0], 5000, true);
|
||||
mTitleText = result.getTitle();
|
||||
mBodyText = result.getTextList();
|
||||
} catch (Exception e) {
|
||||
mTitleText = "";
|
||||
mBodyText = new ArrayList<>();
|
||||
e.printStackTrace();
|
||||
} catch (OutOfMemoryError e) {
|
||||
System.gc();
|
||||
mTitleText = "";
|
||||
mBodyText = new ArrayList<>();
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onPostExecute(Void result) {
|
||||
mProgressDialog.dismiss();
|
||||
if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
|
||||
setText(getString(R.string.untitled), getString(R.string.loading_failed));
|
||||
} else {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (String text : mBodyText) {
|
||||
builder.append(text + "\n\n");
|
||||
}
|
||||
setText(mTitleText, builder.toString());
|
||||
}
|
||||
super.onPostExecute(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void setText(String title, String body) {
|
||||
if (mTitle.getVisibility() == View.INVISIBLE) {
|
||||
mTitle.setAlpha(0.0f);
|
||||
mTitle.setVisibility(View.VISIBLE);
|
||||
mTitle.setText(title);
|
||||
ObjectAnimator animator = ObjectAnimator.ofFloat(mTitle, "alpha", 1.0f);
|
||||
animator.setDuration(300);
|
||||
animator.start();
|
||||
} else {
|
||||
mTitle.setText(title);
|
||||
}
|
||||
|
||||
if (mBody.getVisibility() == View.INVISIBLE) {
|
||||
mBody.setAlpha(0.0f);
|
||||
mBody.setVisibility(View.VISIBLE);
|
||||
mBody.setText(body);
|
||||
ObjectAnimator animator = ObjectAnimator.ofFloat(mBody, "alpha", 1.0f);
|
||||
animator.setDuration(300);
|
||||
animator.start();
|
||||
} else {
|
||||
mBody.setText(body);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onOptionsItemSelected(MenuItem item) {
|
||||
finish();
|
||||
return super.onOptionsItemSelected(item);
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user