Improved reading mode thanks to changes from snacktory fork by skyshard

This commit is contained in:
Anthony Restaino 2015-08-25 20:59:23 -04:00
parent 04c9f75a90
commit 367c62bd39
6 changed files with 1051 additions and 357 deletions

View File

@ -139,7 +139,7 @@ public class ReadingActivity extends AppCompatActivity {
private final Activity mActivity; private final Activity mActivity;
private String mTitleText; private String mTitleText;
private List<String> mBodyText; private String mBodyText;
public PageLoader(Activity activity) { public PageLoader(Activity activity) {
mActivity = activity; mActivity = activity;
@ -163,15 +163,15 @@ public class ReadingActivity extends AppCompatActivity {
try { try {
JResult result = fetcher.fetchAndExtract(params[0], 2500, true); JResult result = fetcher.fetchAndExtract(params[0], 2500, true);
mTitleText = result.getTitle(); mTitleText = result.getTitle();
mBodyText = result.getTextList(); mBodyText = result.getText();
} catch (Exception e) { } catch (Exception e) {
mTitleText = ""; mTitleText = "";
mBodyText = new ArrayList<>(); mBodyText = "";
e.printStackTrace(); e.printStackTrace();
} catch (OutOfMemoryError e) { } catch (OutOfMemoryError e) {
System.gc(); System.gc();
mTitleText = ""; mTitleText = "";
mBodyText = new ArrayList<>(); mBodyText = "";
e.printStackTrace(); e.printStackTrace();
} }
return null; return null;
@ -186,11 +186,7 @@ public class ReadingActivity extends AppCompatActivity {
if (mTitleText.isEmpty() || mBodyText.isEmpty()) { if (mTitleText.isEmpty() || mBodyText.isEmpty()) {
setText(getString(R.string.untitled), getString(R.string.loading_failed)); setText(getString(R.string.untitled), getString(R.string.loading_failed));
} else { } else {
StringBuilder builder = new StringBuilder(); setText(mTitleText, mBodyText);
for (String text : mBodyText) {
builder.append(text).append("\n\n");
}
setText(mTitleText, builder.toString());
} }
super.onPostExecute(result); super.onPostExecute(result);
} }

View File

@ -22,22 +22,19 @@ import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URL; import java.net.URL;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater; import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream; import java.util.zip.InflaterInputStream;
import acr.browser.lightning.constant.Constants;
import android.util.Log;
/** /**
* Class to fetch articles. This class is thread safe. * Class to fetch articles. This class is thread safe.
* *
* @author Peter Karich * @author Peter Karich
*/ */
public class HtmlFetcher { public class HtmlFetcher {
@ -64,7 +61,7 @@ public class HtmlFetcher {
else else
existing.add(domainStr); existing.add(domainStr);
String html = new HtmlFetcher().fetchAsString(url, 20000); String html = new HtmlFetcher().fetchAsString(url, 2000);
String outFile = domainStr + counterStr + ".html"; String outFile = domainStr + counterStr + ".html";
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
writer.write(html); writer.write(html);
@ -73,8 +70,8 @@ public class HtmlFetcher {
reader.close(); reader.close();
} }
private String referrer = "https://github.com/karussell/snacktory"; private String referrer = "http://jetsli.de/crawler";
private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ')'; private String userAgent = "Mozilla/5.0 (compatible; Jetslide; +" + referrer + ')';
private String cacheControl = "max-age=0"; private String cacheControl = "max-age=0";
private String language = "en-us"; private String language = "en-us";
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
@ -83,7 +80,7 @@ public class HtmlFetcher {
private final AtomicInteger cacheCounter = new AtomicInteger(0); private final AtomicInteger cacheCounter = new AtomicInteger(0);
private int maxTextLength = -1; private int maxTextLength = -1;
private ArticleTextExtractor extractor = new ArticleTextExtractor(); private ArticleTextExtractor extractor = new ArticleTextExtractor();
private final Set<String> furtherResolveNecessary = new LinkedHashSet<String>() { private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() {
{ {
add("bit.ly"); add("bit.ly");
add("cli.gs"); add("cli.gs");
@ -202,6 +199,12 @@ public class HtmlFetcher {
} }
public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception { public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception {
return fetchAndExtract(url, timeout, resolve, 0, false);
}
// main workhorse to call externally
public JResult fetchAndExtract(String url, int timeout, boolean resolve,
int maxContentSize, boolean forceReload) throws Exception {
String originalUrl = url; String originalUrl = url;
url = SHelper.removeHashbang(url); url = SHelper.removeHashbang(url);
String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url); String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url);
@ -219,9 +222,8 @@ public class HtmlFetcher {
if (res != null) if (res != null)
return res; return res;
String resUrl = getResolvedUrl(url, timeout); String resUrl = getResolvedUrl(url, timeout, 0);
if (resUrl.isEmpty()) { if (resUrl.isEmpty()) {
Log.d(Constants.TAG, "resolved url is empty. Url is: " + url);
JResult result = new JResult(); JResult result = new JResult();
if (cache != null) if (cache != null)
@ -229,10 +231,9 @@ public class HtmlFetcher {
return result.setUrl(url); return result.setUrl(url);
} }
// if resolved url is longer then use it! // if resolved url is different then use it!
if (resUrl.trim().length() > url.length()) { if (!resUrl.equals(url)) {
// this is necessary e.g. for some homebaken url resolvers which // this is necessary e.g. for some homebaken url resolvers which return
// return
// the resolved url relative to url! // the resolved url relative to url!
url = SHelper.useDomainOfFirstArg4Second(url, resUrl); url = SHelper.useDomainOfFirstArg4Second(url, resUrl);
} }
@ -244,20 +245,18 @@ public class HtmlFetcher {
return res; return res;
JResult result = new JResult(); JResult result = new JResult();
// or should we use? <link rel="canonical" // or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/>
// href="http://www.N24.de/news/newsitem_6797232.html"/>
result.setUrl(url); result.setUrl(url);
result.setOriginalUrl(originalUrl); result.setOriginalUrl(originalUrl);
result.setDate(SHelper.estimateDate(url));
// Immediately put the url into the cache as extracting content takes // Immediately put the url into the cache as extracting content takes time.
// time.
if (cache != null) { if (cache != null) {
cache.put(originalUrl, result); cache.put(originalUrl, result);
cache.put(url, result); cache.put(url, result);
} }
String lowerUrl = url.toLowerCase(Locale.getDefault()); // extract content to the extent appropriate for content type
String lowerUrl = url.toLowerCase();
if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) { if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) {
// skip // skip
} else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) { } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) {
@ -265,16 +264,30 @@ public class HtmlFetcher {
} else if (SHelper.isImage(lowerUrl)) { } else if (SHelper.isImage(lowerUrl)) {
result.setImageUrl(url); result.setImageUrl(url);
} else { } else {
extractor.extractContent(result, fetchAsString(url, timeout)); try {
String urlToDownload = url;
if (forceReload) {
urlToDownload = getURLtoBreakCache(url);
}
extractor.extractContent(result, fetchAsString(urlToDownload, timeout), maxContentSize);
} catch (IOException io) {
// do nothing
}
if (result.getFaviconUrl().isEmpty()) if (result.getFaviconUrl().isEmpty())
result.setFaviconUrl(SHelper.getDefaultFavicon(url)); result.setFaviconUrl(SHelper.getDefaultFavicon(url));
// some links are relative to root and do not include the domain of // some links are relative to root and do not include the domain of the url :(
// the url :( if (!result.getFaviconUrl().isEmpty())
result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); result.setFaviconUrl(fixUrl(url, result.getFaviconUrl()));
result.setImageUrl(fixUrl(url, result.getImageUrl()));
result.setVideoUrl(fixUrl(url, result.getVideoUrl())); if (!result.getImageUrl().isEmpty())
result.setRssUrl(fixUrl(url, result.getRssUrl())); result.setImageUrl(fixUrl(url, result.getImageUrl()));
if (!result.getVideoUrl().isEmpty())
result.setVideoUrl(fixUrl(url, result.getVideoUrl()));
if (!result.getRssUrl().isEmpty())
result.setRssUrl(fixUrl(url, result.getRssUrl()));
} }
result.setText(lessText(result.getText())); result.setText(lessText(result.getText()));
synchronized (result) { synchronized (result) {
@ -283,6 +296,20 @@ public class HtmlFetcher {
return result; return result;
} }
// Ugly hack to break free from any cached versions, a few URLs required this.
public static String getURLtoBreakCache(String url) {
try {
URL aURL = new URL(url);
if (aURL.getQuery() != null && aURL.getQuery().isEmpty()) {
return url + "?1";
} else {
return url + "&1";
}
} catch (MalformedURLException e) {
return url;
}
}
public String lessText(String text) { public String lessText(String text) {
if (text == null) if (text == null)
return ""; return "";
@ -297,13 +324,14 @@ public class HtmlFetcher {
return SHelper.useDomainOfFirstArg4Second(url, urlOrPath); return SHelper.useDomainOfFirstArg4Second(url, urlOrPath);
} }
public String fetchAsString(String urlAsString, int timeout) throws public String fetchAsString(String urlAsString, int timeout)
IOException { throws MalformedURLException, IOException {
return fetchAsString(urlAsString, timeout, true); return fetchAsString(urlAsString, timeout, true);
} }
// main routine to get raw webpage content
public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions) public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions)
throws IOException { throws MalformedURLException, IOException {
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions); HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions);
hConn.setInstanceFollowRedirects(true); hConn.setInstanceFollowRedirects(true);
String encoding = hConn.getContentEncoding(); String encoding = hConn.getContentEncoding();
@ -317,27 +345,23 @@ public class HtmlFetcher {
} }
String enc = Converter.extractEncoding(hConn.getContentType()); String enc = Converter.extractEncoding(hConn.getContentType());
String res = createConverter(urlAsString).streamToString(is, enc); return createConverter(urlAsString).streamToString(is, enc);
Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString);
return res;
} }
public Converter createConverter(String url) { public static Converter createConverter(String url) {
return new Converter(url); return new Converter(url);
} }
/** /**
* On some devices we have to hack: * On some devices we have to hack:
* http://developers.sun.com/mobility/reference * http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html
* /techart/design_guidelines/http_redirection.html *
* * @param timeout Sets a specified timeout value, in milliseconds
* @param timeout
* Sets a specified timeout value, in milliseconds
* @return the resolved url if any. Or null if it couldn't resolve the url * @return the resolved url if any. Or null if it couldn't resolve the url
* (within the specified time) or the same url if response code is * (within the specified time) or the same url if response code is OK
* OK
*/ */
public String getResolvedUrl(String urlAsString, int timeout) { public String getResolvedUrl(String urlAsString, int timeout,
int num_redirects) {
String newUrl = null; String newUrl = null;
int responseCode = -1; int responseCode = -1;
try { try {
@ -354,28 +378,32 @@ public class HtmlFetcher {
return urlAsString; return urlAsString;
newUrl = hConn.getHeaderField("Location"); newUrl = hConn.getHeaderField("Location");
if (responseCode / 100 == 3 && newUrl != null) { // Note that the max recursion level is 5.
if (responseCode / 100 == 3 && newUrl != null && num_redirects < 5) {
newUrl = newUrl.replaceAll(" ", "+"); newUrl = newUrl.replaceAll(" ", "+");
// some services use (none-standard) utf8 in their location // some services use (none-standard) utf8 in their location header
// header
if (urlAsString.startsWith("http://bit.ly") if (urlAsString.startsWith("http://bit.ly")
|| urlAsString.startsWith("http://is.gd")) || urlAsString.startsWith("http://is.gd"))
newUrl = encodeUriFromHeader(newUrl); newUrl = encodeUriFromHeader(newUrl);
// fix problems if shortened twice. as it is often the case // AP: This code is not longer need, instead we always follow
// after twitters' t.co bullshit // multiple redirects.
if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) //
newUrl = getResolvedUrl(newUrl, timeout); // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit
//if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
// newUrl = getResolvedUrl(newUrl, timeout);
// Add support for URLs with multiple levels of redirection,
// call getResolvedUrl until there is no more redirects or a
// max number of redirects is reached.
newUrl = SHelper.useDomainOfFirstArg4Second(urlAsString, newUrl);
newUrl = getResolvedUrl(newUrl, timeout, num_redirects + 1);
return newUrl; return newUrl;
} else } else
return urlAsString; return urlAsString;
} catch (Exception ex) { } catch (Exception ex) {
Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage());
return ""; return "";
} finally {
Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl);
} }
} }
@ -400,9 +428,9 @@ public class HtmlFetcher {
} }
protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout, protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout,
boolean includeSomeGooseOptions) throws IOException { boolean includeSomeGooseOptions) throws MalformedURLException, IOException {
URL url = new URL(urlAsStr); URL url = new URL(urlAsStr);
// using proxy may increase latency //using proxy may increase latency
HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY);
hConn.setRequestProperty("User-Agent", userAgent); hConn.setRequestProperty("User-Agent", userAgent);
hConn.setRequestProperty("Accept", accept); hConn.setRequestProperty("Accept", accept);
@ -415,8 +443,7 @@ public class HtmlFetcher {
hConn.setRequestProperty("Cache-Control", cacheControl); hConn.setRequestProperty("Cache-Control", cacheControl);
} }
// suggest respond to be gzipped or deflated (which is just another // suggest respond to be gzipped or deflated (which is just another compression)
// compression)
// http://stackoverflow.com/q/3932117 // http://stackoverflow.com/q/3932117
hConn.setRequestProperty("Accept-Encoding", "gzip, deflate"); hConn.setRequestProperty("Accept-Encoding", "gzip, deflate");
hConn.setConnectTimeout(timeout); hConn.setConnectTimeout(timeout);
@ -424,14 +451,12 @@ public class HtmlFetcher {
return hConn; return hConn;
} }
private JResult getFromCache(String url, String originalUrl) throws Exception { private JResult getFromCache(String url, String originalUrl) {
if (cache != null) { if (cache != null) {
JResult res = cache.get(url); JResult res = cache.get(url);
if (res != null) { if (res != null) {
// e.g. the cache returned a shortened url as original url now // e.g. the cache returned a shortened url as original url now we want to store the
// we want to store the // current original url! Also it can be that the cache response to url but the JResult
// current original url! Also it can be that the cache response
// to url but the JResult
// does not contain it so overwrite it: // does not contain it so overwrite it:
res.setUrl(url); res.setUrl(url);
res.setOriginalUrl(originalUrl); res.setOriginalUrl(originalUrl);
@ -441,4 +466,4 @@ public class HtmlFetcher {
} }
return null; return null;
} }
} }

View File

@ -16,14 +16,18 @@
package acr.browser.lightning.reading; package acr.browser.lightning.reading;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
/** /**
* Parsed result from web page containing important title, text and image. * Parsed result from web page containing important title, text and image.
* *
* @author Peter Karich * @author Peter Karich
*/ */
public class JResult implements Serializable { public class JResult implements Serializable {
@ -38,10 +42,15 @@ public class JResult implements Serializable {
private String text; private String text;
private String faviconUrl; private String faviconUrl;
private String description; private String description;
private String dateString; private String authorName;
private List<String> textList; private String authorDescription;
private Date date;
private Collection<String> keywords; private Collection<String> keywords;
private List<ImageResult> images = null; private List<ImageResult> images = null;
private List<Map<String, String>> links = new ArrayList<>();
private String type;
private String sitename;
private String language;
public JResult() { public JResult() {
} }
@ -108,6 +117,28 @@ public class JResult implements Serializable {
return this; return this;
} }
public String getAuthorName() {
if (authorName == null)
return "";
return authorName;
}
public JResult setAuthorName(String authorName) {
this.authorName = authorName;
return this;
}
public String getAuthorDescription() {
if (authorDescription == null)
return "";
return authorDescription;
}
public JResult setAuthorDescription(String authorDescription) {
this.authorDescription = authorDescription;
return this;
}
public String getImageUrl() { public String getImageUrl() {
if (imageUrl == null) if (imageUrl == null)
return ""; return "";
@ -131,17 +162,6 @@ public class JResult implements Serializable {
return this; return this;
} }
public List<String> getTextList() {
if (this.textList == null)
return new ArrayList<>();
return this.textList;
}
public JResult setTextList(List<String> textList) {
this.textList = textList;
return this;
}
public String getTitle() { public String getTitle() {
if (title == null) if (title == null)
return ""; return "";
@ -164,8 +184,8 @@ public class JResult implements Serializable {
return this; return this;
} }
public JResult setDate(String date) { public JResult setDate(Date date) {
this.dateString = date; this.date = date;
return this; return this;
} }
@ -180,8 +200,8 @@ public class JResult implements Serializable {
/** /**
* @return get date from url or guessed from text * @return get date from url or guessed from text
*/ */
public String getDate() { public Date getDate() {
return dateString; return date;
} }
/** /**
@ -209,8 +229,46 @@ public class JResult implements Serializable {
this.images = images; this.images = images;
} }
public void addLink(String url, String text, Integer pos) {
Map link = new HashMap();
link.put("url", url);
link.put("text", text);
link.put("offset", String.valueOf(pos));
links.add(link);
}
public List<Map<String, String>> getLinks() {
if (links == null)
return Collections.emptyList();
return links;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getSitename() {
return sitename;
}
public void setSitename(String sitename) {
this.sitename = sitename;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
@Override @Override
public String toString() { public String toString() {
return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text; return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text;
} }
} }

View File

@ -4,40 +4,46 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode; import org.jsoup.nodes.TextNode;
/** /**
* @author goose | jim * @author goose | jim
* @author karussell * @author karussell
* * <p/>
* this class will be responsible for taking our top node and stripping * this class will be responsible for taking our top node and stripping out junk
* out junk we don't want and getting it ready for how we want it * we don't want and getting it ready for how we want it presented to the user
* presented to the user
*/ */
public class OutputFormatter { public class OutputFormatter {
public static final int MIN_PARAGRAPH_TEXT = 50; private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph
private static final int MIN_PARAGRAPH_TEXT = 30; // Min size of any other paragraphs
private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i"); private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden"); private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden");
protected final int minParagraphText; private final int minFirstParagraphText;
protected final List<String> nodesToReplace; private final int minParagraphText;
protected String nodesToKeepCssSelector = "p"; private final List<String> nodesToReplace;
private String nodesToKeepCssSelector = "p, ol";
public OutputFormatter() { public OutputFormatter() {
this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
} }
public OutputFormatter(int minParagraphText) { public OutputFormatter(int minParagraphText) {
this(minParagraphText, NODES_TO_REPLACE); this(minParagraphText, minParagraphText, NODES_TO_REPLACE);
} }
public OutputFormatter(int minParagraphText, List<String> nodesToReplace) { public OutputFormatter(int minFirstParagraphText, int minParagraphText) {
this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE);
}
public OutputFormatter(int minFirstParagraphText, int minParagraphText,
List<String> nodesToReplace) {
this.minFirstParagraphText = minFirstParagraphText;
this.minParagraphText = minParagraphText; this.minParagraphText = minParagraphText;
this.nodesToReplace = nodesToReplace; this.nodesToReplace = nodesToReplace;
} }
@ -53,36 +59,34 @@ public class OutputFormatter {
* takes an element and turns the P tags into \n\n * takes an element and turns the P tags into \n\n
*/ */
public String getFormattedText(Element topNode) { public String getFormattedText(Element topNode) {
setParagraphIndex(topNode, nodesToKeepCssSelector);
removeNodesWithNegativeScores(topNode); removeNodesWithNegativeScores(topNode);
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
append(topNode, sb, nodesToKeepCssSelector); int countOfP = append(topNode, sb, nodesToKeepCssSelector);
String str = SHelper.innerTrim(sb.toString()); String str = SHelper.innerTrim(sb.toString());
if (str.length() > 100)
int topNodeLength = topNode.text().length();
if (topNodeLength == 0) {
topNodeLength = 1;
}
boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25);
if (str.length() > 100 && countOfP > 0 && !lowTextRatio)
return str; return str;
// no subelements // no subelements
if (str.isEmpty() || !topNode.text().isEmpty() if (str.isEmpty() || (!topNode.text().isEmpty()
&& str.length() <= topNode.ownText().length()) && str.length() <= topNode.ownText().length())
|| countOfP == 0 || lowTextRatio) {
str = topNode.text(); str = topNode.text();
}
// if jsoup failed to parse the whole html now parse this smaller // if jsoup failed to parse the whole html now parse this smaller
// snippet again to avoid html tags disturbing our text: // snippet again to avoid html tags disturbing our text:
return Jsoup.parse(str).text(); return Jsoup.parse(str).text();
} }
/**
* Takes an element and returns a list of texts extracted from the P tags
*/
public List<String> getTextList(Element topNode) {
List<String> texts = new ArrayList<>();
for (Element element : topNode.select(this.nodesToKeepCssSelector)) {
if (element.hasText()) {
texts.add(element.text());
}
}
return texts;
}
/** /**
* If there are elements inside our top node that have a negative gravity * If there are elements inside our top node that have a negative gravity
* score remove them * score remove them
@ -90,15 +94,20 @@ public class OutputFormatter {
protected void removeNodesWithNegativeScores(Element topNode) { protected void removeNodesWithNegativeScores(Element topNode) {
Elements gravityItems = topNode.select("*[gravityScore]"); Elements gravityItems = topNode.select("*[gravityScore]");
for (Element item : gravityItems) { for (Element item : gravityItems) {
int score = Integer.parseInt(item.attr("gravityScore")); int score = getScore(item);
if (score < 0 || item.text().length() < minParagraphText) int paragraphIndex = getParagraphIndex(item);
if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
item.remove(); item.remove();
}
} }
} }
protected void append(Element node, StringBuilder sb, String tagName) { protected int append(Element node, StringBuilder sb, String tagName) {
int countOfP = 0; // Number of P elements in the article
int paragraphWithTextIndex = 0;
// is select more costly then getElementsByTag? // is select more costly then getElementsByTag?
MAIN: for (Element e : node.select(tagName)) { MAIN:
for (Element e : node.select(tagName)) {
Element tmpEl = e; Element tmpEl = e;
// check all elements until 'node' // check all elements until 'node'
while (tmpEl != null && !tmpEl.equals(node)) { while (tmpEl != null && !tmpEl.equals(node)) {
@ -108,18 +117,56 @@ public class OutputFormatter {
} }
String text = node2Text(e); String text = node2Text(e);
if (text.isEmpty() || text.length() < minParagraphText if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex)
|| text.length() > SHelper.countLetters(text) * 2) || text.length() > SHelper.countLetters(text) * 2) {
continue; continue;
}
if (e.tagName().equals("p")) {
countOfP++;
}
sb.append(text); sb.append(text);
sb.append("\n\n"); sb.append("\n\n");
paragraphWithTextIndex += 1;
}
return countOfP;
}
protected static void setParagraphIndex(Element node, String tagName) {
int paragraphIndex = 0;
for (Element e : node.select(tagName)) {
e.attr("paragraphIndex", Integer.toString(paragraphIndex++));
}
}
protected int getMinParagraph(int paragraphIndex) {
if (paragraphIndex < 1) {
return minFirstParagraphText;
} else {
return minParagraphText;
}
}
protected static int getParagraphIndex(Element el) {
try {
return Integer.parseInt(el.attr("paragraphIndex"));
} catch (NumberFormatException ex) {
return -1;
}
}
protected static int getScore(Element el) {
try {
return Integer.parseInt(el.attr("gravityScore"));
} catch (Exception ex) {
return 0;
} }
} }
boolean unlikely(Node e) { boolean unlikely(Node e) {
if (e.attr("class") != null if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
&& e.attr("class").toLowerCase(Locale.getDefault()).contains("caption"))
return true; return true;
String style = e.attr("style"); String style = e.attr("style");
@ -127,36 +174,34 @@ public class OutputFormatter {
return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
} }
void appendTextSkipHidden(Element e, StringBuilder accum) { void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
for (Node child : e.childNodes()) { for (Node child : e.childNodes()) {
if (unlikely(child)) if (unlikely(child)) {
continue; continue;
}
if (child instanceof TextNode) { if (child instanceof TextNode) {
TextNode textNode = (TextNode) child; TextNode textNode = (TextNode) child;
String txt = textNode.text(); String txt = textNode.text();
accum.append(txt); accum.append(txt);
} else if (child instanceof Element) { } else if (child instanceof Element) {
Element element = (Element) child; Element element = (Element) child;
if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) if (accum.length() > 0 && element.isBlock()
&& !lastCharIsWhitespace(accum))
accum.append(' '); accum.append(' ');
else if (element.tagName().equals("br")) else if (element.tagName().equals("br"))
accum.append(' '); accum.append(' ');
appendTextSkipHidden(element, accum); appendTextSkipHidden(element, accum, indent + 1);
} }
} }
} }
boolean lastCharIsWhitespace(StringBuilder accum) { static boolean lastCharIsWhitespace(StringBuilder accum) {
return (accum.length() != 0) && Character.isWhitespace(accum.charAt(accum.length() - 1)); return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1));
}
protected String node2TextOld(Element el) {
return el.text();
} }
protected String node2Text(Element el) { protected String node2Text(Element el) {
StringBuilder sb = new StringBuilder(200); StringBuilder sb = new StringBuilder(200);
appendTextSkipHidden(el, sb); appendTextSkipHidden(el, sb, 0);
return sb.toString(); return sb.toString();
} }
@ -168,4 +213,4 @@ public class OutputFormatter {
public OutputFormatter appendUnlikelyPattern(String str) { public OutputFormatter appendUnlikelyPattern(String str) {
return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str); return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str);
} }
} }

View File

@ -15,17 +15,19 @@
*/ */
package acr.browser.lightning.reading; package acr.browser.lightning.reading;
import org.jsoup.nodes.Element;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.CookieHandler; import java.net.CookieHandler;
import java.net.CookieManager; import java.net.CookieManager;
import java.net.CookiePolicy; import java.net.CookiePolicy;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.security.SecureRandom; import java.security.SecureRandom;
import java.security.cert.CertificateException; import java.security.cert.CertificateException;
import java.security.cert.X509Certificate; import java.security.cert.X509Certificate;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -33,10 +35,8 @@ import javax.net.ssl.KeyManager;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager; import javax.net.ssl.X509TrustManager;
import org.jsoup.nodes.Element;
/** /**
*
* @author Peter Karich * @author Peter Karich
*/ */
public class SHelper { public class SHelper {
@ -127,8 +127,7 @@ public class SHelper {
return null; return null;
// dynamic programming => save already identical length into array // dynamic programming => save already identical length into array
// to understand this algo simply print identical length in every entry // to understand this algo simply print identical length in every entry of the array
// of the array
// i+1, j+1 then reuses information from i,j // i+1, j+1 then reuses information from i,j
// java initializes them already with 0 // java initializes them already with 0
int[][] num = new int[str1.length()][str2.length()]; int[][] num = new int[str1.length()][str2.length()];
@ -152,7 +151,7 @@ public class SHelper {
} }
} }
} }
return new int[] { lastSubstrBegin, endIndex }; return new int[]{lastSubstrBegin, endIndex};
} }
public static String getDefaultFavicon(String url) { public static String getDefaultFavicon(String url) {
@ -160,35 +159,19 @@ public class SHelper {
} }
/** /**
* @param urlForDomain * @param urlForDomain extract the domain from this url
* extract the domain from this url * @param path this url does not have a domain
* @param path * @return
* this url does not have a domain
* @return returns the domain
*/ */
public static String useDomainOfFirstArg4Second(String urlForDomain, String path) { public static String useDomainOfFirstArg4Second(String urlForDomain, String path) {
if (path.startsWith("http")) try {
// See: http://stackoverflow.com/questions/1389184/building-an-absolute-url-from-a-relative-url-in-java
URL baseUrl = new URL(urlForDomain);
URL relativeurl = new URL(baseUrl, path);
return relativeurl.toString();
} catch (MalformedURLException ex) {
return path; return path;
if ("favicon.ico".equals(path))
path = "/favicon.ico";
if (path.startsWith("//")) {
// wikipedia special case, see tests
if (urlForDomain.startsWith("https:"))
return "https:" + path;
return "http:" + path;
} else if (path.startsWith("/"))
return "http://" + extractHost(urlForDomain) + path;
else if (path.startsWith("../")) {
int slashIndex = urlForDomain.lastIndexOf("/");
if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length())
urlForDomain = urlForDomain.substring(0, slashIndex + 1);
return urlForDomain + path;
} }
return path;
} }
public static String extractHost(String url) { public static String extractHost(String url) {
@ -224,14 +207,12 @@ public class SHelper {
} }
public static boolean isVideo(String url) { public static boolean isVideo(String url) {
return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") || url.endsWith(".mov")
|| url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4") || url.endsWith(".mpg4") || url.endsWith(".mp4") || url.endsWith(".flv") || url.endsWith(".wmv");
|| url.endsWith(".flv") || url.endsWith(".wmv");
} }
public static boolean isAudio(String url) { public static boolean isAudio(String url) {
return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") || url.endsWith(".wav");
|| url.endsWith(".wav");
} }
public static boolean isDoc(String url) { public static boolean isDoc(String url) {
@ -241,23 +222,20 @@ public class SHelper {
public static boolean isPackage(String url) { public static boolean isPackage(String url) {
return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip") return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip")
|| url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") || url.endsWith(".7z");
|| url.endsWith(".7z");
} }
public static boolean isApp(String url) { public static boolean isApp(String url) {
return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") || url.endsWith(".dmg");
|| url.endsWith(".dmg");
} }
public static boolean isImage(String url) { public static boolean isImage(String url) {
return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif") return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif")
|| url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") || url.endsWith(".eps");
|| url.endsWith(".eps");
} }
/** /**
* http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se * @see "http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se"
*/ */
public static void enableCookieMgmt() { public static void enableCookieMgmt() {
CookieManager manager = new CookieManager(); CookieManager manager = new CookieManager();
@ -266,7 +244,7 @@ public class SHelper {
} }
/** /**
* http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection * @see "http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection"
*/ */
public static void enableUserAgentOverwrite() { public static void enableUserAgentOverwrite() {
System.setProperty("http.agent", ""); System.setProperty("http.agent", "");
@ -377,8 +355,8 @@ public class SHelper {
} else if (counter == monthCounter + 1) { } else if (counter == monthCounter + 1) {
try { try {
day = Integer.parseInt(str); day = Integer.parseInt(str);
} catch (Exception ex) { } catch (Exception ignored) {
ex.printStackTrace(); // ignored
} }
if (day < 1 || day > 31) { if (day < 1 || day > 31) {
day = -1; day = -1;
@ -425,21 +403,11 @@ public class SHelper {
return dateStr + "/01/01"; return dateStr + "/01/01";
} }
/** // with the help of http://stackoverflow.com/questions/1828775/httpclient-and-ssl
* keep in mind: simpleDateFormatter is not thread safe! call completeDate
* before applying this formatter.
*/
public static SimpleDateFormat createDateFormatter() {
return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault());
}
// with the help of
// http://stackoverflow.com/questions/1828775/httpclient-and-ssl
public static void enableAnySSL() { public static void enableAnySSL() {
try { try {
SSLContext ctx = SSLContext.getInstance("TLS"); SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() }, ctx.init(new KeyManager[0], new TrustManager[]{new DefaultTrustManager()}, new SecureRandom());
new SecureRandom());
SSLContext.setDefault(ctx); SSLContext.setDefault(ctx);
} catch (Exception ex) { } catch (Exception ex) {
ex.printStackTrace(); ex.printStackTrace();
@ -449,13 +417,11 @@ public class SHelper {
private static class DefaultTrustManager implements X509TrustManager { private static class DefaultTrustManager implements X509TrustManager {
@Override @Override
public void checkClientTrusted(X509Certificate[] arg0, String arg1) public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
throws CertificateException {
} }
@Override @Override
public void checkServerTrusted(X509Certificate[] arg0, String arg1) public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
throws CertificateException {
} }
@Override @Override
@ -473,4 +439,4 @@ public class SHelper {
} }
return chars; return chars;
} }
} }