From 367c62bd39323a7ec944bbb35714c010e8bb8e94 Mon Sep 17 00:00:00 2001 From: Anthony Restaino Date: Tue, 25 Aug 2015 20:59:23 -0400 Subject: [PATCH] Improved reading mode thanks to changes from snacktory fork by skyshard --- .../lightning/activity/ReadingActivity.java | 14 +- .../reading/ArticleTextExtractor.java | 902 +++++++++++++++--- .../lightning/reading/HtmlFetcher.java | 153 +-- .../browser/lightning/reading/JResult.java | 98 +- .../lightning/reading/OutputFormatter.java | 145 ++- .../browser/lightning/reading/SHelper.java | 94 +- 6 files changed, 1050 insertions(+), 356 deletions(-) diff --git a/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java b/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java index 4953dff..7cb77aa 100644 --- a/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java +++ b/app/src/main/java/acr/browser/lightning/activity/ReadingActivity.java @@ -139,7 +139,7 @@ public class ReadingActivity extends AppCompatActivity { private final Activity mActivity; private String mTitleText; - private List mBodyText; + private String mBodyText; public PageLoader(Activity activity) { mActivity = activity; @@ -163,15 +163,15 @@ public class ReadingActivity extends AppCompatActivity { try { JResult result = fetcher.fetchAndExtract(params[0], 2500, true); mTitleText = result.getTitle(); - mBodyText = result.getTextList(); + mBodyText = result.getText(); } catch (Exception e) { mTitleText = ""; - mBodyText = new ArrayList<>(); + mBodyText = ""; e.printStackTrace(); } catch (OutOfMemoryError e) { System.gc(); mTitleText = ""; - mBodyText = new ArrayList<>(); + mBodyText = ""; e.printStackTrace(); } return null; @@ -186,11 +186,7 @@ public class ReadingActivity extends AppCompatActivity { if (mTitleText.isEmpty() || mBodyText.isEmpty()) { setText(getString(R.string.untitled), getString(R.string.loading_failed)); } else { - StringBuilder builder = new StringBuilder(); - for (String text : mBodyText) { - builder.append(text).append("\n\n"); - } - setText(mTitleText, builder.toString()); + setText(mTitleText, mBodyText); } super.onPostExecute(result); } diff --git a/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java b/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java index a9596e8..7ec41a9 100644 --- a/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java +++ b/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java @@ -5,20 +5,25 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; -import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; -import java.util.Locale; +import java.util.Map; import java.util.Set; import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.util.Date; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.jsoup.select.Selector.SelectorParseException; /** * This class is thread safe. + * Class for content extraction from string form of webpage + * 'extractContent' is main call from external programs/classes * * @author Alex P (ifesdjeen from jreadability) * @author Peter Karich @@ -36,17 +41,33 @@ public class ArticleTextExtractor { // Most likely negative candidates private String negativeStr; private Pattern NEGATIVE; - private static final Pattern NEGATIVE_STYLE = Pattern - .compile("hidden|display: ?none|font-size: ?small"); + private static final Pattern NEGATIVE_STYLE = + Pattern.compile("hidden|display: ?none|font-size: ?small"); + private static final Pattern IGNORE_AUTHOR_PARTS = + Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE); private static final Set IGNORED_TITLE_PARTS = new LinkedHashSet() { { add("hacker news"); add("facebook"); + add("home"); + add("articles"); } }; private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter(); private OutputFormatter formatter = DEFAULT_FORMATTER; + private static final int MAX_AUTHOR_NAME_LENGHT = 255; + private static final int MIN_AUTHOR_NAME_LENGTH = 4; + private static final List CLEAN_AUTHOR_PATTERNS = Collections.singletonList( + Pattern.compile("By\\S*(.*)[\\.,].*") + ); + private static final int MAX_AUTHOR_DESC_LENGHT = 1000; + private static final int MAX_IMAGE_LENGHT = 255; + + // For debugging + private static final boolean DEBUG_WEIGHTS = false; + private static final int MAX_LOG_LENGTH = 200; + public ArticleTextExtractor() { setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" @@ -59,7 +80,7 @@ public class ArticleTextExtractor { + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); } - public ArticleTextExtractor setUnlikely(String unlikelyStr) { + private ArticleTextExtractor setUnlikely(String unlikelyStr) { this.unlikelyStr = unlikelyStr; UNLIKELY = Pattern.compile(unlikelyStr); return this; @@ -69,7 +90,7 @@ public class ArticleTextExtractor { return setUnlikely(unlikelyStr + '|' + unlikelyMatches); } - public ArticleTextExtractor setPositive(String positiveStr) { + private ArticleTextExtractor setPositive(String positiveStr) { this.positiveStr = positiveStr; POSITIVE = Pattern.compile(positiveStr); return this; @@ -79,7 +100,7 @@ public class ArticleTextExtractor { return setPositive(positiveStr + '|' + pos); } - public ArticleTextExtractor setNegative(String negativeStr) { + private ArticleTextExtractor setNegative(String negativeStr) { this.negativeStr = negativeStr; NEGATIVE = Pattern.compile(negativeStr); return this; @@ -95,109 +116,234 @@ public class ArticleTextExtractor { } /** - * @param doc the document to extract - * extracts article text from given html string. wasn't tested - * with improper HTML, although jSoup should be able to handle - * minor stuff. + * @param html extracts article text from given html string. wasn't tested + * with improper HTML, although jSoup should be able to handle minor stuff. + * @returns extracted article, all HTML tags stripped */ - public JResult extractContent(Document doc) throws Exception { - return extractContent(new JResult(), doc, formatter); + public JResult extractContent(String html, int maxContentSize) throws Exception { + return extractContent(new JResult(), html, maxContentSize); } - public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception { - return extractContent(new JResult(), doc, formatter); + public JResult extractContent(String html) throws Exception { + return extractContent(new JResult(), html, 0); } - public JResult extractContent(String html) throws Exception { - return extractContent(new JResult(), html); + public JResult extractContent(JResult res, String html, int maxContentSize) throws Exception { + return extractContent(res, html, formatter, true, maxContentSize); } public JResult extractContent(JResult res, String html) throws Exception { - return extractContent(res, html, formatter); + return extractContent(res, html, formatter, true, 0); } - public JResult extractContent(JResult res, String html, OutputFormatter formatter) - throws Exception { + private JResult extractContent(JResult res, String html, OutputFormatter formatter, + Boolean extractimages, int maxContentSize) throws Exception { if (html.isEmpty()) throw new IllegalArgumentException("html string is empty!?"); // http://jsoup.org/cookbook/extracting-data/selector-syntax - return extractContent(res, Jsoup.parse(html), formatter); + return extractContent(res, Jsoup.parse(html), formatter, extractimages, maxContentSize); + } + + // Returns the best node match based on the weights (see getWeight for strategy) + private Element getBestMatchElement(Collection nodes) { + int maxWeight = -200; // why -200 now instead of 0? + Element bestMatchElement = null; + + boolean ignoreMaxWeightLimit = false; + for (Element entry : nodes) { + + int currentWeight = getWeight(entry, false); + if (currentWeight > maxWeight) { + maxWeight = currentWeight; + bestMatchElement = entry; + + /* + // NOTE: This optimization fails with large pages that + contains chunks of text that can be mistaken by articles, since we + want the best accuracy possible, I am disabling it for now. AP. + + // The original code had a limit of 200, the intention was that + // if a node had a weight greater than it, then it most likely + // it was the main content. + // However this assumption fails when the amount of text in the + // children (or grandchildren) is too large. If we detect this + // case then the limit is ignored and we try all the nodes to select + // the one with the absolute maximum weight. + if (maxWeight > 500){ + ignoreMaxWeightLimit = true; + continue; + } + + // formerly 200, increased to 250 to account for the fact + // we are not adding the weights of the grand children to the + // tally. + + if (maxWeight > 250 && !ignoreMaxWeightLimit) + break; + */ + } + } + + return bestMatchElement; } - public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) - throws NullPointerException { + private JResult extractContent(JResult res, Document doc, OutputFormatter formatter, + Boolean extractimages, int maxContentSize) throws Exception { + Document origDoc = doc.clone(); + JResult result = extractContent(res, doc, formatter, extractimages, maxContentSize, true); + //System.out.println("result.getText().length()="+result.getText().length()); + if (result.getText().isEmpty()) { + result = extractContent(res, origDoc, formatter, extractimages, maxContentSize, false); + } + return result; + } + + + // main workhorse + private JResult extractContent(JResult res, Document doc, OutputFormatter formatter, + Boolean extractimages, int maxContentSize, boolean cleanScripts) { if (doc == null) throw new NullPointerException("missing document"); + // get the easy stuff res.setTitle(extractTitle(doc)); res.setDescription(extractDescription(doc)); res.setCanonicalUrl(extractCanonicalUrl(doc)); + res.setType(extractType(doc)); + res.setSitename(extractSitename(doc)); + res.setLanguage(extractLanguage(doc)); + + // get author information + res.setAuthorName(extractAuthorName(doc)); + res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName())); + + // add extra selection gravity to any element containing author name + // wasn't useful in the case I implemented it for, but might be later + /* + Elements authelems = doc.select(":containsOwn(" + res.getAuthorName() + ")"); + for (Element elem : authelems) { + elem.attr("extragravityscore", Integer.toString(100)); + System.out.println("modified element " + elem.toString()); + } + */ + + // get date from document, if not present, extract from URL if possible + Date docdate = extractDate(doc); + if (docdate == null) { + String dateStr = SHelper.estimateDate(res.getUrl()); + docdate = parseDate(dateStr); + res.setDate(docdate); + } else { + res.setDate(docdate); + } - // now remove the clutter - prepareDocument(doc); + // now remove the clutter + if (cleanScripts) { + prepareDocument(doc); + } - // init elements + // init elements and get the one with highest weight (see getWeight for strategy) Collection nodes = getNodes(doc); - int maxWeight = 0; - Element bestMatchElement = null; - for (Element entry : nodes) { - int currentWeight = getWeight(entry); - if (currentWeight > maxWeight) { - maxWeight = currentWeight; - bestMatchElement = entry; - if (maxWeight > 200) - break; - } - } + Element bestMatchElement = getBestMatchElement(nodes); + // do extraction from the best element if (bestMatchElement != null) { - List images = new ArrayList<>(); - Element imgEl = determineImageSource(bestMatchElement, images); - if (imgEl != null) { - res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src"))); - // TODO remove parent container of image if it is contained in - // bestMatchElement - // to avoid image subtitles flooding in - - res.setImages(images); + if (extractimages) { + List images = new ArrayList<>(); + Element imgEl = determineImageSource(bestMatchElement, images); + if (imgEl != null) { + res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src"))); + // TODO remove parent container of image if it is contained in bestMatchElement + // to avoid image subtitles flooding in + + res.setImages(images); + } } // clean before grabbing text String text = formatter.getFormattedText(bestMatchElement); text = removeTitleFromText(text, res.getTitle()); - // this fails for short facebook post and probably tweets: - // text.length() > res.getDescription().length() + // this fails for short facebook post and probably tweets: text.length() > res.getDescription().length() if (text.length() > res.getTitle().length()) { + if (maxContentSize > 0) { + if (text.length() > maxContentSize) { + text = utf8truncate(text, maxContentSize); + } + } res.setText(text); - // print("best element:", bestMatchElement); } - res.setTextList(formatter.getTextList(bestMatchElement)); + + // extract links from the same best element + String fullhtml = bestMatchElement.toString(); + Elements children = bestMatchElement.select("a[href]"); // a with href = link + String linkstr; + Integer linkpos; + Integer lastlinkpos = 0; + for (Element child : children) { + linkstr = child.toString(); + linkpos = fullhtml.indexOf(linkstr, lastlinkpos); + res.addLink(child.attr("abs:href"), child.text(), linkpos); + lastlinkpos = linkpos; + } } - if (res.getImageUrl().isEmpty()) { - res.setImageUrl(extractImageUrl(doc)); + if (extractimages) { + if (res.getImageUrl().isEmpty()) { + res.setImageUrl(extractImageUrl(doc)); + } } res.setRssUrl(extractRssUrl(doc)); res.setVideoUrl(extractVideoUrl(doc)); res.setFaviconUrl(extractFaviconUrl(doc)); res.setKeywords(extractKeywords(doc)); + + // Sanity checks in author + if (res.getAuthorName().length() > MAX_AUTHOR_NAME_LENGHT) { + res.setAuthorName(utf8truncate(res.getAuthorName(), MAX_AUTHOR_NAME_LENGHT)); + } + + // Sanity checks in author description. + String authorDescSnippet = getSnippet(res.getAuthorDescription()); + if (getSnippet(res.getText()).equals(authorDescSnippet) || + getSnippet(res.getDescription()).equals(authorDescSnippet)) { + res.setAuthorDescription(""); + } else { + if (res.getAuthorDescription().length() > MAX_AUTHOR_DESC_LENGHT) { + res.setAuthorDescription(utf8truncate(res.getAuthorDescription(), MAX_AUTHOR_DESC_LENGHT)); + } + } + + // Sanity checks in image name + if (res.getImageUrl().length() > MAX_IMAGE_LENGHT) { + // doesn't make sense to truncate a URL + res.setImageUrl(""); + } + return res; } - protected String extractTitle(Document doc) { + private static String getSnippet(String data) { + if (data.length() < 50) + return data; + else + return data.substring(0, 50); + } + + private static String extractTitle(Document doc) { String title = cleanTitle(doc.title()); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head title").text()); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content")); if (title.isEmpty()) { - title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr( - "content")); + title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content")); if (title.isEmpty()) { - title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr( - "content")); + title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content")); + if (title.isEmpty()) { + title = SHelper.innerTrim(doc.select("h1:first-of-type").text()); + } } } } @@ -205,41 +351,349 @@ public class ArticleTextExtractor { return title; } - protected String extractCanonicalUrl(Document doc) { + private static String extractCanonicalUrl(Document doc) { String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href")); if (url.isEmpty()) { url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content")); if (url.isEmpty()) { - url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr( - "content")); + url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content")); } } return url; } - protected String extractDescription(Document doc) { - String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr( - "content")); + private static String extractDescription(Document doc) { + String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content")); if (description.isEmpty()) { - description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr( - "content")); + description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content")); if (description.isEmpty()) { - description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]") - .attr("content")); + description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content")); } } return description; } - protected Collection extractKeywords(Document doc) { + // Returns the publication Date or null + private static Date extractDate(Document doc) { + String dateStr = ""; + + // try some locations that nytimes uses + Element elem = doc.select("meta[name=ptime]").first(); + if (elem != null) { + dateStr = SHelper.innerTrim(elem.attr("content")); + // elem.attr("extragravityscore", Integer.toString(100)); + // System.out.println("date modified element " + elem.toString()); + } + + if ("".equals(dateStr)) { + dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content")); + } + if ("".equals(dateStr)) { + dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content")); + } + if ("".equals(dateStr)) { + dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content")); + } + if ("".equals(dateStr)) { + return parseDate(dateStr); + } + + // taking this stuff directly from Juicer (and converted to Java) + // opengraph (?) + Elements elems = doc.select("meta[property=article:published_time]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + try { + if (dateStr.endsWith("Z")) { + dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00"; + } else { + dateStr = String.format(dateStr.substring(0, dateStr.length() - 6), + dateStr.substring(dateStr.length() - 6, + dateStr.length())); + } + } catch (StringIndexOutOfBoundsException ex) { + // do nothing + } + return parseDate(dateStr); + } + } + + // rnews + elems = doc.select("meta[property=dateCreated], span[property=dateCreated]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + + return parseDate(dateStr); + } else { + return parseDate(el.text()); + } + } + + // schema.org creativework + elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + + return parseDate(dateStr); + } else if (el.hasAttr("value")) { + dateStr = el.attr("value"); + + return parseDate(dateStr); + } else { + return parseDate(el.text()); + } + } + + // parsely page (?) + /* skip conversion for now, seems highly specific and uses new lib + elems = doc.select("meta[name=parsely-page]"); + if (elems.size() > 0) { + implicit val formats = net.liftweb.json.DefaultFormats + + Element el = elems.get(0); + if(el.hasAttr("content")) { + val json = parse(el.attr("content")) + + return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz")) + } + } + */ + + // BBC + elems = doc.select("meta[name=OriginalPublicationDate]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + return parseDate(dateStr); + } + } + + // wired + elems = doc.select("meta[name=DisplayDate]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + return parseDate(dateStr); + } + } + + // wildcard + elems = doc.select("meta[name*=date]"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + if (el.hasAttr("content")) { + dateStr = el.attr("content"); + Date parsedDate = parseDate(dateStr); + if (parsedDate != null) { + return parsedDate; + } + } + } + + // blogger + elems = doc.select(".date-header"); + if (!elems.isEmpty()) { + Element el = elems.get(0); + dateStr = el.text(); + return parseDate(dateStr); + } + + return null; + } + + private static Date parseDate(String dateStr) { +// String[] parsePatterns = { +// "yyyy-MM-dd'T'HH:mm:ssz", +// "yyyy-MM-dd HH:mm:ss", +// "yyyy/MM/dd HH:mm:ss", +// "yyyy-MM-dd HH:mm", +// "yyyy/MM/dd HH:mm", +// "yyyy-MM-dd", +// "yyyy/MM/dd", +// "MM/dd/yyyy HH:mm:ss", +// "MM-dd-yyyy HH:mm:ss", +// "MM/dd/yyyy HH:mm", +// "MM-dd-yyyy HH:mm", +// "MM/dd/yyyy", +// "MM-dd-yyyy", +// "EEE, MMM dd, yyyy", +// "MM/dd/yyyy hh:mm:ss a", +// "MM-dd-yyyy hh:mm:ss a", +// "MM/dd/yyyy hh:mm a", +// "MM-dd-yyyy hh:mm a", +// "yyyy-MM-dd hh:mm:ss a", +// "yyyy/MM/dd hh:mm:ss a ", +// "yyyy-MM-dd hh:mm a", +// "yyyy/MM/dd hh:mm ", +// "dd MMM yyyy", +// "dd MMMM yyyy", +// "yyyyMMddHHmm", +// "yyyyMMdd HHmm", +// "dd-MM-yyyy HH:mm:ss", +// "dd/MM/yyyy HH:mm:ss", +// "dd MMM yyyy HH:mm:ss", +// "dd MMMM yyyy HH:mm:ss", +// "dd-MM-yyyy HH:mm", +// "dd/MM/yyyy HH:mm", +// "dd MMM yyyy HH:mm", +// "dd MMMM yyyy HH:mm", +// "yyyyMMddHHmmss", +// "yyyyMMdd HHmmss", +// "yyyyMMdd" +// }; +// + return new Date(0); + +// try { +// return DateUtils.parseDateStrictly(dateStr, parsePatterns); +// } catch (Exception ex) { +// return null; +// } + } + + // Returns the author name or null + private String extractAuthorName(Document doc) { + String authorName = ""; + + // first try the Google Author tag + Element result = doc.select("body [rel*=author]").first(); + if (result != null) + authorName = SHelper.innerTrim(result.ownText()); + + // if that doesn't work, try some other methods + if (authorName.isEmpty()) { + + // meta tag approaches, get content + result = doc.select("head meta[name=author]").first(); + if (result != null) { + authorName = SHelper.innerTrim(result.attr("content")); + } + + if (authorName.isEmpty()) { // for "opengraph" + authorName = SHelper.innerTrim(doc.select("head meta[property=article:author]").attr("content")); + } + if (authorName.isEmpty()) { // OpenGraph twitter:creator tag + authorName = SHelper.innerTrim(doc.select("head meta[property=twitter:creator]").attr("content")); + } + if (authorName.isEmpty()) { // for "schema.org creativework" + authorName = SHelper.innerTrim(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content")); + } + + // other hacks + if (authorName.isEmpty()) { + try { + // build up a set of elements which have likely author-related terms + // .X searches for class X + Elements matches = doc.select("a[rel=author],.byline-name,.byLineTag,.byline,.author,.by,.writer,.address"); + + if (matches == null || matches.isEmpty()) { + matches = doc.select("body [class*=author]"); + } + + if (matches == null || matches.isEmpty()) { + matches = doc.select("body [title*=author]"); + } + + // a hack for huffington post + if (matches == null || matches.isEmpty()) { + matches = doc.select(".staff_info dl a[href]"); + } + + // a hack for http://sports.espn.go.com/ + if (matches == null || matches.isEmpty()) { + matches = doc.select("cite[class*=source]"); + } + + // select the best element from them + if (matches != null) { + Element bestMatch = getBestMatchElement(matches); + + if (!(bestMatch == null)) { + authorName = bestMatch.text(); + + if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) { + authorName = bestMatch.text(); + } + + authorName = SHelper.innerTrim(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll("")); + + if (authorName.contains(",")) { + authorName = authorName.split(",")[0]; + } + } + } + } catch (Exception e) { + System.out.println(e.toString()); + } + } + } + + for (Pattern pattern : CLEAN_AUTHOR_PATTERNS) { + Matcher matcher = pattern.matcher(authorName); + if (matcher.matches()) { + authorName = SHelper.innerTrim(matcher.group(1)); + break; + } + } + + return authorName; + } + + // Returns the author description or null + private String extractAuthorDescription(Document doc, String authorName) { + + String authorDesc = ""; + + if (authorName.isEmpty()) + return ""; + + // Special case for entrepreneur.com + Elements matches = doc.select(".byline > .bio"); + if (matches != null && !matches.isEmpty()) { + Element bestMatch = matches.first(); // assume it is the first. + authorDesc = bestMatch.text(); + return authorDesc; + } + + // Special case for huffingtonpost.com + matches = doc.select(".byline span[class*=teaser]"); + if (matches != null && !matches.isEmpty()) { + Element bestMatch = matches.first(); // assume it is the first. + authorDesc = bestMatch.text(); + return authorDesc; + } + + try { + Elements nodes = doc.select(":containsOwn(" + authorName + ')'); + Element bestMatch = getBestMatchElement(nodes); + if (bestMatch != null) + authorDesc = bestMatch.text(); + } catch (SelectorParseException se) { + // Avoid error when selector is invalid + } + + return authorDesc; + } + + private static Collection extractKeywords(Document doc) { String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content")); - if (content.startsWith("[") && content.endsWith("]")) - content = content.substring(1, content.length() - 1); + if (content != null) { + if (content.startsWith("[") && content.endsWith("]")) + content = content.substring(1, content.length() - 1); - String[] split = content.split("\\s*,\\s*"); - if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty())) - return Arrays.asList(split); + String[] split = content.split("\\s*,\\s*"); + if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty())) + return Arrays.asList(split); + } return Collections.emptyList(); } @@ -249,62 +703,101 @@ public class ArticleTextExtractor { * * @return image url or empty str */ - protected String extractImageUrl(Document doc) { + private static String extractImageUrl(Document doc) { // use open graph tag to get image - String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr( - "content")); + String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content")); if (imageUrl.isEmpty()) { - imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr( - "content")); + imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content")); if (imageUrl.isEmpty()) { // prefer link over thumbnail-meta if empty imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href")); if (imageUrl.isEmpty()) { - imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr( - "content")); + imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content")); } } } return imageUrl; } - protected String extractRssUrl(Document doc) { - return SHelper.replaceSpaces(doc.select("link[rel=alternate]") - .select("link[type=application/rss+xml]").attr("href")); + private static String extractRssUrl(Document doc) { + return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href")); } - protected String extractVideoUrl(Document doc) { + private static String extractVideoUrl(Document doc) { return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content")); } - protected String extractFaviconUrl(Document doc) { + private static String extractFaviconUrl(Document doc) { String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href")); if (faviconUrl.isEmpty()) { - faviconUrl = SHelper.replaceSpaces(doc.select( - "head link[rel^=shortcut],link[rel$=icon]").attr("href")); + faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href")); } return faviconUrl; } + private static String extractType(Document doc) { + String type = SHelper.innerTrim(doc.select("head meta[property=og:type]").attr("content")); + return type; + } + + private static String extractSitename(Document doc) { + String sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content")); + if (sitename.isEmpty()) { + sitename = SHelper.innerTrim(doc.select("head meta[name=twitter:site]").attr("content")); + } + if (sitename.isEmpty()) { + sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content")); + } + return sitename; + } + + private static String extractLanguage(Document doc) { + String language = SHelper.innerTrim(doc.select("head meta[property=language]").attr("content")); + if (language.isEmpty()) { + language = SHelper.innerTrim(doc.select("html").attr("lang")); + if (language.isEmpty()) { + language = SHelper.innerTrim(doc.select("head meta[property=og:locale]").attr("content")); + } + } + if (!language.isEmpty()) { + if (language.length() > 2) { + language = language.substring(0, 2); + } + } + return language; + } + /** * Weights current element. By matching it with positive candidates and * weighting child nodes. Since it's impossible to predict which exactly * names, ids or class names will be used in HTML, major role is played by * child nodes * - * @param e - * Element to weight, along with child nodes + * @param e Element to weight, along with child nodes */ - protected int getWeight(Element e) { + private int getWeight(Element e, boolean checkextra) { int weight = calcWeight(e); - weight += (int) Math.round(e.ownText().length() / 100.0 * 10); - weight += weightChildNodes(e); + int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10); + weight += ownTextWeight; + int childrenWeight = weightChildNodes(e); + weight += childrenWeight; + + // add additional weight using possible 'extragravityscore' attribute + if (checkextra) { + Element xelem = e.select("[extragravityscore]").first(); + if (xelem != null) { + // System.out.println("HERE found one: " + xelem.toString()); + weight += Integer.parseInt(xelem.attr("extragravityscore")); + // System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore")); + } + } + return weight; } /** * Weights a child nodes of given Element. During tests some difficulties - * were met. For instanance, not every single document has nested paragraph + * were met. For instance, not every single document has nested paragraph * tags inside of the major article tag. Sometimes people are adding one * more nesting level. So, we're adding 4 points for every 100 symbols * contained in tag nested inside of the current weighted element, but only @@ -312,42 +805,93 @@ public class ArticleTextExtractor { * more chances to extract the element that has less nested levels, * increasing probability of the correct extraction. * - * @param rootEl - * Element, who's child nodes will be weighted + * @param rootEl Element, who's child nodes will be weighted */ - protected int weightChildNodes(Element rootEl) { + private int weightChildNodes(Element rootEl) { int weight = 0; Element caption = null; List pEls = new ArrayList<>(5); + for (Element child : rootEl.children()) { String ownText = child.ownText(); int ownTextLength = ownText.length(); if (ownTextLength < 20) continue; - if (ownTextLength > 200) - weight += Math.max(50, ownTextLength / 10); + if (ownTextLength > 200) { + int childOwnTextWeight = Math.max(50, ownTextLength / 10); + weight += childOwnTextWeight; + } if (child.tagName().equals("h1") || child.tagName().equals("h2")) { - weight += 30; + int h2h1Weight = 30; + weight += h2h1Weight; } else if (child.tagName().equals("div") || child.tagName().equals("p")) { - weight += calcWeightForChild(child, ownText); + int calcChildWeight = calcWeightForChild(child, ownText); + weight += calcChildWeight; if (child.tagName().equals("p") && ownTextLength > 50) pEls.add(child); - if (child.className().toLowerCase(Locale.getDefault()).equals("caption")) + if (child.className().toLowerCase().equals("caption")) caption = child; } } + // + // Visit grandchildren, This section visits the grandchildren + // of the node and calculate their weights. Note that grandchildren + // weights are only worth 1/3 of children's + // + int grandChildrenWeight = 0; + for (Element child2 : rootEl.children()) { + + // If the node looks negative don't include it in the weights + // instead penalize the grandparent. This is done to try to + // avoid giving weigths to navigation nodes, etc. + if (NEGATIVE.matcher(child2.id()).find() || + NEGATIVE.matcher(child2.className()).find()) { + grandChildrenWeight -= 30; + continue; + } + + for (Element grandchild : child2.children()) { + int grandchildWeight = 0; + String ownText = grandchild.ownText(); + int ownTextLength = ownText.length(); + if (ownTextLength < 20) + continue; + + if (ownTextLength > 200) { + int childOwnTextWeight = Math.max(50, ownTextLength / 10); + grandchildWeight += childOwnTextWeight; + } + + if (grandchild.tagName().equals("h1") || grandchild.tagName().equals("h2")) { + int h2h1Weight = 30; + grandchildWeight += h2h1Weight; + } else if (grandchild.tagName().equals("div") || grandchild.tagName().equals("p")) { + int calcChildWeight = calcWeightForChild(grandchild, ownText); + grandchildWeight += calcChildWeight; + } + + grandChildrenWeight += grandchildWeight; + } + } + + grandChildrenWeight = grandChildrenWeight / 3; + weight += grandChildrenWeight; + // use caption and image - if (caption != null) - weight += 30; + if (caption != null) { + int captionWeight = 30; + weight += captionWeight; + } if (pEls.size() >= 2) { for (Element subEl : rootEl.children()) { if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) { - weight += 20; + int h1h2h3Weight = 20; + weight += h1h2h3Weight; // headerEls.add(subEl); } else if ("table;li;td;th".contains(subEl.tagName())) { addScore(subEl, -30); @@ -360,26 +904,25 @@ public class ArticleTextExtractor { return weight; } - public void addScore(Element el, int score) { + private static void addScore(Element el, int score) { int old = getScore(el); setScore(el, score + old); } - public int getScore(Element el) { + private static int getScore(Element el) { int old = 0; try { old = Integer.parseInt(el.attr("gravityScore")); - } catch (Exception ex) { - ex.printStackTrace(); + } catch (Exception ignored) { } return old; } - public void setScore(Element el, int score) { + private static void setScore(Element el, int score) { el.attr("gravityScore", Integer.toString(score)); } - private int calcWeightForChild(Element child, String ownText) { + private static int calcWeightForChild(Element child, String ownText) { int c = SHelper.count(ownText, """); c += SHelper.count(ownText, "<"); c += SHelper.count(ownText, ">"); @@ -388,7 +931,7 @@ public class ArticleTextExtractor { if (c > 5) val = -30; else - val = (int) Math.round(ownText.length() / 25.0); + val = (int) Math.round(ownText.length() / 35.0); addScore(child, val); return val; @@ -400,7 +943,7 @@ public class ArticleTextExtractor { weight += 35; if (POSITIVE.matcher(e.id()).find()) - weight += 40; + weight += 45; if (UNLIKELY.matcher(e.className()).find()) weight -= 20; @@ -417,10 +960,16 @@ public class ArticleTextExtractor { String style = e.attr("style"); if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) weight -= 50; + + String itemprop = e.attr("itemprop"); + if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) { + weight += 100; + } + return weight; } - public Element determineImageSource(Element el, List images) { + private Element determineImageSource(Element el, List images) { int maxWeight = 0; Element maxNode = null; Elements els = el.select("img"); @@ -441,8 +990,7 @@ public class ArticleTextExtractor { weight += 20; else weight -= 20; - } catch (Exception ex) { - ex.printStackTrace(); + } catch (Exception ignored) { } int width = 0; @@ -452,8 +1000,7 @@ public class ArticleTextExtractor { weight += 20; else weight -= 20; - } catch (Exception ex) { - ex.printStackTrace(); + } catch (Exception ignored) { } String alt = e.attr("alt"); if (alt.length() > 35) @@ -480,8 +1027,7 @@ public class ArticleTextExtractor { score = score / 2; } - ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, - noFollow); + ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow); images.add(image); } @@ -494,12 +1040,11 @@ public class ArticleTextExtractor { * from time to time they're getting more score than good ones especially in * cases when major text is short. * - * @param doc - * document to prepare. Passed as reference, and changed inside + * @param doc document to prepare. Passed as reference, and changed inside * of function */ - protected void prepareDocument(Document doc) { - // stripUnlikelyCandidates(doc); + private static void prepareDocument(Document doc) { +// stripUnlikelyCandidates(doc); removeScriptsAndStyles(doc); } @@ -507,27 +1052,25 @@ public class ArticleTextExtractor { * Removes unlikely candidates from HTML. Currently takes id and class name * and matches them against list of patterns * - * @param doc - * document to strip unlikely candidates from + * @param doc document to strip unlikely candidates from */ protected void stripUnlikelyCandidates(Document doc) { for (Element child : doc.select("body").select("*")) { - String className = child.className().toLowerCase(Locale.getDefault()); - String id = child.id().toLowerCase(Locale.getDefault()); + String className = child.className().toLowerCase(); + String id = child.id().toLowerCase(); - if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) { - // print("REMOVE:", child); + if (NEGATIVE.matcher(className).find() + || NEGATIVE.matcher(id).find()) { child.remove(); } } } - private Document removeScriptsAndStyles(Document doc) { + private static Document removeScriptsAndStyles(Document doc) { Elements scripts = doc.getElementsByTag("script"); for (Element item : scripts) { item.remove(); } - Elements noscripts = doc.getElementsByTag("noscript"); for (Element item : noscripts) { item.remove(); @@ -541,49 +1084,74 @@ public class ArticleTextExtractor { return doc; } - private boolean isAdImage(String imageUrl) { + private static boolean isAdImage(String imageUrl) { return SHelper.count(imageUrl, "ad") >= 2; } /** * Match only exact matching as longestSubstring can be too fuzzy */ - public String removeTitleFromText(String text, String title) { + private static String removeTitleFromText(String text, String title) { // don't do this as its terrible to read - // int index1 = text.toLowerCase().indexOf(title.toLowerCase()); - // if (index1 >= 0) - // text = text.substring(index1 + title.length()); - // return text.trim(); +// int index1 = text.toLowerCase().indexOf(title.toLowerCase()); +// if (index1 >= 0) +// text = text.substring(index1 + title.length()); +// return text.trim(); return text; } + /** + * based on a delimeter in the title take the longest piece or do some + * custom logic based on the site + * + * @param title + * @param delimeter + * @return + */ + private static String doTitleSplits(String title, String delimeter) { + String largeText = ""; + int largetTextLen = 0; + String[] titlePieces = title.split(delimeter); + + // take the largest split + for (String p : titlePieces) { + if (p.length() > largetTextLen) { + largeText = p; + largetTextLen = p.length(); + } + } + + largeText = largeText.replace("»", " "); + largeText = largeText.replace("ยป", " "); + return largeText.trim(); + } + /** * @return a set of all important nodes */ - public Collection getNodes(Document doc) { - Set nodes = new HashSet<>(64); + private static Collection getNodes(Document doc) { + Map nodes = new LinkedHashMap<>(64); int score = 100; for (Element el : doc.select("body").select("*")) { if (NODES.matcher(el.tagName()).matches()) { - nodes.add(el); + nodes.put(el, null); setScore(el, score); score = score / 2; } } - return nodes; - + return nodes.keySet(); } - public String cleanTitle(String title) { + private static String cleanTitle(String title) { StringBuilder res = new StringBuilder(); - // int index = title.lastIndexOf("|"); - // if (index > 0 && title.length() / 2 < index) - // title = title.substring(0, index + 1); +// int index = title.lastIndexOf("|"); +// if (index > 0 && title.length() / 2 < index) +// title = title.substring(0, index + 1); int counter = 0; String[] strs = title.split("\\|"); for (String part : strs) { - if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim())) + if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim())) continue; if (counter == strs.length - 1 && res.length() > part.length()) @@ -599,13 +1167,48 @@ public class ArticleTextExtractor { return SHelper.innerTrim(res.toString()); } + /** + * Truncate a Java string so that its UTF-8 representation will not + * exceed the specified number of bytes. + *

+ * For discussion of why you might want to do this, see + * http://lpar.ath0.com/2011/06/07/unicode-alchemy-with-db2/ + */ + private static String utf8truncate(String input, int length) { + StringBuilder result = new StringBuilder(length); + int resultlen = 0; + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + int charlen = 0; + if (c <= 0x7f) { + charlen = 1; + } else if (c <= 0x7ff) { + charlen = 2; + } else if (c <= 0xd7ff) { + charlen = 3; + } else if (c <= 0xdbff) { + charlen = 4; + } else if (c <= 0xdfff) { + charlen = 0; + } else if (c <= 0xffff) { + charlen = 3; + } + if (resultlen + charlen > length) { + break; + } + result.append(c); + resultlen += charlen; + } + return result.toString(); + } + + /** * Comparator for Image by weight * * @author Chris Alexander, chris@chris-alexander.co.uk - * */ - public class ImageComparator implements Comparator { + private class ImageComparator implements Comparator { @Override public int compare(ImageResult o1, ImageResult o2) { @@ -613,4 +1216,5 @@ public class ArticleTextExtractor { return o2.weight.compareTo(o1.weight); } } + } \ No newline at end of file diff --git a/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java b/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java index e815234..59d5635 100644 --- a/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java +++ b/app/src/main/java/acr/browser/lightning/reading/HtmlFetcher.java @@ -22,22 +22,19 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; +import java.net.MalformedURLException; import java.net.Proxy; import java.net.URL; import java.util.LinkedHashSet; -import java.util.Locale; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; -import acr.browser.lightning.constant.Constants; -import android.util.Log; - /** * Class to fetch articles. This class is thread safe. - * + * * @author Peter Karich */ public class HtmlFetcher { @@ -64,7 +61,7 @@ public class HtmlFetcher { else existing.add(domainStr); - String html = new HtmlFetcher().fetchAsString(url, 20000); + String html = new HtmlFetcher().fetchAsString(url, 2000); String outFile = domainStr + counterStr + ".html"; BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); writer.write(html); @@ -73,8 +70,8 @@ public class HtmlFetcher { reader.close(); } - private String referrer = "https://github.com/karussell/snacktory"; - private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ')'; + private String referrer = "http://jetsli.de/crawler"; + private String userAgent = "Mozilla/5.0 (compatible; Jetslide; +" + referrer + ')'; private String cacheControl = "max-age=0"; private String language = "en-us"; private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; @@ -83,7 +80,7 @@ public class HtmlFetcher { private final AtomicInteger cacheCounter = new AtomicInteger(0); private int maxTextLength = -1; private ArticleTextExtractor extractor = new ArticleTextExtractor(); - private final Set furtherResolveNecessary = new LinkedHashSet() { + private Set furtherResolveNecessary = new LinkedHashSet() { { add("bit.ly"); add("cli.gs"); @@ -202,6 +199,12 @@ public class HtmlFetcher { } public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception { + return fetchAndExtract(url, timeout, resolve, 0, false); + } + + // main workhorse to call externally + public JResult fetchAndExtract(String url, int timeout, boolean resolve, + int maxContentSize, boolean forceReload) throws Exception { String originalUrl = url; url = SHelper.removeHashbang(url); String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url); @@ -219,9 +222,8 @@ public class HtmlFetcher { if (res != null) return res; - String resUrl = getResolvedUrl(url, timeout); + String resUrl = getResolvedUrl(url, timeout, 0); if (resUrl.isEmpty()) { - Log.d(Constants.TAG, "resolved url is empty. Url is: " + url); JResult result = new JResult(); if (cache != null) @@ -229,10 +231,9 @@ public class HtmlFetcher { return result.setUrl(url); } - // if resolved url is longer then use it! - if (resUrl.trim().length() > url.length()) { - // this is necessary e.g. for some homebaken url resolvers which - // return + // if resolved url is different then use it! + if (!resUrl.equals(url)) { + // this is necessary e.g. for some homebaken url resolvers which return // the resolved url relative to url! url = SHelper.useDomainOfFirstArg4Second(url, resUrl); } @@ -244,20 +245,18 @@ public class HtmlFetcher { return res; JResult result = new JResult(); - // or should we use? + // or should we use? result.setUrl(url); result.setOriginalUrl(originalUrl); - result.setDate(SHelper.estimateDate(url)); - // Immediately put the url into the cache as extracting content takes - // time. + // Immediately put the url into the cache as extracting content takes time. if (cache != null) { cache.put(originalUrl, result); cache.put(url, result); } - String lowerUrl = url.toLowerCase(Locale.getDefault()); + // extract content to the extent appropriate for content type + String lowerUrl = url.toLowerCase(); if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) { // skip } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) { @@ -265,16 +264,30 @@ public class HtmlFetcher { } else if (SHelper.isImage(lowerUrl)) { result.setImageUrl(url); } else { - extractor.extractContent(result, fetchAsString(url, timeout)); + try { + String urlToDownload = url; + if (forceReload) { + urlToDownload = getURLtoBreakCache(url); + } + extractor.extractContent(result, fetchAsString(urlToDownload, timeout), maxContentSize); + } catch (IOException io) { + // do nothing + } if (result.getFaviconUrl().isEmpty()) result.setFaviconUrl(SHelper.getDefaultFavicon(url)); - // some links are relative to root and do not include the domain of - // the url :( - result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); - result.setImageUrl(fixUrl(url, result.getImageUrl())); - result.setVideoUrl(fixUrl(url, result.getVideoUrl())); - result.setRssUrl(fixUrl(url, result.getRssUrl())); + // some links are relative to root and do not include the domain of the url :( + if (!result.getFaviconUrl().isEmpty()) + result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); + + if (!result.getImageUrl().isEmpty()) + result.setImageUrl(fixUrl(url, result.getImageUrl())); + + if (!result.getVideoUrl().isEmpty()) + result.setVideoUrl(fixUrl(url, result.getVideoUrl())); + + if (!result.getRssUrl().isEmpty()) + result.setRssUrl(fixUrl(url, result.getRssUrl())); } result.setText(lessText(result.getText())); synchronized (result) { @@ -283,6 +296,20 @@ public class HtmlFetcher { return result; } + // Ugly hack to break free from any cached versions, a few URLs required this. + public static String getURLtoBreakCache(String url) { + try { + URL aURL = new URL(url); + if (aURL.getQuery() != null && aURL.getQuery().isEmpty()) { + return url + "?1"; + } else { + return url + "&1"; + } + } catch (MalformedURLException e) { + return url; + } + } + public String lessText(String text) { if (text == null) return ""; @@ -297,13 +324,14 @@ public class HtmlFetcher { return SHelper.useDomainOfFirstArg4Second(url, urlOrPath); } - public String fetchAsString(String urlAsString, int timeout) throws - IOException { + public String fetchAsString(String urlAsString, int timeout) + throws MalformedURLException, IOException { return fetchAsString(urlAsString, timeout, true); } + // main routine to get raw webpage content public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions) - throws IOException { + throws MalformedURLException, IOException { HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions); hConn.setInstanceFollowRedirects(true); String encoding = hConn.getContentEncoding(); @@ -317,27 +345,23 @@ public class HtmlFetcher { } String enc = Converter.extractEncoding(hConn.getContentType()); - String res = createConverter(urlAsString).streamToString(is, enc); - Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString); - return res; + return createConverter(urlAsString).streamToString(is, enc); } - public Converter createConverter(String url) { + public static Converter createConverter(String url) { return new Converter(url); } /** * On some devices we have to hack: - * http://developers.sun.com/mobility/reference - * /techart/design_guidelines/http_redirection.html - * - * @param timeout - * Sets a specified timeout value, in milliseconds + * http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html + * + * @param timeout Sets a specified timeout value, in milliseconds * @return the resolved url if any. Or null if it couldn't resolve the url - * (within the specified time) or the same url if response code is - * OK + * (within the specified time) or the same url if response code is OK */ - public String getResolvedUrl(String urlAsString, int timeout) { + public String getResolvedUrl(String urlAsString, int timeout, + int num_redirects) { String newUrl = null; int responseCode = -1; try { @@ -354,28 +378,32 @@ public class HtmlFetcher { return urlAsString; newUrl = hConn.getHeaderField("Location"); - if (responseCode / 100 == 3 && newUrl != null) { + // Note that the max recursion level is 5. + if (responseCode / 100 == 3 && newUrl != null && num_redirects < 5) { newUrl = newUrl.replaceAll(" ", "+"); - // some services use (none-standard) utf8 in their location - // header + // some services use (none-standard) utf8 in their location header if (urlAsString.startsWith("http://bit.ly") || urlAsString.startsWith("http://is.gd")) newUrl = encodeUriFromHeader(newUrl); - // fix problems if shortened twice. as it is often the case - // after twitters' t.co bullshit - if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) - newUrl = getResolvedUrl(newUrl, timeout); - + // AP: This code is not longer need, instead we always follow + // multiple redirects. + // + // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit + //if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) + // newUrl = getResolvedUrl(newUrl, timeout); + + // Add support for URLs with multiple levels of redirection, + // call getResolvedUrl until there is no more redirects or a + // max number of redirects is reached. + newUrl = SHelper.useDomainOfFirstArg4Second(urlAsString, newUrl); + newUrl = getResolvedUrl(newUrl, timeout, num_redirects + 1); return newUrl; } else return urlAsString; } catch (Exception ex) { - Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage()); return ""; - } finally { - Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl); } } @@ -400,9 +428,9 @@ public class HtmlFetcher { } protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout, - boolean includeSomeGooseOptions) throws IOException { + boolean includeSomeGooseOptions) throws MalformedURLException, IOException { URL url = new URL(urlAsStr); - // using proxy may increase latency + //using proxy may increase latency HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); hConn.setRequestProperty("User-Agent", userAgent); hConn.setRequestProperty("Accept", accept); @@ -415,8 +443,7 @@ public class HtmlFetcher { hConn.setRequestProperty("Cache-Control", cacheControl); } - // suggest respond to be gzipped or deflated (which is just another - // compression) + // suggest respond to be gzipped or deflated (which is just another compression) // http://stackoverflow.com/q/3932117 hConn.setRequestProperty("Accept-Encoding", "gzip, deflate"); hConn.setConnectTimeout(timeout); @@ -424,14 +451,12 @@ public class HtmlFetcher { return hConn; } - private JResult getFromCache(String url, String originalUrl) throws Exception { + private JResult getFromCache(String url, String originalUrl) { if (cache != null) { JResult res = cache.get(url); if (res != null) { - // e.g. the cache returned a shortened url as original url now - // we want to store the - // current original url! Also it can be that the cache response - // to url but the JResult + // e.g. the cache returned a shortened url as original url now we want to store the + // current original url! Also it can be that the cache response to url but the JResult // does not contain it so overwrite it: res.setUrl(url); res.setOriginalUrl(originalUrl); @@ -441,4 +466,4 @@ public class HtmlFetcher { } return null; } -} +} \ No newline at end of file diff --git a/app/src/main/java/acr/browser/lightning/reading/JResult.java b/app/src/main/java/acr/browser/lightning/reading/JResult.java index 1b4a23f..dc97de2 100644 --- a/app/src/main/java/acr/browser/lightning/reading/JResult.java +++ b/app/src/main/java/acr/browser/lightning/reading/JResult.java @@ -16,14 +16,18 @@ package acr.browser.lightning.reading; import java.io.Serializable; -import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + /** * Parsed result from web page containing important title, text and image. - * + * * @author Peter Karich */ public class JResult implements Serializable { @@ -38,10 +42,15 @@ public class JResult implements Serializable { private String text; private String faviconUrl; private String description; - private String dateString; - private List textList; + private String authorName; + private String authorDescription; + private Date date; private Collection keywords; private List images = null; + private List> links = new ArrayList<>(); + private String type; + private String sitename; + private String language; public JResult() { } @@ -108,6 +117,28 @@ public class JResult implements Serializable { return this; } + public String getAuthorName() { + if (authorName == null) + return ""; + return authorName; + } + + public JResult setAuthorName(String authorName) { + this.authorName = authorName; + return this; + } + + public String getAuthorDescription() { + if (authorDescription == null) + return ""; + return authorDescription; + } + + public JResult setAuthorDescription(String authorDescription) { + this.authorDescription = authorDescription; + return this; + } + public String getImageUrl() { if (imageUrl == null) return ""; @@ -131,17 +162,6 @@ public class JResult implements Serializable { return this; } - public List getTextList() { - if (this.textList == null) - return new ArrayList<>(); - return this.textList; - } - - public JResult setTextList(List textList) { - this.textList = textList; - return this; - } - public String getTitle() { if (title == null) return ""; @@ -164,8 +184,8 @@ public class JResult implements Serializable { return this; } - public JResult setDate(String date) { - this.dateString = date; + public JResult setDate(Date date) { + this.date = date; return this; } @@ -180,8 +200,8 @@ public class JResult implements Serializable { /** * @return get date from url or guessed from text */ - public String getDate() { - return dateString; + public Date getDate() { + return date; } /** @@ -209,8 +229,46 @@ public class JResult implements Serializable { this.images = images; } + public void addLink(String url, String text, Integer pos) { + Map link = new HashMap(); + link.put("url", url); + link.put("text", text); + link.put("offset", String.valueOf(pos)); + links.add(link); + } + + public List> getLinks() { + if (links == null) + return Collections.emptyList(); + return links; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getSitename() { + return sitename; + } + + public void setSitename(String sitename) { + this.sitename = sitename; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + @Override public String toString() { return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text; } -} +} \ No newline at end of file diff --git a/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java b/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java index 9e374a5..d05a26b 100644 --- a/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java +++ b/app/src/main/java/acr/browser/lightning/reading/OutputFormatter.java @@ -4,40 +4,46 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.Locale; import java.util.regex.Pattern; + import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; /** * @author goose | jim * @author karussell - * - * this class will be responsible for taking our top node and stripping - * out junk we don't want and getting it ready for how we want it - * presented to the user + *

+ * this class will be responsible for taking our top node and stripping out junk + * we don't want and getting it ready for how we want it presented to the user */ public class OutputFormatter { - public static final int MIN_PARAGRAPH_TEXT = 50; + private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph + private static final int MIN_PARAGRAPH_TEXT = 30; // Min size of any other paragraphs private static final List NODES_TO_REPLACE = Arrays.asList("strong", "b", "i"); private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden"); - protected final int minParagraphText; - protected final List nodesToReplace; - protected String nodesToKeepCssSelector = "p"; + private final int minFirstParagraphText; + private final int minParagraphText; + private final List nodesToReplace; + private String nodesToKeepCssSelector = "p, ol"; public OutputFormatter() { - this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); + this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); } public OutputFormatter(int minParagraphText) { - this(minParagraphText, NODES_TO_REPLACE); + this(minParagraphText, minParagraphText, NODES_TO_REPLACE); + } + + public OutputFormatter(int minFirstParagraphText, int minParagraphText) { + this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE); } - public OutputFormatter(int minParagraphText, List nodesToReplace) { + public OutputFormatter(int minFirstParagraphText, int minParagraphText, + List nodesToReplace) { + this.minFirstParagraphText = minFirstParagraphText; this.minParagraphText = minParagraphText; this.nodesToReplace = nodesToReplace; } @@ -53,36 +59,34 @@ public class OutputFormatter { * takes an element and turns the P tags into \n\n */ public String getFormattedText(Element topNode) { + setParagraphIndex(topNode, nodesToKeepCssSelector); removeNodesWithNegativeScores(topNode); StringBuilder sb = new StringBuilder(); - append(topNode, sb, nodesToKeepCssSelector); + int countOfP = append(topNode, sb, nodesToKeepCssSelector); String str = SHelper.innerTrim(sb.toString()); - if (str.length() > 100) + + int topNodeLength = topNode.text().length(); + if (topNodeLength == 0) { + topNodeLength = 1; + } + + + boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25); + if (str.length() > 100 && countOfP > 0 && !lowTextRatio) return str; // no subelements - if (str.isEmpty() || !topNode.text().isEmpty() + if (str.isEmpty() || (!topNode.text().isEmpty() && str.length() <= topNode.ownText().length()) + || countOfP == 0 || lowTextRatio) { str = topNode.text(); + } - // if jsoup failed to parse the whole html now parse this smaller + // if jsoup failed to parse the whole html now parse this smaller // snippet again to avoid html tags disturbing our text: return Jsoup.parse(str).text(); } - /** - * Takes an element and returns a list of texts extracted from the P tags - */ - public List getTextList(Element topNode) { - List texts = new ArrayList<>(); - for (Element element : topNode.select(this.nodesToKeepCssSelector)) { - if (element.hasText()) { - texts.add(element.text()); - } - } - return texts; - } - /** * If there are elements inside our top node that have a negative gravity * score remove them @@ -90,15 +94,20 @@ public class OutputFormatter { protected void removeNodesWithNegativeScores(Element topNode) { Elements gravityItems = topNode.select("*[gravityScore]"); for (Element item : gravityItems) { - int score = Integer.parseInt(item.attr("gravityScore")); - if (score < 0 || item.text().length() < minParagraphText) + int score = getScore(item); + int paragraphIndex = getParagraphIndex(item); + if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) { item.remove(); + } } } - protected void append(Element node, StringBuilder sb, String tagName) { + protected int append(Element node, StringBuilder sb, String tagName) { + int countOfP = 0; // Number of P elements in the article + int paragraphWithTextIndex = 0; // is select more costly then getElementsByTag? - MAIN: for (Element e : node.select(tagName)) { + MAIN: + for (Element e : node.select(tagName)) { Element tmpEl = e; // check all elements until 'node' while (tmpEl != null && !tmpEl.equals(node)) { @@ -108,18 +117,56 @@ public class OutputFormatter { } String text = node2Text(e); - if (text.isEmpty() || text.length() < minParagraphText - || text.length() > SHelper.countLetters(text) * 2) + if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex) + || text.length() > SHelper.countLetters(text) * 2) { continue; + } + + if (e.tagName().equals("p")) { + countOfP++; + } sb.append(text); sb.append("\n\n"); + paragraphWithTextIndex += 1; + } + + return countOfP; + } + + protected static void setParagraphIndex(Element node, String tagName) { + int paragraphIndex = 0; + for (Element e : node.select(tagName)) { + e.attr("paragraphIndex", Integer.toString(paragraphIndex++)); + } + } + + protected int getMinParagraph(int paragraphIndex) { + if (paragraphIndex < 1) { + return minFirstParagraphText; + } else { + return minParagraphText; + } + } + + protected static int getParagraphIndex(Element el) { + try { + return Integer.parseInt(el.attr("paragraphIndex")); + } catch (NumberFormatException ex) { + return -1; + } + } + + protected static int getScore(Element el) { + try { + return Integer.parseInt(el.attr("gravityScore")); + } catch (Exception ex) { + return 0; } } boolean unlikely(Node e) { - if (e.attr("class") != null - && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) + if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption")) return true; String style = e.attr("style"); @@ -127,36 +174,34 @@ public class OutputFormatter { return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); } - void appendTextSkipHidden(Element e, StringBuilder accum) { + void appendTextSkipHidden(Element e, StringBuilder accum, int indent) { for (Node child : e.childNodes()) { - if (unlikely(child)) + if (unlikely(child)) { continue; + } if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; - if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) + if (accum.length() > 0 && element.isBlock() + && !lastCharIsWhitespace(accum)) accum.append(' '); else if (element.tagName().equals("br")) accum.append(' '); - appendTextSkipHidden(element, accum); + appendTextSkipHidden(element, accum, indent + 1); } } } - boolean lastCharIsWhitespace(StringBuilder accum) { - return (accum.length() != 0) && Character.isWhitespace(accum.charAt(accum.length() - 1)); - } - - protected String node2TextOld(Element el) { - return el.text(); + static boolean lastCharIsWhitespace(StringBuilder accum) { + return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1)); } protected String node2Text(Element el) { StringBuilder sb = new StringBuilder(200); - appendTextSkipHidden(el, sb); + appendTextSkipHidden(el, sb, 0); return sb.toString(); } @@ -168,4 +213,4 @@ public class OutputFormatter { public OutputFormatter appendUnlikelyPattern(String str) { return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str); } -} +} \ No newline at end of file diff --git a/app/src/main/java/acr/browser/lightning/reading/SHelper.java b/app/src/main/java/acr/browser/lightning/reading/SHelper.java index 7274219..a71acdc 100644 --- a/app/src/main/java/acr/browser/lightning/reading/SHelper.java +++ b/app/src/main/java/acr/browser/lightning/reading/SHelper.java @@ -15,17 +15,19 @@ */ package acr.browser.lightning.reading; +import org.jsoup.nodes.Element; + import java.io.UnsupportedEncodingException; import java.net.CookieHandler; import java.net.CookieManager; import java.net.CookiePolicy; +import java.net.MalformedURLException; +import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.security.SecureRandom; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; -import java.text.SimpleDateFormat; -import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -33,10 +35,8 @@ import javax.net.ssl.KeyManager; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; -import org.jsoup.nodes.Element; /** - * * @author Peter Karich */ public class SHelper { @@ -127,8 +127,7 @@ public class SHelper { return null; // dynamic programming => save already identical length into array - // to understand this algo simply print identical length in every entry - // of the array + // to understand this algo simply print identical length in every entry of the array // i+1, j+1 then reuses information from i,j // java initializes them already with 0 int[][] num = new int[str1.length()][str2.length()]; @@ -152,7 +151,7 @@ public class SHelper { } } } - return new int[] { lastSubstrBegin, endIndex }; + return new int[]{lastSubstrBegin, endIndex}; } public static String getDefaultFavicon(String url) { @@ -160,35 +159,19 @@ public class SHelper { } /** - * @param urlForDomain - * extract the domain from this url - * @param path - * this url does not have a domain - * @return returns the domain + * @param urlForDomain extract the domain from this url + * @param path this url does not have a domain + * @return */ public static String useDomainOfFirstArg4Second(String urlForDomain, String path) { - if (path.startsWith("http")) + try { + // See: http://stackoverflow.com/questions/1389184/building-an-absolute-url-from-a-relative-url-in-java + URL baseUrl = new URL(urlForDomain); + URL relativeurl = new URL(baseUrl, path); + return relativeurl.toString(); + } catch (MalformedURLException ex) { return path; - - if ("favicon.ico".equals(path)) - path = "/favicon.ico"; - - if (path.startsWith("//")) { - // wikipedia special case, see tests - if (urlForDomain.startsWith("https:")) - return "https:" + path; - - return "http:" + path; - } else if (path.startsWith("/")) - return "http://" + extractHost(urlForDomain) + path; - else if (path.startsWith("../")) { - int slashIndex = urlForDomain.lastIndexOf("/"); - if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length()) - urlForDomain = urlForDomain.substring(0, slashIndex + 1); - - return urlForDomain + path; } - return path; } public static String extractHost(String url) { @@ -224,14 +207,12 @@ public class SHelper { } public static boolean isVideo(String url) { - return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") - || url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4") - || url.endsWith(".flv") || url.endsWith(".wmv"); + return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") || url.endsWith(".mov") + || url.endsWith(".mpg4") || url.endsWith(".mp4") || url.endsWith(".flv") || url.endsWith(".wmv"); } public static boolean isAudio(String url) { - return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") - || url.endsWith(".wav"); + return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") || url.endsWith(".wav"); } public static boolean isDoc(String url) { @@ -241,23 +222,20 @@ public class SHelper { public static boolean isPackage(String url) { return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip") - || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") - || url.endsWith(".7z"); + || url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") || url.endsWith(".7z"); } public static boolean isApp(String url) { - return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") - || url.endsWith(".dmg"); + return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") || url.endsWith(".dmg"); } public static boolean isImage(String url) { return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif") - || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") - || url.endsWith(".eps"); + || url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") || url.endsWith(".eps"); } /** - * http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se + * @see "http://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se" */ public static void enableCookieMgmt() { CookieManager manager = new CookieManager(); @@ -266,7 +244,7 @@ public class SHelper { } /** - * http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection + * @see "http://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java-urlconnection" */ public static void enableUserAgentOverwrite() { System.setProperty("http.agent", ""); @@ -377,8 +355,8 @@ public class SHelper { } else if (counter == monthCounter + 1) { try { day = Integer.parseInt(str); - } catch (Exception ex) { - ex.printStackTrace(); + } catch (Exception ignored) { + // ignored } if (day < 1 || day > 31) { day = -1; @@ -425,21 +403,11 @@ public class SHelper { return dateStr + "/01/01"; } - /** - * keep in mind: simpleDateFormatter is not thread safe! call completeDate - * before applying this formatter. - */ - public static SimpleDateFormat createDateFormatter() { - return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault()); - } - - // with the help of - // http://stackoverflow.com/questions/1828775/httpclient-and-ssl + // with the help of http://stackoverflow.com/questions/1828775/httpclient-and-ssl public static void enableAnySSL() { try { SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() }, - new SecureRandom()); + ctx.init(new KeyManager[0], new TrustManager[]{new DefaultTrustManager()}, new SecureRandom()); SSLContext.setDefault(ctx); } catch (Exception ex) { ex.printStackTrace(); @@ -449,13 +417,11 @@ public class SHelper { private static class DefaultTrustManager implements X509TrustManager { @Override - public void checkClientTrusted(X509Certificate[] arg0, String arg1) - throws CertificateException { + public void checkClientTrusted(X509Certificate[] arg0, String arg1) throws CertificateException { } @Override - public void checkServerTrusted(X509Certificate[] arg0, String arg1) - throws CertificateException { + public void checkServerTrusted(X509Certificate[] arg0, String arg1) throws CertificateException { } @Override @@ -473,4 +439,4 @@ public class SHelper { } return chars; } -} +} \ No newline at end of file