Lightning browser with I2P configuration
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

1216 lines
44 KiB

package acr.browser.lightning.reading;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.Date;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector.SelectorParseException;
/**
* This class is thread safe.
* Class for content extraction from string form of webpage
* 'extractContent' is main call from external programs/classes
*
* @author Alex P (ifesdjeen from jreadability)
* @author Peter Karich
*/
public class ArticleTextExtractor {
// Interessting nodes
private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
// Unlikely candidates
private String unlikelyStr;
private Pattern UNLIKELY;
// Most likely positive candidates
private String positiveStr;
private Pattern POSITIVE;
// Most likely negative candidates
private String negativeStr;
private Pattern NEGATIVE;
private static final Pattern NEGATIVE_STYLE =
Pattern.compile("hidden|display: ?none|font-size: ?small");
private static final Pattern IGNORE_AUTHOR_PARTS =
Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE);
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
{
add("hacker news");
add("facebook");
add("home");
add("articles");
}
};
private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
private OutputFormatter formatter = DEFAULT_FORMATTER;
private static final int MAX_AUTHOR_NAME_LENGHT = 255;
private static final int MIN_AUTHOR_NAME_LENGTH = 4;
private static final List<Pattern> CLEAN_AUTHOR_PATTERNS = Collections.singletonList(
Pattern.compile("By\\S*(.*)[\\.,].*")
);
private static final int MAX_AUTHOR_DESC_LENGHT = 1000;
private static final int MAX_IMAGE_LENGHT = 255;
// For debugging
private static final boolean DEBUG_WEIGHTS = false;
private static final int MAX_LOG_LENGTH = 200;
public ArticleTextExtractor() {
setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
+ "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
+ "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
+ "login|si(debar|gn|ngle)");
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+ "|arti(cle|kel)|instapaper_body");
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
}
private ArticleTextExtractor setUnlikely(String unlikelyStr) {
this.unlikelyStr = unlikelyStr;
UNLIKELY = Pattern.compile(unlikelyStr);
return this;
}
public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
return setUnlikely(unlikelyStr + '|' + unlikelyMatches);
}
private ArticleTextExtractor setPositive(String positiveStr) {
this.positiveStr = positiveStr;
POSITIVE = Pattern.compile(positiveStr);
return this;
}
public ArticleTextExtractor addPositive(String pos) {
return setPositive(positiveStr + '|' + pos);
}
private ArticleTextExtractor setNegative(String negativeStr) {
this.negativeStr = negativeStr;
NEGATIVE = Pattern.compile(negativeStr);
return this;
}
public ArticleTextExtractor addNegative(String neg) {
setNegative(negativeStr + '|' + neg);
return this;
}
public void setOutputFormatter(OutputFormatter formatter) {
this.formatter = formatter;
}
/**
* @param html extracts article text from given html string. wasn't tested
* with improper HTML, although jSoup should be able to handle minor stuff.
* @returns extracted article, all HTML tags stripped
*/
public JResult extractContent(String html, int maxContentSize) throws Exception {
return extractContent(new JResult(), html, maxContentSize);
}
public JResult extractContent(String html) throws Exception {
return extractContent(new JResult(), html, 0);
}
public JResult extractContent(JResult res, String html, int maxContentSize) throws Exception {
return extractContent(res, html, formatter, true, maxContentSize);
}
public JResult extractContent(JResult res, String html) throws Exception {
return extractContent(res, html, formatter, true, 0);
}
private JResult extractContent(JResult res, String html, OutputFormatter formatter,
Boolean extractimages, int maxContentSize) throws Exception {
if (html.isEmpty())
throw new IllegalArgumentException("html string is empty!?");
// http://jsoup.org/cookbook/extracting-data/selector-syntax
return extractContent(res, Jsoup.parse(html), formatter, extractimages, maxContentSize);
}
// Returns the best node match based on the weights (see getWeight for strategy)
private Element getBestMatchElement(Collection<Element> nodes) {
int maxWeight = -200; // why -200 now instead of 0?
Element bestMatchElement = null;
boolean ignoreMaxWeightLimit = false;
for (Element entry : nodes) {
int currentWeight = getWeight(entry, false);
if (currentWeight > maxWeight) {
maxWeight = currentWeight;
bestMatchElement = entry;
/*
// NOTE: This optimization fails with large pages that
contains chunks of text that can be mistaken by articles, since we
want the best accuracy possible, I am disabling it for now. AP.
// The original code had a limit of 200, the intention was that
// if a node had a weight greater than it, then it most likely
// it was the main content.
// However this assumption fails when the amount of text in the
// children (or grandchildren) is too large. If we detect this
// case then the limit is ignored and we try all the nodes to select
// the one with the absolute maximum weight.
if (maxWeight > 500){
ignoreMaxWeightLimit = true;
continue;
}
// formerly 200, increased to 250 to account for the fact
// we are not adding the weights of the grand children to the
// tally.
if (maxWeight > 250 && !ignoreMaxWeightLimit)
break;
*/
}
}
return bestMatchElement;
}
private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
Boolean extractimages, int maxContentSize) throws Exception {
Document origDoc = doc.clone();
JResult result = extractContent(res, doc, formatter, extractimages, maxContentSize, true);
//System.out.println("result.getText().length()="+result.getText().length());
if (result.getText().isEmpty()) {
result = extractContent(res, origDoc, formatter, extractimages, maxContentSize, false);
}
return result;
}
// main workhorse
private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
Boolean extractimages, int maxContentSize, boolean cleanScripts) {
if (doc == null)
throw new NullPointerException("missing document");
// get the easy stuff
res.setTitle(extractTitle(doc));
res.setDescription(extractDescription(doc));
res.setCanonicalUrl(extractCanonicalUrl(doc));
res.setType(extractType(doc));
res.setSitename(extractSitename(doc));
res.setLanguage(extractLanguage(doc));
// get author information
res.setAuthorName(extractAuthorName(doc));
res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName()));
// add extra selection gravity to any element containing author name
// wasn't useful in the case I implemented it for, but might be later
/*
Elements authelems = doc.select(":containsOwn(" + res.getAuthorName() + ")");
for (Element elem : authelems) {
elem.attr("extragravityscore", Integer.toString(100));
System.out.println("modified element " + elem.toString());
}
*/
// get date from document, if not present, extract from URL if possible
Date docdate = extractDate(doc);
if (docdate == null) {
String dateStr = SHelper.estimateDate(res.getUrl());
docdate = parseDate(dateStr);
res.setDate(docdate);
} else {
res.setDate(docdate);
}
// now remove the clutter
if (cleanScripts) {
prepareDocument(doc);
}
// init elements and get the one with highest weight (see getWeight for strategy)
Collection<Element> nodes = getNodes(doc);
Element bestMatchElement = getBestMatchElement(nodes);
// do extraction from the best element
if (bestMatchElement != null) {
if (extractimages) {
List<ImageResult> images = new ArrayList<>();
Element imgEl = determineImageSource(bestMatchElement, images);
if (imgEl != null) {
res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
// TODO remove parent container of image if it is contained in bestMatchElement
// to avoid image subtitles flooding in
res.setImages(images);
}
}
// clean before grabbing text
String text = formatter.getFormattedText(bestMatchElement);
text = removeTitleFromText(text, res.getTitle());
// this fails for short facebook post and probably tweets: text.length() > res.getDescription().length()
if (text.length() > res.getTitle().length()) {
if (maxContentSize > 0) {
if (text.length() > maxContentSize) {
text = utf8truncate(text, maxContentSize);
}
}
res.setText(text);
}
// extract links from the same best element
String fullhtml = bestMatchElement.toString();
Elements children = bestMatchElement.select("a[href]"); // a with href = link
String linkstr;
Integer linkpos;
Integer lastlinkpos = 0;
for (Element child : children) {
linkstr = child.toString();
linkpos = fullhtml.indexOf(linkstr, lastlinkpos);
res.addLink(child.attr("abs:href"), child.text(), linkpos);
lastlinkpos = linkpos;
}
}
if (extractimages) {
if (res.getImageUrl().isEmpty()) {
res.setImageUrl(extractImageUrl(doc));
}
}
res.setRssUrl(extractRssUrl(doc));
res.setVideoUrl(extractVideoUrl(doc));
res.setFaviconUrl(extractFaviconUrl(doc));
res.setKeywords(extractKeywords(doc));
// Sanity checks in author
if (res.getAuthorName().length() > MAX_AUTHOR_NAME_LENGHT) {
res.setAuthorName(utf8truncate(res.getAuthorName(), MAX_AUTHOR_NAME_LENGHT));
}
// Sanity checks in author description.
String authorDescSnippet = getSnippet(res.getAuthorDescription());
if (getSnippet(res.getText()).equals(authorDescSnippet) ||
getSnippet(res.getDescription()).equals(authorDescSnippet)) {
res.setAuthorDescription("");
} else {
if (res.getAuthorDescription().length() > MAX_AUTHOR_DESC_LENGHT) {
res.setAuthorDescription(utf8truncate(res.getAuthorDescription(), MAX_AUTHOR_DESC_LENGHT));
}
}
// Sanity checks in image name
if (res.getImageUrl().length() > MAX_IMAGE_LENGHT) {
// doesn't make sense to truncate a URL
res.setImageUrl("");
}
return res;
}
private static String getSnippet(String data) {
if (data.length() < 50)
return data;
else
return data.substring(0, 50);
}
private static String extractTitle(Document doc) {
String title = cleanTitle(doc.title());
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head title").text());
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content"));
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content"));
if (title.isEmpty()) {
title = SHelper.innerTrim(doc.select("h1:first-of-type").text());
}
}
}
}
}
return title;
}
private static String extractCanonicalUrl(Document doc) {
String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
if (url.isEmpty()) {
url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
if (url.isEmpty()) {
url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content"));
}
}
return url;
}
private static String extractDescription(Document doc) {
String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
if (description.isEmpty()) {
description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content"));
if (description.isEmpty()) {
description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content"));
}
}
return description;
}
// Returns the publication Date or null
private static Date extractDate(Document doc) {
String dateStr = "";
// try some locations that nytimes uses
Element elem = doc.select("meta[name=ptime]").first();
if (elem != null) {
dateStr = SHelper.innerTrim(elem.attr("content"));
// elem.attr("extragravityscore", Integer.toString(100));
// System.out.println("date modified element " + elem.toString());
}
if (dateStr.isEmpty()) {
dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content"));
}
if (dateStr.isEmpty()) {
dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content"));
}
if (dateStr.isEmpty()) {
dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content"));
}
if (dateStr.isEmpty()) {
return parseDate(dateStr);
}
// taking this stuff directly from Juicer (and converted to Java)
// opengraph (?)
Elements elems = doc.select("meta[property=article:published_time]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
try {
if (dateStr.endsWith("Z")) {
dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00";
} else {
dateStr = String.format(dateStr.substring(0, dateStr.length() - 6),
dateStr.substring(dateStr.length() - 6,
dateStr.length()));
}
} catch (StringIndexOutOfBoundsException ex) {
// do nothing
}
return parseDate(dateStr);
}
}
// rnews
elems = doc.select("meta[property=dateCreated], span[property=dateCreated]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
return parseDate(dateStr);
} else {
return parseDate(el.text());
}
}
// schema.org creativework
elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
return parseDate(dateStr);
} else if (el.hasAttr("value")) {
dateStr = el.attr("value");
return parseDate(dateStr);
} else {
return parseDate(el.text());
}
}
// parsely page (?)
/* skip conversion for now, seems highly specific and uses new lib
elems = doc.select("meta[name=parsely-page]");
if (elems.size() > 0) {
implicit val formats = net.liftweb.json.DefaultFormats
Element el = elems.get(0);
if(el.hasAttr("content")) {
val json = parse(el.attr("content"))
return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz"))
}
}
*/
// BBC
elems = doc.select("meta[name=OriginalPublicationDate]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
return parseDate(dateStr);
}
}
// wired
elems = doc.select("meta[name=DisplayDate]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
return parseDate(dateStr);
}
}
// wildcard
elems = doc.select("meta[name*=date]");
if (!elems.isEmpty()) {
Element el = elems.get(0);
if (el.hasAttr("content")) {
dateStr = el.attr("content");
Date parsedDate = parseDate(dateStr);
return parsedDate;
}
}
// blogger
elems = doc.select(".date-header");
if (!elems.isEmpty()) {
Element el = elems.get(0);
dateStr = el.text();
return parseDate(dateStr);
}
return null;
}
private static Date parseDate(String dateStr) {
// String[] parsePatterns = {
// "yyyy-MM-dd'T'HH:mm:ssz",
// "yyyy-MM-dd HH:mm:ss",
// "yyyy/MM/dd HH:mm:ss",
// "yyyy-MM-dd HH:mm",
// "yyyy/MM/dd HH:mm",
// "yyyy-MM-dd",
// "yyyy/MM/dd",
// "MM/dd/yyyy HH:mm:ss",
// "MM-dd-yyyy HH:mm:ss",
// "MM/dd/yyyy HH:mm",
// "MM-dd-yyyy HH:mm",
// "MM/dd/yyyy",
// "MM-dd-yyyy",
// "EEE, MMM dd, yyyy",
// "MM/dd/yyyy hh:mm:ss a",
// "MM-dd-yyyy hh:mm:ss a",
// "MM/dd/yyyy hh:mm a",
// "MM-dd-yyyy hh:mm a",
// "yyyy-MM-dd hh:mm:ss a",
// "yyyy/MM/dd hh:mm:ss a ",
// "yyyy-MM-dd hh:mm a",
// "yyyy/MM/dd hh:mm ",
// "dd MMM yyyy",
// "dd MMMM yyyy",
// "yyyyMMddHHmm",
// "yyyyMMdd HHmm",
// "dd-MM-yyyy HH:mm:ss",
// "dd/MM/yyyy HH:mm:ss",
// "dd MMM yyyy HH:mm:ss",
// "dd MMMM yyyy HH:mm:ss",
// "dd-MM-yyyy HH:mm",
// "dd/MM/yyyy HH:mm",
// "dd MMM yyyy HH:mm",
// "dd MMMM yyyy HH:mm",
// "yyyyMMddHHmmss",
// "yyyyMMdd HHmmss",
// "yyyyMMdd"
// };
//
return new Date(0);
// try {
// return DateUtils.parseDateStrictly(dateStr, parsePatterns);
// } catch (Exception ex) {
// return null;
// }
}
// Returns the author name or null
private String extractAuthorName(Document doc) {
String authorName = "";
// first try the Google Author tag
Element result = doc.select("body [rel*=author]").first();
if (result != null)
authorName = SHelper.innerTrim(result.ownText());
// if that doesn't work, try some other methods
if (authorName.isEmpty()) {
// meta tag approaches, get content
result = doc.select("head meta[name=author]").first();
if (result != null) {
authorName = SHelper.innerTrim(result.attr("content"));
}
if (authorName.isEmpty()) { // for "opengraph"
authorName = SHelper.innerTrim(doc.select("head meta[property=article:author]").attr("content"));
}
if (authorName.isEmpty()) { // OpenGraph twitter:creator tag
authorName = SHelper.innerTrim(doc.select("head meta[property=twitter:creator]").attr("content"));
}
if (authorName.isEmpty()) { // for "schema.org creativework"
authorName = SHelper.innerTrim(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content"));
}
// other hacks
if (authorName.isEmpty()) {
try {
// build up a set of elements which have likely author-related terms
// .X searches for class X
Elements matches = doc.select("a[rel=author],.byline-name,.byLineTag,.byline,.author,.by,.writer,.address");
if (matches == null || matches.isEmpty()) {
matches = doc.select("body [class*=author]");
}
if (matches == null || matches.isEmpty()) {
matches = doc.select("body [title*=author]");
}
// a hack for huffington post
if (matches == null || matches.isEmpty()) {
matches = doc.select(".staff_info dl a[href]");
}
// a hack for http://sports.espn.go.com/
if (matches == null || matches.isEmpty()) {
matches = doc.select("cite[class*=source]");
}
// select the best element from them
if (matches != null) {
Element bestMatch = getBestMatchElement(matches);
if (!(bestMatch == null)) {
authorName = bestMatch.text();
if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) {
authorName = bestMatch.text();
}
authorName = SHelper.innerTrim(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll(""));
if (authorName.contains(",")) {
authorName = authorName.split(",")[0];
}
}
}
} catch (Exception e) {
System.out.println(e.toString());
}
}
}
for (Pattern pattern : CLEAN_AUTHOR_PATTERNS) {
Matcher matcher = pattern.matcher(authorName);
if (matcher.matches()) {
authorName = SHelper.innerTrim(matcher.group(1));
break;
}
}
return authorName;
}
// Returns the author description or null
private String extractAuthorDescription(Document doc, String authorName) {
String authorDesc = "";
if (authorName.isEmpty())
return "";
// Special case for entrepreneur.com
Elements matches = doc.select(".byline > .bio");
if (matches != null && !matches.isEmpty()) {
Element bestMatch = matches.first(); // assume it is the first.
authorDesc = bestMatch.text();
return authorDesc;
}
// Special case for huffingtonpost.com
matches = doc.select(".byline span[class*=teaser]");
if (matches != null && !matches.isEmpty()) {
Element bestMatch = matches.first(); // assume it is the first.
authorDesc = bestMatch.text();
return authorDesc;
}
try {
Elements nodes = doc.select(":containsOwn(" + authorName + ')');
Element bestMatch = getBestMatchElement(nodes);
if (bestMatch != null)
authorDesc = bestMatch.text();
} catch (SelectorParseException se) {
// Avoid error when selector is invalid
}
return authorDesc;
}
private static Collection<String> extractKeywords(Document doc) {
String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));
if (content.startsWith("[") && content.endsWith("]"))
content = content.substring(1, content.length() - 1);
String[] split = content.split("\\s*,\\s*");
if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty()))
return Arrays.asList(split);
return Collections.emptyList();
}
/**
* Tries to extract an image url from metadata if determineImageSource
* failed
*
* @return image url or empty str
*/
private static String extractImageUrl(Document doc) {
// use open graph tag to get image
String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content"));
if (imageUrl.isEmpty()) {
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content"));
if (imageUrl.isEmpty()) {
// prefer link over thumbnail-meta if empty
imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
if (imageUrl.isEmpty()) {
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content"));
}
}
}
return imageUrl;
}
private static String extractRssUrl(Document doc) {
return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href"));
}
private static String extractVideoUrl(Document doc) {
return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
}
private static String extractFaviconUrl(Document doc) {
String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
if (faviconUrl.isEmpty()) {
faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href"));
}
return faviconUrl;
}
private static String extractType(Document doc) {
return SHelper.innerTrim(doc.select("head meta[property=og:type]").attr("content"));
}
private static String extractSitename(Document doc) {
String sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
if (sitename.isEmpty()) {
sitename = SHelper.innerTrim(doc.select("head meta[name=twitter:site]").attr("content"));
}
if (sitename.isEmpty()) {
sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
}
return sitename;
}
private static String extractLanguage(Document doc) {
String language = SHelper.innerTrim(doc.select("head meta[property=language]").attr("content"));
if (language.isEmpty()) {
language = SHelper.innerTrim(doc.select("html").attr("lang"));
if (language.isEmpty()) {
language = SHelper.innerTrim(doc.select("head meta[property=og:locale]").attr("content"));
}
}
if (!language.isEmpty()) {
if (language.length() > 2) {
language = language.substring(0, 2);
}
}
return language;
}
/**
* Weights current element. By matching it with positive candidates and
* weighting child nodes. Since it's impossible to predict which exactly
* names, ids or class names will be used in HTML, major role is played by
* child nodes
*
* @param e Element to weight, along with child nodes
*/
private int getWeight(Element e, boolean checkextra) {
int weight = calcWeight(e);
int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10);
weight += ownTextWeight;
int childrenWeight = weightChildNodes(e);
weight += childrenWeight;
// add additional weight using possible 'extragravityscore' attribute
if (checkextra) {
Element xelem = e.select("[extragravityscore]").first();
if (xelem != null) {
// System.out.println("HERE found one: " + xelem.toString());
weight += Integer.parseInt(xelem.attr("extragravityscore"));
// System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore"));
}
}
return weight;
}
/**
* Weights a child nodes of given Element. During tests some difficulties
* were met. For instance, not every single document has nested paragraph
* tags inside of the major article tag. Sometimes people are adding one
* more nesting level. So, we're adding 4 points for every 100 symbols
* contained in tag nested inside of the current weighted element, but only
* 3 points for every element that's nested 2 levels deep. This way we give
* more chances to extract the element that has less nested levels,
* increasing probability of the correct extraction.
*
* @param rootEl Element, who's child nodes will be weighted
*/
private int weightChildNodes(Element rootEl) {
int weight = 0;
Element caption = null;
List<Element> pEls = new ArrayList<>(5);
for (Element child : rootEl.children()) {
String ownText = child.ownText();
int ownTextLength = ownText.length();
if (ownTextLength < 20)
continue;
if (ownTextLength > 200) {
int childOwnTextWeight = Math.max(50, ownTextLength / 10);
weight += childOwnTextWeight;
}
if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
int h2h1Weight = 30;
weight += h2h1Weight;
} else if (child.tagName().equals("div") || child.tagName().equals("p")) {
int calcChildWeight = calcWeightForChild(child, ownText);
weight += calcChildWeight;
if (child.tagName().equals("p") && ownTextLength > 50)
pEls.add(child);
if (child.className().toLowerCase().equals("caption"))
caption = child;
}
}
//
// Visit grandchildren, This section visits the grandchildren
// of the node and calculate their weights. Note that grandchildren
// weights are only worth 1/3 of children's
//
int grandChildrenWeight = 0;
for (Element child2 : rootEl.children()) {
// If the node looks negative don't include it in the weights
// instead penalize the grandparent. This is done to try to
// avoid giving weigths to navigation nodes, etc.
if (NEGATIVE.matcher(child2.id()).find() ||
NEGATIVE.matcher(child2.className()).find()) {
grandChildrenWeight -= 30;
continue;
}
for (Element grandchild : child2.children()) {
int grandchildWeight = 0;
String ownText = grandchild.ownText();
int ownTextLength = ownText.length();
if (ownTextLength < 20)
continue;
if (ownTextLength > 200) {
int childOwnTextWeight = Math.max(50, ownTextLength / 10);
grandchildWeight += childOwnTextWeight;
}
if (grandchild.tagName().equals("h1") || grandchild.tagName().equals("h2")) {
int h2h1Weight = 30;
grandchildWeight += h2h1Weight;
} else if (grandchild.tagName().equals("div") || grandchild.tagName().equals("p")) {
int calcChildWeight = calcWeightForChild(grandchild, ownText);
grandchildWeight += calcChildWeight;
}
grandChildrenWeight += grandchildWeight;
}
}
grandChildrenWeight = grandChildrenWeight / 3;
weight += grandChildrenWeight;
// use caption and image
if (caption != null) {
int captionWeight = 30;
weight += captionWeight;
}
if (pEls.size() >= 2) {
for (Element subEl : rootEl.children()) {
if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
int h1h2h3Weight = 20;
weight += h1h2h3Weight;
// headerEls.add(subEl);
} else if ("table;li;td;th".contains(subEl.tagName())) {
addScore(subEl, -30);
}
if ("p".contains(subEl.tagName()))
addScore(subEl, 30);
}
}
return weight;
}
private static void addScore(Element el, int score) {
int old = getScore(el);
setScore(el, score + old);
}
private static int getScore(Element el) {
int old = 0;
try {
old = Integer.parseInt(el.attr("gravityScore"));
} catch (Exception ignored) {
}
return old;
}
private static void setScore(Element el, int score) {
el.attr("gravityScore", Integer.toString(score));
}
private static int calcWeightForChild(Element child, String ownText) {
int c = SHelper.count(ownText, "&quot;");
c += SHelper.count(ownText, "&lt;");
c += SHelper.count(ownText, "&gt;");
c += SHelper.count(ownText, "px");
int val;
if (c > 5)
val = -30;
else
val = (int) Math.round(ownText.length() / 35.0);
addScore(child, val);
return val;
}
private int calcWeight(Element e) {
int weight = 0;
if (POSITIVE.matcher(e.className()).find())
weight += 35;
if (POSITIVE.matcher(e.id()).find())
weight += 45;
if (UNLIKELY.matcher(e.className()).find())
weight -= 20;
if (UNLIKELY.matcher(e.id()).find())
weight -= 20;
if (NEGATIVE.matcher(e.className()).find())
weight -= 50;
if (NEGATIVE.matcher(e.id()).find())
weight -= 50;
String style = e.attr("style");
if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
weight -= 50;
String itemprop = e.attr("itemprop");
if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) {
weight += 100;
}
return weight;
}
private static Element determineImageSource(Element el, List<ImageResult> images) {
int maxWeight = 0;
Element maxNode = null;
Elements els = el.select("img");
if (els.isEmpty())
els = el.parent().select("img");
double score = 1;
for (Element e : els) {
String sourceUrl = e.attr("src");
if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
continue;
int weight = 0;
int height = 0;
try {
height = Integer.parseInt(e.attr("height"));
if (height >= 50)
weight += 20;
else
weight -= 20;
} catch (Exception ignored) {
}
int width = 0;
try {
width = Integer.parseInt(e.attr("width"));
if (width >= 50)
weight += 20;
else
weight -= 20;
} catch (Exception ignored) {
}
String alt = e.attr("alt");
if (alt.length() > 35)
weight += 20;
String title = e.attr("title");
if (title.length() > 35)
weight += 20;
String rel;
boolean noFollow = false;
if (e.parent() != null) {
rel = e.parent().attr("rel");
if (rel != null && rel.contains("nofollow")) {
noFollow = rel.contains("nofollow");
weight -= 40;
}
}
weight = (int) (weight * score);
if (weight > maxWeight) {
maxWeight = weight;
maxNode = e;
score = score / 2;
}
ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow);
images.add(image);
}
Collections.sort(images, new ImageComparator());
return maxNode;
}
/**
* Prepares document. Currently only stipping unlikely candidates, since
* from time to time they're getting more score than good ones especially in
* cases when major text is short.
*
* @param doc document to prepare. Passed as reference, and changed inside
* of function
*/
private static void prepareDocument(Document doc) {
// stripUnlikelyCandidates(doc);
removeScriptsAndStyles(doc);
}
/**
* Removes unlikely candidates from HTML. Currently takes id and class name
* and matches them against list of patterns
*
* @param doc document to strip unlikely candidates from
*/
protected void stripUnlikelyCandidates(Document doc) {
for (Element child : doc.select("body").select("*")) {
String className = child.className().toLowerCase();
String id = child.id().toLowerCase();
if (NEGATIVE.matcher(className).find()
|| NEGATIVE.matcher(id).find()) {
child.remove();
}
}
}
private static Document removeScriptsAndStyles(Document doc) {
Elements scripts = doc.getElementsByTag("script");
for (Element item : scripts) {
item.remove();
}
Elements noscripts = doc.getElementsByTag("noscript");
for (Element item : noscripts) {
item.remove();
}
Elements styles = doc.getElementsByTag("style");
for (Element style : styles) {
style.remove();
}
return doc;
}
private static boolean isAdImage(String imageUrl) {
return SHelper.count(imageUrl, "ad") >= 2;
}
/**
* Match only exact matching as longestSubstring can be too fuzzy
*/
private static String removeTitleFromText(String text, String title) {
// don't do this as its terrible to read
// int index1 = text.toLowerCase().indexOf(title.toLowerCase());
// if (index1 >= 0)
// text = text.substring(index1 + title.length());
// return text.trim();
return text;
}
/**
* based on a delimeter in the title take the longest piece or do some
* custom logic based on the site
*
* @param title
* @param delimeter
* @return
*/
private static String doTitleSplits(String title, String delimeter) {
String largeText = "";
int largetTextLen = 0;
String[] titlePieces = title.split(delimeter);
// take the largest split
for (String p : titlePieces) {
if (p.length() > largetTextLen) {
largeText = p;
largetTextLen = p.length();
}
}
largeText = largeText.replace("&raquo;", " ");
largeText = largeText.replace("»", " ");
return largeText.trim();
}
/**
* @return a set of all important nodes
*/
private static Collection<Element> getNodes(Document doc) {
Map<Element, Object> nodes = new LinkedHashMap<>(64);
int score = 100;
for (Element el : doc.select("body").select("*")) {
if (NODES.matcher(el.tagName()).matches()) {
nodes.put(el, null);
setScore(el, score);
score = score / 2;
}
}
return nodes.keySet();
}
private static String cleanTitle(String title) {
// int index = title.lastIndexOf("|");
// if (index > 0 && title.length() / 2 < index)
// title = title.substring(0, index + 1);
int counter = 0;
String[] strs = title.split("\\|");
StringBuilder res = new StringBuilder(strs.length);
for (String part : strs) {
if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim()))
continue;
if (counter == strs.length - 1 && res.length() > part.length())
continue;
if (counter > 0)
res.append('|');
res.append(part);
counter++;
}
return SHelper.innerTrim(res.toString());
}
/**
* Truncate a Java string so that its UTF-8 representation will not
* exceed the specified number of bytes.
* <p/>
* For discussion of why you might want to do this, see
* http://lpar.ath0.com/2011/06/07/unicode-alchemy-with-db2/
*/
private static String utf8truncate(String input, int length) {
StringBuilder result = new StringBuilder(length);
int resultlen = 0;
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
int charlen = 0;
if (c <= 0x7f) {
charlen = 1;
} else if (c <= 0x7ff) {
charlen = 2;
} else if (c <= 0xd7ff) {
charlen = 3;
} else if (c <= 0xdbff) {
charlen = 4;
} else if (c <= 0xdfff) {
charlen = 0;
} else {
charlen = 3;
}
if (resultlen + charlen > length) {
break;
}
result.append(c);
resultlen += charlen;
}
return result.toString();
}
/**
* Comparator for Image by weight
*
* @author Chris Alexander, chris@chris-alexander.co.uk
*/
private static class ImageComparator implements Comparator<ImageResult> {
@Override
public int compare(ImageResult o1, ImageResult o2) {
// Returns the highest weight first
return o2.weight.compareTo(o1.weight);
}
}
}