You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1216 lines
44 KiB
1216 lines
44 KiB
package acr.browser.lightning.reading; |
|
|
|
import java.util.ArrayList; |
|
import java.util.Arrays; |
|
import java.util.Collection; |
|
import java.util.Collections; |
|
import java.util.Comparator; |
|
import java.util.LinkedHashMap; |
|
import java.util.LinkedHashSet; |
|
import java.util.List; |
|
import java.util.Map; |
|
import java.util.Set; |
|
import java.util.regex.Pattern; |
|
import java.util.regex.Matcher; |
|
import java.util.Date; |
|
|
|
import org.jsoup.Jsoup; |
|
import org.jsoup.nodes.Document; |
|
import org.jsoup.nodes.Element; |
|
import org.jsoup.select.Elements; |
|
import org.jsoup.select.Selector.SelectorParseException; |
|
|
|
/** |
|
* This class is thread safe. |
|
* Class for content extraction from string form of webpage |
|
* 'extractContent' is main call from external programs/classes |
|
* |
|
* @author Alex P (ifesdjeen from jreadability) |
|
* @author Peter Karich |
|
*/ |
|
public class ArticleTextExtractor { |
|
|
|
// Interessting nodes |
|
private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section"); |
|
// Unlikely candidates |
|
private String unlikelyStr; |
|
private Pattern UNLIKELY; |
|
// Most likely positive candidates |
|
private String positiveStr; |
|
private Pattern POSITIVE; |
|
// Most likely negative candidates |
|
private String negativeStr; |
|
private Pattern NEGATIVE; |
|
private static final Pattern NEGATIVE_STYLE = |
|
Pattern.compile("hidden|display: ?none|font-size: ?small"); |
|
private static final Pattern IGNORE_AUTHOR_PARTS = |
|
Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE); |
|
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() { |
|
{ |
|
add("hacker news"); |
|
add("facebook"); |
|
add("home"); |
|
add("articles"); |
|
} |
|
}; |
|
private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter(); |
|
private OutputFormatter formatter = DEFAULT_FORMATTER; |
|
|
|
private static final int MAX_AUTHOR_NAME_LENGHT = 255; |
|
private static final int MIN_AUTHOR_NAME_LENGTH = 4; |
|
private static final List<Pattern> CLEAN_AUTHOR_PATTERNS = Collections.singletonList( |
|
Pattern.compile("By\\S*(.*)[\\.,].*") |
|
); |
|
private static final int MAX_AUTHOR_DESC_LENGHT = 1000; |
|
private static final int MAX_IMAGE_LENGHT = 255; |
|
|
|
// For debugging |
|
private static final boolean DEBUG_WEIGHTS = false; |
|
private static final int MAX_LOG_LENGTH = 200; |
|
|
|
public ArticleTextExtractor() { |
|
setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" |
|
+ "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" |
|
+ "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|" |
|
+ "login|si(debar|gn|ngle)"); |
|
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" |
|
+ "|arti(cle|kel)|instapaper_body"); |
|
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" |
|
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" |
|
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); |
|
} |
|
|
|
private ArticleTextExtractor setUnlikely(String unlikelyStr) { |
|
this.unlikelyStr = unlikelyStr; |
|
UNLIKELY = Pattern.compile(unlikelyStr); |
|
return this; |
|
} |
|
|
|
public ArticleTextExtractor addUnlikely(String unlikelyMatches) { |
|
return setUnlikely(unlikelyStr + '|' + unlikelyMatches); |
|
} |
|
|
|
private ArticleTextExtractor setPositive(String positiveStr) { |
|
this.positiveStr = positiveStr; |
|
POSITIVE = Pattern.compile(positiveStr); |
|
return this; |
|
} |
|
|
|
public ArticleTextExtractor addPositive(String pos) { |
|
return setPositive(positiveStr + '|' + pos); |
|
} |
|
|
|
private ArticleTextExtractor setNegative(String negativeStr) { |
|
this.negativeStr = negativeStr; |
|
NEGATIVE = Pattern.compile(negativeStr); |
|
return this; |
|
} |
|
|
|
public ArticleTextExtractor addNegative(String neg) { |
|
setNegative(negativeStr + '|' + neg); |
|
return this; |
|
} |
|
|
|
public void setOutputFormatter(OutputFormatter formatter) { |
|
this.formatter = formatter; |
|
} |
|
|
|
/** |
|
* @param html extracts article text from given html string. wasn't tested |
|
* with improper HTML, although jSoup should be able to handle minor stuff. |
|
* @returns extracted article, all HTML tags stripped |
|
*/ |
|
public JResult extractContent(String html, int maxContentSize) throws Exception { |
|
return extractContent(new JResult(), html, maxContentSize); |
|
} |
|
|
|
public JResult extractContent(String html) throws Exception { |
|
return extractContent(new JResult(), html, 0); |
|
} |
|
|
|
public JResult extractContent(JResult res, String html, int maxContentSize) throws Exception { |
|
return extractContent(res, html, formatter, true, maxContentSize); |
|
} |
|
|
|
public JResult extractContent(JResult res, String html) throws Exception { |
|
return extractContent(res, html, formatter, true, 0); |
|
} |
|
|
|
private JResult extractContent(JResult res, String html, OutputFormatter formatter, |
|
Boolean extractimages, int maxContentSize) throws Exception { |
|
if (html.isEmpty()) |
|
throw new IllegalArgumentException("html string is empty!?"); |
|
|
|
// http://jsoup.org/cookbook/extracting-data/selector-syntax |
|
return extractContent(res, Jsoup.parse(html), formatter, extractimages, maxContentSize); |
|
} |
|
|
|
// Returns the best node match based on the weights (see getWeight for strategy) |
|
private Element getBestMatchElement(Collection<Element> nodes) { |
|
int maxWeight = -200; // why -200 now instead of 0? |
|
Element bestMatchElement = null; |
|
|
|
boolean ignoreMaxWeightLimit = false; |
|
for (Element entry : nodes) { |
|
|
|
int currentWeight = getWeight(entry, false); |
|
if (currentWeight > maxWeight) { |
|
maxWeight = currentWeight; |
|
bestMatchElement = entry; |
|
|
|
/* |
|
// NOTE: This optimization fails with large pages that |
|
contains chunks of text that can be mistaken by articles, since we |
|
want the best accuracy possible, I am disabling it for now. AP. |
|
|
|
// The original code had a limit of 200, the intention was that |
|
// if a node had a weight greater than it, then it most likely |
|
// it was the main content. |
|
// However this assumption fails when the amount of text in the |
|
// children (or grandchildren) is too large. If we detect this |
|
// case then the limit is ignored and we try all the nodes to select |
|
// the one with the absolute maximum weight. |
|
if (maxWeight > 500){ |
|
ignoreMaxWeightLimit = true; |
|
continue; |
|
} |
|
|
|
// formerly 200, increased to 250 to account for the fact |
|
// we are not adding the weights of the grand children to the |
|
// tally. |
|
|
|
if (maxWeight > 250 && !ignoreMaxWeightLimit) |
|
break; |
|
*/ |
|
} |
|
} |
|
|
|
return bestMatchElement; |
|
} |
|
|
|
private JResult extractContent(JResult res, Document doc, OutputFormatter formatter, |
|
Boolean extractimages, int maxContentSize) throws Exception { |
|
Document origDoc = doc.clone(); |
|
JResult result = extractContent(res, doc, formatter, extractimages, maxContentSize, true); |
|
//System.out.println("result.getText().length()="+result.getText().length()); |
|
if (result.getText().isEmpty()) { |
|
result = extractContent(res, origDoc, formatter, extractimages, maxContentSize, false); |
|
} |
|
return result; |
|
} |
|
|
|
|
|
// main workhorse |
|
private JResult extractContent(JResult res, Document doc, OutputFormatter formatter, |
|
Boolean extractimages, int maxContentSize, boolean cleanScripts) { |
|
if (doc == null) |
|
throw new NullPointerException("missing document"); |
|
|
|
// get the easy stuff |
|
res.setTitle(extractTitle(doc)); |
|
res.setDescription(extractDescription(doc)); |
|
res.setCanonicalUrl(extractCanonicalUrl(doc)); |
|
res.setType(extractType(doc)); |
|
res.setSitename(extractSitename(doc)); |
|
res.setLanguage(extractLanguage(doc)); |
|
|
|
// get author information |
|
res.setAuthorName(extractAuthorName(doc)); |
|
res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName())); |
|
|
|
// add extra selection gravity to any element containing author name |
|
// wasn't useful in the case I implemented it for, but might be later |
|
/* |
|
Elements authelems = doc.select(":containsOwn(" + res.getAuthorName() + ")"); |
|
for (Element elem : authelems) { |
|
elem.attr("extragravityscore", Integer.toString(100)); |
|
System.out.println("modified element " + elem.toString()); |
|
} |
|
*/ |
|
|
|
// get date from document, if not present, extract from URL if possible |
|
Date docdate = extractDate(doc); |
|
if (docdate == null) { |
|
String dateStr = SHelper.estimateDate(res.getUrl()); |
|
docdate = parseDate(dateStr); |
|
res.setDate(docdate); |
|
} else { |
|
res.setDate(docdate); |
|
} |
|
|
|
// now remove the clutter |
|
if (cleanScripts) { |
|
prepareDocument(doc); |
|
} |
|
|
|
// init elements and get the one with highest weight (see getWeight for strategy) |
|
Collection<Element> nodes = getNodes(doc); |
|
Element bestMatchElement = getBestMatchElement(nodes); |
|
|
|
// do extraction from the best element |
|
if (bestMatchElement != null) { |
|
if (extractimages) { |
|
List<ImageResult> images = new ArrayList<>(); |
|
Element imgEl = determineImageSource(bestMatchElement, images); |
|
if (imgEl != null) { |
|
res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src"))); |
|
// TODO remove parent container of image if it is contained in bestMatchElement |
|
// to avoid image subtitles flooding in |
|
|
|
res.setImages(images); |
|
} |
|
} |
|
|
|
// clean before grabbing text |
|
String text = formatter.getFormattedText(bestMatchElement); |
|
text = removeTitleFromText(text, res.getTitle()); |
|
// this fails for short facebook post and probably tweets: text.length() > res.getDescription().length() |
|
if (text.length() > res.getTitle().length()) { |
|
if (maxContentSize > 0) { |
|
if (text.length() > maxContentSize) { |
|
text = utf8truncate(text, maxContentSize); |
|
} |
|
} |
|
res.setText(text); |
|
} |
|
|
|
// extract links from the same best element |
|
String fullhtml = bestMatchElement.toString(); |
|
Elements children = bestMatchElement.select("a[href]"); // a with href = link |
|
String linkstr; |
|
Integer linkpos; |
|
Integer lastlinkpos = 0; |
|
for (Element child : children) { |
|
linkstr = child.toString(); |
|
linkpos = fullhtml.indexOf(linkstr, lastlinkpos); |
|
res.addLink(child.attr("abs:href"), child.text(), linkpos); |
|
lastlinkpos = linkpos; |
|
} |
|
} |
|
|
|
if (extractimages) { |
|
if (res.getImageUrl().isEmpty()) { |
|
res.setImageUrl(extractImageUrl(doc)); |
|
} |
|
} |
|
|
|
res.setRssUrl(extractRssUrl(doc)); |
|
res.setVideoUrl(extractVideoUrl(doc)); |
|
res.setFaviconUrl(extractFaviconUrl(doc)); |
|
res.setKeywords(extractKeywords(doc)); |
|
|
|
// Sanity checks in author |
|
if (res.getAuthorName().length() > MAX_AUTHOR_NAME_LENGHT) { |
|
res.setAuthorName(utf8truncate(res.getAuthorName(), MAX_AUTHOR_NAME_LENGHT)); |
|
} |
|
|
|
// Sanity checks in author description. |
|
String authorDescSnippet = getSnippet(res.getAuthorDescription()); |
|
if (getSnippet(res.getText()).equals(authorDescSnippet) || |
|
getSnippet(res.getDescription()).equals(authorDescSnippet)) { |
|
res.setAuthorDescription(""); |
|
} else { |
|
if (res.getAuthorDescription().length() > MAX_AUTHOR_DESC_LENGHT) { |
|
res.setAuthorDescription(utf8truncate(res.getAuthorDescription(), MAX_AUTHOR_DESC_LENGHT)); |
|
} |
|
} |
|
|
|
// Sanity checks in image name |
|
if (res.getImageUrl().length() > MAX_IMAGE_LENGHT) { |
|
// doesn't make sense to truncate a URL |
|
res.setImageUrl(""); |
|
} |
|
|
|
return res; |
|
} |
|
|
|
private static String getSnippet(String data) { |
|
if (data.length() < 50) |
|
return data; |
|
else |
|
return data.substring(0, 50); |
|
} |
|
|
|
private static String extractTitle(Document doc) { |
|
String title = cleanTitle(doc.title()); |
|
if (title.isEmpty()) { |
|
title = SHelper.innerTrim(doc.select("head title").text()); |
|
if (title.isEmpty()) { |
|
title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content")); |
|
if (title.isEmpty()) { |
|
title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content")); |
|
if (title.isEmpty()) { |
|
title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content")); |
|
if (title.isEmpty()) { |
|
title = SHelper.innerTrim(doc.select("h1:first-of-type").text()); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
return title; |
|
} |
|
|
|
private static String extractCanonicalUrl(Document doc) { |
|
String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href")); |
|
if (url.isEmpty()) { |
|
url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content")); |
|
if (url.isEmpty()) { |
|
url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content")); |
|
} |
|
} |
|
return url; |
|
} |
|
|
|
private static String extractDescription(Document doc) { |
|
String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content")); |
|
if (description.isEmpty()) { |
|
description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content")); |
|
if (description.isEmpty()) { |
|
description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content")); |
|
} |
|
} |
|
return description; |
|
} |
|
|
|
// Returns the publication Date or null |
|
private static Date extractDate(Document doc) { |
|
String dateStr = ""; |
|
|
|
// try some locations that nytimes uses |
|
Element elem = doc.select("meta[name=ptime]").first(); |
|
if (elem != null) { |
|
dateStr = SHelper.innerTrim(elem.attr("content")); |
|
// elem.attr("extragravityscore", Integer.toString(100)); |
|
// System.out.println("date modified element " + elem.toString()); |
|
} |
|
|
|
if (dateStr.isEmpty()) { |
|
dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content")); |
|
} |
|
if (dateStr.isEmpty()) { |
|
dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content")); |
|
} |
|
if (dateStr.isEmpty()) { |
|
dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content")); |
|
} |
|
if (dateStr.isEmpty()) { |
|
return parseDate(dateStr); |
|
} |
|
|
|
// taking this stuff directly from Juicer (and converted to Java) |
|
// opengraph (?) |
|
Elements elems = doc.select("meta[property=article:published_time]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
try { |
|
if (dateStr.endsWith("Z")) { |
|
dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00"; |
|
} else { |
|
dateStr = String.format(dateStr.substring(0, dateStr.length() - 6), |
|
dateStr.substring(dateStr.length() - 6, |
|
dateStr.length())); |
|
} |
|
} catch (StringIndexOutOfBoundsException ex) { |
|
// do nothing |
|
} |
|
return parseDate(dateStr); |
|
} |
|
} |
|
|
|
// rnews |
|
elems = doc.select("meta[property=dateCreated], span[property=dateCreated]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
|
|
return parseDate(dateStr); |
|
} else { |
|
return parseDate(el.text()); |
|
} |
|
} |
|
|
|
// schema.org creativework |
|
elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
|
|
return parseDate(dateStr); |
|
} else if (el.hasAttr("value")) { |
|
dateStr = el.attr("value"); |
|
|
|
return parseDate(dateStr); |
|
} else { |
|
return parseDate(el.text()); |
|
} |
|
} |
|
|
|
// parsely page (?) |
|
/* skip conversion for now, seems highly specific and uses new lib |
|
elems = doc.select("meta[name=parsely-page]"); |
|
if (elems.size() > 0) { |
|
implicit val formats = net.liftweb.json.DefaultFormats |
|
|
|
Element el = elems.get(0); |
|
if(el.hasAttr("content")) { |
|
val json = parse(el.attr("content")) |
|
|
|
return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz")) |
|
} |
|
} |
|
*/ |
|
|
|
// BBC |
|
elems = doc.select("meta[name=OriginalPublicationDate]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
return parseDate(dateStr); |
|
} |
|
} |
|
|
|
// wired |
|
elems = doc.select("meta[name=DisplayDate]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
return parseDate(dateStr); |
|
} |
|
} |
|
|
|
// wildcard |
|
elems = doc.select("meta[name*=date]"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
if (el.hasAttr("content")) { |
|
dateStr = el.attr("content"); |
|
Date parsedDate = parseDate(dateStr); |
|
return parsedDate; |
|
} |
|
} |
|
|
|
// blogger |
|
elems = doc.select(".date-header"); |
|
if (!elems.isEmpty()) { |
|
Element el = elems.get(0); |
|
dateStr = el.text(); |
|
return parseDate(dateStr); |
|
} |
|
|
|
return null; |
|
} |
|
|
|
private static Date parseDate(String dateStr) { |
|
// String[] parsePatterns = { |
|
// "yyyy-MM-dd'T'HH:mm:ssz", |
|
// "yyyy-MM-dd HH:mm:ss", |
|
// "yyyy/MM/dd HH:mm:ss", |
|
// "yyyy-MM-dd HH:mm", |
|
// "yyyy/MM/dd HH:mm", |
|
// "yyyy-MM-dd", |
|
// "yyyy/MM/dd", |
|
// "MM/dd/yyyy HH:mm:ss", |
|
// "MM-dd-yyyy HH:mm:ss", |
|
// "MM/dd/yyyy HH:mm", |
|
// "MM-dd-yyyy HH:mm", |
|
// "MM/dd/yyyy", |
|
// "MM-dd-yyyy", |
|
// "EEE, MMM dd, yyyy", |
|
// "MM/dd/yyyy hh:mm:ss a", |
|
// "MM-dd-yyyy hh:mm:ss a", |
|
// "MM/dd/yyyy hh:mm a", |
|
// "MM-dd-yyyy hh:mm a", |
|
// "yyyy-MM-dd hh:mm:ss a", |
|
// "yyyy/MM/dd hh:mm:ss a ", |
|
// "yyyy-MM-dd hh:mm a", |
|
// "yyyy/MM/dd hh:mm ", |
|
// "dd MMM yyyy", |
|
// "dd MMMM yyyy", |
|
// "yyyyMMddHHmm", |
|
// "yyyyMMdd HHmm", |
|
// "dd-MM-yyyy HH:mm:ss", |
|
// "dd/MM/yyyy HH:mm:ss", |
|
// "dd MMM yyyy HH:mm:ss", |
|
// "dd MMMM yyyy HH:mm:ss", |
|
// "dd-MM-yyyy HH:mm", |
|
// "dd/MM/yyyy HH:mm", |
|
// "dd MMM yyyy HH:mm", |
|
// "dd MMMM yyyy HH:mm", |
|
// "yyyyMMddHHmmss", |
|
// "yyyyMMdd HHmmss", |
|
// "yyyyMMdd" |
|
// }; |
|
// |
|
return new Date(0); |
|
|
|
// try { |
|
// return DateUtils.parseDateStrictly(dateStr, parsePatterns); |
|
// } catch (Exception ex) { |
|
// return null; |
|
// } |
|
} |
|
|
|
// Returns the author name or null |
|
private String extractAuthorName(Document doc) { |
|
String authorName = ""; |
|
|
|
// first try the Google Author tag |
|
Element result = doc.select("body [rel*=author]").first(); |
|
if (result != null) |
|
authorName = SHelper.innerTrim(result.ownText()); |
|
|
|
// if that doesn't work, try some other methods |
|
if (authorName.isEmpty()) { |
|
|
|
// meta tag approaches, get content |
|
result = doc.select("head meta[name=author]").first(); |
|
if (result != null) { |
|
authorName = SHelper.innerTrim(result.attr("content")); |
|
} |
|
|
|
if (authorName.isEmpty()) { // for "opengraph" |
|
authorName = SHelper.innerTrim(doc.select("head meta[property=article:author]").attr("content")); |
|
} |
|
if (authorName.isEmpty()) { // OpenGraph twitter:creator tag |
|
authorName = SHelper.innerTrim(doc.select("head meta[property=twitter:creator]").attr("content")); |
|
} |
|
if (authorName.isEmpty()) { // for "schema.org creativework" |
|
authorName = SHelper.innerTrim(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content")); |
|
} |
|
|
|
// other hacks |
|
if (authorName.isEmpty()) { |
|
try { |
|
// build up a set of elements which have likely author-related terms |
|
// .X searches for class X |
|
Elements matches = doc.select("a[rel=author],.byline-name,.byLineTag,.byline,.author,.by,.writer,.address"); |
|
|
|
if (matches == null || matches.isEmpty()) { |
|
matches = doc.select("body [class*=author]"); |
|
} |
|
|
|
if (matches == null || matches.isEmpty()) { |
|
matches = doc.select("body [title*=author]"); |
|
} |
|
|
|
// a hack for huffington post |
|
if (matches == null || matches.isEmpty()) { |
|
matches = doc.select(".staff_info dl a[href]"); |
|
} |
|
|
|
// a hack for http://sports.espn.go.com/ |
|
if (matches == null || matches.isEmpty()) { |
|
matches = doc.select("cite[class*=source]"); |
|
} |
|
|
|
// select the best element from them |
|
if (matches != null) { |
|
Element bestMatch = getBestMatchElement(matches); |
|
|
|
if (!(bestMatch == null)) { |
|
authorName = bestMatch.text(); |
|
|
|
if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) { |
|
authorName = bestMatch.text(); |
|
} |
|
|
|
authorName = SHelper.innerTrim(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll("")); |
|
|
|
if (authorName.contains(",")) { |
|
authorName = authorName.split(",")[0]; |
|
} |
|
} |
|
} |
|
} catch (Exception e) { |
|
System.out.println(e.toString()); |
|
} |
|
} |
|
} |
|
|
|
for (Pattern pattern : CLEAN_AUTHOR_PATTERNS) { |
|
Matcher matcher = pattern.matcher(authorName); |
|
if (matcher.matches()) { |
|
authorName = SHelper.innerTrim(matcher.group(1)); |
|
break; |
|
} |
|
} |
|
|
|
return authorName; |
|
} |
|
|
|
// Returns the author description or null |
|
private String extractAuthorDescription(Document doc, String authorName) { |
|
|
|
String authorDesc = ""; |
|
|
|
if (authorName.isEmpty()) |
|
return ""; |
|
|
|
// Special case for entrepreneur.com |
|
Elements matches = doc.select(".byline > .bio"); |
|
if (matches != null && !matches.isEmpty()) { |
|
Element bestMatch = matches.first(); // assume it is the first. |
|
authorDesc = bestMatch.text(); |
|
return authorDesc; |
|
} |
|
|
|
// Special case for huffingtonpost.com |
|
matches = doc.select(".byline span[class*=teaser]"); |
|
if (matches != null && !matches.isEmpty()) { |
|
Element bestMatch = matches.first(); // assume it is the first. |
|
authorDesc = bestMatch.text(); |
|
return authorDesc; |
|
} |
|
|
|
try { |
|
Elements nodes = doc.select(":containsOwn(" + authorName + ')'); |
|
Element bestMatch = getBestMatchElement(nodes); |
|
if (bestMatch != null) |
|
authorDesc = bestMatch.text(); |
|
} catch (SelectorParseException se) { |
|
// Avoid error when selector is invalid |
|
} |
|
|
|
return authorDesc; |
|
} |
|
|
|
private static Collection<String> extractKeywords(Document doc) { |
|
String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content")); |
|
|
|
if (content.startsWith("[") && content.endsWith("]")) |
|
content = content.substring(1, content.length() - 1); |
|
|
|
String[] split = content.split("\\s*,\\s*"); |
|
if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty())) |
|
return Arrays.asList(split); |
|
return Collections.emptyList(); |
|
} |
|
|
|
/** |
|
* Tries to extract an image url from metadata if determineImageSource |
|
* failed |
|
* |
|
* @return image url or empty str |
|
*/ |
|
private static String extractImageUrl(Document doc) { |
|
// use open graph tag to get image |
|
String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content")); |
|
if (imageUrl.isEmpty()) { |
|
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content")); |
|
if (imageUrl.isEmpty()) { |
|
// prefer link over thumbnail-meta if empty |
|
imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href")); |
|
if (imageUrl.isEmpty()) { |
|
imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content")); |
|
} |
|
} |
|
} |
|
return imageUrl; |
|
} |
|
|
|
private static String extractRssUrl(Document doc) { |
|
return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href")); |
|
} |
|
|
|
private static String extractVideoUrl(Document doc) { |
|
return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content")); |
|
} |
|
|
|
private static String extractFaviconUrl(Document doc) { |
|
String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href")); |
|
if (faviconUrl.isEmpty()) { |
|
faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href")); |
|
} |
|
return faviconUrl; |
|
} |
|
|
|
private static String extractType(Document doc) { |
|
return SHelper.innerTrim(doc.select("head meta[property=og:type]").attr("content")); |
|
} |
|
|
|
private static String extractSitename(Document doc) { |
|
String sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content")); |
|
if (sitename.isEmpty()) { |
|
sitename = SHelper.innerTrim(doc.select("head meta[name=twitter:site]").attr("content")); |
|
} |
|
if (sitename.isEmpty()) { |
|
sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content")); |
|
} |
|
return sitename; |
|
} |
|
|
|
private static String extractLanguage(Document doc) { |
|
String language = SHelper.innerTrim(doc.select("head meta[property=language]").attr("content")); |
|
if (language.isEmpty()) { |
|
language = SHelper.innerTrim(doc.select("html").attr("lang")); |
|
if (language.isEmpty()) { |
|
language = SHelper.innerTrim(doc.select("head meta[property=og:locale]").attr("content")); |
|
} |
|
} |
|
if (!language.isEmpty()) { |
|
if (language.length() > 2) { |
|
language = language.substring(0, 2); |
|
} |
|
} |
|
return language; |
|
} |
|
|
|
/** |
|
* Weights current element. By matching it with positive candidates and |
|
* weighting child nodes. Since it's impossible to predict which exactly |
|
* names, ids or class names will be used in HTML, major role is played by |
|
* child nodes |
|
* |
|
* @param e Element to weight, along with child nodes |
|
*/ |
|
private int getWeight(Element e, boolean checkextra) { |
|
int weight = calcWeight(e); |
|
int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10); |
|
weight += ownTextWeight; |
|
int childrenWeight = weightChildNodes(e); |
|
weight += childrenWeight; |
|
|
|
// add additional weight using possible 'extragravityscore' attribute |
|
if (checkextra) { |
|
Element xelem = e.select("[extragravityscore]").first(); |
|
if (xelem != null) { |
|
// System.out.println("HERE found one: " + xelem.toString()); |
|
weight += Integer.parseInt(xelem.attr("extragravityscore")); |
|
// System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore")); |
|
} |
|
} |
|
|
|
return weight; |
|
} |
|
|
|
/** |
|
* Weights a child nodes of given Element. During tests some difficulties |
|
* were met. For instance, not every single document has nested paragraph |
|
* tags inside of the major article tag. Sometimes people are adding one |
|
* more nesting level. So, we're adding 4 points for every 100 symbols |
|
* contained in tag nested inside of the current weighted element, but only |
|
* 3 points for every element that's nested 2 levels deep. This way we give |
|
* more chances to extract the element that has less nested levels, |
|
* increasing probability of the correct extraction. |
|
* |
|
* @param rootEl Element, who's child nodes will be weighted |
|
*/ |
|
private int weightChildNodes(Element rootEl) { |
|
int weight = 0; |
|
Element caption = null; |
|
List<Element> pEls = new ArrayList<>(5); |
|
|
|
for (Element child : rootEl.children()) { |
|
String ownText = child.ownText(); |
|
int ownTextLength = ownText.length(); |
|
if (ownTextLength < 20) |
|
continue; |
|
|
|
if (ownTextLength > 200) { |
|
int childOwnTextWeight = Math.max(50, ownTextLength / 10); |
|
weight += childOwnTextWeight; |
|
} |
|
|
|
if (child.tagName().equals("h1") || child.tagName().equals("h2")) { |
|
int h2h1Weight = 30; |
|
weight += h2h1Weight; |
|
} else if (child.tagName().equals("div") || child.tagName().equals("p")) { |
|
int calcChildWeight = calcWeightForChild(child, ownText); |
|
weight += calcChildWeight; |
|
if (child.tagName().equals("p") && ownTextLength > 50) |
|
pEls.add(child); |
|
|
|
if (child.className().toLowerCase().equals("caption")) |
|
caption = child; |
|
} |
|
} |
|
|
|
// |
|
// Visit grandchildren, This section visits the grandchildren |
|
// of the node and calculate their weights. Note that grandchildren |
|
// weights are only worth 1/3 of children's |
|
// |
|
int grandChildrenWeight = 0; |
|
for (Element child2 : rootEl.children()) { |
|
|
|
// If the node looks negative don't include it in the weights |
|
// instead penalize the grandparent. This is done to try to |
|
// avoid giving weigths to navigation nodes, etc. |
|
if (NEGATIVE.matcher(child2.id()).find() || |
|
NEGATIVE.matcher(child2.className()).find()) { |
|
grandChildrenWeight -= 30; |
|
continue; |
|
} |
|
|
|
for (Element grandchild : child2.children()) { |
|
int grandchildWeight = 0; |
|
String ownText = grandchild.ownText(); |
|
int ownTextLength = ownText.length(); |
|
if (ownTextLength < 20) |
|
continue; |
|
|
|
if (ownTextLength > 200) { |
|
int childOwnTextWeight = Math.max(50, ownTextLength / 10); |
|
grandchildWeight += childOwnTextWeight; |
|
} |
|
|
|
if (grandchild.tagName().equals("h1") || grandchild.tagName().equals("h2")) { |
|
int h2h1Weight = 30; |
|
grandchildWeight += h2h1Weight; |
|
} else if (grandchild.tagName().equals("div") || grandchild.tagName().equals("p")) { |
|
int calcChildWeight = calcWeightForChild(grandchild, ownText); |
|
grandchildWeight += calcChildWeight; |
|
} |
|
|
|
grandChildrenWeight += grandchildWeight; |
|
} |
|
} |
|
|
|
grandChildrenWeight = grandChildrenWeight / 3; |
|
weight += grandChildrenWeight; |
|
|
|
// use caption and image |
|
if (caption != null) { |
|
int captionWeight = 30; |
|
weight += captionWeight; |
|
} |
|
|
|
if (pEls.size() >= 2) { |
|
for (Element subEl : rootEl.children()) { |
|
if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) { |
|
int h1h2h3Weight = 20; |
|
weight += h1h2h3Weight; |
|
// headerEls.add(subEl); |
|
} else if ("table;li;td;th".contains(subEl.tagName())) { |
|
addScore(subEl, -30); |
|
} |
|
|
|
if ("p".contains(subEl.tagName())) |
|
addScore(subEl, 30); |
|
} |
|
} |
|
return weight; |
|
} |
|
|
|
private static void addScore(Element el, int score) { |
|
int old = getScore(el); |
|
setScore(el, score + old); |
|
} |
|
|
|
private static int getScore(Element el) { |
|
int old = 0; |
|
try { |
|
old = Integer.parseInt(el.attr("gravityScore")); |
|
} catch (Exception ignored) { |
|
} |
|
return old; |
|
} |
|
|
|
private static void setScore(Element el, int score) { |
|
el.attr("gravityScore", Integer.toString(score)); |
|
} |
|
|
|
private static int calcWeightForChild(Element child, String ownText) { |
|
int c = SHelper.count(ownText, """); |
|
c += SHelper.count(ownText, "<"); |
|
c += SHelper.count(ownText, ">"); |
|
c += SHelper.count(ownText, "px"); |
|
int val; |
|
if (c > 5) |
|
val = -30; |
|
else |
|
val = (int) Math.round(ownText.length() / 35.0); |
|
|
|
addScore(child, val); |
|
return val; |
|
} |
|
|
|
private int calcWeight(Element e) { |
|
int weight = 0; |
|
if (POSITIVE.matcher(e.className()).find()) |
|
weight += 35; |
|
|
|
if (POSITIVE.matcher(e.id()).find()) |
|
weight += 45; |
|
|
|
if (UNLIKELY.matcher(e.className()).find()) |
|
weight -= 20; |
|
|
|
if (UNLIKELY.matcher(e.id()).find()) |
|
weight -= 20; |
|
|
|
if (NEGATIVE.matcher(e.className()).find()) |
|
weight -= 50; |
|
|
|
if (NEGATIVE.matcher(e.id()).find()) |
|
weight -= 50; |
|
|
|
String style = e.attr("style"); |
|
if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) |
|
weight -= 50; |
|
|
|
String itemprop = e.attr("itemprop"); |
|
if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) { |
|
weight += 100; |
|
} |
|
|
|
return weight; |
|
} |
|
|
|
private static Element determineImageSource(Element el, List<ImageResult> images) { |
|
int maxWeight = 0; |
|
Element maxNode = null; |
|
Elements els = el.select("img"); |
|
if (els.isEmpty()) |
|
els = el.parent().select("img"); |
|
|
|
double score = 1; |
|
for (Element e : els) { |
|
String sourceUrl = e.attr("src"); |
|
if (sourceUrl.isEmpty() || isAdImage(sourceUrl)) |
|
continue; |
|
|
|
int weight = 0; |
|
int height = 0; |
|
try { |
|
height = Integer.parseInt(e.attr("height")); |
|
if (height >= 50) |
|
weight += 20; |
|
else |
|
weight -= 20; |
|
} catch (Exception ignored) { |
|
} |
|
|
|
int width = 0; |
|
try { |
|
width = Integer.parseInt(e.attr("width")); |
|
if (width >= 50) |
|
weight += 20; |
|
else |
|
weight -= 20; |
|
} catch (Exception ignored) { |
|
} |
|
String alt = e.attr("alt"); |
|
if (alt.length() > 35) |
|
weight += 20; |
|
|
|
String title = e.attr("title"); |
|
if (title.length() > 35) |
|
weight += 20; |
|
|
|
String rel; |
|
boolean noFollow = false; |
|
if (e.parent() != null) { |
|
rel = e.parent().attr("rel"); |
|
if (rel != null && rel.contains("nofollow")) { |
|
noFollow = rel.contains("nofollow"); |
|
weight -= 40; |
|
} |
|
} |
|
|
|
weight = (int) (weight * score); |
|
if (weight > maxWeight) { |
|
maxWeight = weight; |
|
maxNode = e; |
|
score = score / 2; |
|
} |
|
|
|
ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow); |
|
images.add(image); |
|
} |
|
|
|
Collections.sort(images, new ImageComparator()); |
|
return maxNode; |
|
} |
|
|
|
/** |
|
* Prepares document. Currently only stipping unlikely candidates, since |
|
* from time to time they're getting more score than good ones especially in |
|
* cases when major text is short. |
|
* |
|
* @param doc document to prepare. Passed as reference, and changed inside |
|
* of function |
|
*/ |
|
private static void prepareDocument(Document doc) { |
|
// stripUnlikelyCandidates(doc); |
|
removeScriptsAndStyles(doc); |
|
} |
|
|
|
/** |
|
* Removes unlikely candidates from HTML. Currently takes id and class name |
|
* and matches them against list of patterns |
|
* |
|
* @param doc document to strip unlikely candidates from |
|
*/ |
|
protected void stripUnlikelyCandidates(Document doc) { |
|
for (Element child : doc.select("body").select("*")) { |
|
String className = child.className().toLowerCase(); |
|
String id = child.id().toLowerCase(); |
|
|
|
if (NEGATIVE.matcher(className).find() |
|
|| NEGATIVE.matcher(id).find()) { |
|
child.remove(); |
|
} |
|
} |
|
} |
|
|
|
private static Document removeScriptsAndStyles(Document doc) { |
|
Elements scripts = doc.getElementsByTag("script"); |
|
for (Element item : scripts) { |
|
item.remove(); |
|
} |
|
Elements noscripts = doc.getElementsByTag("noscript"); |
|
for (Element item : noscripts) { |
|
item.remove(); |
|
} |
|
|
|
Elements styles = doc.getElementsByTag("style"); |
|
for (Element style : styles) { |
|
style.remove(); |
|
} |
|
|
|
return doc; |
|
} |
|
|
|
private static boolean isAdImage(String imageUrl) { |
|
return SHelper.count(imageUrl, "ad") >= 2; |
|
} |
|
|
|
/** |
|
* Match only exact matching as longestSubstring can be too fuzzy |
|
*/ |
|
private static String removeTitleFromText(String text, String title) { |
|
// don't do this as its terrible to read |
|
// int index1 = text.toLowerCase().indexOf(title.toLowerCase()); |
|
// if (index1 >= 0) |
|
// text = text.substring(index1 + title.length()); |
|
// return text.trim(); |
|
return text; |
|
} |
|
|
|
/** |
|
* based on a delimeter in the title take the longest piece or do some |
|
* custom logic based on the site |
|
* |
|
* @param title |
|
* @param delimeter |
|
* @return |
|
*/ |
|
private static String doTitleSplits(String title, String delimeter) { |
|
String largeText = ""; |
|
int largetTextLen = 0; |
|
String[] titlePieces = title.split(delimeter); |
|
|
|
// take the largest split |
|
for (String p : titlePieces) { |
|
if (p.length() > largetTextLen) { |
|
largeText = p; |
|
largetTextLen = p.length(); |
|
} |
|
} |
|
|
|
largeText = largeText.replace("»", " "); |
|
largeText = largeText.replace("»", " "); |
|
return largeText.trim(); |
|
} |
|
|
|
/** |
|
* @return a set of all important nodes |
|
*/ |
|
private static Collection<Element> getNodes(Document doc) { |
|
Map<Element, Object> nodes = new LinkedHashMap<>(64); |
|
int score = 100; |
|
for (Element el : doc.select("body").select("*")) { |
|
if (NODES.matcher(el.tagName()).matches()) { |
|
nodes.put(el, null); |
|
setScore(el, score); |
|
score = score / 2; |
|
} |
|
} |
|
return nodes.keySet(); |
|
} |
|
|
|
private static String cleanTitle(String title) { |
|
|
|
// int index = title.lastIndexOf("|"); |
|
// if (index > 0 && title.length() / 2 < index) |
|
// title = title.substring(0, index + 1); |
|
|
|
int counter = 0; |
|
String[] strs = title.split("\\|"); |
|
StringBuilder res = new StringBuilder(strs.length); |
|
for (String part : strs) { |
|
if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim())) |
|
continue; |
|
|
|
if (counter == strs.length - 1 && res.length() > part.length()) |
|
continue; |
|
|
|
if (counter > 0) |
|
res.append('|'); |
|
|
|
res.append(part); |
|
counter++; |
|
} |
|
|
|
return SHelper.innerTrim(res.toString()); |
|
} |
|
|
|
/** |
|
* Truncate a Java string so that its UTF-8 representation will not |
|
* exceed the specified number of bytes. |
|
* <p/> |
|
* For discussion of why you might want to do this, see |
|
* http://lpar.ath0.com/2011/06/07/unicode-alchemy-with-db2/ |
|
*/ |
|
private static String utf8truncate(String input, int length) { |
|
StringBuilder result = new StringBuilder(length); |
|
int resultlen = 0; |
|
for (int i = 0; i < input.length(); i++) { |
|
char c = input.charAt(i); |
|
int charlen = 0; |
|
if (c <= 0x7f) { |
|
charlen = 1; |
|
} else if (c <= 0x7ff) { |
|
charlen = 2; |
|
} else if (c <= 0xd7ff) { |
|
charlen = 3; |
|
} else if (c <= 0xdbff) { |
|
charlen = 4; |
|
} else if (c <= 0xdfff) { |
|
charlen = 0; |
|
} else { |
|
charlen = 3; |
|
} |
|
if (resultlen + charlen > length) { |
|
break; |
|
} |
|
result.append(c); |
|
resultlen += charlen; |
|
} |
|
return result.toString(); |
|
} |
|
|
|
|
|
/** |
|
* Comparator for Image by weight |
|
* |
|
* @author Chris Alexander, chris@chris-alexander.co.uk |
|
*/ |
|
private static class ImageComparator implements Comparator<ImageResult> { |
|
|
|
@Override |
|
public int compare(ImageResult o1, ImageResult o2) { |
|
// Returns the highest weight first |
|
return o2.weight.compareTo(o1.weight); |
|
} |
|
} |
|
|
|
} |