lightning-i2p/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java

package acr.browser.lightning.reading;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * This class is thread safe.
 * 
 * @author Alex P (ifesdjeen from jreadability)
 * @author Peter Karich
 */
public class ArticleTextExtractor {

	// Interessting nodes
	private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
	// Unlikely candidates
	private String unlikelyStr;
	private Pattern UNLIKELY;
	// Most likely positive candidates
	private String positiveStr;
	private Pattern POSITIVE;
	// Most likely negative candidates
	private String negativeStr;
	private Pattern NEGATIVE;
	private static final Pattern NEGATIVE_STYLE = Pattern
			.compile("hidden|display: ?none|font-size: ?small");
	private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
		{
			add("hacker news");
			add("facebook");
		}
	};
	private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
	private OutputFormatter formatter = DEFAULT_FORMATTER;

	public ArticleTextExtractor() {
		setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
				+ "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
				+ "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
				+ "login|si(debar|gn|ngle)");
		setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
				+ "|arti(cle|kel)|instapaper_body");
		setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
				+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
				+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
	}

	public ArticleTextExtractor setUnlikely(String unlikelyStr) {
		this.unlikelyStr = unlikelyStr;
		UNLIKELY = Pattern.compile(unlikelyStr);
		return this;
	}

	public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
		return setUnlikely(unlikelyStr + "|" + unlikelyMatches);
	}

	public ArticleTextExtractor setPositive(String positiveStr) {
		this.positiveStr = positiveStr;
		POSITIVE = Pattern.compile(positiveStr);
		return this;
	}

	public ArticleTextExtractor addPositive(String pos) {
		return setPositive(positiveStr + "|" + pos);
	}

	public ArticleTextExtractor setNegative(String negativeStr) {
		this.negativeStr = negativeStr;
		NEGATIVE = Pattern.compile(negativeStr);
		return this;
	}

	public ArticleTextExtractor addNegative(String neg) {
		setNegative(negativeStr + "|" + neg);
		return this;
	}

	public void setOutputFormatter(OutputFormatter formatter) {
		this.formatter = formatter;
	}

	/**
	 * @param html
	 *            extracts article text from given html string. wasn't tested
	 *            with improper HTML, although jSoup should be able to handle
	 *            minor stuff.
	 * @returns extracted article, all HTML tags stripped
	 */
	public JResult extractContent(Document doc) throws Exception {
		return extractContent(new JResult(), doc, formatter);
	}

	public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {
		return extractContent(new JResult(), doc, formatter);
	}

	public JResult extractContent(String html) throws Exception {
		return extractContent(new JResult(), html);
	}

	public JResult extractContent(JResult res, String html) throws Exception {
		return extractContent(res, html, formatter);
	}

	public JResult extractContent(JResult res, String html, OutputFormatter formatter)
			throws Exception {
		if (html.isEmpty())
			throw new IllegalArgumentException("html string is empty!?");

		// http://jsoup.org/cookbook/extracting-data/selector-syntax
		return extractContent(res, Jsoup.parse(html), formatter);
	}

	public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)
			throws Exception {
		if (doc == null)
			throw new NullPointerException("missing document");

		res.setTitle(extractTitle(doc));
		res.setDescription(extractDescription(doc));
		res.setCanonicalUrl(extractCanonicalUrl(doc));

		// now remove the clutter
		prepareDocument(doc);

		// init elements
		Collection<Element> nodes = getNodes(doc);
		int maxWeight = 0;
		Element bestMatchElement = null;
		for (Element entry : nodes) {
			int currentWeight = getWeight(entry);
			if (currentWeight > maxWeight) {
				maxWeight = currentWeight;
				bestMatchElement = entry;
				if (maxWeight > 200)
					break;
			}
		}

		if (bestMatchElement != null) {
			List<ImageResult> images = new ArrayList<ImageResult>();
			Element imgEl = determineImageSource(bestMatchElement, images);
			if (imgEl != null) {
				res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
				// TODO remove parent container of image if it is contained in
				// bestMatchElement
				// to avoid image subtitles flooding in

				res.setImages(images);
			}

			// clean before grabbing text
			String text = formatter.getFormattedText(bestMatchElement);
			text = removeTitleFromText(text, res.getTitle());
			// this fails for short facebook post and probably tweets:
			// text.length() > res.getDescription().length()
			if (text.length() > res.getTitle().length()) {
				res.setText(text);
				// print("best element:", bestMatchElement);
			}
			res.setTextList(formatter.getTextList(bestMatchElement));
		}

		if (res.getImageUrl().isEmpty()) {
			res.setImageUrl(extractImageUrl(doc));
		}

		res.setRssUrl(extractRssUrl(doc));
		res.setVideoUrl(extractVideoUrl(doc));
		res.setFaviconUrl(extractFaviconUrl(doc));
		res.setKeywords(extractKeywords(doc));
		return res;
	}

	protected String extractTitle(Document doc) {
		String title = cleanTitle(doc.title());
		if (title.isEmpty()) {
			title = SHelper.innerTrim(doc.select("head title").text());
			if (title.isEmpty()) {
				title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
				if (title.isEmpty()) {
					title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(
							"content"));
					if (title.isEmpty()) {
						title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(
								"content"));
					}
				}
			}
		}
		return title;
	}

	protected String extractCanonicalUrl(Document doc) {
		String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
		if (url.isEmpty()) {
			url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
			if (url.isEmpty()) {
				url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(
						"content"));
			}
		}
		return url;
	}

	protected String extractDescription(Document doc) {
		String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(
				"content"));
		if (description.isEmpty()) {
			description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(
					"content"));
			if (description.isEmpty()) {
				description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")
						.attr("content"));
			}
		}
		return description;
	}

	protected Collection<String> extractKeywords(Document doc) {
		String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));

		if (content != null) {
			if (content.startsWith("[") && content.endsWith("]"))
				content = content.substring(1, content.length() - 1);

			String[] split = content.split("\\s*,\\s*");
			if (split.length > 1 || (split.length > 0 && !"".equals(split[0])))
				return Arrays.asList(split);
		}
		return Collections.emptyList();
	}

	/**
	 * Tries to extract an image url from metadata if determineImageSource
	 * failed
	 * 
	 * @return image url or empty str
	 */
	protected String extractImageUrl(Document doc) {
		// use open graph tag to get image
		String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(
				"content"));
		if (imageUrl.isEmpty()) {
			imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(
					"content"));
			if (imageUrl.isEmpty()) {
				// prefer link over thumbnail-meta if empty
				imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
				if (imageUrl.isEmpty()) {
					imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(
							"content"));
				}
			}
		}
		return imageUrl;
	}

	protected String extractRssUrl(Document doc) {
		return SHelper.replaceSpaces(doc.select("link[rel=alternate]")
				.select("link[type=application/rss+xml]").attr("href"));
	}

	protected String extractVideoUrl(Document doc) {
		return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
	}

	protected String extractFaviconUrl(Document doc) {
		String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
		if (faviconUrl.isEmpty()) {
			faviconUrl = SHelper.replaceSpaces(doc.select(
					"head link[rel^=shortcut],link[rel$=icon]").attr("href"));
		}
		return faviconUrl;
	}

	/**
	 * Weights current element. By matching it with positive candidates and
	 * weighting child nodes. Since it's impossible to predict which exactly
	 * names, ids or class names will be used in HTML, major role is played by
	 * child nodes
	 * 
	 * @param e
	 *            Element to weight, along with child nodes
	 */
	protected int getWeight(Element e) {
		int weight = calcWeight(e);
		weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
		weight += weightChildNodes(e);
		return weight;
	}

	/**
	 * Weights a child nodes of given Element. During tests some difficulties
	 * were met. For instanance, not every single document has nested paragraph
	 * tags inside of the major article tag. Sometimes people are adding one
	 * more nesting level. So, we're adding 4 points for every 100 symbols
	 * contained in tag nested inside of the current weighted element, but only
	 * 3 points for every element that's nested 2 levels deep. This way we give
	 * more chances to extract the element that has less nested levels,
	 * increasing probability of the correct extraction.
	 * 
	 * @param rootEl
	 *            Element, who's child nodes will be weighted
	 */
	protected int weightChildNodes(Element rootEl) {
		int weight = 0;
		Element caption = null;
		List<Element> pEls = new ArrayList<Element>(5);
		for (Element child : rootEl.children()) {
			String ownText = child.ownText();
			int ownTextLength = ownText.length();
			if (ownTextLength < 20)
				continue;

			if (ownTextLength > 200)
				weight += Math.max(50, ownTextLength / 10);

			if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
				weight += 30;
			} else if (child.tagName().equals("div") || child.tagName().equals("p")) {
				weight += calcWeightForChild(child, ownText);
				if (child.tagName().equals("p") && ownTextLength > 50)
					pEls.add(child);

				if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))
					caption = child;
			}
		}

		// use caption and image
		if (caption != null)
			weight += 30;

		if (pEls.size() >= 2) {
			for (Element subEl : rootEl.children()) {
				if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
					weight += 20;
					// headerEls.add(subEl);
				} else if ("table;li;td;th".contains(subEl.tagName())) {
					addScore(subEl, -30);
				}

				if ("p".contains(subEl.tagName()))
					addScore(subEl, 30);
			}
		}
		return weight;
	}

	public void addScore(Element el, int score) {
		int old = getScore(el);
		setScore(el, score + old);
	}

	public int getScore(Element el) {
		int old = 0;
		try {
			old = Integer.parseInt(el.attr("gravityScore"));
		} catch (Exception ex) {
		}
		return old;
	}

	public void setScore(Element el, int score) {
		el.attr("gravityScore", Integer.toString(score));
	}

	private int calcWeightForChild(Element child, String ownText) {
		int c = SHelper.count(ownText, "&quot;");
		c += SHelper.count(ownText, "&lt;");
		c += SHelper.count(ownText, "&gt;");
		c += SHelper.count(ownText, "px");
		int val;
		if (c > 5)
			val = -30;
		else
			val = (int) Math.round(ownText.length() / 25.0);

		addScore(child, val);
		return val;
	}

	private int calcWeight(Element e) {
		int weight = 0;
		if (POSITIVE.matcher(e.className()).find())
			weight += 35;

		if (POSITIVE.matcher(e.id()).find())
			weight += 40;

		if (UNLIKELY.matcher(e.className()).find())
			weight -= 20;

		if (UNLIKELY.matcher(e.id()).find())
			weight -= 20;

		if (NEGATIVE.matcher(e.className()).find())
			weight -= 50;

		if (NEGATIVE.matcher(e.id()).find())
			weight -= 50;

		String style = e.attr("style");
		if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
			weight -= 50;
		return weight;
	}

	public Element determineImageSource(Element el, List<ImageResult> images) {
		int maxWeight = 0;
		Element maxNode = null;
		Elements els = el.select("img");
		if (els.isEmpty())
			els = el.parent().select("img");

		double score = 1;
		for (Element e : els) {
			String sourceUrl = e.attr("src");
			if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
				continue;

			int weight = 0;
			int height = 0;
			try {
				height = Integer.parseInt(e.attr("height"));
				if (height >= 50)
					weight += 20;
				else
					weight -= 20;
			} catch (Exception ex) {
			}

			int width = 0;
			try {
				width = Integer.parseInt(e.attr("width"));
				if (width >= 50)
					weight += 20;
				else
					weight -= 20;
			} catch (Exception ex) {
			}
			String alt = e.attr("alt");
			if (alt.length() > 35)
				weight += 20;

			String title = e.attr("title");
			if (title.length() > 35)
				weight += 20;

			String rel = null;
			boolean noFollow = false;
			if (e.parent() != null) {
				rel = e.parent().attr("rel");
				if (rel != null && rel.contains("nofollow")) {
					noFollow = rel.contains("nofollow");
					weight -= 40;
				}
			}

			weight = (int) (weight * score);
			if (weight > maxWeight) {
				maxWeight = weight;
				maxNode = e;
				score = score / 2;
			}

			ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,
					noFollow);
			images.add(image);
		}

		Collections.sort(images, new ImageComparator());
		return maxNode;
	}

	/**
	 * Prepares document. Currently only stipping unlikely candidates, since
	 * from time to time they're getting more score than good ones especially in
	 * cases when major text is short.
	 * 
	 * @param doc
	 *            document to prepare. Passed as reference, and changed inside
	 *            of function
	 */
	protected void prepareDocument(Document doc) {
		// stripUnlikelyCandidates(doc);
		removeScriptsAndStyles(doc);
	}

	/**
	 * Removes unlikely candidates from HTML. Currently takes id and class name
	 * and matches them against list of patterns
	 * 
	 * @param doc
	 *            document to strip unlikely candidates from
	 */
	protected void stripUnlikelyCandidates(Document doc) {
		for (Element child : doc.select("body").select("*")) {
			String className = child.className().toLowerCase(Locale.getDefault());
			String id = child.id().toLowerCase(Locale.getDefault());

			if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) {
				// print("REMOVE:", child);
				child.remove();
			}
		}
	}

	private Document removeScriptsAndStyles(Document doc) {
		Elements scripts = doc.getElementsByTag("script");
		for (Element item : scripts) {
			item.remove();
		}

		Elements noscripts = doc.getElementsByTag("noscript");
		for (Element item : noscripts) {
			item.remove();
		}

		Elements styles = doc.getElementsByTag("style");
		for (Element style : styles) {
			style.remove();
		}

		return doc;
	}

	private boolean isAdImage(String imageUrl) {
		return SHelper.count(imageUrl, "ad") >= 2;
	}

	/**
	 * Match only exact matching as longestSubstring can be too fuzzy
	 */
	public String removeTitleFromText(String text, String title) {
		// don't do this as its terrible to read
		// int index1 = text.toLowerCase().indexOf(title.toLowerCase());
		// if (index1 >= 0)
		// text = text.substring(index1 + title.length());
		// return text.trim();
		return text;
	}

	/**
	 * @return a set of all important nodes
	 */
	public Collection<Element> getNodes(Document doc) {
		Set<Element> nodes = new HashSet<Element>(64);
		int score = 100;
		for (Element el : doc.select("body").select("*")) {
			if (NODES.matcher(el.tagName()).matches()) {
				nodes.add(el);
				setScore(el, score);
				score = score / 2;
			}
		}
		return nodes;

	}

	public String cleanTitle(String title) {
		StringBuilder res = new StringBuilder();
		// int index = title.lastIndexOf("|");
		// if (index > 0 && title.length() / 2 < index)
		// title = title.substring(0, index + 1);

		int counter = 0;
		String[] strs = title.split("\\|");
		for (String part : strs) {
			if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))
				continue;

			if (counter == strs.length - 1 && res.length() > part.length())
				continue;

			if (counter > 0)
				res.append("|");

			res.append(part);
			counter++;
		}

		return SHelper.innerTrim(res.toString());
	}

	/**
	 * Comparator for Image by weight
	 * 
	 * @author Chris Alexander, chris@chris-alexander.co.uk
	 * 
	 */
	public class ImageComparator implements Comparator<ImageResult> {

		@Override
		public int compare(ImageResult o1, ImageResult o2) {
			// Returns the highest weight first
			return o2.weight.compareTo(o1.weight);
		}
	}
}
Change Reading mode package name to lower case 9 years ago			`package acr.browser.lightning.reading;`
Added a Reading Mode that can be accessed from the menu Reading Mode utilizes the Snacktory library created by karussel which is licensed under the Apache 2.0 license. https://github.com/karussell/snacktory 9 years ago
			`import java.util.ArrayList;`
			`import java.util.Arrays;`
			`import java.util.Collection;`
			`import java.util.Collections;`
			`import java.util.Comparator;`
			`import java.util.HashSet;`
			`import java.util.LinkedHashSet;`
			`import java.util.List;`
			`import java.util.Locale;`
			`import java.util.Set;`
			`import java.util.regex.Pattern;`
Fix deprecation issues, fix a couple rendering issues 9 years ago
Added a Reading Mode that can be accessed from the menu Reading Mode utilizes the Snacktory library created by karussel which is licensed under the Apache 2.0 license. https://github.com/karussell/snacktory 9 years ago			`import org.jsoup.Jsoup;`
			`import org.jsoup.nodes.Document;`
			`import org.jsoup.nodes.Element;`
			`import org.jsoup.select.Elements;`

			`/**`
			`* This class is thread safe.`
			`*`
			`* @author Alex P (ifesdjeen from jreadability)`
			`* @author Peter Karich`
			`*/`
			`public class ArticleTextExtractor {`

			`// Interessting nodes`
			`private static final Pattern NODES = Pattern.compile("p\|div\|td\|h1\|h2\|article\|section");`
			`// Unlikely candidates`
			`private String unlikelyStr;`
			`private Pattern UNLIKELY;`
			`// Most likely positive candidates`
			`private String positiveStr;`
			`private Pattern POSITIVE;`
			`// Most likely negative candidates`
			`private String negativeStr;`
			`private Pattern NEGATIVE;`
			`private static final Pattern NEGATIVE_STYLE = Pattern`
			`.compile("hidden\|display: ?none\|font-size: ?small");`
			`private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {`
			`{`
			`add("hacker news");`
			`add("facebook");`
			`}`
			`};`
			`private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();`
			`private OutputFormatter formatter = DEFAULT_FORMATTER;`

			`public ArticleTextExtractor() {`
			`setUnlikely("com(bx\|ment\|munity)\|dis(qus\|cuss)\|e(xtra\|[-]?mail)\|foot\|"`
			`+ "header\|menu\|re(mark\|ply)\|rss\|sh(are\|outbox)\|sponsor"`
			`+ "a(d\|ll\|gegate\|rchive\|ttachment)\|(pag(er\|ination))\|popup\|print\|"`
			`+ "login\|si(debar\|gn\|ngle)");`
			`setPositive("(^(body\|content\|h?entry\|main\|page\|post\|text\|blog\|story\|haupt))"`
			`+ "\|arti(cle\|kel)\|instapaper_body");`
			`setNegative("nav($\|igation)\|user\|com(ment\|bx)\|(^com-)\|contact\|"`
			`+ "foot\|masthead\|(me(dia\|ta))\|outbrain\|promo\|related\|scroll\|(sho(utbox\|pping))\|"`
			`+ "sidebar\|sponsor\|tags\|tool\|widget\|player\|disclaimer\|toc\|infobox\|vcard");`
			`}`

			`public ArticleTextExtractor setUnlikely(String unlikelyStr) {`
			`this.unlikelyStr = unlikelyStr;`
			`UNLIKELY = Pattern.compile(unlikelyStr);`
			`return this;`
			`}`

			`public ArticleTextExtractor addUnlikely(String unlikelyMatches) {`
			`return setUnlikely(unlikelyStr + "\|" + unlikelyMatches);`
			`}`

			`public ArticleTextExtractor setPositive(String positiveStr) {`
			`this.positiveStr = positiveStr;`
			`POSITIVE = Pattern.compile(positiveStr);`
			`return this;`
			`}`

			`public ArticleTextExtractor addPositive(String pos) {`
			`return setPositive(positiveStr + "\|" + pos);`
			`}`

			`public ArticleTextExtractor setNegative(String negativeStr) {`
			`this.negativeStr = negativeStr;`
			`NEGATIVE = Pattern.compile(negativeStr);`
			`return this;`
			`}`

			`public ArticleTextExtractor addNegative(String neg) {`
			`setNegative(negativeStr + "\|" + neg);`
			`return this;`
			`}`

			`public void setOutputFormatter(OutputFormatter formatter) {`
			`this.formatter = formatter;`
			`}`

			`/**`
			`* @param html`
			`* extracts article text from given html string. wasn't tested`
			`* with improper HTML, although jSoup should be able to handle`
			`* minor stuff.`
			`* @returns extracted article, all HTML tags stripped`
			`*/`
			`public JResult extractContent(Document doc) throws Exception {`
			`return extractContent(new JResult(), doc, formatter);`
			`}`

			`public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception {`
			`return extractContent(new JResult(), doc, formatter);`
			`}`

			`public JResult extractContent(String html) throws Exception {`
			`return extractContent(new JResult(), html);`
			`}`

			`public JResult extractContent(JResult res, String html) throws Exception {`
			`return extractContent(res, html, formatter);`
			`}`

			`public JResult extractContent(JResult res, String html, OutputFormatter formatter)`
			`throws Exception {`
			`if (html.isEmpty())`
			`throw new IllegalArgumentException("html string is empty!?");`

			`// http://jsoup.org/cookbook/extracting-data/selector-syntax`
			`return extractContent(res, Jsoup.parse(html), formatter);`
			`}`

			`public JResult extractContent(JResult res, Document doc, OutputFormatter formatter)`
			`throws Exception {`
			`if (doc == null)`
			`throw new NullPointerException("missing document");`

			`res.setTitle(extractTitle(doc));`
			`res.setDescription(extractDescription(doc));`
			`res.setCanonicalUrl(extractCanonicalUrl(doc));`

			`// now remove the clutter`
			`prepareDocument(doc);`

			`// init elements`
			`Collection<Element> nodes = getNodes(doc);`
			`int maxWeight = 0;`
			`Element bestMatchElement = null;`
			`for (Element entry : nodes) {`
			`int currentWeight = getWeight(entry);`
			`if (currentWeight > maxWeight) {`
			`maxWeight = currentWeight;`
			`bestMatchElement = entry;`
			`if (maxWeight > 200)`
			`break;`
			`}`
			`}`

			`if (bestMatchElement != null) {`
			`List<ImageResult> images = new ArrayList<ImageResult>();`
			`Element imgEl = determineImageSource(bestMatchElement, images);`
			`if (imgEl != null) {`
			`res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));`
			`// TODO remove parent container of image if it is contained in`
			`// bestMatchElement`
			`// to avoid image subtitles flooding in`

			`res.setImages(images);`
			`}`

			`// clean before grabbing text`
			`String text = formatter.getFormattedText(bestMatchElement);`
			`text = removeTitleFromText(text, res.getTitle());`
			`// this fails for short facebook post and probably tweets:`
			`// text.length() > res.getDescription().length()`
			`if (text.length() > res.getTitle().length()) {`
			`res.setText(text);`
			`// print("best element:", bestMatchElement);`
			`}`
			`res.setTextList(formatter.getTextList(bestMatchElement));`
			`}`

			`if (res.getImageUrl().isEmpty()) {`
			`res.setImageUrl(extractImageUrl(doc));`
			`}`

			`res.setRssUrl(extractRssUrl(doc));`
			`res.setVideoUrl(extractVideoUrl(doc));`
			`res.setFaviconUrl(extractFaviconUrl(doc));`
			`res.setKeywords(extractKeywords(doc));`
			`return res;`
			`}`

			`protected String extractTitle(Document doc) {`
			`String title = cleanTitle(doc.title());`
			`if (title.isEmpty()) {`
			`title = SHelper.innerTrim(doc.select("head title").text());`
			`if (title.isEmpty()) {`
			`title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));`
			`if (title.isEmpty()) {`
			`title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr(`
			`"content"));`
			`if (title.isEmpty()) {`
			`title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr(`
			`"content"));`
			`}`
			`}`
			`}`
			`}`
			`return title;`
			`}`

			`protected String extractCanonicalUrl(Document doc) {`
			`String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));`
			`if (url.isEmpty()) {`
			`url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));`
			`if (url.isEmpty()) {`
			`url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr(`
			`"content"));`
			`}`
			`}`
			`return url;`
			`}`

			`protected String extractDescription(Document doc) {`
			`String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr(`
			`"content"));`
			`if (description.isEmpty()) {`
			`description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr(`
			`"content"));`
			`if (description.isEmpty()) {`
			`description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]")`
			`.attr("content"));`
			`}`
			`}`
			`return description;`
			`}`

			`protected Collection<String> extractKeywords(Document doc) {`
			`String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));`

			`if (content != null) {`
			`if (content.startsWith("[") && content.endsWith("]"))`
			`content = content.substring(1, content.length() - 1);`

			`String[] split = content.split("\\s,\\s");`
			`if (split.length > 1 \|\| (split.length > 0 && !"".equals(split[0])))`
			`return Arrays.asList(split);`
			`}`
			`return Collections.emptyList();`
			`}`

			`/**`
			`* Tries to extract an image url from metadata if determineImageSource`
			`* failed`
			`*`
			`* @return image url or empty str`
			`*/`
			`protected String extractImageUrl(Document doc) {`
			`// use open graph tag to get image`
			`String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr(`
			`"content"));`
			`if (imageUrl.isEmpty()) {`
			`imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr(`
			`"content"));`
			`if (imageUrl.isEmpty()) {`
			`// prefer link over thumbnail-meta if empty`
			`imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));`
			`if (imageUrl.isEmpty()) {`
			`imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr(`
			`"content"));`
			`}`
			`}`
			`}`
			`return imageUrl;`
			`}`

			`protected String extractRssUrl(Document doc) {`
			`return SHelper.replaceSpaces(doc.select("link[rel=alternate]")`
			`.select("link[type=application/rss+xml]").attr("href"));`
			`}`

			`protected String extractVideoUrl(Document doc) {`
			`return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));`
			`}`

			`protected String extractFaviconUrl(Document doc) {`
			`String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));`
			`if (faviconUrl.isEmpty()) {`
			`faviconUrl = SHelper.replaceSpaces(doc.select(`
			`"head link[rel^=shortcut],link[rel$=icon]").attr("href"));`
			`}`
			`return faviconUrl;`
			`}`

			`/**`
			`* Weights current element. By matching it with positive candidates and`
			`* weighting child nodes. Since it's impossible to predict which exactly`
			`* names, ids or class names will be used in HTML, major role is played by`
			`* child nodes`
			`*`
			`* @param e`
			`* Element to weight, along with child nodes`
			`*/`
			`protected int getWeight(Element e) {`
			`int weight = calcWeight(e);`
			`weight += (int) Math.round(e.ownText().length() / 100.0 * 10);`
			`weight += weightChildNodes(e);`
			`return weight;`
			`}`

			`/**`
			`* Weights a child nodes of given Element. During tests some difficulties`
			`* were met. For instanance, not every single document has nested paragraph`
			`* tags inside of the major article tag. Sometimes people are adding one`
			`* more nesting level. So, we're adding 4 points for every 100 symbols`
			`* contained in tag nested inside of the current weighted element, but only`
			`* 3 points for every element that's nested 2 levels deep. This way we give`
			`* more chances to extract the element that has less nested levels,`
			`* increasing probability of the correct extraction.`
			`*`
			`* @param rootEl`
			`* Element, who's child nodes will be weighted`
			`*/`
			`protected int weightChildNodes(Element rootEl) {`
			`int weight = 0;`
			`Element caption = null;`
			`List<Element> pEls = new ArrayList<Element>(5);`
			`for (Element child : rootEl.children()) {`
			`String ownText = child.ownText();`
			`int ownTextLength = ownText.length();`
			`if (ownTextLength < 20)`
			`continue;`

			`if (ownTextLength > 200)`
			`weight += Math.max(50, ownTextLength / 10);`

			`if (child.tagName().equals("h1") \|\| child.tagName().equals("h2")) {`
			`weight += 30;`
			`} else if (child.tagName().equals("div") \|\| child.tagName().equals("p")) {`
			`weight += calcWeightForChild(child, ownText);`
			`if (child.tagName().equals("p") && ownTextLength > 50)`
			`pEls.add(child);`

			`if (child.className().toLowerCase(Locale.getDefault()).equals("caption"))`
			`caption = child;`
			`}`
			`}`

			`// use caption and image`
			`if (caption != null)`
			`weight += 30;`

			`if (pEls.size() >= 2) {`
			`for (Element subEl : rootEl.children()) {`
			`if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {`
			`weight += 20;`
			`// headerEls.add(subEl);`
			`} else if ("table;li;td;th".contains(subEl.tagName())) {`
			`addScore(subEl, -30);`
			`}`

			`if ("p".contains(subEl.tagName()))`
			`addScore(subEl, 30);`
			`}`
			`}`
			`return weight;`
			`}`

			`public void addScore(Element el, int score) {`
			`int old = getScore(el);`
			`setScore(el, score + old);`
			`}`

			`public int getScore(Element el) {`
			`int old = 0;`
			`try {`
			`old = Integer.parseInt(el.attr("gravityScore"));`
			`} catch (Exception ex) {`
			`}`
			`return old;`
			`}`

			`public void setScore(Element el, int score) {`
			`el.attr("gravityScore", Integer.toString(score));`
			`}`

			`private int calcWeightForChild(Element child, String ownText) {`
			`int c = SHelper.count(ownText, """);`
			`c += SHelper.count(ownText, "<");`
			`c += SHelper.count(ownText, ">");`
			`c += SHelper.count(ownText, "px");`
			`int val;`
			`if (c > 5)`
			`val = -30;`
			`else`
			`val = (int) Math.round(ownText.length() / 25.0);`

			`addScore(child, val);`
			`return val;`
			`}`

			`private int calcWeight(Element e) {`
			`int weight = 0;`
			`if (POSITIVE.matcher(e.className()).find())`
			`weight += 35;`

			`if (POSITIVE.matcher(e.id()).find())`
			`weight += 40;`

			`if (UNLIKELY.matcher(e.className()).find())`
			`weight -= 20;`

			`if (UNLIKELY.matcher(e.id()).find())`
			`weight -= 20;`

			`if (NEGATIVE.matcher(e.className()).find())`
			`weight -= 50;`

			`if (NEGATIVE.matcher(e.id()).find())`
			`weight -= 50;`

			`String style = e.attr("style");`
			`if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())`
			`weight -= 50;`
			`return weight;`
			`}`

			`public Element determineImageSource(Element el, List<ImageResult> images) {`
			`int maxWeight = 0;`
			`Element maxNode = null;`
			`Elements els = el.select("img");`
			`if (els.isEmpty())`
			`els = el.parent().select("img");`

			`double score = 1;`
			`for (Element e : els) {`
			`String sourceUrl = e.attr("src");`
			`if (sourceUrl.isEmpty() \|\| isAdImage(sourceUrl))`
			`continue;`

			`int weight = 0;`
			`int height = 0;`
			`try {`
			`height = Integer.parseInt(e.attr("height"));`
			`if (height >= 50)`
			`weight += 20;`
			`else`
			`weight -= 20;`
			`} catch (Exception ex) {`
			`}`

			`int width = 0;`
			`try {`
			`width = Integer.parseInt(e.attr("width"));`
			`if (width >= 50)`
			`weight += 20;`
			`else`
			`weight -= 20;`
			`} catch (Exception ex) {`
			`}`
			`String alt = e.attr("alt");`
			`if (alt.length() > 35)`
			`weight += 20;`

			`String title = e.attr("title");`
			`if (title.length() > 35)`
			`weight += 20;`

			`String rel = null;`
			`boolean noFollow = false;`
			`if (e.parent() != null) {`
			`rel = e.parent().attr("rel");`
			`if (rel != null && rel.contains("nofollow")) {`
			`noFollow = rel.contains("nofollow");`
			`weight -= 40;`
			`}`
			`}`

			`weight = (int) (weight * score);`
			`if (weight > maxWeight) {`
			`maxWeight = weight;`
			`maxNode = e;`
			`score = score / 2;`
			`}`

			`ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt,`
			`noFollow);`
			`images.add(image);`
			`}`

			`Collections.sort(images, new ImageComparator());`
			`return maxNode;`
			`}`

			`/**`
			`* Prepares document. Currently only stipping unlikely candidates, since`
			`* from time to time they're getting more score than good ones especially in`
			`* cases when major text is short.`
			`*`
			`* @param doc`
			`* document to prepare. Passed as reference, and changed inside`
			`* of function`
			`*/`
			`protected void prepareDocument(Document doc) {`
			`// stripUnlikelyCandidates(doc);`
			`removeScriptsAndStyles(doc);`
			`}`

			`/**`
			`* Removes unlikely candidates from HTML. Currently takes id and class name`
			`* and matches them against list of patterns`
			`*`
			`* @param doc`
			`* document to strip unlikely candidates from`
			`*/`
			`protected void stripUnlikelyCandidates(Document doc) {`
			`for (Element child : doc.select("body").select("*")) {`
			`String className = child.className().toLowerCase(Locale.getDefault());`
			`String id = child.id().toLowerCase(Locale.getDefault());`

			`if (NEGATIVE.matcher(className).find() \|\| NEGATIVE.matcher(id).find()) {`
			`// print("REMOVE:", child);`
			`child.remove();`
			`}`
			`}`
			`}`

			`private Document removeScriptsAndStyles(Document doc) {`
			`Elements scripts = doc.getElementsByTag("script");`
			`for (Element item : scripts) {`
			`item.remove();`
			`}`

			`Elements noscripts = doc.getElementsByTag("noscript");`
			`for (Element item : noscripts) {`
			`item.remove();`
			`}`

			`Elements styles = doc.getElementsByTag("style");`
			`for (Element style : styles) {`
			`style.remove();`
			`}`

			`return doc;`
			`}`

			`private boolean isAdImage(String imageUrl) {`
			`return SHelper.count(imageUrl, "ad") >= 2;`
			`}`

			`/**`
			`* Match only exact matching as longestSubstring can be too fuzzy`
			`*/`
			`public String removeTitleFromText(String text, String title) {`
			`// don't do this as its terrible to read`
			`// int index1 = text.toLowerCase().indexOf(title.toLowerCase());`
			`// if (index1 >= 0)`
			`// text = text.substring(index1 + title.length());`
			`// return text.trim();`
			`return text;`
			`}`

			`/**`
			`* @return a set of all important nodes`
			`*/`
			`public Collection<Element> getNodes(Document doc) {`
			`Set<Element> nodes = new HashSet<Element>(64);`
			`int score = 100;`
			`for (Element el : doc.select("body").select("*")) {`
			`if (NODES.matcher(el.tagName()).matches()) {`
			`nodes.add(el);`
			`setScore(el, score);`
			`score = score / 2;`
			`}`
			`}`
			`return nodes;`

			`}`

			`public String cleanTitle(String title) {`
			`StringBuilder res = new StringBuilder();`
			`// int index = title.lastIndexOf("\|");`
			`// if (index > 0 && title.length() / 2 < index)`
			`// title = title.substring(0, index + 1);`

			`int counter = 0;`
			`String[] strs = title.split("\\\|");`
			`for (String part : strs) {`
			`if (IGNORED_TITLE_PARTS.contains(part.toLowerCase(Locale.getDefault()).trim()))`
			`continue;`

			`if (counter == strs.length - 1 && res.length() > part.length())`
			`continue;`

			`if (counter > 0)`
			`res.append("\|");`

			`res.append(part);`
			`counter++;`
			`}`

			`return SHelper.innerTrim(res.toString());`
			`}`

			`/**`
			`* Comparator for Image by weight`
			`*`
			`* @author Chris Alexander, chris@chris-alexander.co.uk`
			`*`
			`*/`
			`public class ImageComparator implements Comparator<ImageResult> {`

			`@Override`
			`public int compare(ImageResult o1, ImageResult o2) {`
			`// Returns the highest weight first`
			`return o2.weight.compareTo(o1.weight);`
			`}`
			`}`
			`}`