lightning-i2p/app/src/main/java/acr/browser/lightning/reading/ArticleTextExtractor.java

package acr.browser.lightning.reading;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.Date;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector.SelectorParseException;

/**
 * This class is thread safe.
 * Class for content extraction from string form of webpage
 * 'extractContent' is main call from external programs/classes
 *
 * @author Alex P (ifesdjeen from jreadability)
 * @author Peter Karich
 */
public class ArticleTextExtractor {

    // Interessting nodes
    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
    // Unlikely candidates
    private String unlikelyStr;
    private Pattern UNLIKELY;
    // Most likely positive candidates
    private String positiveStr;
    private Pattern POSITIVE;
    // Most likely negative candidates
    private String negativeStr;
    private Pattern NEGATIVE;
    private static final Pattern NEGATIVE_STYLE =
            Pattern.compile("hidden|display: ?none|font-size: ?small");
    private static final Pattern IGNORE_AUTHOR_PARTS =
            Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE);
    private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
        {
            add("hacker news");
            add("facebook");
            add("home");
            add("articles");
        }
    };
    private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();
    private OutputFormatter formatter = DEFAULT_FORMATTER;

    private static final int MAX_AUTHOR_NAME_LENGHT = 255;
    private static final int MIN_AUTHOR_NAME_LENGTH = 4;
    private static final List<Pattern> CLEAN_AUTHOR_PATTERNS = Collections.singletonList(
            Pattern.compile("By\\S*(.*)[\\.,].*")
    );
    private static final int MAX_AUTHOR_DESC_LENGHT = 1000;
    private static final int MAX_IMAGE_LENGHT = 255;

    // For debugging
    private static final boolean DEBUG_WEIGHTS = false;
    private static final int MAX_LOG_LENGTH = 200;

    public ArticleTextExtractor() {
        setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
                + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
                + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
                + "login|si(debar|gn|ngle)");
        setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
                + "|arti(cle|kel)|instapaper_body");
        setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
                + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
                + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
    }

    private ArticleTextExtractor setUnlikely(String unlikelyStr) {
        this.unlikelyStr = unlikelyStr;
        UNLIKELY = Pattern.compile(unlikelyStr);
        return this;
    }

    public ArticleTextExtractor addUnlikely(String unlikelyMatches) {
        return setUnlikely(unlikelyStr + '|' + unlikelyMatches);
    }

    private ArticleTextExtractor setPositive(String positiveStr) {
        this.positiveStr = positiveStr;
        POSITIVE = Pattern.compile(positiveStr);
        return this;
    }

    public ArticleTextExtractor addPositive(String pos) {
        return setPositive(positiveStr + '|' + pos);
    }

    private ArticleTextExtractor setNegative(String negativeStr) {
        this.negativeStr = negativeStr;
        NEGATIVE = Pattern.compile(negativeStr);
        return this;
    }

    public ArticleTextExtractor addNegative(String neg) {
        setNegative(negativeStr + '|' + neg);
        return this;
    }

    public void setOutputFormatter(OutputFormatter formatter) {
        this.formatter = formatter;
    }

    /**
     * @param html extracts article text from given html string. wasn't tested
     *             with improper HTML, although jSoup should be able to handle minor stuff.
     * @returns extracted article, all HTML tags stripped
     */
    public JResult extractContent(String html, int maxContentSize) throws Exception {
        return extractContent(new JResult(), html, maxContentSize);
    }

    public JResult extractContent(String html) throws Exception {
        return extractContent(new JResult(), html, 0);
    }

    public JResult extractContent(JResult res, String html, int maxContentSize) throws Exception {
        return extractContent(res, html, formatter, true, maxContentSize);
    }

    public JResult extractContent(JResult res, String html) throws Exception {
        return extractContent(res, html, formatter, true, 0);
    }

    private JResult extractContent(JResult res, String html, OutputFormatter formatter,
                                   Boolean extractimages, int maxContentSize) throws Exception {
        if (html.isEmpty())
            throw new IllegalArgumentException("html string is empty!?");

        // http://jsoup.org/cookbook/extracting-data/selector-syntax
        return extractContent(res, Jsoup.parse(html), formatter, extractimages, maxContentSize);
    }

    // Returns the best node match based on the weights (see getWeight for strategy)
    private Element getBestMatchElement(Collection<Element> nodes) {
        int maxWeight = -200;        // why -200 now instead of 0?
        Element bestMatchElement = null;

        boolean ignoreMaxWeightLimit = false;
        for (Element entry : nodes) {

            int currentWeight = getWeight(entry, false);
            if (currentWeight > maxWeight) {
                maxWeight = currentWeight;
                bestMatchElement = entry;

                /*
                // NOTE: This optimization fails with large pages that
                contains chunks of text that can be mistaken by articles, since we
                want the best accuracy possible, I am disabling it for now. AP.

                // The original code had a limit of 200, the intention was that
                // if a node had a weight greater than it, then it most likely
                // it was the main content.
                // However this assumption fails when the amount of text in the
                // children (or grandchildren) is too large. If we detect this
                // case then the limit is ignored and we try all the nodes to select
                // the one with the absolute maximum weight.
                if (maxWeight > 500){
                    ignoreMaxWeightLimit = true;
                    continue;
                }

                // formerly 200, increased to 250 to account for the fact
                // we are not adding the weights of the grand children to the
                // tally.

                if (maxWeight > 250 && !ignoreMaxWeightLimit)
                    break;
                */
            }
        }

        return bestMatchElement;
    }

    private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
                                   Boolean extractimages, int maxContentSize) throws Exception {
        Document origDoc = doc.clone();
        JResult result = extractContent(res, doc, formatter, extractimages, maxContentSize, true);
        //System.out.println("result.getText().length()="+result.getText().length());
        if (result.getText().isEmpty()) {
            result = extractContent(res, origDoc, formatter, extractimages, maxContentSize, false);
        }
        return result;
    }


    // main workhorse
    private JResult extractContent(JResult res, Document doc, OutputFormatter formatter,
                                   Boolean extractimages, int maxContentSize, boolean cleanScripts) {
        if (doc == null)
            throw new NullPointerException("missing document");

        // get the easy stuff
        res.setTitle(extractTitle(doc));
        res.setDescription(extractDescription(doc));
        res.setCanonicalUrl(extractCanonicalUrl(doc));
        res.setType(extractType(doc));
        res.setSitename(extractSitename(doc));
        res.setLanguage(extractLanguage(doc));

        // get author information
        res.setAuthorName(extractAuthorName(doc));
        res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName()));

        // add extra selection gravity to any element containing author name
        // wasn't useful in the case I implemented it for, but might be later
        /*
        Elements authelems = doc.select(":containsOwn(" + res.getAuthorName() + ")");
        for (Element elem : authelems) {
            elem.attr("extragravityscore", Integer.toString(100));
            System.out.println("modified element " + elem.toString());
        }
        */

        // get date from document, if not present, extract from URL if possible
        Date docdate = extractDate(doc);
        if (docdate == null) {
            String dateStr = SHelper.estimateDate(res.getUrl());
            docdate = parseDate(dateStr);
            res.setDate(docdate);
        } else {
            res.setDate(docdate);
        }

        // now remove the clutter
        if (cleanScripts) {
            prepareDocument(doc);
        }

        // init elements and get the one with highest weight (see getWeight for strategy)
        Collection<Element> nodes = getNodes(doc);
        Element bestMatchElement = getBestMatchElement(nodes);

        // do extraction from the best element
        if (bestMatchElement != null) {
            if (extractimages) {
                List<ImageResult> images = new ArrayList<>();
                Element imgEl = determineImageSource(bestMatchElement, images);
                if (imgEl != null) {
                    res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src")));
                    // TODO remove parent container of image if it is contained in bestMatchElement
                    // to avoid image subtitles flooding in

                    res.setImages(images);
                }
            }

            // clean before grabbing text
            String text = formatter.getFormattedText(bestMatchElement);
            text = removeTitleFromText(text, res.getTitle());
            // this fails for short facebook post and probably tweets: text.length() > res.getDescription().length()
            if (text.length() > res.getTitle().length()) {
                if (maxContentSize > 0) {
                    if (text.length() > maxContentSize) {
                        text = utf8truncate(text, maxContentSize);
                    }
                }
                res.setText(text);
            }

            // extract links from the same best element
            String fullhtml = bestMatchElement.toString();
            Elements children = bestMatchElement.select("a[href]"); // a with href = link
            String linkstr;
            Integer linkpos;
            Integer lastlinkpos = 0;
            for (Element child : children) {
                linkstr = child.toString();
                linkpos = fullhtml.indexOf(linkstr, lastlinkpos);
                res.addLink(child.attr("abs:href"), child.text(), linkpos);
                lastlinkpos = linkpos;
            }
        }

        if (extractimages) {
            if (res.getImageUrl().isEmpty()) {
                res.setImageUrl(extractImageUrl(doc));
            }
        }

        res.setRssUrl(extractRssUrl(doc));
        res.setVideoUrl(extractVideoUrl(doc));
        res.setFaviconUrl(extractFaviconUrl(doc));
        res.setKeywords(extractKeywords(doc));

        // Sanity checks in author
        if (res.getAuthorName().length() > MAX_AUTHOR_NAME_LENGHT) {
            res.setAuthorName(utf8truncate(res.getAuthorName(), MAX_AUTHOR_NAME_LENGHT));
        }

        // Sanity checks in author description.
        String authorDescSnippet = getSnippet(res.getAuthorDescription());
        if (getSnippet(res.getText()).equals(authorDescSnippet) ||
                getSnippet(res.getDescription()).equals(authorDescSnippet)) {
            res.setAuthorDescription("");
        } else {
            if (res.getAuthorDescription().length() > MAX_AUTHOR_DESC_LENGHT) {
                res.setAuthorDescription(utf8truncate(res.getAuthorDescription(), MAX_AUTHOR_DESC_LENGHT));
            }
        }

        // Sanity checks in image name
        if (res.getImageUrl().length() > MAX_IMAGE_LENGHT) {
            // doesn't make sense to truncate a URL
            res.setImageUrl("");
        }

        return res;
    }

    private static String getSnippet(String data) {
        if (data.length() < 50)
            return data;
        else
            return data.substring(0, 50);
    }

    private static String extractTitle(Document doc) {
        String title = cleanTitle(doc.title());
        if (title.isEmpty()) {
            title = SHelper.innerTrim(doc.select("head title").text());
            if (title.isEmpty()) {
                title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content"));
                if (title.isEmpty()) {
                    title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content"));
                    if (title.isEmpty()) {
                        title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content"));
                        if (title.isEmpty()) {
                            title = SHelper.innerTrim(doc.select("h1:first-of-type").text());
                        }
                    }
                }
            }
        }
        return title;
    }

    private static String extractCanonicalUrl(Document doc) {
        String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href"));
        if (url.isEmpty()) {
            url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content"));
            if (url.isEmpty()) {
                url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content"));
            }
        }
        return url;
    }

    private static String extractDescription(Document doc) {
        String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content"));
        if (description.isEmpty()) {
            description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content"));
            if (description.isEmpty()) {
                description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content"));
            }
        }
        return description;
    }

    // Returns the publication Date or null
    private static Date extractDate(Document doc) {
        String dateStr = "";

        // try some locations that nytimes uses
        Element elem = doc.select("meta[name=ptime]").first();
        if (elem != null) {
            dateStr = SHelper.innerTrim(elem.attr("content"));
            //            elem.attr("extragravityscore", Integer.toString(100));
            //            System.out.println("date modified element " + elem.toString());
        }

        if (dateStr.isEmpty()) {
            dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content"));
        }
        if (dateStr.isEmpty()) {
            dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content"));
        }
        if (dateStr.isEmpty()) {
            dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content"));
        }
        if (dateStr.isEmpty()) {
            return parseDate(dateStr);
        }

        // taking this stuff directly from Juicer (and converted to Java)
        // opengraph (?)
        Elements elems = doc.select("meta[property=article:published_time]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");
                try {
                    if (dateStr.endsWith("Z")) {
                        dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00";
                    } else {
                        dateStr = String.format(dateStr.substring(0, dateStr.length() - 6),
                                dateStr.substring(dateStr.length() - 6,
                                        dateStr.length()));
                    }
                } catch (StringIndexOutOfBoundsException ex) {
                    // do nothing
                }
                return parseDate(dateStr);
            }
        }

        // rnews
        elems = doc.select("meta[property=dateCreated], span[property=dateCreated]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");

                return parseDate(dateStr);
            } else {
                return parseDate(el.text());
            }
        }

        // schema.org creativework
        elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");

                return parseDate(dateStr);
            } else if (el.hasAttr("value")) {
                dateStr = el.attr("value");

                return parseDate(dateStr);
            } else {
                return parseDate(el.text());
            }
        }

        // parsely page (?)
        /*  skip conversion for now, seems highly specific and uses new lib
        elems = doc.select("meta[name=parsely-page]");
        if (elems.size() > 0) {
            implicit val formats = net.liftweb.json.DefaultFormats

                Element el = elems.get(0);
                if(el.hasAttr("content")) {
                    val json = parse(el.attr("content"))

                        return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz"))
                        }
            }
        */

        // BBC
        elems = doc.select("meta[name=OriginalPublicationDate]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");
                return parseDate(dateStr);
            }
        }

        // wired
        elems = doc.select("meta[name=DisplayDate]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");
                return parseDate(dateStr);
            }
        }

        // wildcard
        elems = doc.select("meta[name*=date]");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            if (el.hasAttr("content")) {
                dateStr = el.attr("content");
                Date parsedDate = parseDate(dateStr);
                return parsedDate;
            }
        }

        // blogger
        elems = doc.select(".date-header");
        if (!elems.isEmpty()) {
            Element el = elems.get(0);
            dateStr = el.text();
            return parseDate(dateStr);
        }

        return null;
    }

    private static Date parseDate(String dateStr) {
//        String[] parsePatterns = {
//                "yyyy-MM-dd'T'HH:mm:ssz",
//                "yyyy-MM-dd HH:mm:ss",
//                "yyyy/MM/dd HH:mm:ss",
//                "yyyy-MM-dd HH:mm",
//                "yyyy/MM/dd HH:mm",
//                "yyyy-MM-dd",
//                "yyyy/MM/dd",
//                "MM/dd/yyyy HH:mm:ss",
//                "MM-dd-yyyy HH:mm:ss",
//                "MM/dd/yyyy HH:mm",
//                "MM-dd-yyyy HH:mm",
//                "MM/dd/yyyy",
//                "MM-dd-yyyy",
//                "EEE, MMM dd, yyyy",
//                "MM/dd/yyyy hh:mm:ss a",
//                "MM-dd-yyyy hh:mm:ss a",
//                "MM/dd/yyyy hh:mm a",
//                "MM-dd-yyyy hh:mm a",
//                "yyyy-MM-dd hh:mm:ss a",
//                "yyyy/MM/dd hh:mm:ss a ",
//                "yyyy-MM-dd hh:mm a",
//                "yyyy/MM/dd hh:mm ",
//                "dd MMM yyyy",
//                "dd MMMM yyyy",
//                "yyyyMMddHHmm",
//                "yyyyMMdd HHmm",
//                "dd-MM-yyyy HH:mm:ss",
//                "dd/MM/yyyy HH:mm:ss",
//                "dd MMM yyyy HH:mm:ss",
//                "dd MMMM yyyy HH:mm:ss",
//                "dd-MM-yyyy HH:mm",
//                "dd/MM/yyyy HH:mm",
//                "dd MMM yyyy HH:mm",
//                "dd MMMM yyyy HH:mm",
//                "yyyyMMddHHmmss",
//                "yyyyMMdd HHmmss",
//                "yyyyMMdd"
//        };
//
        return new Date(0);

//        try {
//            return DateUtils.parseDateStrictly(dateStr, parsePatterns);
//        } catch (Exception ex) {
//            return null;
//        }
    }

    // Returns the author name or null
    private String extractAuthorName(Document doc) {
        String authorName = "";

        // first try the Google Author tag
        Element result = doc.select("body [rel*=author]").first();
        if (result != null)
            authorName = SHelper.innerTrim(result.ownText());

        // if that doesn't work, try some other methods
        if (authorName.isEmpty()) {

            // meta tag approaches, get content
            result = doc.select("head meta[name=author]").first();
            if (result != null) {
                authorName = SHelper.innerTrim(result.attr("content"));
            }

            if (authorName.isEmpty()) {  // for "opengraph"
                authorName = SHelper.innerTrim(doc.select("head meta[property=article:author]").attr("content"));
            }
            if (authorName.isEmpty()) { // OpenGraph twitter:creator tag
                authorName = SHelper.innerTrim(doc.select("head meta[property=twitter:creator]").attr("content"));
            }
            if (authorName.isEmpty()) {  // for "schema.org creativework"
                authorName = SHelper.innerTrim(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content"));
            }

            // other hacks
            if (authorName.isEmpty()) {
                try {
                    // build up a set of elements which have likely author-related terms
                    // .X searches for class X
                    Elements matches = doc.select("a[rel=author],.byline-name,.byLineTag,.byline,.author,.by,.writer,.address");

                    if (matches == null || matches.isEmpty()) {
                        matches = doc.select("body [class*=author]");
                    }

                    if (matches == null || matches.isEmpty()) {
                        matches = doc.select("body [title*=author]");
                    }

                    // a hack for huffington post
                    if (matches == null || matches.isEmpty()) {
                        matches = doc.select(".staff_info dl a[href]");
                    }

                    // a hack for http://sports.espn.go.com/
                    if (matches == null || matches.isEmpty()) {
                        matches = doc.select("cite[class*=source]");
                    }

                    // select the best element from them
                    if (matches != null) {
                        Element bestMatch = getBestMatchElement(matches);

                        if (!(bestMatch == null)) {
                            authorName = bestMatch.text();

                            if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) {
                                authorName = bestMatch.text();
                            }

                            authorName = SHelper.innerTrim(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll(""));

                            if (authorName.contains(",")) {
                                authorName = authorName.split(",")[0];
                            }
                        }
                    }
                } catch (Exception e) {
                    System.out.println(e.toString());
                }
            }
        }

        for (Pattern pattern : CLEAN_AUTHOR_PATTERNS) {
            Matcher matcher = pattern.matcher(authorName);
            if (matcher.matches()) {
                authorName = SHelper.innerTrim(matcher.group(1));
                break;
            }
        }

        return authorName;
    }

    // Returns the author description or null
    private String extractAuthorDescription(Document doc, String authorName) {

        String authorDesc = "";

        if (authorName.isEmpty())
            return "";

        // Special case for entrepreneur.com
        Elements matches = doc.select(".byline > .bio");
        if (matches != null && !matches.isEmpty()) {
            Element bestMatch = matches.first(); // assume it is the first.
            authorDesc = bestMatch.text();
            return authorDesc;
        }

        // Special case for huffingtonpost.com
        matches = doc.select(".byline span[class*=teaser]");
        if (matches != null && !matches.isEmpty()) {
            Element bestMatch = matches.first(); // assume it is the first.
            authorDesc = bestMatch.text();
            return authorDesc;
        }

        try {
            Elements nodes = doc.select(":containsOwn(" + authorName + ')');
            Element bestMatch = getBestMatchElement(nodes);
            if (bestMatch != null)
                authorDesc = bestMatch.text();
        } catch (SelectorParseException se) {
            // Avoid error when selector is invalid
        }

        return authorDesc;
    }

    private static Collection<String> extractKeywords(Document doc) {
        String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content"));

        if (content.startsWith("[") && content.endsWith("]"))
            content = content.substring(1, content.length() - 1);

        String[] split = content.split("\\s*,\\s*");
        if (split.length > 1 || (split.length > 0 && split[0] != null && !split[0].isEmpty()))
            return Arrays.asList(split);
        return Collections.emptyList();
    }

    /**
     * Tries to extract an image url from metadata if determineImageSource
     * failed
     *
     * @return image url or empty str
     */
    private static String extractImageUrl(Document doc) {
        // use open graph tag to get image
        String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content"));
        if (imageUrl.isEmpty()) {
            imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content"));
            if (imageUrl.isEmpty()) {
                // prefer link over thumbnail-meta if empty
                imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href"));
                if (imageUrl.isEmpty()) {
                    imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content"));
                }
            }
        }
        return imageUrl;
    }

    private static String extractRssUrl(Document doc) {
        return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href"));
    }

    private static String extractVideoUrl(Document doc) {
        return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content"));
    }

    private static String extractFaviconUrl(Document doc) {
        String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href"));
        if (faviconUrl.isEmpty()) {
            faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href"));
        }
        return faviconUrl;
    }

    private static String extractType(Document doc) {
        return SHelper.innerTrim(doc.select("head meta[property=og:type]").attr("content"));
    }

    private static String extractSitename(Document doc) {
        String sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
        if (sitename.isEmpty()) {
            sitename = SHelper.innerTrim(doc.select("head meta[name=twitter:site]").attr("content"));
        }
        if (sitename.isEmpty()) {
            sitename = SHelper.innerTrim(doc.select("head meta[property=og:site_name]").attr("content"));
        }
        return sitename;
    }

    private static String extractLanguage(Document doc) {
        String language = SHelper.innerTrim(doc.select("head meta[property=language]").attr("content"));
        if (language.isEmpty()) {
            language = SHelper.innerTrim(doc.select("html").attr("lang"));
            if (language.isEmpty()) {
                language = SHelper.innerTrim(doc.select("head meta[property=og:locale]").attr("content"));
            }
        }
        if (!language.isEmpty()) {
            if (language.length() > 2) {
                language = language.substring(0, 2);
            }
        }
        return language;
    }

    /**
     * Weights current element. By matching it with positive candidates and
     * weighting child nodes. Since it's impossible to predict which exactly
     * names, ids or class names will be used in HTML, major role is played by
     * child nodes
     *
     * @param e Element to weight, along with child nodes
     */
    private int getWeight(Element e, boolean checkextra) {
        int weight = calcWeight(e);
        int ownTextWeight = (int) Math.round(e.ownText().length() / 100.0 * 10);
        weight += ownTextWeight;
        int childrenWeight = weightChildNodes(e);
        weight += childrenWeight;

        // add additional weight using possible 'extragravityscore' attribute
        if (checkextra) {
            Element xelem = e.select("[extragravityscore]").first();
            if (xelem != null) {
                //                System.out.println("HERE found one: " + xelem.toString());
                weight += Integer.parseInt(xelem.attr("extragravityscore"));
                //                System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore"));
            }
        }

        return weight;
    }

    /**
     * Weights a child nodes of given Element. During tests some difficulties
     * were met. For instance, not every single document has nested paragraph
     * tags inside of the major article tag. Sometimes people are adding one
     * more nesting level. So, we're adding 4 points for every 100 symbols
     * contained in tag nested inside of the current weighted element, but only
     * 3 points for every element that's nested 2 levels deep. This way we give
     * more chances to extract the element that has less nested levels,
     * increasing probability of the correct extraction.
     *
     * @param rootEl Element, who's child nodes will be weighted
     */
    private int weightChildNodes(Element rootEl) {
        int weight = 0;
        Element caption = null;
        List<Element> pEls = new ArrayList<>(5);

        for (Element child : rootEl.children()) {
            String ownText = child.ownText();
            int ownTextLength = ownText.length();
            if (ownTextLength < 20)
                continue;

            if (ownTextLength > 200) {
                int childOwnTextWeight = Math.max(50, ownTextLength / 10);
                weight += childOwnTextWeight;
            }

            if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
                int h2h1Weight = 30;
                weight += h2h1Weight;
            } else if (child.tagName().equals("div") || child.tagName().equals("p")) {
                int calcChildWeight = calcWeightForChild(child, ownText);
                weight += calcChildWeight;
                if (child.tagName().equals("p") && ownTextLength > 50)
                    pEls.add(child);

                if (child.className().toLowerCase().equals("caption"))
                    caption = child;
            }
        }

        //
        // Visit grandchildren, This section visits the grandchildren
        // of the node and calculate their weights. Note that grandchildren
        // weights are only worth 1/3 of children's
        //
        int grandChildrenWeight = 0;
        for (Element child2 : rootEl.children()) {

            // If the node looks negative don't include it in the weights
            // instead penalize the grandparent. This is done to try to
            // avoid giving weigths to navigation nodes, etc.
            if (NEGATIVE.matcher(child2.id()).find() ||
                    NEGATIVE.matcher(child2.className()).find()) {
                grandChildrenWeight -= 30;
                continue;
            }

            for (Element grandchild : child2.children()) {
                int grandchildWeight = 0;
                String ownText = grandchild.ownText();
                int ownTextLength = ownText.length();
                if (ownTextLength < 20)
                    continue;

                if (ownTextLength > 200) {
                    int childOwnTextWeight = Math.max(50, ownTextLength / 10);
                    grandchildWeight += childOwnTextWeight;
                }

                if (grandchild.tagName().equals("h1") || grandchild.tagName().equals("h2")) {
                    int h2h1Weight = 30;
                    grandchildWeight += h2h1Weight;
                } else if (grandchild.tagName().equals("div") || grandchild.tagName().equals("p")) {
                    int calcChildWeight = calcWeightForChild(grandchild, ownText);
                    grandchildWeight += calcChildWeight;
                }

                grandChildrenWeight += grandchildWeight;
            }
        }

        grandChildrenWeight = grandChildrenWeight / 3;
        weight += grandChildrenWeight;

        // use caption and image
        if (caption != null) {
            int captionWeight = 30;
            weight += captionWeight;
        }

        if (pEls.size() >= 2) {
            for (Element subEl : rootEl.children()) {
                if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
                    int h1h2h3Weight = 20;
                    weight += h1h2h3Weight;
                    // headerEls.add(subEl);
                } else if ("table;li;td;th".contains(subEl.tagName())) {
                    addScore(subEl, -30);
                }

                if ("p".contains(subEl.tagName()))
                    addScore(subEl, 30);
            }
        }
        return weight;
    }

    private static void addScore(Element el, int score) {
        int old = getScore(el);
        setScore(el, score + old);
    }

    private static int getScore(Element el) {
        int old = 0;
        try {
            old = Integer.parseInt(el.attr("gravityScore"));
        } catch (Exception ignored) {
        }
        return old;
    }

    private static void setScore(Element el, int score) {
        el.attr("gravityScore", Integer.toString(score));
    }

    private static int calcWeightForChild(Element child, String ownText) {
        int c = SHelper.count(ownText, "&quot;");
        c += SHelper.count(ownText, "&lt;");
        c += SHelper.count(ownText, "&gt;");
        c += SHelper.count(ownText, "px");
        int val;
        if (c > 5)
            val = -30;
        else
            val = (int) Math.round(ownText.length() / 35.0);

        addScore(child, val);
        return val;
    }

    private int calcWeight(Element e) {
        int weight = 0;
        if (POSITIVE.matcher(e.className()).find())
            weight += 35;

        if (POSITIVE.matcher(e.id()).find())
            weight += 45;

        if (UNLIKELY.matcher(e.className()).find())
            weight -= 20;

        if (UNLIKELY.matcher(e.id()).find())
            weight -= 20;

        if (NEGATIVE.matcher(e.className()).find())
            weight -= 50;

        if (NEGATIVE.matcher(e.id()).find())
            weight -= 50;

        String style = e.attr("style");
        if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
            weight -= 50;

        String itemprop = e.attr("itemprop");
        if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) {
            weight += 100;
        }

        return weight;
    }

    private static Element determineImageSource(Element el, List<ImageResult> images) {
        int maxWeight = 0;
        Element maxNode = null;
        Elements els = el.select("img");
        if (els.isEmpty())
            els = el.parent().select("img");

        double score = 1;
        for (Element e : els) {
            String sourceUrl = e.attr("src");
            if (sourceUrl.isEmpty() || isAdImage(sourceUrl))
                continue;

            int weight = 0;
            int height = 0;
            try {
                height = Integer.parseInt(e.attr("height"));
                if (height >= 50)
                    weight += 20;
                else
                    weight -= 20;
            } catch (Exception ignored) {
            }

            int width = 0;
            try {
                width = Integer.parseInt(e.attr("width"));
                if (width >= 50)
                    weight += 20;
                else
                    weight -= 20;
            } catch (Exception ignored) {
            }
            String alt = e.attr("alt");
            if (alt.length() > 35)
                weight += 20;

            String title = e.attr("title");
            if (title.length() > 35)
                weight += 20;

            String rel;
            boolean noFollow = false;
            if (e.parent() != null) {
                rel = e.parent().attr("rel");
                if (rel != null && rel.contains("nofollow")) {
                    noFollow = rel.contains("nofollow");
                    weight -= 40;
                }
            }

            weight = (int) (weight * score);
            if (weight > maxWeight) {
                maxWeight = weight;
                maxNode = e;
                score = score / 2;
            }

            ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow);
            images.add(image);
        }

        Collections.sort(images, new ImageComparator());
        return maxNode;
    }

    /**
     * Prepares document. Currently only stipping unlikely candidates, since
     * from time to time they're getting more score than good ones especially in
     * cases when major text is short.
     *
     * @param doc document to prepare. Passed as reference, and changed inside
     *            of function
     */
    private static void prepareDocument(Document doc) {
//        stripUnlikelyCandidates(doc);
        removeScriptsAndStyles(doc);
    }

    /**
     * Removes unlikely candidates from HTML. Currently takes id and class name
     * and matches them against list of patterns
     *
     * @param doc document to strip unlikely candidates from
     */
    protected void stripUnlikelyCandidates(Document doc) {
        for (Element child : doc.select("body").select("*")) {
            String className = child.className().toLowerCase();
            String id = child.id().toLowerCase();

            if (NEGATIVE.matcher(className).find()
                    || NEGATIVE.matcher(id).find()) {
                child.remove();
            }
        }
    }

    private static Document removeScriptsAndStyles(Document doc) {
        Elements scripts = doc.getElementsByTag("script");
        for (Element item : scripts) {
            item.remove();
        }
        Elements noscripts = doc.getElementsByTag("noscript");
        for (Element item : noscripts) {
            item.remove();
        }

        Elements styles = doc.getElementsByTag("style");
        for (Element style : styles) {
            style.remove();
        }

        return doc;
    }

    private static boolean isAdImage(String imageUrl) {
        return SHelper.count(imageUrl, "ad") >= 2;
    }

    /**
     * Match only exact matching as longestSubstring can be too fuzzy
     */
    private static String removeTitleFromText(String text, String title) {
        // don't do this as its terrible to read
//        int index1 = text.toLowerCase().indexOf(title.toLowerCase());
//        if (index1 >= 0)
//            text = text.substring(index1 + title.length());
//        return text.trim();
        return text;
    }

    /**
     * based on a delimeter in the title take the longest piece or do some
     * custom logic based on the site
     *
     * @param title
     * @param delimeter
     * @return
     */
    private static String doTitleSplits(String title, String delimeter) {
        String largeText = "";
        int largetTextLen = 0;
        String[] titlePieces = title.split(delimeter);

        // take the largest split
        for (String p : titlePieces) {
            if (p.length() > largetTextLen) {
                largeText = p;
                largetTextLen = p.length();
            }
        }

        largeText = largeText.replace("&raquo;", " ");
        largeText = largeText.replace("»", " ");
        return largeText.trim();
    }

    /**
     * @return a set of all important nodes
     */
    private static Collection<Element> getNodes(Document doc) {
        Map<Element, Object> nodes = new LinkedHashMap<>(64);
        int score = 100;
        for (Element el : doc.select("body").select("*")) {
            if (NODES.matcher(el.tagName()).matches()) {
                nodes.put(el, null);
                setScore(el, score);
                score = score / 2;
            }
        }
        return nodes.keySet();
    }

    private static String cleanTitle(String title) {

//        int index = title.lastIndexOf("|");
//        if (index > 0 && title.length() / 2 < index)
//            title = title.substring(0, index + 1);

        int counter = 0;
        String[] strs = title.split("\\|");
        StringBuilder res = new StringBuilder(strs.length);
        for (String part : strs) {
            if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim()))
                continue;

            if (counter == strs.length - 1 && res.length() > part.length())
                continue;

            if (counter > 0)
                res.append('|');

            res.append(part);
            counter++;
        }

        return SHelper.innerTrim(res.toString());
    }

    /**
     * Truncate a Java string so that its UTF-8 representation will not
     * exceed the specified number of bytes.
     * <p/>
     * For discussion of why you might want to do this, see
     * http://lpar.ath0.com/2011/06/07/unicode-alchemy-with-db2/
     */
    private static String utf8truncate(String input, int length) {
        StringBuilder result = new StringBuilder(length);
        int resultlen = 0;
        for (int i = 0; i < input.length(); i++) {
            char c = input.charAt(i);
            int charlen = 0;
            if (c <= 0x7f) {
                charlen = 1;
            } else if (c <= 0x7ff) {
                charlen = 2;
            } else if (c <= 0xd7ff) {
                charlen = 3;
            } else if (c <= 0xdbff) {
                charlen = 4;
            } else if (c <= 0xdfff) {
                charlen = 0;
            } else {
                charlen = 3;
            }
            if (resultlen + charlen > length) {
                break;
            }
            result.append(c);
            resultlen += charlen;
        }
        return result.toString();
    }


    /**
     * Comparator for Image by weight
     *
     * @author Chris Alexander, chris@chris-alexander.co.uk
     */
    private static class ImageComparator implements Comparator<ImageResult> {

        @Override
        public int compare(ImageResult o1, ImageResult o2) {
            // Returns the highest weight first
            return o2.weight.compareTo(o1.weight);
        }
    }

}