lightning-i2p/app/src/main/java/org/purplei2p/lightning/reading/OutputFormatter.java

package org.purplei2p.lightning.reading;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

/**
 * @author goose | jim
 * @author karussell
 *         <p/>
 *         this class will be responsible for taking our top node and stripping out junk
 *         we don't want and getting it ready for how we want it presented to the user
 */
public class OutputFormatter {

    private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph
    private static final int MIN_PARAGRAPH_TEXT = 30;       // Min size of any other paragraphs
    private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i");
    private Pattern unlikelyPattern = Pattern.compile("display:none|visibility:hidden");
    private final int minFirstParagraphText;
    private final int minParagraphText;
    private final List<String> nodesToReplace;
    private String nodesToKeepCssSelector = "p, ol";

    public OutputFormatter() {
        this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE);
    }

    public OutputFormatter(int minParagraphText) {
        this(minParagraphText, minParagraphText, NODES_TO_REPLACE);
    }

    public OutputFormatter(int minFirstParagraphText, int minParagraphText) {
        this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE);
    }

    private OutputFormatter(int minFirstParagraphText, int minParagraphText,
                            List<String> nodesToReplace) {
        this.minFirstParagraphText = minFirstParagraphText;
        this.minParagraphText = minParagraphText;
        this.nodesToReplace = nodesToReplace;
    }

    /**
     * set elements to keep in output text
     */
    public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) {
        this.nodesToKeepCssSelector = nodesToKeepCssSelector;
    }

    /**
     * takes an element and turns the P tags into \n\n
     */
    public String getFormattedText(Element topNode) {
        setParagraphIndex(topNode, nodesToKeepCssSelector);
        removeNodesWithNegativeScores(topNode);
        StringBuilder sb = new StringBuilder();
        int countOfP = append(topNode, sb, nodesToKeepCssSelector);
        String str = SHelper.innerTrim(sb.toString());

        int topNodeLength = topNode.text().length();
        if (topNodeLength == 0) {
            topNodeLength = 1;
        }


        boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25);
        if (str.length() > 100 && countOfP > 0 && !lowTextRatio)
            return str;

        // no subelements
        if (str.isEmpty() || (!topNode.text().isEmpty()
                && str.length() <= topNode.ownText().length())
                || countOfP == 0 || lowTextRatio) {
            str = topNode.text();
        }

        // if jsoup failed to parse the whole html now parse this smaller
        // snippet again to avoid html tags disturbing our text:
        return Jsoup.parse(str).text();
    }

    /**
     * If there are elements inside our top node that have a negative gravity
     * score remove them
     */
    private void removeNodesWithNegativeScores(Element topNode) {
        Elements gravityItems = topNode.select("*[gravityScore]");
        for (Element item : gravityItems) {
            int score = getScore(item);
            int paragraphIndex = getParagraphIndex(item);
            if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
                item.remove();
            }
        }
    }

    private int append(Element node, StringBuilder sb, String tagName) {
        int countOfP = 0; // Number of P elements in the article
        int paragraphWithTextIndex = 0;
        // is select more costly then getElementsByTag?
        MAIN:
        for (Element e : node.select(tagName)) {
            Element tmpEl = e;
            // check all elements until 'node'
            while (tmpEl != null && !tmpEl.equals(node)) {
                if (unlikely(tmpEl))
                    continue MAIN;
                tmpEl = tmpEl.parent();
            }

            String text = node2Text(e);
            if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex)
                    || text.length() > SHelper.countLetters(text) * 2) {
                continue;
            }

            if (e.tagName().equals("p")) {
                countOfP++;
            }

            sb.append(text);
            sb.append("\n\n");
            paragraphWithTextIndex += 1;
        }

        return countOfP;
    }

    private static void setParagraphIndex(Element node, String tagName) {
        int paragraphIndex = 0;
        for (Element e : node.select(tagName)) {
            e.attr("paragraphIndex", Integer.toString(paragraphIndex++));
        }
    }

    private int getMinParagraph(int paragraphIndex) {
        if (paragraphIndex < 1) {
            return minFirstParagraphText;
        } else {
            return minParagraphText;
        }
    }

    private static int getParagraphIndex(Element el) {
        try {
            return Integer.parseInt(el.attr("paragraphIndex"));
        } catch (NumberFormatException ex) {
            return -1;
        }
    }

    private static int getScore(Element el) {
        try {
            return Integer.parseInt(el.attr("gravityScore"));
        } catch (Exception ex) {
            return 0;
        }
    }

    private boolean unlikely(Node e) {
        if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
            return true;

        String style = e.attr("style");
        String clazz = e.attr("class");
        return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
    }

    private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
        for (Node child : e.childNodes()) {
            if (unlikely(child)) {
                continue;
            }
            if (child instanceof TextNode) {
                TextNode textNode = (TextNode) child;
                String txt = textNode.text();
                accum.append(txt);
            } else if (child instanceof Element) {
                Element element = (Element) child;
                if (accum.length() > 0 && element.isBlock()
                        && !lastCharIsWhitespace(accum))
                    accum.append(' ');
                else if (element.tagName().equals("br"))
                    accum.append(' ');
                appendTextSkipHidden(element, accum, indent + 1);
            }
        }
    }

    private static boolean lastCharIsWhitespace(StringBuilder accum) {
        return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1));
    }

    private String node2Text(Element el) {
        StringBuilder sb = new StringBuilder(200);
        appendTextSkipHidden(el, sb, 0);
        return sb.toString();
    }

    private OutputFormatter setUnlikelyPattern(String unlikelyPattern) {
        this.unlikelyPattern = Pattern.compile(unlikelyPattern);
        return this;
    }

    public OutputFormatter appendUnlikelyPattern(String str) {
        return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str);
    }
}