You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
7.2 KiB
216 lines
7.2 KiB
package org.purplei2p.lightning.reading; |
|
|
|
import org.jsoup.Jsoup; |
|
import org.jsoup.nodes.Element; |
|
import org.jsoup.select.Elements; |
|
|
|
import java.util.Arrays; |
|
import java.util.List; |
|
import java.util.regex.Pattern; |
|
|
|
import org.jsoup.nodes.Node; |
|
import org.jsoup.nodes.TextNode; |
|
|
|
/** |
|
* @author goose | jim |
|
* @author karussell |
|
* <p/> |
|
* this class will be responsible for taking our top node and stripping out junk |
|
* we don't want and getting it ready for how we want it presented to the user |
|
*/ |
|
public class OutputFormatter { |
|
|
|
private static final int MIN_FIRST_PARAGRAPH_TEXT = 50; // Min size of first paragraph |
|
private static final int MIN_PARAGRAPH_TEXT = 30; // Min size of any other paragraphs |
|
private static final List<String> NODES_TO_REPLACE = Arrays.asList("strong", "b", "i"); |
|
private Pattern unlikelyPattern = Pattern.compile("display:none|visibility:hidden"); |
|
private final int minFirstParagraphText; |
|
private final int minParagraphText; |
|
private final List<String> nodesToReplace; |
|
private String nodesToKeepCssSelector = "p, ol"; |
|
|
|
public OutputFormatter() { |
|
this(MIN_FIRST_PARAGRAPH_TEXT, MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); |
|
} |
|
|
|
public OutputFormatter(int minParagraphText) { |
|
this(minParagraphText, minParagraphText, NODES_TO_REPLACE); |
|
} |
|
|
|
public OutputFormatter(int minFirstParagraphText, int minParagraphText) { |
|
this(minFirstParagraphText, minParagraphText, NODES_TO_REPLACE); |
|
} |
|
|
|
private OutputFormatter(int minFirstParagraphText, int minParagraphText, |
|
List<String> nodesToReplace) { |
|
this.minFirstParagraphText = minFirstParagraphText; |
|
this.minParagraphText = minParagraphText; |
|
this.nodesToReplace = nodesToReplace; |
|
} |
|
|
|
/** |
|
* set elements to keep in output text |
|
*/ |
|
public void setNodesToKeepCssSelector(String nodesToKeepCssSelector) { |
|
this.nodesToKeepCssSelector = nodesToKeepCssSelector; |
|
} |
|
|
|
/** |
|
* takes an element and turns the P tags into \n\n |
|
*/ |
|
public String getFormattedText(Element topNode) { |
|
setParagraphIndex(topNode, nodesToKeepCssSelector); |
|
removeNodesWithNegativeScores(topNode); |
|
StringBuilder sb = new StringBuilder(); |
|
int countOfP = append(topNode, sb, nodesToKeepCssSelector); |
|
String str = SHelper.innerTrim(sb.toString()); |
|
|
|
int topNodeLength = topNode.text().length(); |
|
if (topNodeLength == 0) { |
|
topNodeLength = 1; |
|
} |
|
|
|
|
|
boolean lowTextRatio = ((str.length() / (topNodeLength * 1.0)) < 0.25); |
|
if (str.length() > 100 && countOfP > 0 && !lowTextRatio) |
|
return str; |
|
|
|
// no subelements |
|
if (str.isEmpty() || (!topNode.text().isEmpty() |
|
&& str.length() <= topNode.ownText().length()) |
|
|| countOfP == 0 || lowTextRatio) { |
|
str = topNode.text(); |
|
} |
|
|
|
// if jsoup failed to parse the whole html now parse this smaller |
|
// snippet again to avoid html tags disturbing our text: |
|
return Jsoup.parse(str).text(); |
|
} |
|
|
|
/** |
|
* If there are elements inside our top node that have a negative gravity |
|
* score remove them |
|
*/ |
|
private void removeNodesWithNegativeScores(Element topNode) { |
|
Elements gravityItems = topNode.select("*[gravityScore]"); |
|
for (Element item : gravityItems) { |
|
int score = getScore(item); |
|
int paragraphIndex = getParagraphIndex(item); |
|
if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) { |
|
item.remove(); |
|
} |
|
} |
|
} |
|
|
|
private int append(Element node, StringBuilder sb, String tagName) { |
|
int countOfP = 0; // Number of P elements in the article |
|
int paragraphWithTextIndex = 0; |
|
// is select more costly then getElementsByTag? |
|
MAIN: |
|
for (Element e : node.select(tagName)) { |
|
Element tmpEl = e; |
|
// check all elements until 'node' |
|
while (tmpEl != null && !tmpEl.equals(node)) { |
|
if (unlikely(tmpEl)) |
|
continue MAIN; |
|
tmpEl = tmpEl.parent(); |
|
} |
|
|
|
String text = node2Text(e); |
|
if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex) |
|
|| text.length() > SHelper.countLetters(text) * 2) { |
|
continue; |
|
} |
|
|
|
if (e.tagName().equals("p")) { |
|
countOfP++; |
|
} |
|
|
|
sb.append(text); |
|
sb.append("\n\n"); |
|
paragraphWithTextIndex += 1; |
|
} |
|
|
|
return countOfP; |
|
} |
|
|
|
private static void setParagraphIndex(Element node, String tagName) { |
|
int paragraphIndex = 0; |
|
for (Element e : node.select(tagName)) { |
|
e.attr("paragraphIndex", Integer.toString(paragraphIndex++)); |
|
} |
|
} |
|
|
|
private int getMinParagraph(int paragraphIndex) { |
|
if (paragraphIndex < 1) { |
|
return minFirstParagraphText; |
|
} else { |
|
return minParagraphText; |
|
} |
|
} |
|
|
|
private static int getParagraphIndex(Element el) { |
|
try { |
|
return Integer.parseInt(el.attr("paragraphIndex")); |
|
} catch (NumberFormatException ex) { |
|
return -1; |
|
} |
|
} |
|
|
|
private static int getScore(Element el) { |
|
try { |
|
return Integer.parseInt(el.attr("gravityScore")); |
|
} catch (Exception ex) { |
|
return 0; |
|
} |
|
} |
|
|
|
private boolean unlikely(Node e) { |
|
if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption")) |
|
return true; |
|
|
|
String style = e.attr("style"); |
|
String clazz = e.attr("class"); |
|
return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); |
|
} |
|
|
|
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) { |
|
for (Node child : e.childNodes()) { |
|
if (unlikely(child)) { |
|
continue; |
|
} |
|
if (child instanceof TextNode) { |
|
TextNode textNode = (TextNode) child; |
|
String txt = textNode.text(); |
|
accum.append(txt); |
|
} else if (child instanceof Element) { |
|
Element element = (Element) child; |
|
if (accum.length() > 0 && element.isBlock() |
|
&& !lastCharIsWhitespace(accum)) |
|
accum.append(' '); |
|
else if (element.tagName().equals("br")) |
|
accum.append(' '); |
|
appendTextSkipHidden(element, accum, indent + 1); |
|
} |
|
} |
|
} |
|
|
|
private static boolean lastCharIsWhitespace(StringBuilder accum) { |
|
return accum.length() != 0 && Character.isWhitespace(accum.charAt(accum.length() - 1)); |
|
} |
|
|
|
private String node2Text(Element el) { |
|
StringBuilder sb = new StringBuilder(200); |
|
appendTextSkipHidden(el, sb, 0); |
|
return sb.toString(); |
|
} |
|
|
|
private OutputFormatter setUnlikelyPattern(String unlikelyPattern) { |
|
this.unlikelyPattern = Pattern.compile(unlikelyPattern); |
|
return this; |
|
} |
|
|
|
public OutputFormatter appendUnlikelyPattern(String str) { |
|
return setUnlikelyPattern(unlikelyPattern.toString() + '|' + str); |
|
} |
|
} |