You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
480 lines
12 KiB
480 lines
12 KiB
/* |
|
* Copyright 2011 Peter Karich |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
package acr.browser.lightning.reading; |
|
|
|
import java.io.UnsupportedEncodingException; |
|
import java.net.CookieHandler; |
|
import java.net.CookieManager; |
|
import java.net.CookiePolicy; |
|
import java.net.URLDecoder; |
|
import java.net.URLEncoder; |
|
import java.security.SecureRandom; |
|
import java.security.cert.CertificateException; |
|
import java.security.cert.X509Certificate; |
|
import java.text.SimpleDateFormat; |
|
import java.util.Locale; |
|
import java.util.regex.Matcher; |
|
import java.util.regex.Pattern; |
|
|
|
import javax.net.ssl.KeyManager; |
|
import javax.net.ssl.SSLContext; |
|
import javax.net.ssl.TrustManager; |
|
import javax.net.ssl.X509TrustManager; |
|
import org.jsoup.nodes.Element; |
|
|
|
/** |
|
* |
|
* @author Peter Karich |
|
*/ |
|
public class SHelper { |
|
|
|
public static final String UTF8 = "UTF-8"; |
|
private static final Pattern SPACE = Pattern.compile(" "); |
|
|
|
public static String replaceSpaces(String url) { |
|
if (!url.isEmpty()) { |
|
url = url.trim(); |
|
if (url.contains(" ")) { |
|
Matcher spaces = SPACE.matcher(url); |
|
url = spaces.replaceAll("%20"); |
|
} |
|
} |
|
return url; |
|
} |
|
|
|
public static int count(String str, String substring) { |
|
int c = 0; |
|
int index1 = str.indexOf(substring); |
|
if (index1 >= 0) { |
|
c++; |
|
c += count(str.substring(index1 + substring.length()), substring); |
|
} |
|
return c; |
|
} |
|
|
|
/** |
|
* remove more than two spaces or newlines |
|
*/ |
|
public static String innerTrim(String str) { |
|
if (str.isEmpty()) |
|
return ""; |
|
|
|
StringBuilder sb = new StringBuilder(); |
|
boolean previousSpace = false; |
|
for (int i = 0; i < str.length(); i++) { |
|
char c = str.charAt(i); |
|
if (c == ' ' || (int) c == 9 || c == '\n') { |
|
previousSpace = true; |
|
continue; |
|
} |
|
|
|
if (previousSpace) |
|
sb.append(' '); |
|
|
|
previousSpace = false; |
|
sb.append(c); |
|
} |
|
return sb.toString().trim(); |
|
} |
|
|
|
/** |
|
* Starts reading the encoding from the first valid character until an |
|
* invalid encoding character occurs. |
|
*/ |
|
public static String encodingCleanup(String str) { |
|
StringBuilder sb = new StringBuilder(); |
|
boolean startedWithCorrectString = false; |
|
for (int i = 0; i < str.length(); i++) { |
|
char c = str.charAt(i); |
|
if (Character.isDigit(c) || Character.isLetter(c) || c == '-' || c == '_') { |
|
startedWithCorrectString = true; |
|
sb.append(c); |
|
continue; |
|
} |
|
|
|
if (startedWithCorrectString) |
|
break; |
|
} |
|
return sb.toString().trim(); |
|
} |
|
|
|
/** |
|
* @return the longest substring as str1.substring(result[0], result[1]); |
|
*/ |
|
public static String getLongestSubstring(String str1, String str2) { |
|
int res[] = longestSubstring(str1, str2); |
|
if (res == null || res[0] >= res[1]) |
|
return ""; |
|
|
|
return str1.substring(res[0], res[1]); |
|
} |
|
|
|
public static int[] longestSubstring(String str1, String str2) { |
|
if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty()) |
|
return null; |
|
|
|
// dynamic programming => save already identical length into array |
|
// to understand this algo simply print identical length in every entry |
|
// of the array |
|
// i+1, j+1 then reuses information from i,j |
|
// java initializes them already with 0 |
|
int[][] num = new int[str1.length()][str2.length()]; |
|
int maxlen = 0; |
|
int lastSubstrBegin = 0; |
|
int endIndex = 0; |
|
for (int i = 0; i < str1.length(); i++) { |
|
for (int j = 0; j < str2.length(); j++) { |
|
if (str1.charAt(i) == str2.charAt(j)) { |
|
if ((i == 0) || (j == 0)) |
|
num[i][j] = 1; |
|
else |
|
num[i][j] = 1 + num[i - 1][j - 1]; |
|
|
|
if (num[i][j] > maxlen) { |
|
maxlen = num[i][j]; |
|
// generate substring from str1 => i |
|
lastSubstrBegin = i - num[i][j] + 1; |
|
endIndex = i + 1; |
|
} |
|
} |
|
} |
|
} |
|
return new int[] { lastSubstrBegin, endIndex }; |
|
} |
|
|
|
public static String getDefaultFavicon(String url) { |
|
return useDomainOfFirstArg4Second(url, "/favicon.ico"); |
|
} |
|
|
|
/** |
|
* @param urlForDomain |
|
* extract the domain from this url |
|
* @param path |
|
* this url does not have a domain |
|
* @return |
|
*/ |
|
public static String useDomainOfFirstArg4Second(String urlForDomain, String path) { |
|
if (path.startsWith("http")) |
|
return path; |
|
|
|
if ("favicon.ico".equals(path)) |
|
path = "/favicon.ico"; |
|
|
|
if (path.startsWith("//")) { |
|
// wikipedia special case, see tests |
|
if (urlForDomain.startsWith("https:")) |
|
return "https:" + path; |
|
|
|
return "http:" + path; |
|
} else if (path.startsWith("/")) |
|
return "http://" + extractHost(urlForDomain) + path; |
|
else if (path.startsWith("../")) { |
|
int slashIndex = urlForDomain.lastIndexOf("/"); |
|
if (slashIndex > 0 && slashIndex + 1 < urlForDomain.length()) |
|
urlForDomain = urlForDomain.substring(0, slashIndex + 1); |
|
|
|
return urlForDomain + path; |
|
} |
|
return path; |
|
} |
|
|
|
public static String extractHost(String url) { |
|
return extractDomain(url, false); |
|
} |
|
|
|
public static String extractDomain(String url, boolean aggressive) { |
|
if (url.startsWith("http://")) |
|
url = url.substring("http://".length()); |
|
else if (url.startsWith("https://")) |
|
url = url.substring("https://".length()); |
|
|
|
if (aggressive) { |
|
if (url.startsWith("www.")) |
|
url = url.substring("www.".length()); |
|
|
|
// strip mobile from start |
|
if (url.startsWith("m.")) |
|
url = url.substring("m.".length()); |
|
} |
|
|
|
int slashIndex = url.indexOf("/"); |
|
if (slashIndex > 0) |
|
url = url.substring(0, slashIndex); |
|
|
|
return url; |
|
} |
|
|
|
public static boolean isVideoLink(String url) { |
|
url = extractDomain(url, true); |
|
return url.startsWith("youtube.com") || url.startsWith("video.yahoo.com") |
|
|| url.startsWith("vimeo.com") || url.startsWith("blip.tv"); |
|
} |
|
|
|
public static boolean isVideo(String url) { |
|
return url.endsWith(".mpeg") || url.endsWith(".mpg") || url.endsWith(".avi") |
|
|| url.endsWith(".mov") || url.endsWith(".mpg4") || url.endsWith(".mp4") |
|
|| url.endsWith(".flv") || url.endsWith(".wmv"); |
|
} |
|
|
|
public static boolean isAudio(String url) { |
|
return url.endsWith(".mp3") || url.endsWith(".ogg") || url.endsWith(".m3u") |
|
|| url.endsWith(".wav"); |
|
} |
|
|
|
public static boolean isDoc(String url) { |
|
return url.endsWith(".pdf") || url.endsWith(".ppt") || url.endsWith(".doc") |
|
|| url.endsWith(".swf") || url.endsWith(".rtf") || url.endsWith(".xls"); |
|
} |
|
|
|
public static boolean isPackage(String url) { |
|
return url.endsWith(".gz") || url.endsWith(".tgz") || url.endsWith(".zip") |
|
|| url.endsWith(".rar") || url.endsWith(".deb") || url.endsWith(".rpm") |
|
|| url.endsWith(".7z"); |
|
} |
|
|
|
public static boolean isApp(String url) { |
|
return url.endsWith(".exe") || url.endsWith(".bin") || url.endsWith(".bat") |
|
|| url.endsWith(".dmg"); |
|
} |
|
|
|
public static boolean isImage(String url) { |
|
return url.endsWith(".png") || url.endsWith(".jpeg") || url.endsWith(".gif") |
|
|| url.endsWith(".jpg") || url.endsWith(".bmp") || url.endsWith(".ico") |
|
|| url.endsWith(".eps"); |
|
} |
|
|
|
/** |
|
* @see http |
|
* ://blogs.sun.com/CoreJavaTechTips/entry/cookie_handling_in_java_se |
|
*/ |
|
public static void enableCookieMgmt() { |
|
CookieManager manager = new CookieManager(); |
|
manager.setCookiePolicy(CookiePolicy.ACCEPT_ALL); |
|
CookieHandler.setDefault(manager); |
|
} |
|
|
|
/** |
|
* @see http |
|
* ://stackoverflow.com/questions/2529682/setting-user-agent-of-a-java |
|
* -urlconnection |
|
*/ |
|
public static void enableUserAgentOverwrite() { |
|
System.setProperty("http.agent", ""); |
|
} |
|
|
|
public static String getUrlFromUglyGoogleRedirect(String url) { |
|
if (url.startsWith("http://www.google.com/url?")) { |
|
url = url.substring("http://www.google.com/url?".length()); |
|
String arr[] = urlDecode(url).split("\\&"); |
|
if (arr != null) |
|
for (String str : arr) { |
|
if (str.startsWith("q=")) |
|
return str.substring("q=".length()); |
|
} |
|
} |
|
|
|
return null; |
|
} |
|
|
|
public static String getUrlFromUglyFacebookRedirect(String url) { |
|
if (url.startsWith("http://www.facebook.com/l.php?u=")) { |
|
url = url.substring("http://www.facebook.com/l.php?u=".length()); |
|
return urlDecode(url); |
|
} |
|
|
|
return null; |
|
} |
|
|
|
public static String urlEncode(String str) { |
|
try { |
|
return URLEncoder.encode(str, UTF8); |
|
} catch (UnsupportedEncodingException ex) { |
|
return str; |
|
} |
|
} |
|
|
|
public static String urlDecode(String str) { |
|
try { |
|
return URLDecoder.decode(str, UTF8); |
|
} catch (UnsupportedEncodingException ex) { |
|
return str; |
|
} |
|
} |
|
|
|
/** |
|
* Popular sites uses the #! to indicate the importance of the following |
|
* chars. Ugly but true. Such as: facebook, twitter, gizmodo, ... |
|
*/ |
|
public static String removeHashbang(String url) { |
|
return url.replaceFirst("#!", ""); |
|
} |
|
|
|
public static String printNode(Element root) { |
|
return printNode(root, 0); |
|
} |
|
|
|
public static String printNode(Element root, int indentation) { |
|
StringBuilder sb = new StringBuilder(); |
|
for (int i = 0; i < indentation; i++) { |
|
sb.append(' '); |
|
} |
|
sb.append(root.tagName()); |
|
sb.append(":"); |
|
sb.append(root.ownText()); |
|
sb.append("\n"); |
|
for (Element el : root.children()) { |
|
sb.append(printNode(el, indentation + 1)); |
|
sb.append("\n"); |
|
} |
|
return sb.toString(); |
|
} |
|
|
|
public static String estimateDate(String url) { |
|
int index = url.indexOf("://"); |
|
if (index > 0) |
|
url = url.substring(index + 3); |
|
|
|
int year = -1; |
|
int yearCounter = -1; |
|
int month = -1; |
|
int monthCounter = -1; |
|
int day = -1; |
|
String strs[] = url.split("/"); |
|
for (int counter = 0; counter < strs.length; counter++) { |
|
String str = strs[counter]; |
|
if (str.length() == 4) { |
|
try { |
|
year = Integer.parseInt(str); |
|
} catch (Exception ex) { |
|
continue; |
|
} |
|
if (year < 1970 || year > 3000) { |
|
year = -1; |
|
continue; |
|
} |
|
yearCounter = counter; |
|
} else if (str.length() == 2) { |
|
if (monthCounter < 0 && counter == yearCounter + 1) { |
|
try { |
|
month = Integer.parseInt(str); |
|
} catch (Exception ex) { |
|
continue; |
|
} |
|
if (month < 1 || month > 12) { |
|
month = -1; |
|
continue; |
|
} |
|
monthCounter = counter; |
|
} else if (counter == monthCounter + 1) { |
|
try { |
|
day = Integer.parseInt(str); |
|
} catch (Exception ex) { |
|
} |
|
if (day < 1 || day > 31) { |
|
day = -1; |
|
continue; |
|
} |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if (year < 0) |
|
return null; |
|
|
|
StringBuilder str = new StringBuilder(); |
|
str.append(year); |
|
if (month < 1) |
|
return str.toString(); |
|
|
|
str.append('/'); |
|
if (month < 10) |
|
str.append('0'); |
|
str.append(month); |
|
if (day < 1) |
|
return str.toString(); |
|
|
|
str.append('/'); |
|
if (day < 10) |
|
str.append('0'); |
|
str.append(day); |
|
return str.toString(); |
|
} |
|
|
|
public static String completeDate(String dateStr) { |
|
if (dateStr == null) |
|
return null; |
|
|
|
int index = dateStr.indexOf('/'); |
|
if (index > 0) { |
|
index = dateStr.indexOf('/', index + 1); |
|
if (index > 0) |
|
return dateStr; |
|
else |
|
return dateStr + "/01"; |
|
} |
|
return dateStr + "/01/01"; |
|
} |
|
|
|
/** |
|
* keep in mind: simpleDateFormatter is not thread safe! call completeDate |
|
* before applying this formatter. |
|
*/ |
|
public static SimpleDateFormat createDateFormatter() { |
|
return new SimpleDateFormat("yyyy/MM/dd", Locale.getDefault()); |
|
} |
|
|
|
// with the help of |
|
// http://stackoverflow.com/questions/1828775/httpclient-and-ssl |
|
public static void enableAnySSL() { |
|
try { |
|
SSLContext ctx = SSLContext.getInstance("TLS"); |
|
ctx.init(new KeyManager[0], new TrustManager[] { new DefaultTrustManager() }, |
|
new SecureRandom()); |
|
SSLContext.setDefault(ctx); |
|
} catch (Exception ex) { |
|
ex.printStackTrace(); |
|
} |
|
} |
|
|
|
private static class DefaultTrustManager implements X509TrustManager { |
|
|
|
@Override |
|
public void checkClientTrusted(X509Certificate[] arg0, String arg1) |
|
throws CertificateException { |
|
} |
|
|
|
@Override |
|
public void checkServerTrusted(X509Certificate[] arg0, String arg1) |
|
throws CertificateException { |
|
} |
|
|
|
@Override |
|
public X509Certificate[] getAcceptedIssuers() { |
|
return null; |
|
} |
|
} |
|
|
|
public static int countLetters(String str) { |
|
int len = str.length(); |
|
int chars = 0; |
|
for (int i = 0; i < len; i++) { |
|
if (Character.isLetter(str.charAt(i))) |
|
chars++; |
|
} |
|
return chars; |
|
} |
|
}
|
|
|