You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
445 lines
13 KiB
445 lines
13 KiB
/* |
|
* Copyright 2011 Peter Karich |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
package acr.browser.lightning.reading; |
|
|
|
import java.io.BufferedReader; |
|
import java.io.BufferedWriter; |
|
import java.io.FileReader; |
|
import java.io.FileWriter; |
|
import java.io.IOException; |
|
import java.io.InputStream; |
|
import java.net.HttpURLConnection; |
|
import java.net.MalformedURLException; |
|
import java.net.Proxy; |
|
import java.net.URL; |
|
import java.util.LinkedHashSet; |
|
import java.util.Locale; |
|
import java.util.Set; |
|
import java.util.concurrent.atomic.AtomicInteger; |
|
import java.util.zip.GZIPInputStream; |
|
import java.util.zip.Inflater; |
|
import java.util.zip.InflaterInputStream; |
|
|
|
import acr.browser.lightning.Constants; |
|
import android.util.Log; |
|
|
|
/** |
|
* Class to fetch articles. This class is thread safe. |
|
* |
|
* @author Peter Karich |
|
*/ |
|
public class HtmlFetcher { |
|
|
|
static { |
|
SHelper.enableCookieMgmt(); |
|
SHelper.enableUserAgentOverwrite(); |
|
SHelper.enableAnySSL(); |
|
} |
|
|
|
public static void main(String[] args) throws Exception { |
|
BufferedReader reader = new BufferedReader(new FileReader("urls.txt")); |
|
String line = null; |
|
Set<String> existing = new LinkedHashSet<String>(); |
|
while ((line = reader.readLine()) != null) { |
|
int index1 = line.indexOf("\""); |
|
int index2 = line.indexOf("\"", index1 + 1); |
|
String url = line.substring(index1 + 1, index2); |
|
String domainStr = SHelper.extractDomain(url, true); |
|
String counterStr = ""; |
|
// TODO more similarities |
|
if (existing.contains(domainStr)) |
|
counterStr = "2"; |
|
else |
|
existing.add(domainStr); |
|
|
|
String html = new HtmlFetcher().fetchAsString(url, 20000); |
|
String outFile = domainStr + counterStr + ".html"; |
|
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); |
|
writer.write(html); |
|
writer.close(); |
|
} |
|
reader.close(); |
|
} |
|
|
|
private String referrer = "https://github.com/karussell/snacktory"; |
|
private String userAgent = "Mozilla/5.0 (compatible; Snacktory; +" + referrer + ")"; |
|
private String cacheControl = "max-age=0"; |
|
private String language = "en-us"; |
|
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; |
|
private String charset = "UTF-8"; |
|
private SCache cache; |
|
private AtomicInteger cacheCounter = new AtomicInteger(0); |
|
private int maxTextLength = -1; |
|
private ArticleTextExtractor extractor = new ArticleTextExtractor(); |
|
private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() { |
|
{ |
|
add("bit.ly"); |
|
add("cli.gs"); |
|
add("deck.ly"); |
|
add("fb.me"); |
|
add("feedproxy.google.com"); |
|
add("flic.kr"); |
|
add("fur.ly"); |
|
add("goo.gl"); |
|
add("is.gd"); |
|
add("ink.co"); |
|
add("j.mp"); |
|
add("lnkd.in"); |
|
add("on.fb.me"); |
|
add("ow.ly"); |
|
add("plurl.us"); |
|
add("sns.mx"); |
|
add("snurl.com"); |
|
add("su.pr"); |
|
add("t.co"); |
|
add("tcrn.ch"); |
|
add("tl.gd"); |
|
add("tiny.cc"); |
|
add("tinyurl.com"); |
|
add("tmi.me"); |
|
add("tr.im"); |
|
add("twurl.nl"); |
|
} |
|
}; |
|
|
|
public HtmlFetcher() { |
|
} |
|
|
|
public void setExtractor(ArticleTextExtractor extractor) { |
|
this.extractor = extractor; |
|
} |
|
|
|
public ArticleTextExtractor getExtractor() { |
|
return extractor; |
|
} |
|
|
|
public HtmlFetcher setCache(SCache cache) { |
|
this.cache = cache; |
|
return this; |
|
} |
|
|
|
public SCache getCache() { |
|
return cache; |
|
} |
|
|
|
public int getCacheCounter() { |
|
return cacheCounter.get(); |
|
} |
|
|
|
public HtmlFetcher clearCacheCounter() { |
|
cacheCounter.set(0); |
|
return this; |
|
} |
|
|
|
public HtmlFetcher setMaxTextLength(int maxTextLength) { |
|
this.maxTextLength = maxTextLength; |
|
return this; |
|
} |
|
|
|
public int getMaxTextLength() { |
|
return maxTextLength; |
|
} |
|
|
|
public void setAccept(String accept) { |
|
this.accept = accept; |
|
} |
|
|
|
public void setCharset(String charset) { |
|
this.charset = charset; |
|
} |
|
|
|
public void setCacheControl(String cacheControl) { |
|
this.cacheControl = cacheControl; |
|
} |
|
|
|
public String getLanguage() { |
|
return language; |
|
} |
|
|
|
public void setLanguage(String language) { |
|
this.language = language; |
|
} |
|
|
|
public String getReferrer() { |
|
return referrer; |
|
} |
|
|
|
public HtmlFetcher setReferrer(String referrer) { |
|
this.referrer = referrer; |
|
return this; |
|
} |
|
|
|
public String getUserAgent() { |
|
return userAgent; |
|
} |
|
|
|
public void setUserAgent(String userAgent) { |
|
this.userAgent = userAgent; |
|
} |
|
|
|
public String getAccept() { |
|
return accept; |
|
} |
|
|
|
public String getCacheControl() { |
|
return cacheControl; |
|
} |
|
|
|
public String getCharset() { |
|
return charset; |
|
} |
|
|
|
public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception { |
|
String originalUrl = url; |
|
url = SHelper.removeHashbang(url); |
|
String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url); |
|
if (gUrl != null) |
|
url = gUrl; |
|
else { |
|
gUrl = SHelper.getUrlFromUglyFacebookRedirect(url); |
|
if (gUrl != null) |
|
url = gUrl; |
|
} |
|
|
|
if (resolve) { |
|
// check if we can avoid resolving the URL (which hits the website!) |
|
JResult res = getFromCache(url, originalUrl); |
|
if (res != null) |
|
return res; |
|
|
|
String resUrl = getResolvedUrl(url, timeout); |
|
if (resUrl.isEmpty()) { |
|
Log.d(Constants.TAG, "resolved url is empty. Url is: " + url); |
|
|
|
JResult result = new JResult(); |
|
if (cache != null) |
|
cache.put(url, result); |
|
return result.setUrl(url); |
|
} |
|
|
|
// if resolved url is longer then use it! |
|
if (resUrl != null && resUrl.trim().length() > url.length()) { |
|
// this is necessary e.g. for some homebaken url resolvers which |
|
// return |
|
// the resolved url relative to url! |
|
url = SHelper.useDomainOfFirstArg4Second(url, resUrl); |
|
} |
|
} |
|
|
|
// check if we have the (resolved) URL in cache |
|
JResult res = getFromCache(url, originalUrl); |
|
if (res != null) |
|
return res; |
|
|
|
JResult result = new JResult(); |
|
// or should we use? <link rel="canonical" |
|
// href="http://www.N24.de/news/newsitem_6797232.html"/> |
|
result.setUrl(url); |
|
result.setOriginalUrl(originalUrl); |
|
result.setDate(SHelper.estimateDate(url)); |
|
|
|
// Immediately put the url into the cache as extracting content takes |
|
// time. |
|
if (cache != null) { |
|
cache.put(originalUrl, result); |
|
cache.put(url, result); |
|
} |
|
|
|
String lowerUrl = url.toLowerCase(Locale.getDefault()); |
|
if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) { |
|
// skip |
|
} else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) { |
|
result.setVideoUrl(url); |
|
} else if (SHelper.isImage(lowerUrl)) { |
|
result.setImageUrl(url); |
|
} else { |
|
extractor.extractContent(result, fetchAsString(url, timeout)); |
|
if (result.getFaviconUrl().isEmpty()) |
|
result.setFaviconUrl(SHelper.getDefaultFavicon(url)); |
|
|
|
// some links are relative to root and do not include the domain of |
|
// the url :( |
|
result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); |
|
result.setImageUrl(fixUrl(url, result.getImageUrl())); |
|
result.setVideoUrl(fixUrl(url, result.getVideoUrl())); |
|
result.setRssUrl(fixUrl(url, result.getRssUrl())); |
|
} |
|
result.setText(lessText(result.getText())); |
|
synchronized (result) { |
|
result.notifyAll(); |
|
} |
|
return result; |
|
} |
|
|
|
public String lessText(String text) { |
|
if (text == null) |
|
return ""; |
|
|
|
if (maxTextLength >= 0 && text.length() > maxTextLength) |
|
return text.substring(0, maxTextLength); |
|
|
|
return text; |
|
} |
|
|
|
private static String fixUrl(String url, String urlOrPath) { |
|
return SHelper.useDomainOfFirstArg4Second(url, urlOrPath); |
|
} |
|
|
|
public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException, |
|
IOException { |
|
return fetchAsString(urlAsString, timeout, true); |
|
} |
|
|
|
public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions) |
|
throws MalformedURLException, IOException { |
|
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions); |
|
hConn.setInstanceFollowRedirects(true); |
|
String encoding = hConn.getContentEncoding(); |
|
InputStream is; |
|
if (encoding != null && encoding.equalsIgnoreCase("gzip")) { |
|
is = new GZIPInputStream(hConn.getInputStream()); |
|
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { |
|
is = new InflaterInputStream(hConn.getInputStream(), new Inflater(true)); |
|
} else { |
|
is = hConn.getInputStream(); |
|
} |
|
|
|
String enc = Converter.extractEncoding(hConn.getContentType()); |
|
String res = createConverter(urlAsString).streamToString(is, enc); |
|
Log.d(Constants.TAG, res.length() + " FetchAsString:" + urlAsString); |
|
return res; |
|
} |
|
|
|
public Converter createConverter(String url) { |
|
return new Converter(url); |
|
} |
|
|
|
/** |
|
* On some devices we have to hack: |
|
* http://developers.sun.com/mobility/reference |
|
* /techart/design_guidelines/http_redirection.html |
|
* |
|
* @param timeout |
|
* Sets a specified timeout value, in milliseconds |
|
* @return the resolved url if any. Or null if it couldn't resolve the url |
|
* (within the specified time) or the same url if response code is |
|
* OK |
|
*/ |
|
public String getResolvedUrl(String urlAsString, int timeout) { |
|
String newUrl = null; |
|
int responseCode = -1; |
|
try { |
|
HttpURLConnection hConn = createUrlConnection(urlAsString, timeout, true); |
|
// force no follow |
|
hConn.setInstanceFollowRedirects(false); |
|
// the program doesn't care what the content actually is !! |
|
// http://java.sun.com/developer/JDCTechTips/2003/tt0422.html |
|
hConn.setRequestMethod("HEAD"); |
|
hConn.connect(); |
|
responseCode = hConn.getResponseCode(); |
|
hConn.getInputStream().close(); |
|
if (responseCode == HttpURLConnection.HTTP_OK) |
|
return urlAsString; |
|
|
|
newUrl = hConn.getHeaderField("Location"); |
|
if (responseCode / 100 == 3 && newUrl != null) { |
|
newUrl = newUrl.replaceAll(" ", "+"); |
|
// some services use (none-standard) utf8 in their location |
|
// header |
|
if (urlAsString.startsWith("http://bit.ly") |
|
|| urlAsString.startsWith("http://is.gd")) |
|
newUrl = encodeUriFromHeader(newUrl); |
|
|
|
// fix problems if shortened twice. as it is often the case |
|
// after twitters' t.co bullshit |
|
if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) |
|
newUrl = getResolvedUrl(newUrl, timeout); |
|
|
|
return newUrl; |
|
} else |
|
return urlAsString; |
|
|
|
} catch (Exception ex) { |
|
Log.e(Constants.TAG, "getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage()); |
|
return ""; |
|
} finally { |
|
Log.e(Constants.TAG, responseCode + " url:" + urlAsString + " resolved:" + newUrl); |
|
} |
|
} |
|
|
|
/** |
|
* Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding |
|
* to non-ASCII characters. Workaround for broken origin servers that send |
|
* UTF-8 in the Location: header. |
|
*/ |
|
static String encodeUriFromHeader(String badLocation) { |
|
StringBuilder sb = new StringBuilder(); |
|
|
|
for (char ch : badLocation.toCharArray()) { |
|
if (ch < (char) 128) { |
|
sb.append(ch); |
|
} else { |
|
// this is ONLY valid if the uri was decoded using ISO-8859-1 |
|
sb.append(String.format("%%%02X", (int) ch)); |
|
} |
|
} |
|
|
|
return sb.toString(); |
|
} |
|
|
|
protected HttpURLConnection createUrlConnection(String urlAsStr, int timeout, |
|
boolean includeSomeGooseOptions) throws MalformedURLException, IOException { |
|
URL url = new URL(urlAsStr); |
|
// using proxy may increase latency |
|
HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); |
|
hConn.setRequestProperty("User-Agent", userAgent); |
|
hConn.setRequestProperty("Accept", accept); |
|
|
|
if (includeSomeGooseOptions) { |
|
hConn.setRequestProperty("Accept-Language", language); |
|
hConn.setRequestProperty("content-charset", charset); |
|
hConn.addRequestProperty("Referer", referrer); |
|
// avoid the cache for testing purposes only? |
|
hConn.setRequestProperty("Cache-Control", cacheControl); |
|
} |
|
|
|
// suggest respond to be gzipped or deflated (which is just another |
|
// compression) |
|
// http://stackoverflow.com/q/3932117 |
|
hConn.setRequestProperty("Accept-Encoding", "gzip, deflate"); |
|
hConn.setConnectTimeout(timeout); |
|
hConn.setReadTimeout(timeout); |
|
return hConn; |
|
} |
|
|
|
private JResult getFromCache(String url, String originalUrl) throws Exception { |
|
if (cache != null) { |
|
JResult res = cache.get(url); |
|
if (res != null) { |
|
// e.g. the cache returned a shortened url as original url now |
|
// we want to store the |
|
// current original url! Also it can be that the cache response |
|
// to url but the JResult |
|
// does not contain it so overwrite it: |
|
res.setUrl(url); |
|
res.setOriginalUrl(originalUrl); |
|
cacheCounter.addAndGet(1); |
|
return res; |
|
} |
|
} |
|
return null; |
|
} |
|
}
|
|
|