@ -22,19 +22,16 @@ import java.io.FileWriter;
@@ -22,19 +22,16 @@ import java.io.FileWriter;
import java.io.IOException ;
import java.io.InputStream ;
import java.net.HttpURLConnection ;
import java.net.MalformedURLException ;
import java.net.Proxy ;
import java.net.URL ;
import java.util.LinkedHashSet ;
import java.util.Locale ;
import java.util.Set ;
import java.util.concurrent.atomic.AtomicInteger ;
import java.util.zip.GZIPInputStream ;
import java.util.zip.Inflater ;
import java.util.zip.InflaterInputStream ;
import acr.browser.lightning.constant.Constants ;
import android.util.Log ;
/ * *
* Class to fetch articles . This class is thread safe .
*
@ -64,7 +61,7 @@ public class HtmlFetcher {
@@ -64,7 +61,7 @@ public class HtmlFetcher {
else
existing . add ( domainStr ) ;
String html = new HtmlFetcher ( ) . fetchAsString ( url , 20000 ) ;
String html = new HtmlFetcher ( ) . fetchAsString ( url , 2000 ) ;
String outFile = domainStr + counterStr + ".html" ;
BufferedWriter writer = new BufferedWriter ( new FileWriter ( outFile ) ) ;
writer . write ( html ) ;
@ -73,8 +70,8 @@ public class HtmlFetcher {
@@ -73,8 +70,8 @@ public class HtmlFetcher {
reader . close ( ) ;
}
private String referrer = "https://github.com/karussell/snacktory " ;
private String userAgent = "Mozilla/5.0 (compatible; Snacktory ; +" + referrer + ')' ;
private String referrer = "http://jetsli.de/crawler " ;
private String userAgent = "Mozilla/5.0 (compatible; Jetslide ; +" + referrer + ')' ;
private String cacheControl = "max-age=0" ;
private String language = "en-us" ;
private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5" ;
@ -83,7 +80,7 @@ public class HtmlFetcher {
@@ -83,7 +80,7 @@ public class HtmlFetcher {
private final AtomicInteger cacheCounter = new AtomicInteger ( 0 ) ;
private int maxTextLength = - 1 ;
private ArticleTextExtractor extractor = new ArticleTextExtractor ( ) ;
private final Set < String > furtherResolveNecessary = new LinkedHashSet < String > ( ) {
private Set < String > furtherResolveNecessary = new LinkedHashSet < String > ( ) {
{
add ( "bit.ly" ) ;
add ( "cli.gs" ) ;
@ -202,6 +199,12 @@ public class HtmlFetcher {
@@ -202,6 +199,12 @@ public class HtmlFetcher {
}
public JResult fetchAndExtract ( String url , int timeout , boolean resolve ) throws Exception {
return fetchAndExtract ( url , timeout , resolve , 0 , false ) ;
}
// main workhorse to call externally
public JResult fetchAndExtract ( String url , int timeout , boolean resolve ,
int maxContentSize , boolean forceReload ) throws Exception {
String originalUrl = url ;
url = SHelper . removeHashbang ( url ) ;
String gUrl = SHelper . getUrlFromUglyGoogleRedirect ( url ) ;
@ -219,9 +222,8 @@ public class HtmlFetcher {
@@ -219,9 +222,8 @@ public class HtmlFetcher {
if ( res ! = null )
return res ;
String resUrl = getResolvedUrl ( url , timeout ) ;
String resUrl = getResolvedUrl ( url , timeout , 0 ) ;
if ( resUrl . isEmpty ( ) ) {
Log . d ( Constants . TAG , "resolved url is empty. Url is: " + url ) ;
JResult result = new JResult ( ) ;
if ( cache ! = null )
@ -229,10 +231,9 @@ public class HtmlFetcher {
@@ -229,10 +231,9 @@ public class HtmlFetcher {
return result . setUrl ( url ) ;
}
// if resolved url is longer then use it!
if ( resUrl . trim ( ) . length ( ) > url . length ( ) ) {
// this is necessary e.g. for some homebaken url resolvers which
// return
// if resolved url is different then use it!
if ( ! resUrl . equals ( url ) ) {
// this is necessary e.g. for some homebaken url resolvers which return
// the resolved url relative to url!
url = SHelper . useDomainOfFirstArg4Second ( url , resUrl ) ;
}
@ -244,20 +245,18 @@ public class HtmlFetcher {
@@ -244,20 +245,18 @@ public class HtmlFetcher {
return res ;
JResult result = new JResult ( ) ;
// or should we use? <link rel="canonical"
// href="http://www.N24.de/news/newsitem_6797232.html"/>
// or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/>
result . setUrl ( url ) ;
result . setOriginalUrl ( originalUrl ) ;
result . setDate ( SHelper . estimateDate ( url ) ) ;
// Immediately put the url into the cache as extracting content takes
// time.
// Immediately put the url into the cache as extracting content takes time.
if ( cache ! = null ) {
cache . put ( originalUrl , result ) ;
cache . put ( url , result ) ;
}
String lowerUrl = url . toLowerCase ( Locale . getDefault ( ) ) ;
// extract content to the extent appropriate for content type
String lowerUrl = url . toLowerCase ( ) ;
if ( SHelper . isDoc ( lowerUrl ) | | SHelper . isApp ( lowerUrl ) | | SHelper . isPackage ( lowerUrl ) ) {
// skip
} else if ( SHelper . isVideo ( lowerUrl ) | | SHelper . isAudio ( lowerUrl ) ) {
@ -265,15 +264,29 @@ public class HtmlFetcher {
@@ -265,15 +264,29 @@ public class HtmlFetcher {
} else if ( SHelper . isImage ( lowerUrl ) ) {
result . setImageUrl ( url ) ;
} else {
extractor . extractContent ( result , fetchAsString ( url , timeout ) ) ;
try {
String urlToDownload = url ;
if ( forceReload ) {
urlToDownload = getURLtoBreakCache ( url ) ;
}
extractor . extractContent ( result , fetchAsString ( urlToDownload , timeout ) , maxContentSize ) ;
} catch ( IOException io ) {
// do nothing
}
if ( result . getFaviconUrl ( ) . isEmpty ( ) )
result . setFaviconUrl ( SHelper . getDefaultFavicon ( url ) ) ;
// some links are relative to root and do not include the domain of
// the url :(
// some links are relative to root and do not include the domain of the url :(
if ( ! result . getFaviconUrl ( ) . isEmpty ( ) )
result . setFaviconUrl ( fixUrl ( url , result . getFaviconUrl ( ) ) ) ;
if ( ! result . getImageUrl ( ) . isEmpty ( ) )
result . setImageUrl ( fixUrl ( url , result . getImageUrl ( ) ) ) ;
if ( ! result . getVideoUrl ( ) . isEmpty ( ) )
result . setVideoUrl ( fixUrl ( url , result . getVideoUrl ( ) ) ) ;
if ( ! result . getRssUrl ( ) . isEmpty ( ) )
result . setRssUrl ( fixUrl ( url , result . getRssUrl ( ) ) ) ;
}
result . setText ( lessText ( result . getText ( ) ) ) ;
@ -283,6 +296,20 @@ public class HtmlFetcher {
@@ -283,6 +296,20 @@ public class HtmlFetcher {
return result ;
}
// Ugly hack to break free from any cached versions, a few URLs required this.
public static String getURLtoBreakCache ( String url ) {
try {
URL aURL = new URL ( url ) ;
if ( aURL . getQuery ( ) ! = null & & aURL . getQuery ( ) . isEmpty ( ) ) {
return url + "?1" ;
} else {
return url + "&1" ;
}
} catch ( MalformedURLException e ) {
return url ;
}
}
public String lessText ( String text ) {
if ( text = = null )
return "" ;
@ -297,13 +324,14 @@ public class HtmlFetcher {
@@ -297,13 +324,14 @@ public class HtmlFetcher {
return SHelper . useDomainOfFirstArg4Second ( url , urlOrPath ) ;
}
public String fetchAsString ( String urlAsString , int timeout ) throws
IOException {
public String fetchAsString ( String urlAsString , int timeout )
throws MalformedURLException , IOException {
return fetchAsString ( urlAsString , timeout , true ) ;
}
// main routine to get raw webpage content
public String fetchAsString ( String urlAsString , int timeout , boolean includeSomeGooseOptions )
throws IOException {
throws MalformedURLException , IOException {
HttpURLConnection hConn = createUrlConnection ( urlAsString , timeout , includeSomeGooseOptions ) ;
hConn . setInstanceFollowRedirects ( true ) ;
String encoding = hConn . getContentEncoding ( ) ;
@ -317,27 +345,23 @@ public class HtmlFetcher {
@@ -317,27 +345,23 @@ public class HtmlFetcher {
}
String enc = Converter . extractEncoding ( hConn . getContentType ( ) ) ;
String res = createConverter ( urlAsString ) . streamToString ( is , enc ) ;
Log . d ( Constants . TAG , res . length ( ) + " FetchAsString:" + urlAsString ) ;
return res ;
return createConverter ( urlAsString ) . streamToString ( is , enc ) ;
}
public Converter createConverter ( String url ) {
public static Converter createConverter ( String url ) {
return new Converter ( url ) ;
}
/ * *
* On some devices we have to hack :
* http : //developers.sun.com/mobility/reference
* / techart / design_guidelines / http_redirection . html
* http : //developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html
*
* @param timeout
* Sets a specified timeout value , in milliseconds
* @param timeout Sets a specified timeout value , in milliseconds
* @return the resolved url if any . Or null if it couldn ' t resolve the url
* ( within the specified time ) or the same url if response code is
* OK
* ( within the specified time ) or the same url if response code is OK
* /
public String getResolvedUrl ( String urlAsString , int timeout ) {
public String getResolvedUrl ( String urlAsString , int timeout ,
int num_redirects ) {
String newUrl = null ;
int responseCode = - 1 ;
try {
@ -354,28 +378,32 @@ public class HtmlFetcher {
@@ -354,28 +378,32 @@ public class HtmlFetcher {
return urlAsString ;
newUrl = hConn . getHeaderField ( "Location" ) ;
if ( responseCode / 100 = = 3 & & newUrl ! = null ) {
// Note that the max recursion level is 5.
if ( responseCode / 100 = = 3 & & newUrl ! = null & & num_redirects < 5 ) {
newUrl = newUrl . replaceAll ( " " , "+" ) ;
// some services use (none-standard) utf8 in their location
// header
// some services use (none-standard) utf8 in their location header
if ( urlAsString . startsWith ( "http://bit.ly" )
| | urlAsString . startsWith ( "http://is.gd" ) )
newUrl = encodeUriFromHeader ( newUrl ) ;
// fix problems if shortened twice. as it is often the case
// after twitters' t.co bullshit
if ( furtherResolveNecessary . contains ( SHelper . extractDomain ( newUrl , true ) ) )
newUrl = getResolvedUrl ( newUrl , timeout ) ;
// AP: This code is not longer need, instead we always follow
// multiple redirects.
//
// fix problems if shortened twice. as it is often the case after twitters' t.co bullshit
//if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true)))
// newUrl = getResolvedUrl(newUrl, timeout);
// Add support for URLs with multiple levels of redirection,
// call getResolvedUrl until there is no more redirects or a
// max number of redirects is reached.
newUrl = SHelper . useDomainOfFirstArg4Second ( urlAsString , newUrl ) ;
newUrl = getResolvedUrl ( newUrl , timeout , num_redirects + 1 ) ;
return newUrl ;
} else
return urlAsString ;
} catch ( Exception ex ) {
Log . e ( Constants . TAG , "getResolvedUrl:" + urlAsString + " Error:" + ex . getMessage ( ) ) ;
return "" ;
} finally {
Log . e ( Constants . TAG , responseCode + " url:" + urlAsString + " resolved:" + newUrl ) ;
}
}
@ -400,7 +428,7 @@ public class HtmlFetcher {
@@ -400,7 +428,7 @@ public class HtmlFetcher {
}
protected HttpURLConnection createUrlConnection ( String urlAsStr , int timeout ,
boolean includeSomeGooseOptions ) throws IOException {
boolean includeSomeGooseOptions ) throws MalformedURLException , IOException {
URL url = new URL ( urlAsStr ) ;
//using proxy may increase latency
HttpURLConnection hConn = ( HttpURLConnection ) url . openConnection ( Proxy . NO_PROXY ) ;
@ -415,8 +443,7 @@ public class HtmlFetcher {
@@ -415,8 +443,7 @@ public class HtmlFetcher {
hConn . setRequestProperty ( "Cache-Control" , cacheControl ) ;
}
// suggest respond to be gzipped or deflated (which is just another
// compression)
// suggest respond to be gzipped or deflated (which is just another compression)
// http://stackoverflow.com/q/3932117
hConn . setRequestProperty ( "Accept-Encoding" , "gzip, deflate" ) ;
hConn . setConnectTimeout ( timeout ) ;
@ -424,14 +451,12 @@ public class HtmlFetcher {
@@ -424,14 +451,12 @@ public class HtmlFetcher {
return hConn ;
}
private JResult getFromCache ( String url , String originalUrl ) throws Exception {
private JResult getFromCache ( String url , String originalUrl ) {
if ( cache ! = null ) {
JResult res = cache . get ( url ) ;
if ( res ! = null ) {
// e.g. the cache returned a shortened url as original url now
// we want to store the
// current original url! Also it can be that the cache response
// to url but the JResult
// e.g. the cache returned a shortened url as original url now we want to store the
// current original url! Also it can be that the cache response to url but the JResult
// does not contain it so overwrite it:
res . setUrl ( url ) ;
res . setOriginalUrl ( originalUrl ) ;