You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
243 lines
6.5 KiB
243 lines
6.5 KiB
/* |
|
* Copyright 2011 Peter Karich |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
package acr.browser.lightning.reading; |
|
|
|
import java.io.*; |
|
import java.net.SocketTimeoutException; |
|
import java.nio.charset.Charset; |
|
import java.util.Locale; |
|
|
|
import acr.browser.lightning.Constants; |
|
import android.util.Log; |
|
|
|
/** |
|
* This class is not thread safe. Use one new instance every time due to |
|
* encoding variable. |
|
* |
|
* @author Peter Karich |
|
*/ |
|
public class Converter { |
|
|
|
public final static String UTF8 = "UTF-8"; |
|
public final static String ISO = "ISO-8859-1"; |
|
public final static int K2 = 2048; |
|
private int maxBytes = 1000000 / 2; |
|
private String encoding; |
|
private String url; |
|
|
|
public Converter(String urlOnlyHint) { |
|
url = urlOnlyHint; |
|
} |
|
|
|
public Converter() { |
|
} |
|
|
|
public Converter setMaxBytes(int maxBytes) { |
|
this.maxBytes = maxBytes; |
|
return this; |
|
} |
|
|
|
public static String extractEncoding(String contentType) { |
|
String[] values; |
|
if (contentType != null) |
|
values = contentType.split(";"); |
|
else |
|
values = new String[0]; |
|
|
|
String charset = ""; |
|
|
|
for (String value : values) { |
|
value = value.trim().toLowerCase(Locale.getDefault()); |
|
|
|
if (value.startsWith("charset=")) |
|
charset = value.substring("charset=".length()); |
|
} |
|
|
|
// http1.1 says ISO-8859-1 is the default charset |
|
if (charset.length() == 0) |
|
charset = ISO; |
|
|
|
return charset; |
|
} |
|
|
|
public String getEncoding() { |
|
if (encoding == null) |
|
return ""; |
|
return encoding.toLowerCase(Locale.getDefault()); |
|
} |
|
|
|
public String streamToString(InputStream is) { |
|
return streamToString(is, maxBytes, encoding); |
|
} |
|
|
|
public String streamToString(InputStream is, String enc) { |
|
return streamToString(is, maxBytes, enc); |
|
} |
|
|
|
/** |
|
* reads bytes off the string and returns a string |
|
* |
|
* @param is |
|
* @param maxBytes |
|
* The max bytes that we want to read from the input stream |
|
* @return String |
|
*/ |
|
public String streamToString(InputStream is, int maxBytes, String enc) { |
|
encoding = enc; |
|
// Http 1.1. standard is iso-8859-1 not utf8 :( |
|
// but we force utf-8 as youtube assumes it ;) |
|
if (encoding == null || encoding.isEmpty()) |
|
encoding = UTF8; |
|
|
|
BufferedInputStream in = null; |
|
try { |
|
in = new BufferedInputStream(is, K2); |
|
ByteArrayOutputStream output = new ByteArrayOutputStream(); |
|
|
|
// detect encoding with the help of meta tag |
|
try { |
|
in.mark(K2 * 2); |
|
String tmpEnc = detectCharset("charset=", output, in, encoding); |
|
if (tmpEnc != null) |
|
encoding = tmpEnc; |
|
else { |
|
Log.d(Constants.TAG, "no charset found in first stage"); |
|
// detect with the help of xml beginning ala |
|
// encoding="charset" |
|
tmpEnc = detectCharset("encoding=", output, in, encoding); |
|
if (tmpEnc != null) |
|
encoding = tmpEnc; |
|
else |
|
Log.d(Constants.TAG, "no charset found in second stage"); |
|
} |
|
|
|
if (!Charset.isSupported(encoding)) |
|
throw new UnsupportedEncodingException(encoding); |
|
} catch (UnsupportedEncodingException e) { |
|
Log.d(Constants.TAG, |
|
"Using default encoding:" + UTF8 + " problem:" + e.getMessage() |
|
+ " encoding:" + encoding + " " + url); |
|
encoding = UTF8; |
|
} |
|
|
|
// SocketException: Connection reset |
|
// IOException: missing CR => problem on server (probably some xml |
|
// character thing?) |
|
// IOException: Premature EOF => socket unexpectly closed from |
|
// server |
|
int bytesRead = output.size(); |
|
byte[] arr = new byte[K2]; |
|
while (true) { |
|
if (bytesRead >= maxBytes) { |
|
Log.d(Constants.TAG, "Maxbyte of " + maxBytes |
|
+ " exceeded! Maybe html is now broken but try it nevertheless. Url: " |
|
+ url); |
|
break; |
|
} |
|
|
|
int n = in.read(arr); |
|
if (n < 0) |
|
break; |
|
bytesRead += n; |
|
output.write(arr, 0, n); |
|
} |
|
|
|
return output.toString(encoding); |
|
} catch (SocketTimeoutException e) { |
|
Log.e(Constants.TAG, e.toString() + " url:" + url); |
|
} catch (IOException e) { |
|
Log.e(Constants.TAG, e.toString() + " url:" + url); |
|
} finally { |
|
if (in != null) { |
|
try { |
|
in.close(); |
|
} catch (Exception e) { |
|
} |
|
} |
|
} |
|
return ""; |
|
} |
|
|
|
/** |
|
* This method detects the charset even if the first call only returns some |
|
* bytes. It will read until 4K bytes are reached and then try to determine |
|
* the encoding |
|
* |
|
* @throws IOException |
|
*/ |
|
protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in, |
|
String enc) throws IOException { |
|
|
|
// Grab better encoding from stream |
|
byte[] arr = new byte[K2]; |
|
int nSum = 0; |
|
while (nSum < K2) { |
|
int n = in.read(arr); |
|
if (n < 0) |
|
break; |
|
|
|
nSum += n; |
|
bos.write(arr, 0, n); |
|
} |
|
|
|
String str = bos.toString(enc); |
|
int encIndex = str.indexOf(key); |
|
int clength = key.length(); |
|
if (encIndex > 0) { |
|
char startChar = str.charAt(encIndex + clength); |
|
int lastEncIndex; |
|
if (startChar == '\'') |
|
// if we have charset='something' |
|
lastEncIndex = str.indexOf("'", ++encIndex + clength); |
|
else if (startChar == '\"') |
|
// if we have charset="something" |
|
lastEncIndex = str.indexOf("\"", ++encIndex + clength); |
|
else { |
|
// if we have "text/html; charset=utf-8" |
|
int first = str.indexOf("\"", encIndex + clength); |
|
if (first < 0) |
|
first = Integer.MAX_VALUE; |
|
|
|
// or "text/html; charset=utf-8 " |
|
int sec = str.indexOf(" ", encIndex + clength); |
|
if (sec < 0) |
|
sec = Integer.MAX_VALUE; |
|
lastEncIndex = Math.min(first, sec); |
|
|
|
// or "text/html; charset=utf-8 ' |
|
int third = str.indexOf("'", encIndex + clength); |
|
if (third > 0) |
|
lastEncIndex = Math.min(lastEncIndex, third); |
|
} |
|
|
|
// re-read byte array with different encoding |
|
// assume that the encoding string cannot be greater than 40 chars |
|
if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) { |
|
String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, |
|
lastEncIndex)); |
|
try { |
|
in.reset(); |
|
bos.reset(); |
|
return tmpEnc; |
|
} catch (IOException ex) { |
|
Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding " |
|
+ tmpEnc + " " + ex.toString()); |
|
} |
|
} |
|
} |
|
return null; |
|
} |
|
}
|
|
|