You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
244 lines
6.5 KiB
244 lines
6.5 KiB
9 years ago
|
/*
|
||
|
* Copyright 2011 Peter Karich
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
9 years ago
|
package acr.browser.lightning.reading;
|
||
9 years ago
|
|
||
|
import java.io.*;
|
||
|
import java.net.SocketTimeoutException;
|
||
|
import java.nio.charset.Charset;
|
||
|
import java.util.Locale;
|
||
|
|
||
|
import acr.browser.lightning.Constants;
|
||
|
import android.util.Log;
|
||
|
|
||
|
/**
|
||
|
* This class is not thread safe. Use one new instance every time due to
|
||
|
* encoding variable.
|
||
|
*
|
||
|
* @author Peter Karich
|
||
|
*/
|
||
|
public class Converter {
|
||
|
|
||
|
public final static String UTF8 = "UTF-8";
|
||
|
public final static String ISO = "ISO-8859-1";
|
||
|
public final static int K2 = 2048;
|
||
|
private int maxBytes = 1000000 / 2;
|
||
|
private String encoding;
|
||
|
private String url;
|
||
|
|
||
|
public Converter(String urlOnlyHint) {
|
||
|
url = urlOnlyHint;
|
||
|
}
|
||
|
|
||
|
public Converter() {
|
||
|
}
|
||
|
|
||
|
public Converter setMaxBytes(int maxBytes) {
|
||
|
this.maxBytes = maxBytes;
|
||
|
return this;
|
||
|
}
|
||
|
|
||
|
public static String extractEncoding(String contentType) {
|
||
|
String[] values;
|
||
|
if (contentType != null)
|
||
|
values = contentType.split(";");
|
||
|
else
|
||
|
values = new String[0];
|
||
|
|
||
|
String charset = "";
|
||
|
|
||
|
for (String value : values) {
|
||
|
value = value.trim().toLowerCase(Locale.getDefault());
|
||
|
|
||
|
if (value.startsWith("charset="))
|
||
|
charset = value.substring("charset=".length());
|
||
|
}
|
||
|
|
||
|
// http1.1 says ISO-8859-1 is the default charset
|
||
|
if (charset.length() == 0)
|
||
|
charset = ISO;
|
||
|
|
||
|
return charset;
|
||
|
}
|
||
|
|
||
|
public String getEncoding() {
|
||
|
if (encoding == null)
|
||
|
return "";
|
||
|
return encoding.toLowerCase(Locale.getDefault());
|
||
|
}
|
||
|
|
||
|
public String streamToString(InputStream is) {
|
||
|
return streamToString(is, maxBytes, encoding);
|
||
|
}
|
||
|
|
||
|
public String streamToString(InputStream is, String enc) {
|
||
|
return streamToString(is, maxBytes, enc);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* reads bytes off the string and returns a string
|
||
|
*
|
||
|
* @param is
|
||
|
* @param maxBytes
|
||
|
* The max bytes that we want to read from the input stream
|
||
|
* @return String
|
||
|
*/
|
||
|
public String streamToString(InputStream is, int maxBytes, String enc) {
|
||
|
encoding = enc;
|
||
|
// Http 1.1. standard is iso-8859-1 not utf8 :(
|
||
|
// but we force utf-8 as youtube assumes it ;)
|
||
|
if (encoding == null || encoding.isEmpty())
|
||
|
encoding = UTF8;
|
||
|
|
||
|
BufferedInputStream in = null;
|
||
|
try {
|
||
|
in = new BufferedInputStream(is, K2);
|
||
|
ByteArrayOutputStream output = new ByteArrayOutputStream();
|
||
|
|
||
|
// detect encoding with the help of meta tag
|
||
|
try {
|
||
|
in.mark(K2 * 2);
|
||
|
String tmpEnc = detectCharset("charset=", output, in, encoding);
|
||
|
if (tmpEnc != null)
|
||
|
encoding = tmpEnc;
|
||
|
else {
|
||
|
Log.d(Constants.TAG, "no charset found in first stage");
|
||
|
// detect with the help of xml beginning ala
|
||
|
// encoding="charset"
|
||
|
tmpEnc = detectCharset("encoding=", output, in, encoding);
|
||
|
if (tmpEnc != null)
|
||
|
encoding = tmpEnc;
|
||
|
else
|
||
|
Log.d(Constants.TAG, "no charset found in second stage");
|
||
|
}
|
||
|
|
||
|
if (!Charset.isSupported(encoding))
|
||
|
throw new UnsupportedEncodingException(encoding);
|
||
|
} catch (UnsupportedEncodingException e) {
|
||
|
Log.d(Constants.TAG,
|
||
|
"Using default encoding:" + UTF8 + " problem:" + e.getMessage()
|
||
|
+ " encoding:" + encoding + " " + url);
|
||
|
encoding = UTF8;
|
||
|
}
|
||
|
|
||
|
// SocketException: Connection reset
|
||
|
// IOException: missing CR => problem on server (probably some xml
|
||
|
// character thing?)
|
||
|
// IOException: Premature EOF => socket unexpectly closed from
|
||
|
// server
|
||
|
int bytesRead = output.size();
|
||
|
byte[] arr = new byte[K2];
|
||
|
while (true) {
|
||
|
if (bytesRead >= maxBytes) {
|
||
|
Log.d(Constants.TAG, "Maxbyte of " + maxBytes
|
||
|
+ " exceeded! Maybe html is now broken but try it nevertheless. Url: "
|
||
|
+ url);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
int n = in.read(arr);
|
||
|
if (n < 0)
|
||
|
break;
|
||
|
bytesRead += n;
|
||
|
output.write(arr, 0, n);
|
||
|
}
|
||
|
|
||
|
return output.toString(encoding);
|
||
|
} catch (SocketTimeoutException e) {
|
||
|
Log.e(Constants.TAG, e.toString() + " url:" + url);
|
||
|
} catch (IOException e) {
|
||
|
Log.e(Constants.TAG, e.toString() + " url:" + url);
|
||
|
} finally {
|
||
|
if (in != null) {
|
||
|
try {
|
||
|
in.close();
|
||
|
} catch (Exception e) {
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This method detects the charset even if the first call only returns some
|
||
|
* bytes. It will read until 4K bytes are reached and then try to determine
|
||
|
* the encoding
|
||
|
*
|
||
|
* @throws IOException
|
||
|
*/
|
||
|
protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
|
||
|
String enc) throws IOException {
|
||
|
|
||
|
// Grab better encoding from stream
|
||
|
byte[] arr = new byte[K2];
|
||
|
int nSum = 0;
|
||
|
while (nSum < K2) {
|
||
|
int n = in.read(arr);
|
||
|
if (n < 0)
|
||
|
break;
|
||
|
|
||
|
nSum += n;
|
||
|
bos.write(arr, 0, n);
|
||
|
}
|
||
|
|
||
|
String str = bos.toString(enc);
|
||
|
int encIndex = str.indexOf(key);
|
||
|
int clength = key.length();
|
||
|
if (encIndex > 0) {
|
||
|
char startChar = str.charAt(encIndex + clength);
|
||
|
int lastEncIndex;
|
||
|
if (startChar == '\'')
|
||
|
// if we have charset='something'
|
||
|
lastEncIndex = str.indexOf("'", ++encIndex + clength);
|
||
|
else if (startChar == '\"')
|
||
|
// if we have charset="something"
|
||
|
lastEncIndex = str.indexOf("\"", ++encIndex + clength);
|
||
|
else {
|
||
|
// if we have "text/html; charset=utf-8"
|
||
|
int first = str.indexOf("\"", encIndex + clength);
|
||
|
if (first < 0)
|
||
|
first = Integer.MAX_VALUE;
|
||
|
|
||
|
// or "text/html; charset=utf-8 "
|
||
|
int sec = str.indexOf(" ", encIndex + clength);
|
||
|
if (sec < 0)
|
||
|
sec = Integer.MAX_VALUE;
|
||
|
lastEncIndex = Math.min(first, sec);
|
||
|
|
||
|
// or "text/html; charset=utf-8 '
|
||
|
int third = str.indexOf("'", encIndex + clength);
|
||
|
if (third > 0)
|
||
|
lastEncIndex = Math.min(lastEncIndex, third);
|
||
|
}
|
||
|
|
||
|
// re-read byte array with different encoding
|
||
|
// assume that the encoding string cannot be greater than 40 chars
|
||
|
if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
|
||
|
String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,
|
||
|
lastEncIndex));
|
||
|
try {
|
||
|
in.reset();
|
||
|
bos.reset();
|
||
|
return tmpEnc;
|
||
|
} catch (IOException ex) {
|
||
|
Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "
|
||
|
+ tmpEnc + " " + ex.toString());
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return null;
|
||
|
}
|
||
|
}
|