lightning-i2p/app/src/main/java/acr/browser/lightning/reading/Converter.java

/*
 *  Copyright 2011 Peter Karich 
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package acr.browser.lightning.reading;

import java.io.*;
import java.net.SocketTimeoutException;
import java.nio.charset.Charset;
import java.util.Locale;

import acr.browser.lightning.Constants;
import android.util.Log;

/**
 * This class is not thread safe. Use one new instance every time due to
 * encoding variable.
 * 
 * @author Peter Karich
 */
public class Converter {

	public final static String UTF8 = "UTF-8";
	public final static String ISO = "ISO-8859-1";
	public final static int K2 = 2048;
	private int maxBytes = 1000000 / 2;
	private String encoding;
	private String url;

	public Converter(String urlOnlyHint) {
		url = urlOnlyHint;
	}

	public Converter() {
	}

	public Converter setMaxBytes(int maxBytes) {
		this.maxBytes = maxBytes;
		return this;
	}

	public static String extractEncoding(String contentType) {
		String[] values;
		if (contentType != null)
			values = contentType.split(";");
		else
			values = new String[0];

		String charset = "";

		for (String value : values) {
			value = value.trim().toLowerCase(Locale.getDefault());

			if (value.startsWith("charset="))
				charset = value.substring("charset=".length());
		}

		// http1.1 says ISO-8859-1 is the default charset
		if (charset.length() == 0)
			charset = ISO;

		return charset;
	}

	public String getEncoding() {
		if (encoding == null)
			return "";
		return encoding.toLowerCase(Locale.getDefault());
	}

	public String streamToString(InputStream is) {
		return streamToString(is, maxBytes, encoding);
	}

	public String streamToString(InputStream is, String enc) {
		return streamToString(is, maxBytes, enc);
	}

	/**
	 * reads bytes off the string and returns a string
	 * 
	 * @param is
	 * @param maxBytes
	 *            The max bytes that we want to read from the input stream
	 * @return String
	 */
	public String streamToString(InputStream is, int maxBytes, String enc) {
		encoding = enc;
		// Http 1.1. standard is iso-8859-1 not utf8 :(
		// but we force utf-8 as youtube assumes it ;)
		if (encoding == null || encoding.isEmpty())
			encoding = UTF8;

		BufferedInputStream in = null;
		try {
			in = new BufferedInputStream(is, K2);
			ByteArrayOutputStream output = new ByteArrayOutputStream();

			// detect encoding with the help of meta tag
			try {
				in.mark(K2 * 2);
				String tmpEnc = detectCharset("charset=", output, in, encoding);
				if (tmpEnc != null)
					encoding = tmpEnc;
				else {
					Log.d(Constants.TAG, "no charset found in first stage");
					// detect with the help of xml beginning ala
					// encoding="charset"
					tmpEnc = detectCharset("encoding=", output, in, encoding);
					if (tmpEnc != null)
						encoding = tmpEnc;
					else
						Log.d(Constants.TAG, "no charset found in second stage");
				}

				if (!Charset.isSupported(encoding))
					throw new UnsupportedEncodingException(encoding);
			} catch (UnsupportedEncodingException e) {
				Log.d(Constants.TAG,
						"Using default encoding:" + UTF8 + " problem:" + e.getMessage()
								+ " encoding:" + encoding + " " + url);
				encoding = UTF8;
			}

			// SocketException: Connection reset
			// IOException: missing CR => problem on server (probably some xml
			// character thing?)
			// IOException: Premature EOF => socket unexpectly closed from
			// server
			int bytesRead = output.size();
			byte[] arr = new byte[K2];
			while (true) {
				if (bytesRead >= maxBytes) {
					Log.d(Constants.TAG, "Maxbyte of " + maxBytes
							+ " exceeded! Maybe html is now broken but try it nevertheless. Url: "
							+ url);
					break;
				}

				int n = in.read(arr);
				if (n < 0)
					break;
				bytesRead += n;
				output.write(arr, 0, n);
			}

			return output.toString(encoding);
		} catch (SocketTimeoutException e) {
			Log.e(Constants.TAG, e.toString() + " url:" + url);
		} catch (IOException e) {
			Log.e(Constants.TAG, e.toString() + " url:" + url);
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (Exception e) {
				}
			}
		}
		return "";
	}

	/**
	 * This method detects the charset even if the first call only returns some
	 * bytes. It will read until 4K bytes are reached and then try to determine
	 * the encoding
	 * 
	 * @throws IOException
	 */
	protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,
			String enc) throws IOException {

		// Grab better encoding from stream
		byte[] arr = new byte[K2];
		int nSum = 0;
		while (nSum < K2) {
			int n = in.read(arr);
			if (n < 0)
				break;

			nSum += n;
			bos.write(arr, 0, n);
		}

		String str = bos.toString(enc);
		int encIndex = str.indexOf(key);
		int clength = key.length();
		if (encIndex > 0) {
			char startChar = str.charAt(encIndex + clength);
			int lastEncIndex;
			if (startChar == '\'')
				// if we have charset='something'
				lastEncIndex = str.indexOf("'", ++encIndex + clength);
			else if (startChar == '\"')
				// if we have charset="something"
				lastEncIndex = str.indexOf("\"", ++encIndex + clength);
			else {
				// if we have "text/html; charset=utf-8"
				int first = str.indexOf("\"", encIndex + clength);
				if (first < 0)
					first = Integer.MAX_VALUE;

				// or "text/html; charset=utf-8 "
				int sec = str.indexOf(" ", encIndex + clength);
				if (sec < 0)
					sec = Integer.MAX_VALUE;
				lastEncIndex = Math.min(first, sec);

				// or "text/html; charset=utf-8 '
				int third = str.indexOf("'", encIndex + clength);
				if (third > 0)
					lastEncIndex = Math.min(lastEncIndex, third);
			}

			// re-read byte array with different encoding
			// assume that the encoding string cannot be greater than 40 chars
			if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
				String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,
						lastEncIndex));
				try {
					in.reset();
					bos.reset();
					return tmpEnc;
				} catch (IOException ex) {
					Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "
							+ tmpEnc + " " + ex.toString());
				}
			}
		}
		return null;
	}
}
Added a Reading Mode that can be accessed from the menu Reading Mode utilizes the Snacktory library created by karussel which is licensed under the Apache 2.0 license. https://github.com/karussell/snacktory 9 years ago			`/*`
			`* Copyright 2011 Peter Karich`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`
Change Reading mode package name to lower case 9 years ago			`package acr.browser.lightning.reading;`
Added a Reading Mode that can be accessed from the menu Reading Mode utilizes the Snacktory library created by karussel which is licensed under the Apache 2.0 license. https://github.com/karussell/snacktory 9 years ago
			`import java.io.*;`
			`import java.net.SocketTimeoutException;`
			`import java.nio.charset.Charset;`
			`import java.util.Locale;`

			`import acr.browser.lightning.Constants;`
			`import android.util.Log;`

			`/**`
			`* This class is not thread safe. Use one new instance every time due to`
			`* encoding variable.`
			`*`
			`* @author Peter Karich`
			`*/`
			`public class Converter {`

			`public final static String UTF8 = "UTF-8";`
			`public final static String ISO = "ISO-8859-1";`
			`public final static int K2 = 2048;`
			`private int maxBytes = 1000000 / 2;`
			`private String encoding;`
			`private String url;`

			`public Converter(String urlOnlyHint) {`
			`url = urlOnlyHint;`
			`}`

			`public Converter() {`
			`}`

			`public Converter setMaxBytes(int maxBytes) {`
			`this.maxBytes = maxBytes;`
			`return this;`
			`}`

			`public static String extractEncoding(String contentType) {`
			`String[] values;`
			`if (contentType != null)`
			`values = contentType.split(";");`
			`else`
			`values = new String[0];`

			`String charset = "";`

			`for (String value : values) {`
			`value = value.trim().toLowerCase(Locale.getDefault());`

			`if (value.startsWith("charset="))`
			`charset = value.substring("charset=".length());`
			`}`

			`// http1.1 says ISO-8859-1 is the default charset`
			`if (charset.length() == 0)`
			`charset = ISO;`

			`return charset;`
			`}`

			`public String getEncoding() {`
			`if (encoding == null)`
			`return "";`
			`return encoding.toLowerCase(Locale.getDefault());`
			`}`

			`public String streamToString(InputStream is) {`
			`return streamToString(is, maxBytes, encoding);`
			`}`

			`public String streamToString(InputStream is, String enc) {`
			`return streamToString(is, maxBytes, enc);`
			`}`

			`/**`
			`* reads bytes off the string and returns a string`
			`*`
			`* @param is`
			`* @param maxBytes`
			`* The max bytes that we want to read from the input stream`
			`* @return String`
			`*/`
			`public String streamToString(InputStream is, int maxBytes, String enc) {`
			`encoding = enc;`
			`// Http 1.1. standard is iso-8859-1 not utf8 :(`
			`// but we force utf-8 as youtube assumes it ;)`
			`if (encoding == null \|\| encoding.isEmpty())`
			`encoding = UTF8;`

			`BufferedInputStream in = null;`
			`try {`
			`in = new BufferedInputStream(is, K2);`
			`ByteArrayOutputStream output = new ByteArrayOutputStream();`

			`// detect encoding with the help of meta tag`
			`try {`
			`in.mark(K2 * 2);`
			`String tmpEnc = detectCharset("charset=", output, in, encoding);`
			`if (tmpEnc != null)`
			`encoding = tmpEnc;`
			`else {`
			`Log.d(Constants.TAG, "no charset found in first stage");`
			`// detect with the help of xml beginning ala`
			`// encoding="charset"`
			`tmpEnc = detectCharset("encoding=", output, in, encoding);`
			`if (tmpEnc != null)`
			`encoding = tmpEnc;`
			`else`
			`Log.d(Constants.TAG, "no charset found in second stage");`
			`}`

			`if (!Charset.isSupported(encoding))`
			`throw new UnsupportedEncodingException(encoding);`
			`} catch (UnsupportedEncodingException e) {`
			`Log.d(Constants.TAG,`
			`"Using default encoding:" + UTF8 + " problem:" + e.getMessage()`
			`+ " encoding:" + encoding + " " + url);`
			`encoding = UTF8;`
			`}`

			`// SocketException: Connection reset`
			`// IOException: missing CR => problem on server (probably some xml`
			`// character thing?)`
			`// IOException: Premature EOF => socket unexpectly closed from`
			`// server`
			`int bytesRead = output.size();`
			`byte[] arr = new byte[K2];`
			`while (true) {`
			`if (bytesRead >= maxBytes) {`
			`Log.d(Constants.TAG, "Maxbyte of " + maxBytes`
			`+ " exceeded! Maybe html is now broken but try it nevertheless. Url: "`
			`+ url);`
			`break;`
			`}`

			`int n = in.read(arr);`
			`if (n < 0)`
			`break;`
			`bytesRead += n;`
			`output.write(arr, 0, n);`
			`}`

			`return output.toString(encoding);`
			`} catch (SocketTimeoutException e) {`
			`Log.e(Constants.TAG, e.toString() + " url:" + url);`
			`} catch (IOException e) {`
			`Log.e(Constants.TAG, e.toString() + " url:" + url);`
			`} finally {`
			`if (in != null) {`
			`try {`
			`in.close();`
			`} catch (Exception e) {`
			`}`
			`}`
			`}`
			`return "";`
			`}`

			`/**`
			`* This method detects the charset even if the first call only returns some`
			`* bytes. It will read until 4K bytes are reached and then try to determine`
			`* the encoding`
			`*`
			`* @throws IOException`
			`*/`
			`protected String detectCharset(String key, ByteArrayOutputStream bos, BufferedInputStream in,`
			`String enc) throws IOException {`

			`// Grab better encoding from stream`
			`byte[] arr = new byte[K2];`
			`int nSum = 0;`
			`while (nSum < K2) {`
			`int n = in.read(arr);`
			`if (n < 0)`
			`break;`

			`nSum += n;`
			`bos.write(arr, 0, n);`
			`}`

			`String str = bos.toString(enc);`
			`int encIndex = str.indexOf(key);`
			`int clength = key.length();`
			`if (encIndex > 0) {`
			`char startChar = str.charAt(encIndex + clength);`
			`int lastEncIndex;`
			`if (startChar == '\'')`
			`// if we have charset='something'`
			`lastEncIndex = str.indexOf("'", ++encIndex + clength);`
			`else if (startChar == '\"')`
			`// if we have charset="something"`
			`lastEncIndex = str.indexOf("\"", ++encIndex + clength);`
			`else {`
			`// if we have "text/html; charset=utf-8"`
			`int first = str.indexOf("\"", encIndex + clength);`
			`if (first < 0)`
			`first = Integer.MAX_VALUE;`

			`// or "text/html; charset=utf-8 "`
			`int sec = str.indexOf(" ", encIndex + clength);`
			`if (sec < 0)`
			`sec = Integer.MAX_VALUE;`
			`lastEncIndex = Math.min(first, sec);`

			`// or "text/html; charset=utf-8 '`
			`int third = str.indexOf("'", encIndex + clength);`
			`if (third > 0)`
			`lastEncIndex = Math.min(lastEncIndex, third);`
			`}`

			`// re-read byte array with different encoding`
			`// assume that the encoding string cannot be greater than 40 chars`
			`if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {`
			`String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength,`
			`lastEncIndex));`
			`try {`
			`in.reset();`
			`bos.reset();`
			`return tmpEnc;`
			`} catch (IOException ex) {`
			`Log.e(Constants.TAG, "Couldn't reset stream to re-read with new encoding "`
			`+ tmpEnc + " " + ex.toString());`
			`}`
			`}`
			`}`
			`return null;`
			`}`
			`}`