You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
311 lines
6.6 KiB
311 lines
6.6 KiB
// Package idn implements encoding from and to punycode as speficied by RFC 3492. |
|
package idn |
|
|
|
import ( |
|
"bytes" |
|
"strings" |
|
"unicode" |
|
|
|
"github.com/miekg/dns" |
|
) |
|
|
|
// Implementation idea from RFC itself and from from IDNA::Punycode created by |
|
// Tatsuhiko Miyagawa <miyagawa@bulknews.net> and released under Perl Artistic |
|
// License in 2002. |
|
|
|
const ( |
|
_MIN rune = 1 |
|
_MAX rune = 26 |
|
_SKEW rune = 38 |
|
_BASE rune = 36 |
|
_BIAS rune = 72 |
|
_N rune = 128 |
|
_DAMP rune = 700 |
|
|
|
_DELIMITER = '-' |
|
_PREFIX = "xn--" |
|
) |
|
|
|
// ToPunycode converts unicode domain names to DNS-appropriate punycode names. |
|
// This function would return an empty string result for domain names with |
|
// invalid unicode strings. This function expects domain names in lowercase. |
|
func ToPunycode(s string) string { |
|
tokens := dns.SplitDomainName(s) |
|
switch { |
|
case s == "": |
|
return "" |
|
case tokens == nil: // s == . |
|
return "." |
|
case s[len(s)-1] == '.': |
|
tokens = append(tokens, "") |
|
} |
|
|
|
for i := range tokens { |
|
t := encode([]byte(tokens[i])) |
|
if t == nil { |
|
return "" |
|
} |
|
tokens[i] = string(t) |
|
} |
|
return strings.Join(tokens, ".") |
|
} |
|
|
|
// FromPunycode returns unicode domain name from provided punycode string. |
|
func FromPunycode(s string) string { |
|
tokens := dns.SplitDomainName(s) |
|
switch { |
|
case s == "": |
|
return "" |
|
case tokens == nil: // s == . |
|
return "." |
|
case s[len(s)-1] == '.': |
|
tokens = append(tokens, "") |
|
} |
|
for i := range tokens { |
|
tokens[i] = string(decode([]byte(tokens[i]))) |
|
} |
|
return strings.Join(tokens, ".") |
|
} |
|
|
|
// digitval converts single byte into meaningful value that's used to calculate decoded unicode character. |
|
const errdigit = 0xffff |
|
|
|
func digitval(code rune) rune { |
|
switch { |
|
case code >= 'A' && code <= 'Z': |
|
return code - 'A' |
|
case code >= 'a' && code <= 'z': |
|
return code - 'a' |
|
case code >= '0' && code <= '9': |
|
return code - '0' + 26 |
|
} |
|
return errdigit |
|
} |
|
|
|
// lettercode finds BASE36 byte (a-z0-9) based on calculated number. |
|
func lettercode(digit rune) rune { |
|
switch { |
|
case digit >= 0 && digit <= 25: |
|
return digit + 'a' |
|
case digit >= 26 && digit <= 36: |
|
return digit - 26 + '0' |
|
} |
|
panic("dns: not reached") |
|
} |
|
|
|
// adapt calculates next bias to be used for next iteration delta. |
|
func adapt(delta rune, numpoints int, firsttime bool) rune { |
|
if firsttime { |
|
delta /= _DAMP |
|
} else { |
|
delta /= 2 |
|
} |
|
|
|
var k rune |
|
for delta = delta + delta/rune(numpoints); delta > (_BASE-_MIN)*_MAX/2; k += _BASE { |
|
delta /= _BASE - _MIN |
|
} |
|
|
|
return k + ((_BASE-_MIN+1)*delta)/(delta+_SKEW) |
|
} |
|
|
|
// next finds minimal rune (one with lowest codepoint value) that should be equal or above boundary. |
|
func next(b []rune, boundary rune) rune { |
|
if len(b) == 0 { |
|
panic("dns: invalid set of runes to determine next one") |
|
} |
|
m := b[0] |
|
for _, x := range b[1:] { |
|
if x >= boundary && (m < boundary || x < m) { |
|
m = x |
|
} |
|
} |
|
return m |
|
} |
|
|
|
// preprune converts unicode rune to lower case. At this time it's not |
|
// supporting all things described in RFCs |
|
func preprune(r rune) rune { |
|
if unicode.IsUpper(r) { |
|
r = unicode.ToLower(r) |
|
} |
|
return r |
|
} |
|
|
|
// tfunc is a function that helps calculate each character weight |
|
func tfunc(k, bias rune) rune { |
|
switch { |
|
case k <= bias: |
|
return _MIN |
|
case k >= bias+_MAX: |
|
return _MAX |
|
} |
|
return k - bias |
|
} |
|
|
|
// encode transforms Unicode input bytes (that represent DNS label) into |
|
// punycode bytestream. This function would return nil if there's an invalid |
|
// character in the label. |
|
func encode(input []byte) []byte { |
|
n, bias := _N, _BIAS |
|
|
|
b := bytes.Runes(input) |
|
for i := range b { |
|
if !isValidRune(b[i]) { |
|
return nil |
|
} |
|
|
|
b[i] = preprune(b[i]) |
|
} |
|
|
|
basic := make([]byte, 0, len(b)) |
|
for _, ltr := range b { |
|
if ltr <= 0x7f { |
|
basic = append(basic, byte(ltr)) |
|
} |
|
} |
|
basiclen := len(basic) |
|
fulllen := len(b) |
|
if basiclen == fulllen { |
|
return basic |
|
} |
|
|
|
var out bytes.Buffer |
|
|
|
out.WriteString(_PREFIX) |
|
if basiclen > 0 { |
|
out.Write(basic) |
|
out.WriteByte(_DELIMITER) |
|
} |
|
|
|
var ( |
|
ltr, nextltr rune |
|
delta, q rune // delta calculation (see rfc) |
|
t, k, cp rune // weight and codepoint calculation |
|
) |
|
|
|
s := &bytes.Buffer{} |
|
for h := basiclen; h < fulllen; n, delta = n+1, delta+1 { |
|
nextltr = next(b, n) |
|
s.Truncate(0) |
|
s.WriteRune(nextltr) |
|
delta, n = delta+(nextltr-n)*rune(h+1), nextltr |
|
|
|
for _, ltr = range b { |
|
if ltr < n { |
|
delta++ |
|
} |
|
if ltr == n { |
|
q = delta |
|
for k = _BASE; ; k += _BASE { |
|
t = tfunc(k, bias) |
|
if q < t { |
|
break |
|
} |
|
cp = t + ((q - t) % (_BASE - t)) |
|
out.WriteRune(lettercode(cp)) |
|
q = (q - t) / (_BASE - t) |
|
} |
|
|
|
out.WriteRune(lettercode(q)) |
|
|
|
bias = adapt(delta, h+1, h == basiclen) |
|
h, delta = h+1, 0 |
|
} |
|
} |
|
} |
|
return out.Bytes() |
|
} |
|
|
|
// decode transforms punycode input bytes (that represent DNS label) into Unicode bytestream |
|
func decode(b []byte) []byte { |
|
src := b // b would move and we need to keep it |
|
|
|
n, bias := _N, _BIAS |
|
if !bytes.HasPrefix(b, []byte(_PREFIX)) { |
|
return b |
|
} |
|
out := make([]rune, 0, len(b)) |
|
b = b[len(_PREFIX):] |
|
for pos := len(b) - 1; pos >= 0; pos-- { |
|
// only last delimiter is our interest |
|
if b[pos] == _DELIMITER { |
|
out = append(out, bytes.Runes(b[:pos])...) |
|
b = b[pos+1:] // trim source string |
|
break |
|
} |
|
} |
|
if len(b) == 0 { |
|
return src |
|
} |
|
var ( |
|
i, oldi, w rune |
|
ch byte |
|
t, digit rune |
|
ln int |
|
) |
|
|
|
for i = 0; len(b) > 0; i++ { |
|
oldi, w = i, 1 |
|
for k := _BASE; len(b) > 0; k += _BASE { |
|
ch, b = b[0], b[1:] |
|
digit = digitval(rune(ch)) |
|
if digit == errdigit { |
|
return src |
|
} |
|
i += digit * w |
|
|
|
t = tfunc(k, bias) |
|
if digit < t { |
|
break |
|
} |
|
|
|
w *= _BASE - t |
|
} |
|
ln = len(out) + 1 |
|
bias = adapt(i-oldi, ln, oldi == 0) |
|
n += i / rune(ln) |
|
i = i % rune(ln) |
|
// insert |
|
out = append(out, 0) |
|
copy(out[i+1:], out[i:]) |
|
out[i] = n |
|
} |
|
|
|
var ret bytes.Buffer |
|
for _, r := range out { |
|
ret.WriteRune(r) |
|
} |
|
return ret.Bytes() |
|
} |
|
|
|
// isValidRune checks if the character is valid. We will look for the |
|
// character property in the code points list. For now we aren't checking special |
|
// rules in case of contextual property |
|
func isValidRune(r rune) bool { |
|
return findProperty(r) == propertyPVALID |
|
} |
|
|
|
// findProperty will try to check the code point property of the given |
|
// character. It will use a binary search algorithm as we have a slice of |
|
// ordered ranges (average case performance O(log n)) |
|
func findProperty(r rune) property { |
|
imin, imax := 0, len(codePoints) |
|
|
|
for imax >= imin { |
|
imid := (imin + imax) / 2 |
|
|
|
codePoint := codePoints[imid] |
|
if (codePoint.start == r && codePoint.end == 0) || (codePoint.start <= r && codePoint.end >= r) { |
|
return codePoint.state |
|
} |
|
|
|
if (codePoint.end > 0 && codePoint.end < r) || (codePoint.end == 0 && codePoint.start < r) { |
|
imin = imid + 1 |
|
} else { |
|
imax = imid - 1 |
|
} |
|
} |
|
|
|
return propertyUnknown |
|
}
|
|
|