%PDF- %PDF-
Direktori : /usr/share/perl5/vendor_perl/ |
Current File : //usr/share/perl5/vendor_perl/Punycode.pm |
package LifeMedien::IDNA::Punycode; =head1 NAME IDNA::Punycode - encodes Unicode string in Punycode =head1 SYNOPSIS use IDNA::Punycode; $punycode = encode_punycode($unicode); $unicode = decode_punycode($punycode); =head1 DESCRIPTION IDNA::Punycode is a module to encode / decode Unicode strings into Punycode, an efficient encoding of Unicode for use with IDNA. This module requires Perl 5.6.0 or over to handle UTF8 flagged Unicode strings. =head1 FUNCTIONS This module exports following functions by default. =cut use strict; #use Error qw(:try); our $VERSION = 0.02; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(url_to_puny decode_htmlsite decode_line decode_domain_to_html decode_string_to_html encode_domain encode_string encode_punycode decode_punycode decode_puny_to_html to_puny to_ascii to_ascii_html); use integer; our $DEBUG = 0; use constant BASE => 36; use constant TMIN => 1; use constant TMAX => 26; use constant SKEW => 38; use constant DAMP => 700; use constant INITIAL_BIAS => 72; use constant INITIAL_N => 128; my $Delimiter = chr 0x2D; my $BasicRE = qr/[\x00-\x7f]/; sub _croak { require Carp; Carp::croak(@_); } #################### # Basis Funktionen (original) #################### =over 4 =item B<decode_punycode> (original) $unicode = decode_punycode($punycode) takes Punycode encoding and returns original Unicode string. =cut sub decode_punycode { my $code = shift; my $n = INITIAL_N; my $i = 0; my $bias = INITIAL_BIAS; my @output; if ($code =~ s/(.*)$Delimiter//o) { push @output, map ord, split //, $1; return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o; } while ($code) { my $oldi = $i; my $w = 1; LOOP: for (my $k = BASE; 1; $k += BASE) { my $cp = substr($code, 0, 1, ''); my $digit = digit_value($cp); defined $digit or return _croak("invalid punycode input"); $i += $digit * $w; my $t = ($k <= $bias) ? TMIN : ($k >= $bias + TMAX) ? TMAX : $k - $bias; last LOOP if $digit < $t; $w *= (BASE - $t); } $bias = adapt($i - $oldi, @output + 1, $oldi == 0); warn "bias becomes $bias" if $DEBUG; $n += $i / (@output + 1); $i = $i % (@output + 1); splice(@output, $i, 0, $n); warn join " ", map sprintf('%04x', $_), @output if $DEBUG; $i++; } return join '', map chr, @output; } ################### sub encode_punycode { =item B<encode_punycode> (original) $punycode = encode_punycode($unicode); takes Unicode string (UTF8-flagged variable) and returns Punycode encoding for it. =cut my $input = shift; # my @input = split //, $input; # doesn't work in 5.6.x! my @input = map substr($input, $_, 1), 0..length($input)-1; my $n = INITIAL_N; my $delta = 0; my $bias = INITIAL_BIAS; my @output; my @basic = grep /$BasicRE/, @input; my $h = my $b = @basic; push @output, @basic, $Delimiter if $b > 0; warn "basic codepoints: (@output)" if $DEBUG; while ($h < @input) { my $m = min(grep { $_ >= $n } map ord, @input); warn sprintf "next code point to insert is %04x", $m if $DEBUG; $delta += ($m - $n) * ($h + 1); $n = $m; for my $i (@input) { my $c = ord($i); $delta++ if $c < $n; if ($c == $n) { my $q = $delta; LOOP: for (my $k = BASE; 1; $k += BASE) { my $t = ($k <= $bias) ? TMIN : ($k >= $bias + TMAX) ? TMAX : $k - $bias; last LOOP if $q < $t; my $cp = code_point($t + (($q - $t) % (BASE - $t))); push @output, chr($cp); $q = ($q - $t) / (BASE - $t); } push @output, chr(code_point($q)); $bias = adapt($delta, $h + 1, $h == $b); warn "bias becomes $bias" if $DEBUG; $delta = 0; $h++; } } $delta++; $n++; } return join '', @output; } #################### # Basis Funktionen (geaendert) #################### =item B<Zus�tzliche Funktionen> ---------------------- =cut sub encode_html_to_puny { =item B<encode_html_to_puny> (added by life medien) $punystring = encode_html_to_puny($string); wie encode_punycode, aber nimmt String mit ASCII, Umlauten oder HTML-Entities (in Dezimalform) and gibt reinen Punycode (ohne xn-- am Anfang) zurueck. =cut my @ordinput = (); my $i; my $c; my $zeichen = ''; my $j; my $input = shift; for ($i=0; $i<=length($input)-1; $i++) { $c = substr($input, $i, 1); if ($c eq "\x26") { if (substr($input, $i+1, 1) ne "\x23") { push @ordinput, ord($c); next; } else { $j = $i+2; $zeichen = ''; while (substr($input, $j, 1) ne "\x3B" && $j < $i+5) { $zeichen .= substr($input, $j, 1); $j++; } if ($j == length($input)) { push @ordinput, ord($c); next; } $zeichen = sprintf("%d",$zeichen); push @ordinput,$zeichen; $i = $j; next; } } push @ordinput, ord(substr($input, $i, 1)); } my $n = INITIAL_N; my $delta = 0; my $bias = INITIAL_BIAS; my @output; my @basic = grep /$BasicRE/, map chr,@ordinput; my $h = my $b = @basic; push @output, @basic, $Delimiter if $b > 0; warn "basic codepoints: (@output)" if $DEBUG; while ($h < @ordinput) { my $m = min(grep { $_ >= $n } @ordinput); warn sprintf "next code point to insert is %04x", $m if $DEBUG; $delta += ($m - $n) * ($h + 1); $n = $m; for my $i (@ordinput) { my $c = $i; $delta++ if $c < $n; if ($c == $n) { my $q = $delta; LOOP: for (my $k = BASE; 1; $k += BASE) { my $t = ($k <= $bias) ? TMIN : ($k >= $bias + TMAX) ? TMAX : $k - $bias; last LOOP if $q < $t; my $cp = code_point($t + (($q - $t) % (BASE - $t))); push @output, chr($cp); $q = ($q - $t) / (BASE - $t); } push @output, chr(code_point($q)); $bias = adapt($delta, $h + 1, $h == $b); warn "bias becomes $bias" if $DEBUG; $delta = 0; $h++; } } $delta++; $n++; } return join '', @output; } ###################### sub encode_string($) { =item B<encode_string> (added by life medien) ($punystring) = encode_string($string); wie encode_html_to_puny, aber nimmt einen String und gibt einen punycodierten String zurueck (mit xn-- am Anfang). [Slobodnik] =cut my $string=$_[0]; #my $exc=new Exception 'PunyCodec'; # eval { if ($string !~ m/^xn--/) { $string = encode_html_to_puny($string); if ($string =~ m/-$/) { $string =~ s/-$//; } else { $string =~ s/^/xn--/; } } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$string\n"); # return 0; # }; return ($string); } ####################### sub encode_domain($) { =item B<encode_domain> (added by life medien) ($punydomain) = encode_domain($domainname); wie encode_html_to_puny, aber nimmt einen Domainnamen mit beliebig vielen Punkten und gibt einen punycodierten Domainnamen zurueck. [Slobodnik] =cut my $i; my $string = $_[0]; my @teilen = split ('\.', $string); my $n = @teilen; return 0 if ($n < 2); #my $exc = new Exception 'PunyCodec'; # try{ for ($i=0; $i<=$n-1; $i++) { if ($teilen[$i] !~ m/^xn--/) { #$teilen[$i] = uml_to_hex($teilen[$i]); $teilen[$i] = encode_html_to_puny($teilen[$i]); if ($teilen[$i] =~ m/-$/) { $teilen[$i] =~ s/-$//; } else { $teilen[$i] =~ s/^/xn--/; } } } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$teilen[$i]\n"); # return 0; # }; return(join '.' ,@teilen); } ########################## =item B<decode_puny_mit_html> (added by life medien) ($asciistring, $htmlstring) = decode_puny_mit_html($punystring); wie decode_punycode, aber nimmt den punycodirten String (ohne xn-- am Anfang) entgegen und gibt ein Array mit zwei Strings zurueck: ASCII-decodiert und mit HTML-Entities (in Dezimalform). [Slobodnik] =cut sub decode_puny_mit_html { my $code = shift; #Codierter String my $n = INITIAL_N; #128 my $i = 0; my $bias = INITIAL_BIAS; #72 my @output; my $nhtml = ''; my @htmloutput = (); if ($code =~ s/(.*)$Delimiter//o) { #wenn "-" vorkommt.. push @output, map ord, split //, $1; #alle Zeichen davor @htmloutput = map chr, @output; return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o; #wenn kein ASCII-Zeichen } while ($code) { my $oldi = $i; my $w = 1; LOOP: for (my $k = BASE; 1; $k += BASE) { #fangen von 36 an my $cp = substr($code, 0, 1, ''); #Erster Zeichen wegnehmen my $digit = digit_value($cp); defined $digit or return _croak("invalid punycode input"); $i += $digit * $w; my $t = ($k <= $bias) ? TMIN : ($k >= $bias + TMAX) ? TMAX : $k - $bias; last LOOP if $digit < $t; $w *= (BASE - $t); } $bias = adapt($i - $oldi, @output + 1, $oldi == 0); warn "bias becomes $bias" if $DEBUG; $n += $i / (@output + 1); $nhtml = noascii_to_html($n); $i = $i % (@output + 1); splice(@output, $i, 0, $n); splice(@htmloutput, $i, 0, $nhtml); warn join " ", map sprintf('%04x', $_), @output if $DEBUG; $i++; } return ((join '', map chr, @output), (join '', @htmloutput)); # return join '', map chr, @output; } ###################### sub decode_puny_to_html { =item B<decode_puny_to_html> (added by life medien) ($htmlstring) = decode_puny_to_html($punystring); wie decode_punycode, aber nimmt den punycodierten String (ohne xn-- vorne) entgegen und gibt einen String mit HTML-Entities in Dezimalform zurueck. [Hoelzler] =cut my $code = shift; my $n = INITIAL_N; my $i = 0; my $bias = INITIAL_BIAS; my @output; if ($code =~ s/(.*)$Delimiter//o) { push @output, map ord, split //, $1; return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o; } while ($code) { my $oldi = $i; my $w = 1; LOOP: for (my $k = BASE; 1; $k += BASE) { my $cp = substr($code, 0, 1, ''); my $digit = digit_value($cp); defined $digit or return _croak("invalid punycode input"); $i += $digit * $w; my $t = ($k <= $bias) ? TMIN : ($k >= $bias + TMAX) ? TMAX : $k - $bias; last LOOP if $digit < $t; $w *= (BASE - $t); } $bias = adapt($i - $oldi, @output + 1, $oldi == 0); warn "bias becomes $bias" if $DEBUG; $n += $i / (@output + 1); $i = $i % (@output + 1); splice(@output, $i, 0, $n); warn @output if $DEBUG; warn join " ", map sprintf('%04x', $_), @output if $DEBUG; $i+=1; } my @newout; foreach my $test (@output) { if($test >= INITIAL_N) { push @newout, (ord("&"), ord("#")); foreach (split(//, $test."")) { push @newout,ord("$_"); } push @newout,ord(";"); }else{ push @newout,$test; } } @output = @newout; return join '', map chr, @output; } ########################## sub decode_string_to_html($) { =item B<decode_string_to_html> (added by life medien) ($htmlstring) = decode_string_to_html($punystring); nimmt einen punycodierten String mit (xn--) und gibt einen String mit HTML-Entities in Dezimalform zurueck. [Slobodnik] =cut my $string = $_[0]; #my $exc=new Exception 'PunyCodec'; eval { $string =~ s/^xn--//; my $tmp = $string; #$string = decode_puny_to_html($string); my @ret = decode_puny_mit_html($string); $string = $ret[1]; warn ("unable to decode $string\n") if ($tmp eq $string); }; if ($@) { warn("xn--$string : $@\n"); return 0; } return($string); } ##################### sub decode_domain_to_html($) { =item B<decode_domain_to_html> (added by life medien) ($htmldomain) = decode_domain_to_html($punydomain); wie decode_puny_to_html, aber nimmt einen Domainnamen mit beliebig vielen Punkten und xn-- entgegen und gibt einen Domainname mit HTML-Entities in Dezimalform zurueck. [Slobodnik] =cut my $i; my $string=$_[0]; my @teilen = split ('\.', $string); my $n = @teilen; return 0 if ($n < 2); eval { for ($i=0; $i<=$n-1; $i++) { if ($teilen[$i] =~ m/^xn--/) { $teilen[$i] =~ s/^xn--//; my $tmp = $teilen[$i]; $teilen[$i] = decode_puny_to_html($teilen[$i]); warn ("unable to decode $teilen[$i]\n") if ($tmp eq $teilen[$i]); } } }; if ($@) { warn("xn--$teilen[$i] $@\n"); return 0; } return(join ('.',@teilen)); } ############################## # Drei nicht konditionelle Kodierungsvarianten. # En/Decodet oder gibt den Originalstring zur�ck. # Gedacht f�r Domainnamen, nicht beliebige Strings. ############################## sub to_puny ($) { =item B<to_puny> (added by life medien) $punycode = to_puny($unicode_or_html) Takes Unicode or code with html-entities and transforms to punycode One dot (.) is mandatory, indicating an url, however to_puny will encode nearly any given string. This funtion requires at least perl 5.6.1 to operate correctly [Hoelzler] =cut my $i; my $string = $_[0]; $string =~ s/&#(\d+)\;/chr($1);/eg; my @teilen = split ('\.', $string); my $n = @teilen; return 0 if ($n < 2); eval { for ($i=0; $i<=$n-1; $i++) { if ($teilen[$i] !~ m/^xn--/) { $teilen[$i] = encode_punycode($teilen[$i]); if ($teilen[$i] =~ m/-$/) { $teilen[$i] =~ s/-$//; } else { $teilen[$i] =~ s/^/xn--/; } } } }; if ($@) { warn("xn--$teilen[$i] : $@\n"); return 0; }; return(join ('.',@teilen)); } ######################## sub to_ascii ($) { =item B<to_ascii> (added by life medien) It shoud have been to_iso or to_uni... $unicode = to_ascii($punycode) Takes a string containing punycode and at least one dot (.), and returns the original iso-decoded string. Depending on local settings, this my not be what you expected. This funtion requires at least perl 5.6.1 to operate correctly [Hoelzler] =cut my $i; my $string = $_[0]; my @teilen = split ('\.', $string); my $n = @teilen; return 0 if ($n < 2); eval { for ($i=0; $i<=$n-1; $i++) { if ($teilen[$i] =~ m/^xn--/) { $teilen[$i] =~ s/^xn--//; my $tmp = $teilen[$i]; $teilen[$i] = decode_punycode($teilen[$i]); warn("unable to decode $teilen[$i]\n") if ($tmp eq $teilen[$i]); } } }; if ($@) { warn("xn--$teilen[$i] : $@\n"); return 0; }; return(join '.', @teilen); } ######################### sub to_ascii_html ($) { =item B<to_ascii_html> (added by life medien) $html-code = to_ascii_html($punycode) Takes a string containing punycode and at least one dot(.), and retunrs a decoded string with special characters escaped in numerical html-entitis This funtion requires at least perl 5.6.1 to operate correctly [Hoelzler] =cut my $i; my $string = $_[0]; my @teilen = split ('\.', $string); my $n = @teilen; return 0 if ($n < 2); eval { for ($i=0; $i<=$n-1; $i++) { if ($teilen[$i] =~ m/^xn--/) { $teilen[$i] =~ s/^xn--//; my $tmp = $teilen[$i]; $teilen[$i] = decode_puny_to_html($teilen[$i]); warn ("unable to decode $teilen[$i]\n") if ($tmp eq $teilen[$i]); } else { my @output; push @output, map ord, split //, $teilen[$i]; my @newout; foreach my $test (@output){ if($test >= INITIAL_N){ push @newout, (ord("&"), ord("#")); foreach (split(//, $test."")){ push @newout, ord("$_"); } push @newout,ord(";"); } else { push @newout,$test; } } $teilen[$i] = join '', map chr, @newout; } } }; if ($@) { warn("xn--$teilen[$i] : $@\n"); return 0; }; return(join '.', @teilen); } ###################### ###################### #sub PunyCodecHtml ($) { # #=item B<PunyCodecHtml> (deprecated) # # $string = PunyCodecHtml($string); # # decodiert einen String mit 'xn--' in HTML-String # (mit HTML-Entities) und codiert einen nicht codierten # String in Punycode mit 'xn--'. # #=cut # # my $i; # my $string=$_[0]; # # my $exc=new Exception 'PunyCodec'; # # try{ # if ($string =~ m/^xn--/) { # $string =~ s/^xn--//; # my $tmp=$string; # $string = decode_puny_to_html($string); # # $exc->raise("unable to decode\n") if ($tmp eq $string); # } else { # $string = encode_punycode($string); # if ($string =~ m/-$/) { # $string =~ s/-$//; # } else { # $string =~ s/^/xn--/; # } # } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$string\n"); # return 0; # }; # # return($string); #} ####################### #sub PunyCodec($) { # #=item B<PunyCodec> (deprecated) # # $domain = PunyCodec($domain); # # decodiert einen Domainname mit 'xn--' in ASCII-String # und codiert einen nicht codierten Domainname in Punycode # mit 'xn--'. # #=cut # # # my $i; # my $string=$_[0]; # # my @teilen = split ('\.', $string); # my $n = @teilen; # return 0 if ($n < 2); # # my $exc=new Exception 'PunyCodec'; # # try{ # for ($i=0; $i<=$n-1; $i++) { # if ($teilen[$i] =~ m/^xn--/) { # $teilen[$i] =~ s/^xn--//; # my $tmp=$teilen[$i]; # $teilen[$i] = decode_punycode($teilen[$i]); # $exc->raise("unable to decode\n") if ($tmp eq $teilen[$i]); # } else { # $teilen[$i] = encode_punycode($teilen[$i]); # if ($teilen[$i] =~ m/-$/) { # $teilen[$i] =~ s/-$//; # } else { # $teilen[$i] =~ s/^/xn--/; # } # } # } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$teilen[$i]\n"); # return 0; # }; # # return(join ('.',@teilen)); #} ######################### #sub PunyCodecString($) { # #=item B<PunyCodecString> (deprecated) # # $string = PunyCodecString($string); # # decodiert einen String mit 'xn--' in ASCII-String # und codiert einen nicht codierten String in Punycode # mit 'xn--'. # #=cut # # my $i; # my $string=$_[0]; # # my $exc=new Exception 'PunyCodec'; # # try{ # if ($string =~ m/^xn--/) { # $string =~ s/^xn--//; # my $tmp=$string; # $string = decode_punycode($string); # # $exc->raise("unable to decode\n") if ($tmp eq $string); # } else { # $string = encode_punycode($string); # if ($string =~ m/-$/) { # $string =~ s/-$//; # } else { # $string =~ s/^/xn--/; # } # } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$string\n"); # return 0; # }; # # return($string); #} ############################### #sub PunyCodecStringHtml($) { # #=item B<PunyCodecStringHtml> (deprecated) # # $string = PunyCodecStringHtml($string); # # decodiert einen String mit 'xn--' in HTML-String # (mit HTML-Entities) und codiert einen nicht codierten # String in Punycode mit 'xn--'. # #=cut # # my $i; # my $string=$_[0]; # # my $exc=new Exception 'PunyCodec'; # # try{ # if ($string =~ m/^xn--/) { # $string =~ s/^xn--//; # my $tmp=$string; # $string = decode_puny_to_html($string); # # $exc->raise("unable to decode\n") if ($tmp eq $string); # } else { # $string = encode_punycode($string); # if ($string =~ m/-$/) { # $string =~ s/-$//; # } else { # $string =~ s/^/xn--/; # } # } # }when ['die',$exc], except{ # my $err .= shift; # chomp($err); # warn("$err xn--$string\n"); # return 0; # }; # # return($string); #} ###################### # Hilfsfunktionen (original) ###################### sub digit_value { my $code = shift; return ord($code) - ord("A") if $code =~ /[A-Z]/; return ord($code) - ord("a") if $code =~ /[a-z]/; return ord($code) - ord("0") + 26 if $code =~ /[0-9]/; return; } ##################### sub code_point { my $digit = shift; return $digit + ord('a') if 0 <= $digit && $digit <= 25; return $digit + ord('0') - 26 if 26 <= $digit && $digit <= 36; die 'NOT COME HERE'; } #################### sub adapt { my($delta, $numpoints, $firsttime) = @_; $delta = $firsttime ? $delta / DAMP : $delta / 2; $delta += $delta / $numpoints; my $k = 0; while ($delta > ((BASE - TMIN) * TMAX) / 2) { $delta /= BASE - TMIN; $k += BASE; } return $k + (((BASE - TMIN + 1) * $delta) / ($delta + SKEW)); } ##################### sub min { my $min = shift; for (@_) { $min = $_ if $_ <= $min } return $min; } ###################### # Hilfsfunktionen (lifemedien) ###################### sub noascii_to_html { my $k= 0x26; my $r= 0x23; my $sem = 0x3B; # & und # my @output = (); my $hex = shift; my @hexarray = map ord,split(// ,$hex); push @output,$k,$r,@hexarray,$sem; return join '', map chr, @output; } sub decode_line($) { my $x; my $y; my $line = shift; while ($line =~ /(.*)xn--([a-z0-9-]+)(.*)$/) { return 0 unless ($x,$y) = decode_puny_mit_html($2); $line = $1.$x.$3; } return $line } sub href_decode($){ my $prot = qr/href\s*=\s*"?(http|https|ftp):\/\//i; my $x; my $string = shift; $string =~ m/^(.*)xn--(.*)$/; if ($1 =~ /^$prot.*$/) { return $string; } else { if ($x = decode_punycode($2)) { return $1.$x; } else { return $string; } } } sub href_encode($){ my $x; my $string = shift; if ($x = encode_domain($string)) { return $x; } else { return $string; } } =item B<decode_htmlsite> (original) $punyline = decode_htmlsite($line); takes a Line from the HTML-Site with Punycode and returns HTML decoding for it. =cut sub decode_htmlsite($) { my $x; my $prot = qr/href\s*\=\s*"?(http|https|ftp):\/\//i; my $line = shift; while ($line =~ m/\G(.*)xn--([a-z0-9-]+)(.+)$/g) { if ($1 !~ /$prot[a-z0-9-\.]+$/) { eval { if ($x = decode_puny_mit_html($2)) { $line = $1.$x.$3; } else { warn ("ERROR Punycode: [$line]\n"); } }; if ($@) { warn ("Error decode_puny_mit_html($2) : $@\n"); } } } return $line; } =item B<url_to_puny> (original) $punyurl = url_to_puny($url); takes url String and returns Punycode encoding for it. =cut sub url_to_puny($) { my $domain; my $url = shift; my $prot = qr/(http|https|ftp):\/\//i; $url =~ m/^($prot)([^:\/ ]+)(.*)$/i; # m/^(http|https|ftp)*(\/\/:)*([^:\/]+)(.*)$/; if ($domain = encode_domain($3)) { $url = $1.$domain.$4; } else { print STDERR "ERROR Punycode [$url]\n"; } return $url; } 1; __END__ =back These functions throws exceptionsn on failure. You can catch 'em via C<eval>. =head1 AUTHOR Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt> This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO http://www.ietf.org/internet-drafts/draft-ietf-idn-punycode-01.txt L<Encode::Punycode> =cut