| File: | local/lib/perl5/Sort/Naturally.pm |
| Coverage: | 13.2% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | ||||||
| 2 | require 5; | |||||
| 3 | package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST" | |||||
| 4 | $VERSION = '1.03'; | |||||
| 5 | @EXPORT = ('nsort', 'ncmp'); | |||||
| 6 | require Exporter; | |||||
| 7 | @ISA = ('Exporter'); | |||||
| 8 | ||||||
| 9 | 2 2 2 | 4 2 33 | use strict; | |||
| 10 | 2 2 2 | 222 193 6 | use locale; | |||
| 11 | 2 2 2 | 25 1 6 | use integer; | |||
| 12 | ||||||
| 13 | #----------------------------------------------------------------------------- | |||||
| 14 | # constants: | |||||
| 15 | 2 | 55 | BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } | |||
| 16 | ||||||
| 17 | 2 2 2 | 4 0 16 | use Config (); | |||
| 18 | BEGIN { | |||||
| 19 | # Make a constant such that if a whole-number string is that long | |||||
| 20 | # or shorter, we KNOW it's treatable as an integer | |||||
| 21 | 2 2 2 | 3 1 4 | no integer; | |||
| 22 | 2 | 116 | my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1; | |||
| 23 | 2 | 5 | die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4; | |||
| 24 | 2 | 40 | eval 'sub MAX_INT_SIZE () {' . $x . '}'; | |||
| 25 | 2 | 5 | die $@ if $@; | |||
| 26 | 2 | 202 | print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG; | |||
| 27 | } | |||||
| 28 | ||||||
| 29 | sub X_FIRST () {-1} | |||||
| 30 | sub Y_FIRST () { 1} | |||||
| 31 | ||||||
| 32 | my @ORD = ('same', 'swap', 'asis'); | |||||
| 33 | ||||||
| 34 | #----------------------------------------------------------------------------- | |||||
| 35 | # For lack of a preprocessor: | |||||
| 36 | ||||||
| 37 | my($code, $guts); | |||||
| 38 | $guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort: | |||||
| 39 | ||||||
| 40 | if($x eq $y) { | |||||
| 41 | # trap this expensive case first, and then fall thru to tiebreaker | |||||
| 42 | $rv = 0; | |||||
| 43 | ||||||
| 44 | # Convoluted hack to get numerics to sort first, at string start: | |||||
| 45 | } elsif($x =~ m/^\d/s) { | |||||
| 46 | if($y =~ m/^\d/s) { | |||||
| 47 | $rv = 0; # fall thru to normal comparison for the two numbers | |||||
| 48 | } else { | |||||
| 49 | $rv = X_FIRST; | |||||
| 50 | DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n"; | |||||
| 51 | } | |||||
| 52 | } elsif($y =~ m/^\d/s) { | |||||
| 53 | $rv = Y_FIRST; | |||||
| 54 | DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n"; | |||||
| 55 | } else { | |||||
| 56 | $rv = 0; | |||||
| 57 | } | |||||
| 58 | ||||||
| 59 | unless($rv) { | |||||
| 60 | # Normal case: | |||||
| 61 | $rv = 0; | |||||
| 62 | DEBUG and print "<$x> and <$y> compared...\n"; | |||||
| 63 | ||||||
| 64 | Consideration: | |||||
| 65 | while(length $x and length $y) { | |||||
| 66 | ||||||
| 67 | DEBUG > 2 and print " <$x> and <$y>...\n"; | |||||
| 68 | ||||||
| 69 | # First, non-numeric comparison: | |||||
| 70 | $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0; | |||||
| 71 | $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0; | |||||
| 72 | # Now make x2 the min length of the two: | |||||
| 73 | $x2 = $y2 if $x2 > $y2; | |||||
| 74 | if($x2) { | |||||
| 75 | DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n", | |||||
| 76 | substr($x,0,$x2), substr($y,0,$x2); | |||||
| 77 | do { | |||||
| 78 | my $i = substr($x,0,$x2); | |||||
| 79 | my $j = substr($y,0,$x2); | |||||
| 80 | my $sv = $i cmp $j; | |||||
| 81 | print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv; | |||||
| 82 | last; | |||||
| 83 | } | |||||
| 84 | ||||||
| 85 | ||||||
| 86 | if $rv = | |||||
| 87 | # The ''. things here force a copy that seems to work around a | |||||
| 88 | # mysterious intermittent bug that 'use locale' provokes in | |||||
| 89 | # many versions of Perl. | |||||
| 90 | $cmp | |||||
| 91 | ? $cmp->(substr($x,0,$x2) . '', | |||||
| 92 | substr($y,0,$x2) . '', | |||||
| 93 | ) | |||||
| 94 | : | |||||
| 95 | scalar(( substr($x,0,$x2) . '' ) cmp | |||||
| 96 | ( substr($y,0,$x2) . '' ) | |||||
| 97 | ) | |||||
| 98 | ; | |||||
| 99 | # otherwise trim and keep going: | |||||
| 100 | substr($x,0,$x2) = ''; | |||||
| 101 | substr($y,0,$x2) = ''; | |||||
| 102 | } | |||||
| 103 | ||||||
| 104 | # Now numeric: | |||||
| 105 | # (actually just using $x2 and $y2 as scratch) | |||||
| 106 | ||||||
| 107 | if( $x =~ s/^(\d+)//s ) { | |||||
| 108 | $x2 = $1; | |||||
| 109 | if( $y =~ s/^(\d+)//s ) { | |||||
| 110 | # We have two numbers here. | |||||
| 111 | DEBUG > 1 and print " <$x2> and <$1> numerically\n"; | |||||
| 112 | if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) { | |||||
| 113 | # small numbers: we can compare happily | |||||
| 114 | last if $rv = $x2 <=> $1; | |||||
| 115 | } else { | |||||
| 116 | # ARBITRARILY large integers! | |||||
| 117 | ||||||
| 118 | # This saves on loss of precision that could happen | |||||
| 119 | # with actual stringification. | |||||
| 120 | # Also, I sense that very large numbers aren't too | |||||
| 121 | # terribly common in sort data. | |||||
| 122 | ||||||
| 123 | # trim leading 0's: | |||||
| 124 | ($y2 = $1) =~ s/^0+//s; | |||||
| 125 | $x2 =~ s/^0+//s; | |||||
| 126 | print " Treating $x2 and $y2 as bigint\n" if DEBUG; | |||||
| 127 | ||||||
| 128 | no locale; # we want the dumb cmp back. | |||||
| 129 | last if $rv = ( | |||||
| 130 | # works only for non-negative whole numbers: | |||||
| 131 | length($x2) <=> length($y2) | |||||
| 132 | # the longer the numeral, the larger the value | |||||
| 133 | or $x2 cmp $y2 | |||||
| 134 | # between equals, compare lexically!! amazing but true. | |||||
| 135 | ); | |||||
| 136 | } | |||||
| 137 | } else { | |||||
| 138 | # X is numeric but Y isn't | |||||
| 139 | $rv = Y_FIRST; | |||||
| 140 | last; | |||||
| 141 | } | |||||
| 142 | } elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring | |||||
| 143 | $rv = X_FIRST; | |||||
| 144 | last; | |||||
| 145 | } | |||||
| 146 | # else one of them is 0-length. | |||||
| 147 | ||||||
| 148 | # end-while | |||||
| 149 | } | |||||
| 150 | } | |||||
| 151 | EOGUTS | |||||
| 152 | ||||||
| 153 | sub maker { | |||||
| 154 | my $code = $_[0]; | |||||
| 155 | $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~"; | |||||
| 156 | 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | 5 1 4 5 2 3 | eval $code; | |||
| 157 | die $@ if $@; | |||||
| 158 | } | |||||
| 159 | ||||||
| 160 | ############################################################################## | |||||
| 161 | ||||||
| 162 | maker(<<'EONSORT'); | |||||
| 163 | sub nsort { | |||||
| 164 | # get options: | |||||
| 165 | my($cmp, $lc); | |||||
| 166 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; | |||||
| 167 | ||||||
| 168 | return @_ unless @_ > 1 or wantarray; # be clever | |||||
| 169 | ||||||
| 170 | my($x, $x2, $y, $y2, $rv); # scratch vars | |||||
| 171 | ||||||
| 172 | # We use a Schwartzian xform to memoize the lc'ing and \W-removal | |||||
| 173 | ||||||
| 174 | map $_->[0], | |||||
| 175 | sort { | |||||
| 176 | if($a->[0] eq $b->[0]) { 0 } # trap this expensive case | |||||
| 177 | else { | |||||
| 178 | ||||||
| 179 | $x = $a->[1]; | |||||
| 180 | $y = $b->[1]; | |||||
| 181 | ||||||
| 182 | ~COMPARATOR~ | |||||
| 183 | ||||||
| 184 | # Tiebreakers... | |||||
| 185 | DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; | |||||
| 186 | $rv ||= (length($x) <=> length($y)) # shorter is always first | |||||
| 187 | || ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0])) | |||||
| 188 | || ($x cmp $y ) | |||||
| 189 | || ($a->[0] cmp $b->[0]) | |||||
| 190 | ; | |||||
| 191 | ||||||
| 192 | DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; | |||||
| 193 | $rv; | |||||
| 194 | }} | |||||
| 195 | ||||||
| 196 | map {; | |||||
| 197 | $x = $lc ? $lc->($_) : lc($_); # x as scratch | |||||
| 198 | $x =~ s/\W+//s; | |||||
| 199 | [$_, $x]; | |||||
| 200 | } | |||||
| 201 | @_ | |||||
| 202 | } | |||||
| 203 | EONSORT | |||||
| 204 | ||||||
| 205 | #----------------------------------------------------------------------------- | |||||
| 206 | maker(<<'EONCMP'); | |||||
| 207 | sub ncmp { | |||||
| 208 | # The guts are basically the same as above... | |||||
| 209 | ||||||
| 210 | # get options: | |||||
| 211 | my($cmp, $lc); | |||||
| 212 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; | |||||
| 213 | ||||||
| 214 | if(@_ == 0) { | |||||
| 215 | @_ = ($a, $b); # bit of a hack! | |||||
| 216 | DEBUG > 1 and print "Hacking in <$a><$b>\n"; | |||||
| 217 | } elsif(@_ != 2) { | |||||
| 218 | require Carp; | |||||
| 219 | Carp::croak("Not enough options to ncmp!"); | |||||
| 220 | } | |||||
| 221 | my($a,$b) = @_; | |||||
| 222 | my($x, $x2, $y, $y2, $rv); # scratch vars | |||||
| 223 | ||||||
| 224 | DEBUG > 1 and print "ncmp args <$a><$b>\n"; | |||||
| 225 | if($a eq $b) { # trap this expensive case | |||||
| 226 | 0; | |||||
| 227 | } else { | |||||
| 228 | $x = ($lc ? $lc->($a) : lc($a)); | |||||
| 229 | $x =~ s/\W+//s; | |||||
| 230 | $y = ($lc ? $lc->($b) : lc($b)); | |||||
| 231 | $y =~ s/\W+//s; | |||||
| 232 | ||||||
| 233 | ~COMPARATOR~ | |||||
| 234 | ||||||
| 235 | ||||||
| 236 | # Tiebreakers... | |||||
| 237 | DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n"; | |||||
| 238 | $rv ||= (length($x) <=> length($y)) # shorter is always first | |||||
| 239 | || ($cmp and $cmp->($x,$y) || $cmp->($a,$b)) | |||||
| 240 | || ($x cmp $y) | |||||
| 241 | || ($a cmp $b) | |||||
| 242 | ; | |||||
| 243 | ||||||
| 244 | DEBUG > 1 and print " <$a> cmp <$b> is $rv\n"; | |||||
| 245 | $rv; | |||||
| 246 | } | |||||
| 247 | } | |||||
| 248 | EONCMP | |||||
| 249 | ||||||
| 250 | # clean up: | |||||
| 251 | undef $guts; | |||||
| 252 | undef &maker; | |||||
| 253 | ||||||
| 254 | #----------------------------------------------------------------------------- | |||||
| 255 | 1; | |||||
| 256 | ||||||
| 257 | ############### END OF MAIN SOURCE ########################################### | |||||