File: | local/lib/perl5/Sort/Naturally.pm |
Coverage: | 13.2% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | ||||||
2 | require 5; | |||||
3 | package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST" | |||||
4 | $VERSION = '1.03'; | |||||
5 | @EXPORT = ('nsort', 'ncmp'); | |||||
6 | require Exporter; | |||||
7 | @ISA = ('Exporter'); | |||||
8 | ||||||
9 | 2 2 2 | 4 2 33 | use strict; | |||
10 | 2 2 2 | 222 193 6 | use locale; | |||
11 | 2 2 2 | 25 1 6 | use integer; | |||
12 | ||||||
13 | #----------------------------------------------------------------------------- | |||||
14 | # constants: | |||||
15 | 2 | 55 | BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } | |||
16 | ||||||
17 | 2 2 2 | 4 0 16 | use Config (); | |||
18 | BEGIN { | |||||
19 | # Make a constant such that if a whole-number string is that long | |||||
20 | # or shorter, we KNOW it's treatable as an integer | |||||
21 | 2 2 2 | 3 1 4 | no integer; | |||
22 | 2 | 116 | my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1; | |||
23 | 2 | 5 | die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4; | |||
24 | 2 | 40 | eval 'sub MAX_INT_SIZE () {' . $x . '}'; | |||
25 | 2 | 5 | die $@ if $@; | |||
26 | 2 | 202 | print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG; | |||
27 | } | |||||
28 | ||||||
29 | sub X_FIRST () {-1} | |||||
30 | sub Y_FIRST () { 1} | |||||
31 | ||||||
32 | my @ORD = ('same', 'swap', 'asis'); | |||||
33 | ||||||
34 | #----------------------------------------------------------------------------- | |||||
35 | # For lack of a preprocessor: | |||||
36 | ||||||
37 | my($code, $guts); | |||||
38 | $guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort: | |||||
39 | ||||||
40 | if($x eq $y) { | |||||
41 | # trap this expensive case first, and then fall thru to tiebreaker | |||||
42 | $rv = 0; | |||||
43 | ||||||
44 | # Convoluted hack to get numerics to sort first, at string start: | |||||
45 | } elsif($x =~ m/^\d/s) { | |||||
46 | if($y =~ m/^\d/s) { | |||||
47 | $rv = 0; # fall thru to normal comparison for the two numbers | |||||
48 | } else { | |||||
49 | $rv = X_FIRST; | |||||
50 | DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n"; | |||||
51 | } | |||||
52 | } elsif($y =~ m/^\d/s) { | |||||
53 | $rv = Y_FIRST; | |||||
54 | DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n"; | |||||
55 | } else { | |||||
56 | $rv = 0; | |||||
57 | } | |||||
58 | ||||||
59 | unless($rv) { | |||||
60 | # Normal case: | |||||
61 | $rv = 0; | |||||
62 | DEBUG and print "<$x> and <$y> compared...\n"; | |||||
63 | ||||||
64 | Consideration: | |||||
65 | while(length $x and length $y) { | |||||
66 | ||||||
67 | DEBUG > 2 and print " <$x> and <$y>...\n"; | |||||
68 | ||||||
69 | # First, non-numeric comparison: | |||||
70 | $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0; | |||||
71 | $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0; | |||||
72 | # Now make x2 the min length of the two: | |||||
73 | $x2 = $y2 if $x2 > $y2; | |||||
74 | if($x2) { | |||||
75 | DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n", | |||||
76 | substr($x,0,$x2), substr($y,0,$x2); | |||||
77 | do { | |||||
78 | my $i = substr($x,0,$x2); | |||||
79 | my $j = substr($y,0,$x2); | |||||
80 | my $sv = $i cmp $j; | |||||
81 | print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv; | |||||
82 | last; | |||||
83 | } | |||||
84 | ||||||
85 | ||||||
86 | if $rv = | |||||
87 | # The ''. things here force a copy that seems to work around a | |||||
88 | # mysterious intermittent bug that 'use locale' provokes in | |||||
89 | # many versions of Perl. | |||||
90 | $cmp | |||||
91 | ? $cmp->(substr($x,0,$x2) . '', | |||||
92 | substr($y,0,$x2) . '', | |||||
93 | ) | |||||
94 | : | |||||
95 | scalar(( substr($x,0,$x2) . '' ) cmp | |||||
96 | ( substr($y,0,$x2) . '' ) | |||||
97 | ) | |||||
98 | ; | |||||
99 | # otherwise trim and keep going: | |||||
100 | substr($x,0,$x2) = ''; | |||||
101 | substr($y,0,$x2) = ''; | |||||
102 | } | |||||
103 | ||||||
104 | # Now numeric: | |||||
105 | # (actually just using $x2 and $y2 as scratch) | |||||
106 | ||||||
107 | if( $x =~ s/^(\d+)//s ) { | |||||
108 | $x2 = $1; | |||||
109 | if( $y =~ s/^(\d+)//s ) { | |||||
110 | # We have two numbers here. | |||||
111 | DEBUG > 1 and print " <$x2> and <$1> numerically\n"; | |||||
112 | if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) { | |||||
113 | # small numbers: we can compare happily | |||||
114 | last if $rv = $x2 <=> $1; | |||||
115 | } else { | |||||
116 | # ARBITRARILY large integers! | |||||
117 | ||||||
118 | # This saves on loss of precision that could happen | |||||
119 | # with actual stringification. | |||||
120 | # Also, I sense that very large numbers aren't too | |||||
121 | # terribly common in sort data. | |||||
122 | ||||||
123 | # trim leading 0's: | |||||
124 | ($y2 = $1) =~ s/^0+//s; | |||||
125 | $x2 =~ s/^0+//s; | |||||
126 | print " Treating $x2 and $y2 as bigint\n" if DEBUG; | |||||
127 | ||||||
128 | no locale; # we want the dumb cmp back. | |||||
129 | last if $rv = ( | |||||
130 | # works only for non-negative whole numbers: | |||||
131 | length($x2) <=> length($y2) | |||||
132 | # the longer the numeral, the larger the value | |||||
133 | or $x2 cmp $y2 | |||||
134 | # between equals, compare lexically!! amazing but true. | |||||
135 | ); | |||||
136 | } | |||||
137 | } else { | |||||
138 | # X is numeric but Y isn't | |||||
139 | $rv = Y_FIRST; | |||||
140 | last; | |||||
141 | } | |||||
142 | } elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring | |||||
143 | $rv = X_FIRST; | |||||
144 | last; | |||||
145 | } | |||||
146 | # else one of them is 0-length. | |||||
147 | ||||||
148 | # end-while | |||||
149 | } | |||||
150 | } | |||||
151 | EOGUTS | |||||
152 | ||||||
153 | sub maker { | |||||
154 | my $code = $_[0]; | |||||
155 | $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~"; | |||||
156 | 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | 5 1 4 5 2 3 | eval $code; | |||
157 | die $@ if $@; | |||||
158 | } | |||||
159 | ||||||
160 | ############################################################################## | |||||
161 | ||||||
162 | maker(<<'EONSORT'); | |||||
163 | sub nsort { | |||||
164 | # get options: | |||||
165 | my($cmp, $lc); | |||||
166 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; | |||||
167 | ||||||
168 | return @_ unless @_ > 1 or wantarray; # be clever | |||||
169 | ||||||
170 | my($x, $x2, $y, $y2, $rv); # scratch vars | |||||
171 | ||||||
172 | # We use a Schwartzian xform to memoize the lc'ing and \W-removal | |||||
173 | ||||||
174 | map $_->[0], | |||||
175 | sort { | |||||
176 | if($a->[0] eq $b->[0]) { 0 } # trap this expensive case | |||||
177 | else { | |||||
178 | ||||||
179 | $x = $a->[1]; | |||||
180 | $y = $b->[1]; | |||||
181 | ||||||
182 | ~COMPARATOR~ | |||||
183 | ||||||
184 | # Tiebreakers... | |||||
185 | DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; | |||||
186 | $rv ||= (length($x) <=> length($y)) # shorter is always first | |||||
187 | || ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0])) | |||||
188 | || ($x cmp $y ) | |||||
189 | || ($a->[0] cmp $b->[0]) | |||||
190 | ; | |||||
191 | ||||||
192 | DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; | |||||
193 | $rv; | |||||
194 | }} | |||||
195 | ||||||
196 | map {; | |||||
197 | $x = $lc ? $lc->($_) : lc($_); # x as scratch | |||||
198 | $x =~ s/\W+//s; | |||||
199 | [$_, $x]; | |||||
200 | } | |||||
201 | @_ | |||||
202 | } | |||||
203 | EONSORT | |||||
204 | ||||||
205 | #----------------------------------------------------------------------------- | |||||
206 | maker(<<'EONCMP'); | |||||
207 | sub ncmp { | |||||
208 | # The guts are basically the same as above... | |||||
209 | ||||||
210 | # get options: | |||||
211 | my($cmp, $lc); | |||||
212 | ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; | |||||
213 | ||||||
214 | if(@_ == 0) { | |||||
215 | @_ = ($a, $b); # bit of a hack! | |||||
216 | DEBUG > 1 and print "Hacking in <$a><$b>\n"; | |||||
217 | } elsif(@_ != 2) { | |||||
218 | require Carp; | |||||
219 | Carp::croak("Not enough options to ncmp!"); | |||||
220 | } | |||||
221 | my($a,$b) = @_; | |||||
222 | my($x, $x2, $y, $y2, $rv); # scratch vars | |||||
223 | ||||||
224 | DEBUG > 1 and print "ncmp args <$a><$b>\n"; | |||||
225 | if($a eq $b) { # trap this expensive case | |||||
226 | 0; | |||||
227 | } else { | |||||
228 | $x = ($lc ? $lc->($a) : lc($a)); | |||||
229 | $x =~ s/\W+//s; | |||||
230 | $y = ($lc ? $lc->($b) : lc($b)); | |||||
231 | $y =~ s/\W+//s; | |||||
232 | ||||||
233 | ~COMPARATOR~ | |||||
234 | ||||||
235 | ||||||
236 | # Tiebreakers... | |||||
237 | DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n"; | |||||
238 | $rv ||= (length($x) <=> length($y)) # shorter is always first | |||||
239 | || ($cmp and $cmp->($x,$y) || $cmp->($a,$b)) | |||||
240 | || ($x cmp $y) | |||||
241 | || ($a cmp $b) | |||||
242 | ; | |||||
243 | ||||||
244 | DEBUG > 1 and print " <$a> cmp <$b> is $rv\n"; | |||||
245 | $rv; | |||||
246 | } | |||||
247 | } | |||||
248 | EONCMP | |||||
249 | ||||||
250 | # clean up: | |||||
251 | undef $guts; | |||||
252 | undef &maker; | |||||
253 | ||||||
254 | #----------------------------------------------------------------------------- | |||||
255 | 1; | |||||
256 | ||||||
257 | ############### END OF MAIN SOURCE ########################################### |