]>
Commit | Line | Data |
---|---|---|
1 | package Text::Levenshtein::Edlib; | |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | use Carp; | |
7 | use parent qw/Exporter/; | |
8 | ||
9 | my @constants = | |
10 | qw/ | |
11 | EDLIB_CIGAR_EXTENDED | |
12 | EDLIB_CIGAR_STANDARD | |
13 | EDLIB_EDOP_DELETE | |
14 | EDLIB_EDOP_INSERT | |
15 | EDLIB_EDOP_MATCH | |
16 | EDLIB_EDOP_MISMATCH | |
17 | EDLIB_MODE_HW | |
18 | EDLIB_MODE_NW | |
19 | EDLIB_MODE_SHW | |
20 | EDLIB_STATUS_ERROR | |
21 | EDLIB_STATUS_OK | |
22 | EDLIB_TASK_DISTANCE | |
23 | EDLIB_TASK_LOC | |
24 | EDLIB_TASK_PATH/; | |
25 | ||
26 | our %EXPORT_TAGS = | |
27 | (all => [ @constants, qw/align distance/ ], constants => \@constants); | |
28 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | |
29 | our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } ); | |
30 | our $VERSION = '0.001'; | |
31 | ||
32 | require XSLoader; | |
33 | XSLoader::load('Text::Levenshtein::Edlib', $VERSION); | |
34 | ||
35 | sub AUTOLOAD { | |
36 | # This AUTOLOAD is used to 'autoload' constants from the constant() | |
37 | # XS function. | |
38 | ||
39 | my $constname; | |
40 | our $AUTOLOAD; | |
41 | ($constname = $AUTOLOAD) =~ s/.*:://; | |
42 | croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant'; | |
43 | my ($error, $val) = constant($constname); | |
44 | if ($error) { croak $error; } | |
45 | { | |
46 | no strict 'refs'; | |
47 | *$AUTOLOAD = sub { $val }; | |
48 | } | |
49 | goto &$AUTOLOAD; | |
50 | } | |
51 | ||
52 | sub align { | |
53 | my ($q, $t, $k, $mode, $task) = @_; | |
54 | $k //= -1; | |
55 | $mode //= EDLIB_MODE_NW(); | |
56 | $task //= EDLIB_TASK_PATH(); | |
57 | my $result = edlibAlign($q, $t, $k, $mode, $task); | |
58 | my ($dist, $alpha_len, $end, $start, $align) = @$result; | |
59 | return {} if $dist == -1; | |
60 | my %ret; | |
61 | $ret{editDistance} = $dist; | |
62 | $ret{alphabetLength} = $alpha_len; | |
63 | $ret{endLocations} = $end if defined $end; | |
64 | $ret{startLocations} = $start if defined $start; | |
65 | $ret{alignment} = $align if defined $align; | |
66 | \%ret | |
67 | } | |
68 | ||
69 | sub distance { | |
70 | my ($q, $t, $k) = @_; | |
71 | align($q, $t, $k)->{editDistance} | |
72 | } | |
73 | ||
74 | 1; | |
75 | __END__ | |
76 | ||
77 | =encoding utf-8 | |
78 | ||
79 | =head1 NAME | |
80 | ||
81 | Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation | |
82 | ||
83 | =head1 SYNOPSIS | |
84 | ||
85 | use feature 'say'; | |
86 | ||
87 | use Text::Levenshtein::Edlib qw/:all/; | |
88 | say distance 'kitten', 'sitting'; # prints '3' | |
89 | say 'Distance > 2!' | |
90 | if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!' | |
91 | ||
92 | my $align = align('kitten', 'sitting'); | |
93 | say "Edit distance is: $align->{editDistance}"; | |
94 | say "Alphabet length is: $align->{alphabetLength}"; | |
95 | say "Start locations are: @{$align->{startLocations}}"; | |
96 | say "End locations are: @{$align->{endLocations}}"; | |
97 | say "Alignment path is: @{$align->{alignment}}"; | |
98 | ||
99 | ||
100 | =head1 DESCRIPTION | |
101 | ||
102 | Text::Levenshtein::Edlib is a wrapper around the edlib library that | |
103 | computes Levenshtein edit distance and optimal alignment path for a | |
104 | pair of strings. | |
105 | ||
106 | It B<does not handle UTF-8 strings>, for those | |
107 | L<Text::Levenshtein::XS> can compute edit distance but not alignment | |
108 | path. | |
109 | ||
110 | This module has two functions: | |
111 | ||
112 | =over | |
113 | ||
114 | =item B<distance>(I<$query>, I<$target>, [I<$max_distance>]) | |
115 | ||
116 | This is the basic interface to the library. It is compatible with the | |
117 | function of the same name in L<Text::Levenshtein::XS>. | |
118 | ||
119 | It returns the edit distance between the two given strings. If the | |
120 | third argument is specified, and the edit distance is greater than the | |
121 | value of the third argument, then the function finishes the | |
122 | computation early and returns undef. | |
123 | ||
124 | =item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]]) | |
125 | ||
126 | This is the full-featured interface to the library. | |
127 | ||
128 | It returns a hashref with the following keys: | |
129 | ||
130 | =over | |
131 | ||
132 | =item C<editDistance> | |
133 | ||
134 | The edit distance of the two strings. | |
135 | ||
136 | =item C<alphabetLength> | |
137 | ||
138 | The number of different characters in the query and target together. | |
139 | ||
140 | =item C<endLocations> | |
141 | ||
142 | Array of zero-based positions in target where optimal alignment paths | |
143 | end. If gap after query is penalized, gap counts as part of query | |
144 | (NW), otherwise not. | |
145 | ||
146 | =item C<startLocations> | |
147 | ||
148 | Array of zero-based positions in target where optimal alignment paths | |
149 | start, they correspond to endLocations. If gap before query is | |
150 | penalized, gap counts as part of query (NW), otherwise not. | |
151 | ||
152 | =item C<alignment> | |
153 | ||
154 | Alignment is found for first pair of start and end locations. | |
155 | Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1 | |
156 | stands for insertion to target. 2 stands for insertion to query. 3 | |
157 | stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead | |
158 | of 0, 1, 2, and 3. Alignment aligns query to target from begining of | |
159 | query till end of query. If gaps are not penalized, they are not in | |
160 | alignment. | |
161 | ||
162 | =back | |
163 | ||
164 | The third argument, I<$max_distance>, works similarly to the third | |
165 | argument of B<distance>: if the distance is more than its value, this | |
166 | function returns an empty hashref. Default value is -1, which disables | |
167 | this optimization. | |
168 | ||
169 | The fourth argument, I<$mode>, chooses how Edlib should treat gaps | |
170 | before and after query. The options are: | |
171 | ||
172 | =over | |
173 | ||
174 | =item C<EDLIB_MODE_NW> (default) | |
175 | ||
176 | Global method - gaps are not ignored. This is the standard Levenshtein | |
177 | distance, and is the default if I<$mode> is not specified. | |
178 | ||
179 | =item C<EDLIB_MODE_SHW> | |
180 | ||
181 | Prefix method - gaps at query end are ignored. So the edit distance | |
182 | between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC> | |
183 | at the end. | |
184 | ||
185 | =item C<EDLIB_MODE_HW> | |
186 | ||
187 | Infix method - gaps at both query start and end are ignored. So the | |
188 | edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at | |
189 | the beginning and C<GAC> at the end of the target are ignored. | |
190 | ||
191 | =back | |
192 | ||
193 | The fifth argument, I<$task>, chooses what we want to compute. | |
194 | If set to C<EDLIB_TASK_PATH> (default), all the keys described above will be computed. | |
195 | If set to C<EDLIB_TASK_LOC>, all keys except for C<alignment> will be computed. | |
196 | If set to C<EDLIB_TASK_DISTANCE>, all keys except for C<alignment> and C<startLocations> will be computed. | |
197 | The less we compute, the faster the function will run. | |
198 | ||
199 | =back | |
200 | ||
201 | =head2 EXPORT | |
202 | ||
203 | All constants by default. You can export the functions C<align> and | |
204 | C<distance> and any of the constants below. You can use the tags | |
205 | C<:constants> to export every constant, and C<:all> to export every | |
206 | constant, C<align> and C<distance>. | |
207 | ||
208 | =head2 Exportable constants | |
209 | ||
210 | EDLIB_CIGAR_EXTENDED | |
211 | EDLIB_CIGAR_STANDARD | |
212 | EDLIB_EDOP_DELETE | |
213 | EDLIB_EDOP_INSERT | |
214 | EDLIB_EDOP_MATCH | |
215 | EDLIB_EDOP_MISMATCH | |
216 | EDLIB_MODE_HW | |
217 | EDLIB_MODE_NW | |
218 | EDLIB_MODE_SHW | |
219 | EDLIB_STATUS_ERROR | |
220 | EDLIB_STATUS_OK | |
221 | EDLIB_TASK_DISTANCE | |
222 | EDLIB_TASK_LOC | |
223 | EDLIB_TASK_PATH | |
224 | ||
225 | =head1 SEE ALSO | |
226 | ||
227 | L<https://github.com/Martinsos/edlib/> | |
228 | ||
229 | =head1 AUTHOR | |
230 | ||
231 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
232 | ||
233 | =head1 COPYRIGHT AND LICENSE | |
234 | ||
235 | Copyright (C) 2017 by Marius Gavrilescu | |
236 | ||
237 | This library is free software; you can redistribute it and/or modify | |
238 | it under the same terms as Perl itself, either Perl version 5.22.3 or, | |
239 | at your option, any later version of Perl 5 you may have available. | |
240 | ||
241 | ||
242 | =cut |