]>
Commit | Line | Data |
---|---|---|
1 | package Text::Levenshtein::Edlib; | |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | use Carp; | |
7 | use parent qw/Exporter/; | |
8 | ||
9 | my @constants = | |
10 | qw/ | |
11 | EDLIB_CIGAR_EXTENDED | |
12 | EDLIB_CIGAR_STANDARD | |
13 | EDLIB_EDOP_DELETE | |
14 | EDLIB_EDOP_INSERT | |
15 | EDLIB_EDOP_MATCH | |
16 | EDLIB_EDOP_MISMATCH | |
17 | EDLIB_MODE_HW | |
18 | EDLIB_MODE_NW | |
19 | EDLIB_MODE_SHW | |
20 | EDLIB_STATUS_ERROR | |
21 | EDLIB_STATUS_OK | |
22 | EDLIB_TASK_DISTANCE | |
23 | EDLIB_TASK_LOC | |
24 | EDLIB_TASK_PATH/; | |
25 | ||
26 | our %EXPORT_TAGS = | |
27 | (all => [ @constants, qw/align distance to_cigar/ ], constants => \@constants); | |
28 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | |
29 | our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } ); | |
30 | our $VERSION = '0.001001'; | |
31 | ||
32 | require XSLoader; | |
33 | XSLoader::load('Text::Levenshtein::Edlib', $VERSION); | |
34 | ||
35 | sub AUTOLOAD { | |
36 | # This AUTOLOAD is used to 'autoload' constants from the constant() | |
37 | # XS function. | |
38 | ||
39 | my $constname; | |
40 | our $AUTOLOAD; | |
41 | ($constname = $AUTOLOAD) =~ s/.*:://; | |
42 | croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant'; | |
43 | my ($error, $val) = constant($constname); | |
44 | if ($error) { croak $error; } | |
45 | { | |
46 | no strict 'refs'; | |
47 | *$AUTOLOAD = sub { $val }; | |
48 | } | |
49 | goto &$AUTOLOAD; | |
50 | } | |
51 | ||
52 | sub align { | |
53 | my ($q, $t, $k, $mode, $task) = @_; | |
54 | $k //= -1; | |
55 | $mode //= EDLIB_MODE_NW(); | |
56 | $task //= EDLIB_TASK_PATH(); | |
57 | my $result = edlibAlign($q, $t, $k, $mode, $task); | |
58 | my ($dist, $alpha_len, $end, $start, $align) = @$result; | |
59 | return {} if $dist == -1; | |
60 | my %ret; | |
61 | $ret{editDistance} = $dist; | |
62 | $ret{alphabetLength} = $alpha_len; | |
63 | $ret{endLocations} = $end if defined $end; | |
64 | $ret{startLocations} = $start if defined $start; | |
65 | $ret{alignment} = $align if defined $align; | |
66 | \%ret | |
67 | } | |
68 | ||
69 | sub distance { | |
70 | my ($q, $t, $k, $mode) = @_; | |
71 | align($q, $t, $k, $mode, EDLIB_TASK_DISTANCE())->{editDistance} | |
72 | } | |
73 | ||
74 | sub to_cigar { | |
75 | my ($align, $format) = @_; | |
76 | $align = pack 'C*', @$align; | |
77 | $format //= EDLIB_CIGAR_STANDARD(); | |
78 | edlibAlignmentToCigar($align, $format); | |
79 | } | |
80 | ||
81 | 1; | |
82 | __END__ | |
83 | ||
84 | =encoding utf-8 | |
85 | ||
86 | =head1 NAME | |
87 | ||
88 | Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation | |
89 | ||
90 | =head1 SYNOPSIS | |
91 | ||
92 | use feature 'say'; | |
93 | ||
94 | use Text::Levenshtein::Edlib qw/:all/; | |
95 | say distance 'kitten', 'sitting'; # prints '3' | |
96 | say 'Distance > 2!' | |
97 | if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!' | |
98 | ||
99 | my $align = align('kitten', 'sitting'); | |
100 | say "Edit distance is: $align->{editDistance}"; | |
101 | say "Alphabet length is: $align->{alphabetLength}"; | |
102 | say "Start locations are: @{$align->{startLocations}}"; | |
103 | say "End locations are: @{$align->{endLocations}}"; | |
104 | say "Alignment path is: @{$align->{alignment}}"; | |
105 | say "Alignment path (in CIGAR format): ", to_cigar $align->{alignment}; | |
106 | say "Alignment path (in extended CIGAR format): ", | |
107 | to_cigar $align->{alignment}, EDLIB_CIGAR_EXTENDED; | |
108 | ||
109 | =head1 DESCRIPTION | |
110 | ||
111 | Text::Levenshtein::Edlib is a wrapper around the edlib library that | |
112 | computes Levenshtein edit distance and optimal alignment path for a | |
113 | pair of strings. | |
114 | ||
115 | It B<does not handle UTF-8 strings>, for those | |
116 | L<Text::Levenshtein::XS> can compute edit distance but not alignment | |
117 | path. | |
118 | ||
119 | This module has two functions: | |
120 | ||
121 | =over | |
122 | ||
123 | =item B<distance>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>]]) | |
124 | ||
125 | This is the basic interface to the library. It is compatible with the | |
126 | function of the same name in L<Text::Levenshtein::XS>. | |
127 | ||
128 | It returns the edit distance between the two given strings. If the | |
129 | third argument is specified, and the edit distance is greater than the | |
130 | value of the third argument, then the function finishes the | |
131 | computation early and returns undef. See below for the meaning of the | |
132 | optional I<$mode> argument. | |
133 | ||
134 | =item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]]) | |
135 | ||
136 | This is the full-featured interface to the library. | |
137 | ||
138 | It returns a hashref with the following keys: | |
139 | ||
140 | =over | |
141 | ||
142 | =item C<editDistance> | |
143 | ||
144 | The edit distance of the two strings. | |
145 | ||
146 | =item C<alphabetLength> | |
147 | ||
148 | The number of different characters in the query and target together. | |
149 | ||
150 | =item C<endLocations> | |
151 | ||
152 | Array of zero-based positions in target where optimal alignment paths | |
153 | end. If gap after query is penalized, gap counts as part of query | |
154 | (NW), otherwise not. | |
155 | ||
156 | =item C<startLocations> | |
157 | ||
158 | Array of zero-based positions in target where optimal alignment paths | |
159 | start, they correspond to endLocations. If gap before query is | |
160 | penalized, gap counts as part of query (NW), otherwise not. | |
161 | ||
162 | =item C<alignment> | |
163 | ||
164 | Alignment is found for first pair of start and end locations. | |
165 | Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1 | |
166 | stands for insertion to target. 2 stands for insertion to query. 3 | |
167 | stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead | |
168 | of 0, 1, 2, and 3. Alignment aligns query to target from begining of | |
169 | query till end of query. If gaps are not penalized, they are not in | |
170 | alignment. | |
171 | ||
172 | =back | |
173 | ||
174 | The third argument, I<$max_distance>, works similarly to the third | |
175 | argument of B<distance>: if the distance is more than its value, this | |
176 | function returns an empty hashref. Default value is -1, which disables | |
177 | this optimization. | |
178 | ||
179 | The fourth argument, I<$mode>, chooses how Edlib should treat gaps | |
180 | before and after query. The options are: | |
181 | ||
182 | =over | |
183 | ||
184 | =item C<EDLIB_MODE_NW> (default) | |
185 | ||
186 | Global method - gaps are not ignored. This is the standard Levenshtein | |
187 | distance, and is the default if I<$mode> is not specified. | |
188 | ||
189 | =item C<EDLIB_MODE_SHW> | |
190 | ||
191 | Prefix method - gaps at query end are ignored. So the edit distance | |
192 | between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC> | |
193 | at the end. | |
194 | ||
195 | =item C<EDLIB_MODE_HW> | |
196 | ||
197 | Infix method - gaps at both query start and end are ignored. So the | |
198 | edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at | |
199 | the beginning and C<GAC> at the end of the target are ignored. | |
200 | ||
201 | =back | |
202 | ||
203 | The fifth argument, I<$task>, chooses what we want to compute. The options are: | |
204 | ||
205 | =over | |
206 | ||
207 | =item C<EDLIB_TASK_PATH> (default, slowest) | |
208 | ||
209 | All the keys described above will be computed. | |
210 | ||
211 | =item C<EDLIB_TASK_LOC> | |
212 | ||
213 | All keys except for C<alignment> will be computed. | |
214 | ||
215 | =item C<EDLIB_TASK_DISTANCE> (fastest) | |
216 | ||
217 | All keys except for C<alignment> and C<startLocations> will be computed. | |
218 | ||
219 | =back | |
220 | ||
221 | The less the function computes, the faster it runs. | |
222 | ||
223 | =back | |
224 | ||
225 | =head2 EXPORT | |
226 | ||
227 | All constants by default. You can export the functions C<align>, | |
228 | C<distance> and C<to_cigar> and any of the constants below. You can | |
229 | use the tags C<:constants> to export every constant, and C<:all> to | |
230 | export every constant, C<align>, C<distance> and C<to_cigar>. | |
231 | ||
232 | =head2 Exportable constants | |
233 | ||
234 | EDLIB_CIGAR_EXTENDED | |
235 | EDLIB_CIGAR_STANDARD | |
236 | EDLIB_EDOP_DELETE | |
237 | EDLIB_EDOP_INSERT | |
238 | EDLIB_EDOP_MATCH | |
239 | EDLIB_EDOP_MISMATCH | |
240 | EDLIB_MODE_HW | |
241 | EDLIB_MODE_NW | |
242 | EDLIB_MODE_SHW | |
243 | EDLIB_STATUS_ERROR | |
244 | EDLIB_STATUS_OK | |
245 | EDLIB_TASK_DISTANCE | |
246 | EDLIB_TASK_LOC | |
247 | EDLIB_TASK_PATH | |
248 | ||
249 | =head1 SEE ALSO | |
250 | ||
251 | L<https://github.com/Martinsos/edlib/>, L<http://martinsosic.com/edlib/edlib_8h.html> | |
252 | ||
253 | =head1 AUTHOR | |
254 | ||
255 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
256 | ||
257 | =head1 COPYRIGHT AND LICENSE | |
258 | ||
259 | Copyright (C) 2017 by Marius Gavrilescu | |
260 | ||
261 | This library is free software; you can redistribute it and/or modify | |
262 | it under the same terms as Perl itself, either Perl version 5.22.3 or, | |
263 | at your option, any later version of Perl 5 you may have available. | |
264 | ||
265 | ||
266 | =cut |