]>
Commit | Line | Data |
---|---|---|
a99d15a3 MG |
1 | package Text::Levenshtein::Edlib; |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | use Carp; | |
7 | use parent qw/Exporter/; | |
8 | ||
9 | my @constants = | |
10 | qw/ | |
11 | EDLIB_CIGAR_EXTENDED | |
12 | EDLIB_CIGAR_STANDARD | |
13 | EDLIB_EDOP_DELETE | |
14 | EDLIB_EDOP_INSERT | |
15 | EDLIB_EDOP_MATCH | |
16 | EDLIB_EDOP_MISMATCH | |
17 | EDLIB_MODE_HW | |
18 | EDLIB_MODE_NW | |
19 | EDLIB_MODE_SHW | |
20 | EDLIB_STATUS_ERROR | |
21 | EDLIB_STATUS_OK | |
22 | EDLIB_TASK_DISTANCE | |
23 | EDLIB_TASK_LOC | |
24 | EDLIB_TASK_PATH/; | |
25 | ||
26 | our %EXPORT_TAGS = | |
329961ff | 27 | (all => [ @constants, qw/align distance to_cigar/ ], constants => \@constants); |
a99d15a3 MG |
28 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
29 | our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } ); | |
30 | our $VERSION = '0.001'; | |
31 | ||
32 | require XSLoader; | |
33 | XSLoader::load('Text::Levenshtein::Edlib', $VERSION); | |
34 | ||
35 | sub AUTOLOAD { | |
36 | # This AUTOLOAD is used to 'autoload' constants from the constant() | |
37 | # XS function. | |
38 | ||
39 | my $constname; | |
40 | our $AUTOLOAD; | |
41 | ($constname = $AUTOLOAD) =~ s/.*:://; | |
42 | croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant'; | |
43 | my ($error, $val) = constant($constname); | |
44 | if ($error) { croak $error; } | |
45 | { | |
46 | no strict 'refs'; | |
47 | *$AUTOLOAD = sub { $val }; | |
48 | } | |
49 | goto &$AUTOLOAD; | |
50 | } | |
51 | ||
52 | sub align { | |
53 | my ($q, $t, $k, $mode, $task) = @_; | |
54 | $k //= -1; | |
55 | $mode //= EDLIB_MODE_NW(); | |
56 | $task //= EDLIB_TASK_PATH(); | |
57 | my $result = edlibAlign($q, $t, $k, $mode, $task); | |
58 | my ($dist, $alpha_len, $end, $start, $align) = @$result; | |
59 | return {} if $dist == -1; | |
60 | my %ret; | |
61 | $ret{editDistance} = $dist; | |
62 | $ret{alphabetLength} = $alpha_len; | |
63 | $ret{endLocations} = $end if defined $end; | |
64 | $ret{startLocations} = $start if defined $start; | |
65 | $ret{alignment} = $align if defined $align; | |
66 | \%ret | |
67 | } | |
68 | ||
69 | sub distance { | |
70 | my ($q, $t, $k) = @_; | |
71 | align($q, $t, $k)->{editDistance} | |
72 | } | |
73 | ||
329961ff MG |
74 | sub to_cigar { |
75 | my ($align, $format) = @_; | |
76 | $align = pack 'C*', @$align; | |
77 | $format //= EDLIB_CIGAR_STANDARD(); | |
78 | edlibAlignmentToCigar($align, $format); | |
79 | } | |
80 | ||
a99d15a3 MG |
81 | 1; |
82 | __END__ | |
83 | ||
84 | =encoding utf-8 | |
85 | ||
86 | =head1 NAME | |
87 | ||
88 | Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation | |
89 | ||
90 | =head1 SYNOPSIS | |
91 | ||
92 | use feature 'say'; | |
93 | ||
94 | use Text::Levenshtein::Edlib qw/:all/; | |
95 | say distance 'kitten', 'sitting'; # prints '3' | |
96 | say 'Distance > 2!' | |
97 | if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!' | |
98 | ||
99 | my $align = align('kitten', 'sitting'); | |
100 | say "Edit distance is: $align->{editDistance}"; | |
101 | say "Alphabet length is: $align->{alphabetLength}"; | |
102 | say "Start locations are: @{$align->{startLocations}}"; | |
103 | say "End locations are: @{$align->{endLocations}}"; | |
104 | say "Alignment path is: @{$align->{alignment}}"; | |
329961ff MG |
105 | say "Alignment path (in CIGAR format): ", to_cigar $align->{alignment}; |
106 | say "Alignment path (in extended CIGAR format): ", | |
107 | to_cigar $align->{alignment}, EDLIB_CIGAR_EXTENDED; | |
a99d15a3 MG |
108 | |
109 | =head1 DESCRIPTION | |
110 | ||
111 | Text::Levenshtein::Edlib is a wrapper around the edlib library that | |
112 | computes Levenshtein edit distance and optimal alignment path for a | |
113 | pair of strings. | |
114 | ||
115 | It B<does not handle UTF-8 strings>, for those | |
116 | L<Text::Levenshtein::XS> can compute edit distance but not alignment | |
117 | path. | |
118 | ||
119 | This module has two functions: | |
120 | ||
121 | =over | |
122 | ||
123 | =item B<distance>(I<$query>, I<$target>, [I<$max_distance>]) | |
124 | ||
125 | This is the basic interface to the library. It is compatible with the | |
126 | function of the same name in L<Text::Levenshtein::XS>. | |
127 | ||
128 | It returns the edit distance between the two given strings. If the | |
129 | third argument is specified, and the edit distance is greater than the | |
130 | value of the third argument, then the function finishes the | |
131 | computation early and returns undef. | |
132 | ||
133 | =item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]]) | |
134 | ||
135 | This is the full-featured interface to the library. | |
136 | ||
137 | It returns a hashref with the following keys: | |
138 | ||
139 | =over | |
140 | ||
141 | =item C<editDistance> | |
142 | ||
143 | The edit distance of the two strings. | |
144 | ||
145 | =item C<alphabetLength> | |
146 | ||
147 | The number of different characters in the query and target together. | |
148 | ||
149 | =item C<endLocations> | |
150 | ||
151 | Array of zero-based positions in target where optimal alignment paths | |
152 | end. If gap after query is penalized, gap counts as part of query | |
153 | (NW), otherwise not. | |
154 | ||
155 | =item C<startLocations> | |
156 | ||
157 | Array of zero-based positions in target where optimal alignment paths | |
158 | start, they correspond to endLocations. If gap before query is | |
159 | penalized, gap counts as part of query (NW), otherwise not. | |
160 | ||
161 | =item C<alignment> | |
162 | ||
163 | Alignment is found for first pair of start and end locations. | |
164 | Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1 | |
165 | stands for insertion to target. 2 stands for insertion to query. 3 | |
166 | stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead | |
167 | of 0, 1, 2, and 3. Alignment aligns query to target from begining of | |
168 | query till end of query. If gaps are not penalized, they are not in | |
169 | alignment. | |
170 | ||
171 | =back | |
172 | ||
173 | The third argument, I<$max_distance>, works similarly to the third | |
174 | argument of B<distance>: if the distance is more than its value, this | |
175 | function returns an empty hashref. Default value is -1, which disables | |
176 | this optimization. | |
177 | ||
178 | The fourth argument, I<$mode>, chooses how Edlib should treat gaps | |
179 | before and after query. The options are: | |
180 | ||
181 | =over | |
182 | ||
183 | =item C<EDLIB_MODE_NW> (default) | |
184 | ||
185 | Global method - gaps are not ignored. This is the standard Levenshtein | |
186 | distance, and is the default if I<$mode> is not specified. | |
187 | ||
188 | =item C<EDLIB_MODE_SHW> | |
189 | ||
190 | Prefix method - gaps at query end are ignored. So the edit distance | |
191 | between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC> | |
192 | at the end. | |
193 | ||
194 | =item C<EDLIB_MODE_HW> | |
195 | ||
196 | Infix method - gaps at both query start and end are ignored. So the | |
197 | edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at | |
198 | the beginning and C<GAC> at the end of the target are ignored. | |
199 | ||
200 | =back | |
201 | ||
202 | The fifth argument, I<$task>, chooses what we want to compute. | |
203 | If set to C<EDLIB_TASK_PATH> (default), all the keys described above will be computed. | |
204 | If set to C<EDLIB_TASK_LOC>, all keys except for C<alignment> will be computed. | |
205 | If set to C<EDLIB_TASK_DISTANCE>, all keys except for C<alignment> and C<startLocations> will be computed. | |
206 | The less we compute, the faster the function will run. | |
207 | ||
208 | =back | |
209 | ||
210 | =head2 EXPORT | |
211 | ||
212 | All constants by default. You can export the functions C<align> and | |
213 | C<distance> and any of the constants below. You can use the tags | |
214 | C<:constants> to export every constant, and C<:all> to export every | |
215 | constant, C<align> and C<distance>. | |
216 | ||
217 | =head2 Exportable constants | |
218 | ||
219 | EDLIB_CIGAR_EXTENDED | |
220 | EDLIB_CIGAR_STANDARD | |
221 | EDLIB_EDOP_DELETE | |
222 | EDLIB_EDOP_INSERT | |
223 | EDLIB_EDOP_MATCH | |
224 | EDLIB_EDOP_MISMATCH | |
225 | EDLIB_MODE_HW | |
226 | EDLIB_MODE_NW | |
227 | EDLIB_MODE_SHW | |
228 | EDLIB_STATUS_ERROR | |
229 | EDLIB_STATUS_OK | |
230 | EDLIB_TASK_DISTANCE | |
231 | EDLIB_TASK_LOC | |
232 | EDLIB_TASK_PATH | |
233 | ||
234 | =head1 SEE ALSO | |
235 | ||
ed67b619 | 236 | L<https://github.com/Martinsos/edlib/>, L<http://martinsosic.com/edlib/edlib_8h.html> |
a99d15a3 MG |
237 | |
238 | =head1 AUTHOR | |
239 | ||
240 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
241 | ||
242 | =head1 COPYRIGHT AND LICENSE | |
243 | ||
244 | Copyright (C) 2017 by Marius Gavrilescu | |
245 | ||
246 | This library is free software; you can redistribute it and/or modify | |
247 | it under the same terms as Perl itself, either Perl version 5.22.3 or, | |
248 | at your option, any later version of Perl 5 you may have available. | |
249 | ||
250 | ||
251 | =cut |