Better distance function
[text-levenshtein-edlib.git] / lib / Text / Levenshtein / Edlib.pm
1 package Text::Levenshtein::Edlib;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 use Carp;
7 use parent qw/Exporter/;
8
9 my @constants =
10 qw/
11 EDLIB_CIGAR_EXTENDED
12 EDLIB_CIGAR_STANDARD
13 EDLIB_EDOP_DELETE
14 EDLIB_EDOP_INSERT
15 EDLIB_EDOP_MATCH
16 EDLIB_EDOP_MISMATCH
17 EDLIB_MODE_HW
18 EDLIB_MODE_NW
19 EDLIB_MODE_SHW
20 EDLIB_STATUS_ERROR
21 EDLIB_STATUS_OK
22 EDLIB_TASK_DISTANCE
23 EDLIB_TASK_LOC
24 EDLIB_TASK_PATH/;
25
26 our %EXPORT_TAGS =
27 (all => [ @constants, qw/align distance to_cigar/ ], constants => \@constants);
28 our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
29 our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } );
30 our $VERSION = '0.001';
31
32 require XSLoader;
33 XSLoader::load('Text::Levenshtein::Edlib', $VERSION);
34
35 sub AUTOLOAD {
36 # This AUTOLOAD is used to 'autoload' constants from the constant()
37 # XS function.
38
39 my $constname;
40 our $AUTOLOAD;
41 ($constname = $AUTOLOAD) =~ s/.*:://;
42 croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant';
43 my ($error, $val) = constant($constname);
44 if ($error) { croak $error; }
45 {
46 no strict 'refs';
47 *$AUTOLOAD = sub { $val };
48 }
49 goto &$AUTOLOAD;
50 }
51
52 sub align {
53 my ($q, $t, $k, $mode, $task) = @_;
54 $k //= -1;
55 $mode //= EDLIB_MODE_NW();
56 $task //= EDLIB_TASK_PATH();
57 my $result = edlibAlign($q, $t, $k, $mode, $task);
58 my ($dist, $alpha_len, $end, $start, $align) = @$result;
59 return {} if $dist == -1;
60 my %ret;
61 $ret{editDistance} = $dist;
62 $ret{alphabetLength} = $alpha_len;
63 $ret{endLocations} = $end if defined $end;
64 $ret{startLocations} = $start if defined $start;
65 $ret{alignment} = $align if defined $align;
66 \%ret
67 }
68
69 sub distance {
70 my ($q, $t, $k, $mode) = @_;
71 align($q, $t, $k, $mode, EDLIB_TASK_DISTANCE())->{editDistance}
72 }
73
74 sub to_cigar {
75 my ($align, $format) = @_;
76 $align = pack 'C*', @$align;
77 $format //= EDLIB_CIGAR_STANDARD();
78 edlibAlignmentToCigar($align, $format);
79 }
80
81 1;
82 __END__
83
84 =encoding utf-8
85
86 =head1 NAME
87
88 Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation
89
90 =head1 SYNOPSIS
91
92 use feature 'say';
93
94 use Text::Levenshtein::Edlib qw/:all/;
95 say distance 'kitten', 'sitting'; # prints '3'
96 say 'Distance > 2!'
97 if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!'
98
99 my $align = align('kitten', 'sitting');
100 say "Edit distance is: $align->{editDistance}";
101 say "Alphabet length is: $align->{alphabetLength}";
102 say "Start locations are: @{$align->{startLocations}}";
103 say "End locations are: @{$align->{endLocations}}";
104 say "Alignment path is: @{$align->{alignment}}";
105 say "Alignment path (in CIGAR format): ", to_cigar $align->{alignment};
106 say "Alignment path (in extended CIGAR format): ",
107 to_cigar $align->{alignment}, EDLIB_CIGAR_EXTENDED;
108
109 =head1 DESCRIPTION
110
111 Text::Levenshtein::Edlib is a wrapper around the edlib library that
112 computes Levenshtein edit distance and optimal alignment path for a
113 pair of strings.
114
115 It B<does not handle UTF-8 strings>, for those
116 L<Text::Levenshtein::XS> can compute edit distance but not alignment
117 path.
118
119 This module has two functions:
120
121 =over
122
123 =item B<distance>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>]])
124
125 This is the basic interface to the library. It is compatible with the
126 function of the same name in L<Text::Levenshtein::XS>.
127
128 It returns the edit distance between the two given strings. If the
129 third argument is specified, and the edit distance is greater than the
130 value of the third argument, then the function finishes the
131 computation early and returns undef. See below for the meaning of the
132 optional I<$mode> argument.
133
134 =item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]])
135
136 This is the full-featured interface to the library.
137
138 It returns a hashref with the following keys:
139
140 =over
141
142 =item C<editDistance>
143
144 The edit distance of the two strings.
145
146 =item C<alphabetLength>
147
148 The number of different characters in the query and target together.
149
150 =item C<endLocations>
151
152 Array of zero-based positions in target where optimal alignment paths
153 end. If gap after query is penalized, gap counts as part of query
154 (NW), otherwise not.
155
156 =item C<startLocations>
157
158 Array of zero-based positions in target where optimal alignment paths
159 start, they correspond to endLocations. If gap before query is
160 penalized, gap counts as part of query (NW), otherwise not.
161
162 =item C<alignment>
163
164 Alignment is found for first pair of start and end locations.
165 Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1
166 stands for insertion to target. 2 stands for insertion to query. 3
167 stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead
168 of 0, 1, 2, and 3. Alignment aligns query to target from begining of
169 query till end of query. If gaps are not penalized, they are not in
170 alignment.
171
172 =back
173
174 The third argument, I<$max_distance>, works similarly to the third
175 argument of B<distance>: if the distance is more than its value, this
176 function returns an empty hashref. Default value is -1, which disables
177 this optimization.
178
179 The fourth argument, I<$mode>, chooses how Edlib should treat gaps
180 before and after query. The options are:
181
182 =over
183
184 =item C<EDLIB_MODE_NW> (default)
185
186 Global method - gaps are not ignored. This is the standard Levenshtein
187 distance, and is the default if I<$mode> is not specified.
188
189 =item C<EDLIB_MODE_SHW>
190
191 Prefix method - gaps at query end are ignored. So the edit distance
192 between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC>
193 at the end.
194
195 =item C<EDLIB_MODE_HW>
196
197 Infix method - gaps at both query start and end are ignored. So the
198 edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at
199 the beginning and C<GAC> at the end of the target are ignored.
200
201 =back
202
203 The fifth argument, I<$task>, chooses what we want to compute.
204 If set to C<EDLIB_TASK_PATH> (default), all the keys described above will be computed.
205 If set to C<EDLIB_TASK_LOC>, all keys except for C<alignment> will be computed.
206 If set to C<EDLIB_TASK_DISTANCE>, all keys except for C<alignment> and C<startLocations> will be computed.
207 The less we compute, the faster the function will run.
208
209 =back
210
211 =head2 EXPORT
212
213 All constants by default. You can export the functions C<align> and
214 C<distance> and any of the constants below. You can use the tags
215 C<:constants> to export every constant, and C<:all> to export every
216 constant, C<align> and C<distance>.
217
218 =head2 Exportable constants
219
220 EDLIB_CIGAR_EXTENDED
221 EDLIB_CIGAR_STANDARD
222 EDLIB_EDOP_DELETE
223 EDLIB_EDOP_INSERT
224 EDLIB_EDOP_MATCH
225 EDLIB_EDOP_MISMATCH
226 EDLIB_MODE_HW
227 EDLIB_MODE_NW
228 EDLIB_MODE_SHW
229 EDLIB_STATUS_ERROR
230 EDLIB_STATUS_OK
231 EDLIB_TASK_DISTANCE
232 EDLIB_TASK_LOC
233 EDLIB_TASK_PATH
234
235 =head1 SEE ALSO
236
237 L<https://github.com/Martinsos/edlib/>, L<http://martinsosic.com/edlib/edlib_8h.html>
238
239 =head1 AUTHOR
240
241 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
242
243 =head1 COPYRIGHT AND LICENSE
244
245 Copyright (C) 2017 by Marius Gavrilescu
246
247 This library is free software; you can redistribute it and/or modify
248 it under the same terms as Perl itself, either Perl version 5.22.3 or,
249 at your option, any later version of Perl 5 you may have available.
250
251
252 =cut
This page took 0.032906 seconds and 4 git commands to generate.