Initial commit
[text-levenshtein-edlib.git] / lib / Text / Levenshtein / Edlib.pm
1 package Text::Levenshtein::Edlib;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 use Carp;
7 use parent qw/Exporter/;
8
9 my @constants =
10 qw/
11 EDLIB_CIGAR_EXTENDED
12 EDLIB_CIGAR_STANDARD
13 EDLIB_EDOP_DELETE
14 EDLIB_EDOP_INSERT
15 EDLIB_EDOP_MATCH
16 EDLIB_EDOP_MISMATCH
17 EDLIB_MODE_HW
18 EDLIB_MODE_NW
19 EDLIB_MODE_SHW
20 EDLIB_STATUS_ERROR
21 EDLIB_STATUS_OK
22 EDLIB_TASK_DISTANCE
23 EDLIB_TASK_LOC
24 EDLIB_TASK_PATH/;
25
26 our %EXPORT_TAGS =
27 (all => [ @constants, qw/align distance/ ], constants => \@constants);
28 our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
29 our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } );
30 our $VERSION = '0.001';
31
32 require XSLoader;
33 XSLoader::load('Text::Levenshtein::Edlib', $VERSION);
34
35 sub AUTOLOAD {
36 # This AUTOLOAD is used to 'autoload' constants from the constant()
37 # XS function.
38
39 my $constname;
40 our $AUTOLOAD;
41 ($constname = $AUTOLOAD) =~ s/.*:://;
42 croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant';
43 my ($error, $val) = constant($constname);
44 if ($error) { croak $error; }
45 {
46 no strict 'refs';
47 *$AUTOLOAD = sub { $val };
48 }
49 goto &$AUTOLOAD;
50 }
51
52 sub align {
53 my ($q, $t, $k, $mode, $task) = @_;
54 $k //= -1;
55 $mode //= EDLIB_MODE_NW();
56 $task //= EDLIB_TASK_PATH();
57 my $result = edlibAlign($q, $t, $k, $mode, $task);
58 my ($dist, $alpha_len, $end, $start, $align) = @$result;
59 return {} if $dist == -1;
60 my %ret;
61 $ret{editDistance} = $dist;
62 $ret{alphabetLength} = $alpha_len;
63 $ret{endLocations} = $end if defined $end;
64 $ret{startLocations} = $start if defined $start;
65 $ret{alignment} = $align if defined $align;
66 \%ret
67 }
68
69 sub distance {
70 my ($q, $t, $k) = @_;
71 align($q, $t, $k)->{editDistance}
72 }
73
74 1;
75 __END__
76
77 =encoding utf-8
78
79 =head1 NAME
80
81 Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation
82
83 =head1 SYNOPSIS
84
85 use feature 'say';
86
87 use Text::Levenshtein::Edlib qw/:all/;
88 say distance 'kitten', 'sitting'; # prints '3'
89 say 'Distance > 2!'
90 if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!'
91
92 my $align = align('kitten', 'sitting');
93 say "Edit distance is: $align->{editDistance}";
94 say "Alphabet length is: $align->{alphabetLength}";
95 say "Start locations are: @{$align->{startLocations}}";
96 say "End locations are: @{$align->{endLocations}}";
97 say "Alignment path is: @{$align->{alignment}}";
98
99
100 =head1 DESCRIPTION
101
102 Text::Levenshtein::Edlib is a wrapper around the edlib library that
103 computes Levenshtein edit distance and optimal alignment path for a
104 pair of strings.
105
106 It B<does not handle UTF-8 strings>, for those
107 L<Text::Levenshtein::XS> can compute edit distance but not alignment
108 path.
109
110 This module has two functions:
111
112 =over
113
114 =item B<distance>(I<$query>, I<$target>, [I<$max_distance>])
115
116 This is the basic interface to the library. It is compatible with the
117 function of the same name in L<Text::Levenshtein::XS>.
118
119 It returns the edit distance between the two given strings. If the
120 third argument is specified, and the edit distance is greater than the
121 value of the third argument, then the function finishes the
122 computation early and returns undef.
123
124 =item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]])
125
126 This is the full-featured interface to the library.
127
128 It returns a hashref with the following keys:
129
130 =over
131
132 =item C<editDistance>
133
134 The edit distance of the two strings.
135
136 =item C<alphabetLength>
137
138 The number of different characters in the query and target together.
139
140 =item C<endLocations>
141
142 Array of zero-based positions in target where optimal alignment paths
143 end. If gap after query is penalized, gap counts as part of query
144 (NW), otherwise not.
145
146 =item C<startLocations>
147
148 Array of zero-based positions in target where optimal alignment paths
149 start, they correspond to endLocations. If gap before query is
150 penalized, gap counts as part of query (NW), otherwise not.
151
152 =item C<alignment>
153
154 Alignment is found for first pair of start and end locations.
155 Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1
156 stands for insertion to target. 2 stands for insertion to query. 3
157 stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead
158 of 0, 1, 2, and 3. Alignment aligns query to target from begining of
159 query till end of query. If gaps are not penalized, they are not in
160 alignment.
161
162 =back
163
164 The third argument, I<$max_distance>, works similarly to the third
165 argument of B<distance>: if the distance is more than its value, this
166 function returns an empty hashref. Default value is -1, which disables
167 this optimization.
168
169 The fourth argument, I<$mode>, chooses how Edlib should treat gaps
170 before and after query. The options are:
171
172 =over
173
174 =item C<EDLIB_MODE_NW> (default)
175
176 Global method - gaps are not ignored. This is the standard Levenshtein
177 distance, and is the default if I<$mode> is not specified.
178
179 =item C<EDLIB_MODE_SHW>
180
181 Prefix method - gaps at query end are ignored. So the edit distance
182 between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC>
183 at the end.
184
185 =item C<EDLIB_MODE_HW>
186
187 Infix method - gaps at both query start and end are ignored. So the
188 edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at
189 the beginning and C<GAC> at the end of the target are ignored.
190
191 =back
192
193 The fifth argument, I<$task>, chooses what we want to compute.
194 If set to C<EDLIB_TASK_PATH> (default), all the keys described above will be computed.
195 If set to C<EDLIB_TASK_LOC>, all keys except for C<alignment> will be computed.
196 If set to C<EDLIB_TASK_DISTANCE>, all keys except for C<alignment> and C<startLocations> will be computed.
197 The less we compute, the faster the function will run.
198
199 =back
200
201 =head2 EXPORT
202
203 All constants by default. You can export the functions C<align> and
204 C<distance> and any of the constants below. You can use the tags
205 C<:constants> to export every constant, and C<:all> to export every
206 constant, C<align> and C<distance>.
207
208 =head2 Exportable constants
209
210 EDLIB_CIGAR_EXTENDED
211 EDLIB_CIGAR_STANDARD
212 EDLIB_EDOP_DELETE
213 EDLIB_EDOP_INSERT
214 EDLIB_EDOP_MATCH
215 EDLIB_EDOP_MISMATCH
216 EDLIB_MODE_HW
217 EDLIB_MODE_NW
218 EDLIB_MODE_SHW
219 EDLIB_STATUS_ERROR
220 EDLIB_STATUS_OK
221 EDLIB_TASK_DISTANCE
222 EDLIB_TASK_LOC
223 EDLIB_TASK_PATH
224
225 =head1 SEE ALSO
226
227 L<https://github.com/Martinsos/edlib/>
228
229 =head1 AUTHOR
230
231 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
232
233 =head1 COPYRIGHT AND LICENSE
234
235 Copyright (C) 2017 by Marius Gavrilescu
236
237 This library is free software; you can redistribute it and/or modify
238 it under the same terms as Perl itself, either Perl version 5.22.3 or,
239 at your option, any later version of Perl 5 you may have available.
240
241
242 =cut
This page took 0.03161 seconds and 4 git commands to generate.