Bump version and update Changes
[text-levenshtein-edlib.git] / lib / Text / Levenshtein / Edlib.pm
CommitLineData
a99d15a3
MG
1package Text::Levenshtein::Edlib;
2
3use 5.014000;
4use strict;
5use warnings;
6use Carp;
7use parent qw/Exporter/;
8
9my @constants =
10 qw/
11 EDLIB_CIGAR_EXTENDED
12 EDLIB_CIGAR_STANDARD
13 EDLIB_EDOP_DELETE
14 EDLIB_EDOP_INSERT
15 EDLIB_EDOP_MATCH
16 EDLIB_EDOP_MISMATCH
17 EDLIB_MODE_HW
18 EDLIB_MODE_NW
19 EDLIB_MODE_SHW
20 EDLIB_STATUS_ERROR
21 EDLIB_STATUS_OK
22 EDLIB_TASK_DISTANCE
23 EDLIB_TASK_LOC
24 EDLIB_TASK_PATH/;
25
26our %EXPORT_TAGS =
329961ff 27 (all => [ @constants, qw/align distance to_cigar/ ], constants => \@constants);
a99d15a3
MG
28our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
29our @EXPORT = ( @{ $EXPORT_TAGS{'constants'} } );
c4b849c0 30our $VERSION = '0.001001';
a99d15a3
MG
31
32require XSLoader;
33XSLoader::load('Text::Levenshtein::Edlib', $VERSION);
34
35sub AUTOLOAD {
36 # This AUTOLOAD is used to 'autoload' constants from the constant()
37 # XS function.
38
39 my $constname;
40 our $AUTOLOAD;
41 ($constname = $AUTOLOAD) =~ s/.*:://;
42 croak "&Text::Levenshtein::Edlib::constant not defined" if $constname eq 'constant';
43 my ($error, $val) = constant($constname);
44 if ($error) { croak $error; }
45 {
46 no strict 'refs';
47 *$AUTOLOAD = sub { $val };
48 }
49 goto &$AUTOLOAD;
50}
51
52sub align {
53 my ($q, $t, $k, $mode, $task) = @_;
54 $k //= -1;
55 $mode //= EDLIB_MODE_NW();
56 $task //= EDLIB_TASK_PATH();
57 my $result = edlibAlign($q, $t, $k, $mode, $task);
58 my ($dist, $alpha_len, $end, $start, $align) = @$result;
59 return {} if $dist == -1;
60 my %ret;
61 $ret{editDistance} = $dist;
62 $ret{alphabetLength} = $alpha_len;
63 $ret{endLocations} = $end if defined $end;
64 $ret{startLocations} = $start if defined $start;
65 $ret{alignment} = $align if defined $align;
66 \%ret
67}
68
69sub distance {
159e7b25
MG
70 my ($q, $t, $k, $mode) = @_;
71 align($q, $t, $k, $mode, EDLIB_TASK_DISTANCE())->{editDistance}
a99d15a3
MG
72}
73
329961ff
MG
74sub to_cigar {
75 my ($align, $format) = @_;
76 $align = pack 'C*', @$align;
77 $format //= EDLIB_CIGAR_STANDARD();
78 edlibAlignmentToCigar($align, $format);
79}
80
a99d15a3
MG
811;
82__END__
83
84=encoding utf-8
85
86=head1 NAME
87
88Text::Levenshtein::Edlib - XS edit distance and optimal alignment path calculation
89
90=head1 SYNOPSIS
91
92 use feature 'say';
93
94 use Text::Levenshtein::Edlib qw/:all/;
95 say distance 'kitten', 'sitting'; # prints '3'
96 say 'Distance > 2!'
97 if !defined distance 'kitten', 'sitting', 2; # prints 'Distance > 2!'
98
99 my $align = align('kitten', 'sitting');
100 say "Edit distance is: $align->{editDistance}";
101 say "Alphabet length is: $align->{alphabetLength}";
102 say "Start locations are: @{$align->{startLocations}}";
103 say "End locations are: @{$align->{endLocations}}";
104 say "Alignment path is: @{$align->{alignment}}";
329961ff
MG
105 say "Alignment path (in CIGAR format): ", to_cigar $align->{alignment};
106 say "Alignment path (in extended CIGAR format): ",
107 to_cigar $align->{alignment}, EDLIB_CIGAR_EXTENDED;
a99d15a3
MG
108
109=head1 DESCRIPTION
110
111Text::Levenshtein::Edlib is a wrapper around the edlib library that
112computes Levenshtein edit distance and optimal alignment path for a
113pair of strings.
114
115It B<does not handle UTF-8 strings>, for those
116L<Text::Levenshtein::XS> can compute edit distance but not alignment
117path.
118
119This module has two functions:
120
121=over
122
159e7b25 123=item B<distance>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>]])
a99d15a3
MG
124
125This is the basic interface to the library. It is compatible with the
126function of the same name in L<Text::Levenshtein::XS>.
127
128It returns the edit distance between the two given strings. If the
129third argument is specified, and the edit distance is greater than the
130value of the third argument, then the function finishes the
159e7b25
MG
131computation early and returns undef. See below for the meaning of the
132optional I<$mode> argument.
a99d15a3
MG
133
134=item B<align>(I<$query>, I<$target>, [I<$max_distance>, [I<$mode>, [I<$task>]]])
135
136This is the full-featured interface to the library.
137
138It returns a hashref with the following keys:
139
140=over
141
142=item C<editDistance>
143
144The edit distance of the two strings.
145
146=item C<alphabetLength>
147
148The number of different characters in the query and target together.
149
150=item C<endLocations>
151
152Array of zero-based positions in target where optimal alignment paths
153end. If gap after query is penalized, gap counts as part of query
154(NW), otherwise not.
155
156=item C<startLocations>
157
158Array of zero-based positions in target where optimal alignment paths
159start, they correspond to endLocations. If gap before query is
160penalized, gap counts as part of query (NW), otherwise not.
161
162=item C<alignment>
163
164Alignment is found for first pair of start and end locations.
165Alignment is sequence of numbers: 0, 1, 2, 3. 0 stands for match. 1
166stands for insertion to target. 2 stands for insertion to query. 3
167stands for mismatch. You can use the C<EDLIB_EDOP_*> constants instead
168of 0, 1, 2, and 3. Alignment aligns query to target from begining of
169query till end of query. If gaps are not penalized, they are not in
170alignment.
171
172=back
173
174The third argument, I<$max_distance>, works similarly to the third
175argument of B<distance>: if the distance is more than its value, this
176function returns an empty hashref. Default value is -1, which disables
177this optimization.
178
179The fourth argument, I<$mode>, chooses how Edlib should treat gaps
180before and after query. The options are:
181
182=over
183
184=item C<EDLIB_MODE_NW> (default)
185
186Global method - gaps are not ignored. This is the standard Levenshtein
187distance, and is the default if I<$mode> is not specified.
188
189=item C<EDLIB_MODE_SHW>
190
191Prefix method - gaps at query end are ignored. So the edit distance
192between C<AACT> and C<AACTGGC> is 0, because we can ignore the C<GGC>
193at the end.
194
195=item C<EDLIB_MODE_HW>
196
197Infix method - gaps at both query start and end are ignored. So the
198edit distance between C<ACT> and C<CGACTGAC> is 0, because C<CG> at
199the beginning and C<GAC> at the end of the target are ignored.
200
201=back
202
a35dd4c3
MG
203The fifth argument, I<$task>, chooses what we want to compute. The options are:
204
205=over
206
207=item C<EDLIB_TASK_PATH> (default, slowest)
208
209All the keys described above will be computed.
210
211=item C<EDLIB_TASK_LOC>
212
213All keys except for C<alignment> will be computed.
214
215=item C<EDLIB_TASK_DISTANCE> (fastest)
216
217All keys except for C<alignment> and C<startLocations> will be computed.
218
219=back
220
221The less the function computes, the faster it runs.
a99d15a3
MG
222
223=back
224
225=head2 EXPORT
226
a35dd4c3
MG
227All constants by default. You can export the functions C<align>,
228C<distance> and C<to_cigar> and any of the constants below. You can
229use the tags C<:constants> to export every constant, and C<:all> to
230export every constant, C<align>, C<distance> and C<to_cigar>.
a99d15a3
MG
231
232=head2 Exportable constants
233
234 EDLIB_CIGAR_EXTENDED
235 EDLIB_CIGAR_STANDARD
236 EDLIB_EDOP_DELETE
237 EDLIB_EDOP_INSERT
238 EDLIB_EDOP_MATCH
239 EDLIB_EDOP_MISMATCH
240 EDLIB_MODE_HW
241 EDLIB_MODE_NW
242 EDLIB_MODE_SHW
243 EDLIB_STATUS_ERROR
244 EDLIB_STATUS_OK
245 EDLIB_TASK_DISTANCE
246 EDLIB_TASK_LOC
247 EDLIB_TASK_PATH
248
249=head1 SEE ALSO
250
ed67b619 251L<https://github.com/Martinsos/edlib/>, L<http://martinsosic.com/edlib/edlib_8h.html>
a99d15a3
MG
252
253=head1 AUTHOR
254
255Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
256
257=head1 COPYRIGHT AND LICENSE
258
259Copyright (C) 2017 by Marius Gavrilescu
260
261This library is free software; you can redistribute it and/or modify
262it under the same terms as Perl itself, either Perl version 5.22.3 or,
263at your option, any later version of Perl 5 you may have available.
264
265
266=cut
This page took 0.024687 seconds and 4 git commands to generate.