Improve POD
[aard.git] / lib / Aard.pm
CommitLineData
1b53d9d3
MG
1package Aard;
2
3use 5.014000;
4use strict;
5use warnings;
6our $VERSION = '0.000_001';
7
8use IO::Uncompress::Inflate qw/inflate/;
9use IO::Uncompress::Bunzip2 qw/bunzip2/;
10use List::Util qw/sum/;
11
12use JSON qw/decode_json/;
13use UUID::Tiny qw/uuid_to_string/;
14
15use constant HEADER_SPEC => [
16 [signature => 'Z4' , 4 ],
17 [sha1sum => 'Z40', 40],
18 [version => 'S>' , 2 ],
19 [uuid => 'Z16', 16],
20 [volume => 'S>' , 2 ],
21 [total_volumes => 'S>' , 2 ],
22 [meta_length => 'L>' , 4 ],
23 [index_count => 'L>' , 4 ],
24 [article_offset => 'L>' , 4 ],
25 [index1_item_format => 'Z4' , 4 ],
26 [key_length_format => 'Z2' , 2 ],
27 [article_length_format => 'Z2' , 2 ],
28];
29
30my $header_length = sum map { $_->[2] } @{HEADER_SPEC()};
31
32sub decompress {
33 my ($input) = @_;
34 my $output = $input;
35 inflate \$input => \$output;
36 bunzip2 \$input => \$output if $input =~ /^BZ/;
37 $output
38}
39
40sub read_at {
41 my ($self, $offset, $length) = @_;
42 my $fh = $self->{fh};
43 my $part;
44 seek $fh, $offset, 0;
45 read $fh, $part, $length;
46 $part
47}
48
49sub index1 {
50 my ($self, $index) = @_;
51 unless (exists $self->{index1}{$index}) {
52 my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length});
53 $self->{index1}{$index} = [unpack $self->{index_format}, $part]
54 }
55 $self->{index1}{$index}
56}
57
58sub fh { shift->{fh} }
59sub sha1sum { shift->{sha1sum} }
60sub uuid { shift->{uuid} }
61sub uuid_string { uuid_to_string shift->uuid }
62sub volume { shift->{volume} }
63sub total_volumes { shift->{total_volumes} }
64sub count { shift->{index_count} }
65
66sub meta { shift->{meta} }
67sub article_count { shift->meta->{article_count} }
68sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} }
69sub index_language { shift->meta->{index_language} }
70sub article_language { shift->meta->{article_language} }
71sub title { shift->meta->{title} }
72sub version { shift->meta->{version} }
73sub description { shift->meta->{description} }
74sub copyright { shift->meta->{copyright} }
75sub license { shift->meta->{license} }
76sub source { shift->meta->{source} }
77
78sub key {
79 my ($self, $index) = @_;
80 unless (exists $self->{key}{$index}) {
81 my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2);
82 my $len = unpack 'S>', $part;
83 read $self->{fh}, $self->{key}{$index}, $len;
84 }
85 $self->{key}{$index}
86}
87
88sub article {
89 my ($self, $index) = @_;
90 unless (exists $self->{article}{$index}) {
91 my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4);
92 my $len = unpack 'L>', $part;
93 read $self->{fh}, $part, $len;
94 $self->{article}{$index} = decompress $part
95 }
96 $self->{article}{$index}
97}
98
99sub new {
100 my ($self, $file) = @_;
101 open my $fh, '<', $file or die $!;
102 binmode $fh;
103 my %header;
104 for (@{HEADER_SPEC()}) {
105 read $fh, my $part, $_->[2];
106 $header{$_->[0]} = unpack $_->[1], $part;
107 }
108
109 die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard';
110 die 'Unknown file format version' if $header{version} != 1;
111
112 read $fh, my $meta, $header{meta_length};
113 $meta = decode_json decompress $meta;
114
115 my %obj = (
116 %header,
117 fh => $fh,
118 meta => $meta,
119 index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'),
120 index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12),
121 );
122 $obj{index1_offset} = $header_length + $obj{meta_length};
123 $obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length};
124 bless \%obj, $self
125}
126
1271;
128__END__
129
130=head1 NAME
131
132Aard - Read aarddict dictionaries
133
134=head1 SYNOPSIS
135
136 use Aard;
137 my $dict = Aard->new('something.aar');
138 printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
139 printf "The tenth entry's key: %s\n", $dict->key(9);
140 printf "The tenth entry's value: %s\n", $dict->article(9);
141
142=head1 DESCRIPTION
143
144Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.
145
146=over
147
148=item B<new>(I<filename>)
149
150Creates a new Aard object for the given file.
151
152=item B<fh>
153
154Returns the open filehandle to the dictionary.
155
156=item B<count>
157
158Returns the number of entries in this dictionary.
159
160=item B<key>(I<index>)
161
162Returns the key of the I<index>th element. This method caches the keys.
163
164=item B<article>(I<index>)
165
166Returns the article of the I<index>th element. This method caches the articles.
167
168=item B<uuid>
169
170Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.
171
172=item B<uuid_string>
173
174Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.
175
176=item B<volume>
177
178Returns the volume number of this file.
179
180=item B<total_volumes>
181
182Returns the total number of volumes for this dictionary.
183
184=item B<meta>
185
186Returns the raw metadata as a hashref.
187
188=item B<article_count>
189
190Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).
191
192=item B<article_count_is_volume_total>
193
194Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.
195
196=item B<index_language>
197
87fce6ae
MG
198Returns the dictionary's "from" language (two or three letter ISO code)
199
1b53d9d3
MG
200=item B<article_language>
201
87fce6ae
MG
202Returns the dictionary's "to" language (two or three letter ISO code)
203
1b53d9d3
MG
204=item B<title>
205
87fce6ae
MG
206Returns the dictionary title
207
1b53d9d3
MG
208=item B<version>
209
87fce6ae
MG
210Returns the dictionary version
211
1b53d9d3
MG
212=item B<description>
213
87fce6ae
MG
214Returns the dictionary description
215
1b53d9d3
MG
216=item B<copyright>
217
87fce6ae
MG
218Returns the copyright notice
219
1b53d9d3
MG
220=item B<license>
221
87fce6ae
MG
222Returns the full license text
223
1b53d9d3
MG
224=item B<source>
225
87fce6ae
MG
226Returns the dictionary data source
227
1b53d9d3
MG
228=back
229
230=head1 SEE ALSO
231
232L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>
233
234=head1 AUTHOR
235
236Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
237
238=head1 COPYRIGHT AND LICENSE
239
240Copyright (C) 2014 by Marius Gavrilescu
241
242This library is free software; you can redistribute it and/or modify
243it under the same terms as Perl itself, either Perl version 5.18.2 or,
244at your option, any later version of Perl 5 you may have available.
245
246
247=cut
This page took 0.024799 seconds and 4 git commands to generate.