]>
Commit | Line | Data |
---|---|---|
1b53d9d3 MG |
1 | package Aard; |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
4fa2eeb6 | 6 | our $VERSION = '0.001'; |
1b53d9d3 MG |
7 | |
8 | use IO::Uncompress::Inflate qw/inflate/; | |
9 | use IO::Uncompress::Bunzip2 qw/bunzip2/; | |
10 | use List::Util qw/sum/; | |
11 | ||
fef4a7db | 12 | use JSON::MaybeXS qw/decode_json/; |
1b53d9d3 MG |
13 | use UUID::Tiny qw/uuid_to_string/; |
14 | ||
15 | use constant HEADER_SPEC => [ | |
16 | [signature => 'Z4' , 4 ], | |
17 | [sha1sum => 'Z40', 40], | |
18 | [version => 'S>' , 2 ], | |
19 | [uuid => 'Z16', 16], | |
20 | [volume => 'S>' , 2 ], | |
21 | [total_volumes => 'S>' , 2 ], | |
22 | [meta_length => 'L>' , 4 ], | |
23 | [index_count => 'L>' , 4 ], | |
24 | [article_offset => 'L>' , 4 ], | |
25 | [index1_item_format => 'Z4' , 4 ], | |
26 | [key_length_format => 'Z2' , 2 ], | |
27 | [article_length_format => 'Z2' , 2 ], | |
28 | ]; | |
29 | ||
30 | my $header_length = sum map { $_->[2] } @{HEADER_SPEC()}; | |
31 | ||
32 | sub decompress { | |
33 | my ($input) = @_; | |
34 | my $output = $input; | |
35 | inflate \$input => \$output; | |
36 | bunzip2 \$input => \$output if $input =~ /^BZ/; | |
37 | $output | |
38 | } | |
39 | ||
40 | sub read_at { | |
41 | my ($self, $offset, $length) = @_; | |
42 | my $fh = $self->{fh}; | |
43 | my $part; | |
44 | seek $fh, $offset, 0; | |
45 | read $fh, $part, $length; | |
46 | $part | |
47 | } | |
48 | ||
49 | sub index1 { | |
50 | my ($self, $index) = @_; | |
51 | unless (exists $self->{index1}{$index}) { | |
52 | my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length}); | |
53 | $self->{index1}{$index} = [unpack $self->{index_format}, $part] | |
54 | } | |
55 | $self->{index1}{$index} | |
56 | } | |
57 | ||
58 | sub fh { shift->{fh} } | |
59 | sub sha1sum { shift->{sha1sum} } | |
60 | sub uuid { shift->{uuid} } | |
61 | sub uuid_string { uuid_to_string shift->uuid } | |
62 | sub volume { shift->{volume} } | |
63 | sub total_volumes { shift->{total_volumes} } | |
64 | sub count { shift->{index_count} } | |
65 | ||
66 | sub meta { shift->{meta} } | |
67 | sub article_count { shift->meta->{article_count} } | |
68 | sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} } | |
69 | sub index_language { shift->meta->{index_language} } | |
70 | sub article_language { shift->meta->{article_language} } | |
71 | sub title { shift->meta->{title} } | |
72 | sub version { shift->meta->{version} } | |
73 | sub description { shift->meta->{description} } | |
74 | sub copyright { shift->meta->{copyright} } | |
75 | sub license { shift->meta->{license} } | |
76 | sub source { shift->meta->{source} } | |
77 | ||
78 | sub key { | |
79 | my ($self, $index) = @_; | |
80 | unless (exists $self->{key}{$index}) { | |
81 | my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2); | |
82 | my $len = unpack 'S>', $part; | |
83 | read $self->{fh}, $self->{key}{$index}, $len; | |
84 | } | |
85 | $self->{key}{$index} | |
86 | } | |
87 | ||
88 | sub article { | |
89 | my ($self, $index) = @_; | |
90 | unless (exists $self->{article}{$index}) { | |
91 | my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4); | |
92 | my $len = unpack 'L>', $part; | |
93 | read $self->{fh}, $part, $len; | |
94 | $self->{article}{$index} = decompress $part | |
95 | } | |
96 | $self->{article}{$index} | |
97 | } | |
98 | ||
99 | sub new { | |
100 | my ($self, $file) = @_; | |
101 | open my $fh, '<', $file or die $!; | |
102 | binmode $fh; | |
103 | my %header; | |
104 | for (@{HEADER_SPEC()}) { | |
105 | read $fh, my $part, $_->[2]; | |
106 | $header{$_->[0]} = unpack $_->[1], $part; | |
107 | } | |
108 | ||
109 | die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard'; | |
110 | die 'Unknown file format version' if $header{version} != 1; | |
111 | ||
112 | read $fh, my $meta, $header{meta_length}; | |
113 | $meta = decode_json decompress $meta; | |
114 | ||
115 | my %obj = ( | |
116 | %header, | |
117 | fh => $fh, | |
118 | meta => $meta, | |
119 | index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'), | |
120 | index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12), | |
121 | ); | |
122 | $obj{index1_offset} = $header_length + $obj{meta_length}; | |
123 | $obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length}; | |
124 | bless \%obj, $self | |
125 | } | |
126 | ||
127 | 1; | |
128 | __END__ | |
129 | ||
130 | =head1 NAME | |
131 | ||
132 | Aard - Read aarddict dictionaries | |
133 | ||
134 | =head1 SYNOPSIS | |
135 | ||
136 | use Aard; | |
137 | my $dict = Aard->new('something.aar'); | |
138 | printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count; | |
139 | printf "The tenth entry's key: %s\n", $dict->key(9); | |
140 | printf "The tenth entry's value: %s\n", $dict->article(9); | |
141 | ||
142 | =head1 DESCRIPTION | |
143 | ||
144 | Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata. | |
145 | ||
146 | =over | |
147 | ||
148 | =item B<new>(I<filename>) | |
149 | ||
150 | Creates a new Aard object for the given file. | |
151 | ||
152 | =item B<fh> | |
153 | ||
154 | Returns the open filehandle to the dictionary. | |
155 | ||
156 | =item B<count> | |
157 | ||
158 | Returns the number of entries in this dictionary. | |
159 | ||
160 | =item B<key>(I<index>) | |
161 | ||
162 | Returns the key of the I<index>th element. This method caches the keys. | |
163 | ||
164 | =item B<article>(I<index>) | |
165 | ||
166 | Returns the article of the I<index>th element. This method caches the articles. | |
167 | ||
168 | =item B<uuid> | |
169 | ||
170 | Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary. | |
171 | ||
172 | =item B<uuid_string> | |
173 | ||
174 | Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary. | |
175 | ||
176 | =item B<volume> | |
177 | ||
178 | Returns the volume number of this file. | |
179 | ||
180 | =item B<total_volumes> | |
181 | ||
182 | Returns the total number of volumes for this dictionary. | |
183 | ||
184 | =item B<meta> | |
185 | ||
186 | Returns the raw metadata as a hashref. | |
187 | ||
188 | =item B<article_count> | |
189 | ||
190 | Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise). | |
191 | ||
192 | =item B<article_count_is_volume_total> | |
193 | ||
194 | Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0. | |
195 | ||
196 | =item B<index_language> | |
197 | ||
87fce6ae MG |
198 | Returns the dictionary's "from" language (two or three letter ISO code) |
199 | ||
1b53d9d3 MG |
200 | =item B<article_language> |
201 | ||
87fce6ae MG |
202 | Returns the dictionary's "to" language (two or three letter ISO code) |
203 | ||
1b53d9d3 MG |
204 | =item B<title> |
205 | ||
87fce6ae MG |
206 | Returns the dictionary title |
207 | ||
1b53d9d3 MG |
208 | =item B<version> |
209 | ||
87fce6ae MG |
210 | Returns the dictionary version |
211 | ||
1b53d9d3 MG |
212 | =item B<description> |
213 | ||
87fce6ae MG |
214 | Returns the dictionary description |
215 | ||
1b53d9d3 MG |
216 | =item B<copyright> |
217 | ||
87fce6ae MG |
218 | Returns the copyright notice |
219 | ||
1b53d9d3 MG |
220 | =item B<license> |
221 | ||
87fce6ae MG |
222 | Returns the full license text |
223 | ||
1b53d9d3 MG |
224 | =item B<source> |
225 | ||
87fce6ae MG |
226 | Returns the dictionary data source |
227 | ||
1b53d9d3 MG |
228 | =back |
229 | ||
230 | =head1 SEE ALSO | |
231 | ||
232 | L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html> | |
233 | ||
234 | =head1 AUTHOR | |
235 | ||
236 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
237 | ||
238 | =head1 COPYRIGHT AND LICENSE | |
239 | ||
240 | Copyright (C) 2014 by Marius Gavrilescu | |
241 | ||
242 | This library is free software; you can redistribute it and/or modify | |
243 | it under the same terms as Perl itself, either Perl version 5.18.2 or, | |
244 | at your option, any later version of Perl 5 you may have available. | |
245 | ||
246 | ||
247 | =cut |