6 our $VERSION = '0.000_001';
8 use IO
::Uncompress
::Inflate qw
/inflate/;
9 use IO
::Uncompress
::Bunzip2 qw
/bunzip2/;
10 use List
::Util qw
/sum/;
12 use JSON qw
/decode_json/;
13 use UUID
::Tiny qw
/uuid_to_string/;
15 use constant HEADER_SPEC
=> [
16 [signature
=> 'Z4' , 4 ],
17 [sha1sum
=> 'Z40', 40],
18 [version
=> 'S>' , 2 ],
20 [volume
=> 'S>' , 2 ],
21 [total_volumes
=> 'S>' , 2 ],
22 [meta_length
=> 'L>' , 4 ],
23 [index_count
=> 'L>' , 4 ],
24 [article_offset
=> 'L>' , 4 ],
25 [index1_item_format
=> 'Z4' , 4 ],
26 [key_length_format
=> 'Z2' , 2 ],
27 [article_length_format
=> 'Z2' , 2 ],
30 my $header_length = sum
map { $_->[2] } @
{HEADER_SPEC
()};
35 inflate \
$input => \
$output;
36 bunzip2 \
$input => \
$output if $input =~ /^BZ/;
41 my ($self, $offset, $length) = @_;
45 read $fh, $part, $length;
50 my ($self, $index) = @_;
51 unless (exists $self->{index1
}{$index}) {
52 my $part = $self->read_at($self->{index1_offset
} + $index * $self->{index_length
}, $self->{index_length
});
53 $self->{index1
}{$index} = [unpack $self->{index_format
}, $part]
55 $self->{index1
}{$index}
58 sub fh
{ shift->{fh
} }
59 sub sha1sum
{ shift->{sha1sum
} }
60 sub uuid
{ shift->{uuid
} }
61 sub uuid_string
{ uuid_to_string
shift->uuid }
62 sub volume
{ shift->{volume
} }
63 sub total_volumes
{ shift->{total_volumes
} }
64 sub count
{ shift->{index_count
} }
66 sub meta
{ shift->{meta
} }
67 sub article_count
{ shift->meta->{article_count
} }
68 sub article_count_is_volume_total
{ shift->meta->{article_count_is_volume_total
} }
69 sub index_language
{ shift->meta->{index_language
} }
70 sub article_language
{ shift->meta->{article_language
} }
71 sub title
{ shift->meta->{title
} }
72 sub version
{ shift->meta->{version
} }
73 sub description
{ shift->meta->{description
} }
74 sub copyright
{ shift->meta->{copyright
} }
75 sub license
{ shift->meta->{license
} }
76 sub source
{ shift->meta->{source
} }
79 my ($self, $index) = @_;
80 unless (exists $self->{key
}{$index}) {
81 my $part = $self->read_at($self->{index2_offset
} + $self->index1($index)->[0], 2);
82 my $len = unpack 'S>', $part;
83 read $self->{fh
}, $self->{key
}{$index}, $len;
89 my ($self, $index) = @_;
90 unless (exists $self->{article
}{$index}) {
91 my $part = $self->read_at($self->{article_offset
} + $self->index1($index)->[1], 4);
92 my $len = unpack 'L>', $part;
93 read $self->{fh
}, $part, $len;
94 $self->{article
}{$index} = decompress
$part
96 $self->{article
}{$index}
100 my ($self, $file) = @_;
101 open my $fh, '<', $file or die $!;
104 for (@
{HEADER_SPEC
()}) {
105 read $fh, my $part, $_->[2];
106 $header{$_->[0]} = unpack $_->[1], $part;
109 die 'Not a recognized aarddict dictionary file' if $header{signature
} ne 'aard';
110 die 'Unknown file format version' if $header{version
} != 1;
112 read $fh, my $meta, $header{meta_length
};
113 $meta = decode_json decompress
$meta;
119 index_format
=> ($header{index1_item_format
} eq '>LL' ?
'L>L>' : 'L>Q>'),
120 index_length
=> ($header{index1_item_format
} eq '>LL' ?
8 : 12),
122 $obj{index1_offset
} = $header_length + $obj{meta_length
};
123 $obj{index2_offset
} = $obj{index1_offset
} + $obj{index_count
} * $obj{index_length
};
132 Aard - Read aarddict dictionaries
137 my $dict = Aard->new('something.aar');
138 printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
139 printf "The tenth entry's key: %s\n", $dict->key(9);
140 printf "The tenth entry's value: %s\n", $dict->article(9);
144 Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.
148 =item B<new>(I<filename>)
150 Creates a new Aard object for the given file.
154 Returns the open filehandle to the dictionary.
158 Returns the number of entries in this dictionary.
160 =item B<key>(I<index>)
162 Returns the key of the I<index>th element. This method caches the keys.
164 =item B<article>(I<index>)
166 Returns the article of the I<index>th element. This method caches the articles.
170 Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.
174 Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.
178 Returns the volume number of this file.
180 =item B<total_volumes>
182 Returns the total number of volumes for this dictionary.
186 Returns the raw metadata as a hashref.
188 =item B<article_count>
190 Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).
192 =item B<article_count_is_volume_total>
194 Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.
196 =item B<index_language>
198 Returns the dictionary's "from" language (two or three letter ISO code)
200 =item B<article_language>
202 Returns the dictionary's "to" language (two or three letter ISO code)
206 Returns the dictionary title
210 Returns the dictionary version
214 Returns the dictionary description
218 Returns the copyright notice
222 Returns the full license text
226 Returns the dictionary data source
232 L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>
236 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
238 =head1 COPYRIGHT AND LICENSE
240 Copyright (C) 2014 by Marius Gavrilescu
242 This library is free software; you can redistribute it and/or modify
243 it under the same terms as Perl itself, either Perl version 5.18.2 or,
244 at your option, any later version of Perl 5 you may have available.