6 our $VERSION = '0.000_001';
8 use IO
::Uncompress
::Inflate qw
/inflate/;
9 use IO
::Uncompress
::Bunzip2 qw
/bunzip2/;
10 use List
::Util qw
/sum/;
12 use JSON qw
/decode_json/;
13 use UUID
::Tiny qw
/uuid_to_string/;
15 use constant HEADER_SPEC
=> [
16 [signature
=> 'Z4' , 4 ],
17 [sha1sum
=> 'Z40', 40],
18 [version
=> 'S>' , 2 ],
20 [volume
=> 'S>' , 2 ],
21 [total_volumes
=> 'S>' , 2 ],
22 [meta_length
=> 'L>' , 4 ],
23 [index_count
=> 'L>' , 4 ],
24 [article_offset
=> 'L>' , 4 ],
25 [index1_item_format
=> 'Z4' , 4 ],
26 [key_length_format
=> 'Z2' , 2 ],
27 [article_length_format
=> 'Z2' , 2 ],
30 my $header_length = sum
map { $_->[2] } @
{HEADER_SPEC
()};
35 inflate \
$input => \
$output;
36 bunzip2 \
$input => \
$output if $input =~ /^BZ/;
41 my ($self, $offset, $length) = @_;
45 read $fh, $part, $length;
50 my ($self, $index) = @_;
51 unless (exists $self->{index1
}{$index}) {
52 my $part = $self->read_at($self->{index1_offset
} + $index * $self->{index_length
}, $self->{index_length
});
53 $self->{index1
}{$index} = [unpack $self->{index_format
}, $part]
55 $self->{index1
}{$index}
58 sub fh
{ shift->{fh
} }
59 sub sha1sum
{ shift->{sha1sum
} }
60 sub uuid
{ shift->{uuid
} }
61 sub uuid_string
{ uuid_to_string
shift->uuid }
62 sub volume
{ shift->{volume
} }
63 sub total_volumes
{ shift->{total_volumes
} }
64 sub count
{ shift->{index_count
} }
66 sub meta
{ shift->{meta
} }
67 sub article_count
{ shift->meta->{article_count
} }
68 sub article_count_is_volume_total
{ shift->meta->{article_count_is_volume_total
} }
69 sub index_language
{ shift->meta->{index_language
} }
70 sub article_language
{ shift->meta->{article_language
} }
71 sub title
{ shift->meta->{title
} }
72 sub version
{ shift->meta->{version
} }
73 sub description
{ shift->meta->{description
} }
74 sub copyright
{ shift->meta->{copyright
} }
75 sub license
{ shift->meta->{license
} }
76 sub source
{ shift->meta->{source
} }
79 my ($self, $index) = @_;
80 unless (exists $self->{key
}{$index}) {
81 my $part = $self->read_at($self->{index2_offset
} + $self->index1($index)->[0], 2);
82 my $len = unpack 'S>', $part;
83 read $self->{fh
}, $self->{key
}{$index}, $len;
89 my ($self, $index) = @_;
90 unless (exists $self->{article
}{$index}) {
91 my $part = $self->read_at($self->{article_offset
} + $self->index1($index)->[1], 4);
92 my $len = unpack 'L>', $part;
93 read $self->{fh
}, $part, $len;
94 $self->{article
}{$index} = decompress
$part
96 $self->{article
}{$index}
100 my ($self, $file) = @_;
101 open my $fh, '<', $file or die $!;
104 for (@
{HEADER_SPEC
()}) {
105 read $fh, my $part, $_->[2];
106 $header{$_->[0]} = unpack $_->[1], $part;
109 die 'Not a recognized aarddict dictionary file' if $header{signature
} ne 'aard';
110 die 'Unknown file format version' if $header{version
} != 1;
112 read $fh, my $meta, $header{meta_length
};
113 $meta = decode_json decompress
$meta;
119 index_format
=> ($header{index1_item_format
} eq '>LL' ?
'L>L>' : 'L>Q>'),
120 index_length
=> ($header{index1_item_format
} eq '>LL' ?
8 : 12),
122 $obj{index1_offset
} = $header_length + $obj{meta_length
};
123 $obj{index2_offset
} = $obj{index1_offset
} + $obj{index_count
} * $obj{index_length
};
132 Aard - Read aarddict dictionaries
137 my $dict = Aard->new('something.aar');
138 printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
139 printf "The tenth entry's key: %s\n", $dict->key(9);
140 printf "The tenth entry's value: %s\n", $dict->article(9);
144 Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.
148 =item B<new>(I<filename>)
150 Creates a new Aard object for the given file.
154 Returns the open filehandle to the dictionary.
158 Returns the number of entries in this dictionary.
160 =item B<key>(I<index>)
162 Returns the key of the I<index>th element. This method caches the keys.
164 =item B<article>(I<index>)
166 Returns the article of the I<index>th element. This method caches the articles.
170 Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.
174 Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.
178 Returns the volume number of this file.
180 =item B<total_volumes>
182 Returns the total number of volumes for this dictionary.
186 Returns the raw metadata as a hashref.
188 =item B<article_count>
190 Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).
192 =item B<article_count_is_volume_total>
194 Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.
196 =item B<index_language>
198 =item B<article_language>
216 L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>
220 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
222 =head1 COPYRIGHT AND LICENSE
224 Copyright (C) 2014 by Marius Gavrilescu
226 This library is free software; you can redistribute it and/or modify
227 it under the same terms as Perl itself, either Perl version 5.18.2 or,
228 at your option, any later version of Perl 5 you may have available.