| 1 | package Aard; |
| 2 | |
| 3 | use 5.014000; |
| 4 | use strict; |
| 5 | use warnings; |
| 6 | our $VERSION = '0.000_001'; |
| 7 | |
| 8 | use IO::Uncompress::Inflate qw/inflate/; |
| 9 | use IO::Uncompress::Bunzip2 qw/bunzip2/; |
| 10 | use List::Util qw/sum/; |
| 11 | |
| 12 | use JSON qw/decode_json/; |
| 13 | use UUID::Tiny qw/uuid_to_string/; |
| 14 | |
| 15 | use constant HEADER_SPEC => [ |
| 16 | [signature => 'Z4' , 4 ], |
| 17 | [sha1sum => 'Z40', 40], |
| 18 | [version => 'S>' , 2 ], |
| 19 | [uuid => 'Z16', 16], |
| 20 | [volume => 'S>' , 2 ], |
| 21 | [total_volumes => 'S>' , 2 ], |
| 22 | [meta_length => 'L>' , 4 ], |
| 23 | [index_count => 'L>' , 4 ], |
| 24 | [article_offset => 'L>' , 4 ], |
| 25 | [index1_item_format => 'Z4' , 4 ], |
| 26 | [key_length_format => 'Z2' , 2 ], |
| 27 | [article_length_format => 'Z2' , 2 ], |
| 28 | ]; |
| 29 | |
| 30 | my $header_length = sum map { $_->[2] } @{HEADER_SPEC()}; |
| 31 | |
| 32 | sub decompress { |
| 33 | my ($input) = @_; |
| 34 | my $output = $input; |
| 35 | inflate \$input => \$output; |
| 36 | bunzip2 \$input => \$output if $input =~ /^BZ/; |
| 37 | $output |
| 38 | } |
| 39 | |
| 40 | sub read_at { |
| 41 | my ($self, $offset, $length) = @_; |
| 42 | my $fh = $self->{fh}; |
| 43 | my $part; |
| 44 | seek $fh, $offset, 0; |
| 45 | read $fh, $part, $length; |
| 46 | $part |
| 47 | } |
| 48 | |
| 49 | sub index1 { |
| 50 | my ($self, $index) = @_; |
| 51 | unless (exists $self->{index1}{$index}) { |
| 52 | my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length}); |
| 53 | $self->{index1}{$index} = [unpack $self->{index_format}, $part] |
| 54 | } |
| 55 | $self->{index1}{$index} |
| 56 | } |
| 57 | |
| 58 | sub fh { shift->{fh} } |
| 59 | sub sha1sum { shift->{sha1sum} } |
| 60 | sub uuid { shift->{uuid} } |
| 61 | sub uuid_string { uuid_to_string shift->uuid } |
| 62 | sub volume { shift->{volume} } |
| 63 | sub total_volumes { shift->{total_volumes} } |
| 64 | sub count { shift->{index_count} } |
| 65 | |
| 66 | sub meta { shift->{meta} } |
| 67 | sub article_count { shift->meta->{article_count} } |
| 68 | sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} } |
| 69 | sub index_language { shift->meta->{index_language} } |
| 70 | sub article_language { shift->meta->{article_language} } |
| 71 | sub title { shift->meta->{title} } |
| 72 | sub version { shift->meta->{version} } |
| 73 | sub description { shift->meta->{description} } |
| 74 | sub copyright { shift->meta->{copyright} } |
| 75 | sub license { shift->meta->{license} } |
| 76 | sub source { shift->meta->{source} } |
| 77 | |
| 78 | sub key { |
| 79 | my ($self, $index) = @_; |
| 80 | unless (exists $self->{key}{$index}) { |
| 81 | my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2); |
| 82 | my $len = unpack 'S>', $part; |
| 83 | read $self->{fh}, $self->{key}{$index}, $len; |
| 84 | } |
| 85 | $self->{key}{$index} |
| 86 | } |
| 87 | |
| 88 | sub article { |
| 89 | my ($self, $index) = @_; |
| 90 | unless (exists $self->{article}{$index}) { |
| 91 | my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4); |
| 92 | my $len = unpack 'L>', $part; |
| 93 | read $self->{fh}, $part, $len; |
| 94 | $self->{article}{$index} = decompress $part |
| 95 | } |
| 96 | $self->{article}{$index} |
| 97 | } |
| 98 | |
| 99 | sub new { |
| 100 | my ($self, $file) = @_; |
| 101 | open my $fh, '<', $file or die $!; |
| 102 | binmode $fh; |
| 103 | my %header; |
| 104 | for (@{HEADER_SPEC()}) { |
| 105 | read $fh, my $part, $_->[2]; |
| 106 | $header{$_->[0]} = unpack $_->[1], $part; |
| 107 | } |
| 108 | |
| 109 | die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard'; |
| 110 | die 'Unknown file format version' if $header{version} != 1; |
| 111 | |
| 112 | read $fh, my $meta, $header{meta_length}; |
| 113 | $meta = decode_json decompress $meta; |
| 114 | |
| 115 | my %obj = ( |
| 116 | %header, |
| 117 | fh => $fh, |
| 118 | meta => $meta, |
| 119 | index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'), |
| 120 | index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12), |
| 121 | ); |
| 122 | $obj{index1_offset} = $header_length + $obj{meta_length}; |
| 123 | $obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length}; |
| 124 | bless \%obj, $self |
| 125 | } |
| 126 | |
| 127 | 1; |
| 128 | __END__ |
| 129 | |
| 130 | =head1 NAME |
| 131 | |
| 132 | Aard - Read aarddict dictionaries |
| 133 | |
| 134 | =head1 SYNOPSIS |
| 135 | |
| 136 | use Aard; |
| 137 | my $dict = Aard->new('something.aar'); |
| 138 | printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count; |
| 139 | printf "The tenth entry's key: %s\n", $dict->key(9); |
| 140 | printf "The tenth entry's value: %s\n", $dict->article(9); |
| 141 | |
| 142 | =head1 DESCRIPTION |
| 143 | |
| 144 | Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata. |
| 145 | |
| 146 | =over |
| 147 | |
| 148 | =item B<new>(I<filename>) |
| 149 | |
| 150 | Creates a new Aard object for the given file. |
| 151 | |
| 152 | =item B<fh> |
| 153 | |
| 154 | Returns the open filehandle to the dictionary. |
| 155 | |
| 156 | =item B<count> |
| 157 | |
| 158 | Returns the number of entries in this dictionary. |
| 159 | |
| 160 | =item B<key>(I<index>) |
| 161 | |
| 162 | Returns the key of the I<index>th element. This method caches the keys. |
| 163 | |
| 164 | =item B<article>(I<index>) |
| 165 | |
| 166 | Returns the article of the I<index>th element. This method caches the articles. |
| 167 | |
| 168 | =item B<uuid> |
| 169 | |
| 170 | Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary. |
| 171 | |
| 172 | =item B<uuid_string> |
| 173 | |
| 174 | Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary. |
| 175 | |
| 176 | =item B<volume> |
| 177 | |
| 178 | Returns the volume number of this file. |
| 179 | |
| 180 | =item B<total_volumes> |
| 181 | |
| 182 | Returns the total number of volumes for this dictionary. |
| 183 | |
| 184 | =item B<meta> |
| 185 | |
| 186 | Returns the raw metadata as a hashref. |
| 187 | |
| 188 | =item B<article_count> |
| 189 | |
| 190 | Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise). |
| 191 | |
| 192 | =item B<article_count_is_volume_total> |
| 193 | |
| 194 | Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0. |
| 195 | |
| 196 | =item B<index_language> |
| 197 | |
| 198 | Returns the dictionary's "from" language (two or three letter ISO code) |
| 199 | |
| 200 | =item B<article_language> |
| 201 | |
| 202 | Returns the dictionary's "to" language (two or three letter ISO code) |
| 203 | |
| 204 | =item B<title> |
| 205 | |
| 206 | Returns the dictionary title |
| 207 | |
| 208 | =item B<version> |
| 209 | |
| 210 | Returns the dictionary version |
| 211 | |
| 212 | =item B<description> |
| 213 | |
| 214 | Returns the dictionary description |
| 215 | |
| 216 | =item B<copyright> |
| 217 | |
| 218 | Returns the copyright notice |
| 219 | |
| 220 | =item B<license> |
| 221 | |
| 222 | Returns the full license text |
| 223 | |
| 224 | =item B<source> |
| 225 | |
| 226 | Returns the dictionary data source |
| 227 | |
| 228 | =back |
| 229 | |
| 230 | =head1 SEE ALSO |
| 231 | |
| 232 | L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html> |
| 233 | |
| 234 | =head1 AUTHOR |
| 235 | |
| 236 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> |
| 237 | |
| 238 | =head1 COPYRIGHT AND LICENSE |
| 239 | |
| 240 | Copyright (C) 2014 by Marius Gavrilescu |
| 241 | |
| 242 | This library is free software; you can redistribute it and/or modify |
| 243 | it under the same terms as Perl itself, either Perl version 5.18.2 or, |
| 244 | at your option, any later version of Perl 5 you may have available. |
| 245 | |
| 246 | |
| 247 | =cut |