[aard.git] / lib / Aard.pm

package Aard;

use 5.014000;
use strict;
use warnings;
our $VERSION = '0.000_001';

use IO::Uncompress::Inflate qw/inflate/;
use IO::Uncompress::Bunzip2 qw/bunzip2/;
use List::Util qw/sum/;

use JSON qw/decode_json/;
use UUID::Tiny qw/uuid_to_string/;

use constant HEADER_SPEC => [
	[signature             => 'Z4' , 4 ],
	[sha1sum               => 'Z40', 40],
	[version               => 'S>' , 2 ],
	[uuid                  => 'Z16', 16],
	[volume                => 'S>' , 2 ],
	[total_volumes         => 'S>' , 2 ],
	[meta_length           => 'L>' , 4 ],
	[index_count           => 'L>' , 4 ],
	[article_offset        => 'L>' , 4 ],
	[index1_item_format    => 'Z4' , 4 ],
	[key_length_format     => 'Z2' , 2 ],
	[article_length_format => 'Z2' , 2 ],
];

my $header_length = sum map { $_->[2] } @{HEADER_SPEC()};

sub decompress {
	my ($input) = @_;
	my $output = $input;
	inflate \$input => \$output;
	bunzip2 \$input => \$output if $input =~ /^BZ/;
	$output
}

sub read_at {
	my ($self, $offset, $length) = @_;
	my $fh = $self->{fh};
	my $part;
	seek $fh, $offset, 0;
	read $fh, $part, $length;
	$part
}

sub index1 {
	my ($self, $index) = @_;
	unless (exists $self->{index1}{$index}) {
		my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length});
		$self->{index1}{$index} = [unpack $self->{index_format}, $part]
	}
	$self->{index1}{$index}
}

sub fh            { shift->{fh} }
sub sha1sum       { shift->{sha1sum} }
sub uuid          { shift->{uuid} }
sub uuid_string   { uuid_to_string shift->uuid }
sub volume        { shift->{volume} }
sub total_volumes { shift->{total_volumes} }
sub count         { shift->{index_count} }

sub meta                          { shift->{meta} }
sub article_count                 { shift->meta->{article_count} }
sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} }
sub index_language                { shift->meta->{index_language} }
sub article_language              { shift->meta->{article_language} }
sub title                         { shift->meta->{title} }
sub version                       { shift->meta->{version} }
sub description                   { shift->meta->{description} }
sub copyright                     { shift->meta->{copyright} }
sub license                       { shift->meta->{license} }
sub source                        { shift->meta->{source} }

sub key {
	my ($self, $index) = @_;
	unless (exists $self->{key}{$index}) {
		my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2);
		my $len = unpack 'S>', $part;
		read $self->{fh}, $self->{key}{$index}, $len;
	}
	$self->{key}{$index}
}

sub article {
	my ($self, $index) = @_;
	unless (exists $self->{article}{$index}) {
		my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4);
		my $len = unpack 'L>', $part;
		read $self->{fh}, $part, $len;
		$self->{article}{$index} = decompress $part
	}
	$self->{article}{$index}
}

sub new {
	my ($self, $file) = @_;
	open my $fh, '<', $file or die $!;
	binmode $fh;
	my %header;
	for (@{HEADER_SPEC()}) {
		read $fh, my $part, $_->[2];
		$header{$_->[0]} = unpack $_->[1], $part;
	}

	die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard';
	die 'Unknown file format version' if $header{version} != 1;

	read $fh, my $meta, $header{meta_length};
	$meta = decode_json decompress $meta;

	my %obj = (
		%header,
		fh => $fh,
		meta => $meta,
		index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'),
		index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12),
	);
	$obj{index1_offset} = $header_length + $obj{meta_length};
	$obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length};
	bless \%obj, $self
}

1;
__END__

=head1 NAME

Aard - Read aarddict dictionaries

=head1 SYNOPSIS

  use Aard;
  my $dict = Aard->new('something.aar');
  printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
  printf "The tenth entry's key: %s\n", $dict->key(9);
  printf "The tenth entry's value: %s\n", $dict->article(9);

=head1 DESCRIPTION

Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.

=over

=item B<new>(I<filename>)

Creates a new Aard object for the given file.

=item B<fh>

Returns the open filehandle to the dictionary.

=item B<count>

Returns the number of entries in this dictionary.

=item B<key>(I<index>)

Returns the key of the I<index>th element. This method caches the keys.

=item B<article>(I<index>)

Returns the article of the I<index>th element. This method caches the articles.

=item B<uuid>

Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.

=item B<uuid_string>

Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.

=item B<volume>

Returns the volume number of this file.

=item B<total_volumes>

Returns the total number of volumes for this dictionary.

=item B<meta>

Returns the raw metadata as a hashref.

=item B<article_count>

Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).

=item B<article_count_is_volume_total>

Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.

=item B<index_language>

Returns the dictionary's "from" language (two or three letter ISO code)

=item B<article_language>

Returns the dictionary's "to" language (two or three letter ISO code)

=item B<title>

Returns the dictionary title

=item B<version>

Returns the dictionary version

=item B<description>

Returns the dictionary description

=item B<copyright>

Returns the copyright notice

=item B<license>

Returns the full license text

=item B<source>

Returns the dictionary data source

=back

=head1 SEE ALSO

L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>

=head1 AUTHOR

Marius Gavrilescu, E<lt>marius@ieval.roE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2014 by Marius Gavrilescu

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.18.2 or,
at your option, any later version of Perl 5 you may have available.


=cut
Commit	Line	Data
	1	package Aard;
	2
	3	use 5.014000;
	4	use strict;
	5	use warnings;
	6	our $VERSION = '0.000_001';
	7
	8	use IO::Uncompress::Inflate qw/inflate/;
	9	use IO::Uncompress::Bunzip2 qw/bunzip2/;
	10	use List::Util qw/sum/;
	11
	12	use JSON qw/decode_json/;
	13	use UUID::Tiny qw/uuid_to_string/;
	14
	15	use constant HEADER_SPEC => [
	16	[signature => 'Z4' , 4 ],
	17	[sha1sum => 'Z40', 40],
	18	[version => 'S>' , 2 ],
	19	[uuid => 'Z16', 16],
	20	[volume => 'S>' , 2 ],
	21	[total_volumes => 'S>' , 2 ],
	22	[meta_length => 'L>' , 4 ],
	23	[index_count => 'L>' , 4 ],
	24	[article_offset => 'L>' , 4 ],
	25	[index1_item_format => 'Z4' , 4 ],
	26	[key_length_format => 'Z2' , 2 ],
	27	[article_length_format => 'Z2' , 2 ],
	28	];
	29
	30	my $header_length = sum map { $_->[2] } @{HEADER_SPEC()};
	31
	32	sub decompress {
	33	my ($input) = @_;
	34	my $output = $input;
	35	inflate \$input => \$output;
	36	bunzip2 \$input => \$output if $input =~ /^BZ/;
	37	$output
	38	}
	39
	40	sub read_at {
	41	my ($self, $offset, $length) = @_;
	42	my $fh = $self->{fh};
	43	my $part;
	44	seek $fh, $offset, 0;
	45	read $fh, $part, $length;
	46	$part
	47	}
	48
	49	sub index1 {
	50	my ($self, $index) = @_;
	51	unless (exists $self->{index1}{$index}) {
	52	my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length});
	53	$self->{index1}{$index} = [unpack $self->{index_format}, $part]
	54	}
	55	$self->{index1}{$index}
	56	}
	57
	58	sub fh { shift->{fh} }
	59	sub sha1sum { shift->{sha1sum} }
	60	sub uuid { shift->{uuid} }
	61	sub uuid_string { uuid_to_string shift->uuid }
	62	sub volume { shift->{volume} }
	63	sub total_volumes { shift->{total_volumes} }
	64	sub count { shift->{index_count} }
	65
	66	sub meta { shift->{meta} }
	67	sub article_count { shift->meta->{article_count} }
	68	sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} }
	69	sub index_language { shift->meta->{index_language} }
	70	sub article_language { shift->meta->{article_language} }
	71	sub title { shift->meta->{title} }
	72	sub version { shift->meta->{version} }
	73	sub description { shift->meta->{description} }
	74	sub copyright { shift->meta->{copyright} }
	75	sub license { shift->meta->{license} }
	76	sub source { shift->meta->{source} }
	77
	78	sub key {
	79	my ($self, $index) = @_;
	80	unless (exists $self->{key}{$index}) {
	81	my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2);
	82	my $len = unpack 'S>', $part;
	83	read $self->{fh}, $self->{key}{$index}, $len;
	84	}
	85	$self->{key}{$index}
	86	}
	87
	88	sub article {
	89	my ($self, $index) = @_;
	90	unless (exists $self->{article}{$index}) {
	91	my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4);
	92	my $len = unpack 'L>', $part;
	93	read $self->{fh}, $part, $len;
	94	$self->{article}{$index} = decompress $part
	95	}
	96	$self->{article}{$index}
	97	}
	98
	99	sub new {
	100	my ($self, $file) = @_;
	101	open my $fh, '<', $file or die $!;
	102	binmode $fh;
	103	my %header;
	104	for (@{HEADER_SPEC()}) {
	105	read $fh, my $part, $_->[2];
	106	$header{$_->[0]} = unpack $_->[1], $part;
	107	}
	108
	109	die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard';
	110	die 'Unknown file format version' if $header{version} != 1;
	111
	112	read $fh, my $meta, $header{meta_length};
	113	$meta = decode_json decompress $meta;
	114
	115	my %obj = (
	116	%header,
	117	fh => $fh,
	118	meta => $meta,
	119	index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'),
	120	index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12),
	121	);
	122	$obj{index1_offset} = $header_length + $obj{meta_length};
	123	$obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length};
	124	bless \%obj, $self
	125	}
	126
	127	1;
	128	__END__
	129
	130	=head1 NAME
	131
	132	Aard - Read aarddict dictionaries
	133
	134	=head1 SYNOPSIS
	135
	136	use Aard;
	137	my $dict = Aard->new('something.aar');
	138	printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
	139	printf "The tenth entry's key: %s\n", $dict->key(9);
	140	printf "The tenth entry's value: %s\n", $dict->article(9);
	141
	142	=head1 DESCRIPTION
	143
	144	Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.
	145
	146	=over
	147
	148	=item B<new>(I<filename>)
	149
	150	Creates a new Aard object for the given file.
	151
	152	=item B<fh>
	153
	154	Returns the open filehandle to the dictionary.
	155
	156	=item B<count>
	157
	158	Returns the number of entries in this dictionary.
	159
	160	=item B<key>(I<index>)
	161
	162	Returns the key of the I<index>th element. This method caches the keys.
	163
	164	=item B<article>(I<index>)
	165
	166	Returns the article of the I<index>th element. This method caches the articles.
	167
	168	=item B<uuid>
	169
	170	Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.
	171
	172	=item B<uuid_string>
	173
	174	Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.
	175
	176	=item B<volume>
	177
	178	Returns the volume number of this file.
	179
	180	=item B<total_volumes>
	181
	182	Returns the total number of volumes for this dictionary.
	183
	184	=item B<meta>
	185
	186	Returns the raw metadata as a hashref.
	187
	188	=item B<article_count>
	189
	190	Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).
	191
	192	=item B<article_count_is_volume_total>
	193
	194	Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.
	195
	196	=item B<index_language>
	197
	198	Returns the dictionary's "from" language (two or three letter ISO code)
	199
	200	=item B<article_language>
	201
	202	Returns the dictionary's "to" language (two or three letter ISO code)
	203
	204	=item B<title>
	205
	206	Returns the dictionary title
	207
	208	=item B<version>
	209
	210	Returns the dictionary version
	211
	212	=item B<description>
	213
	214	Returns the dictionary description
	215
	216	=item B<copyright>
	217
	218	Returns the copyright notice
	219
	220	=item B<license>
	221
	222	Returns the full license text
	223
	224	=item B<source>
	225
	226	Returns the dictionary data source
	227
	228	=back
	229
	230	=head1 SEE ALSO
	231
	232	L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>
	233
	234	=head1 AUTHOR
	235
	236	Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
	237
	238	=head1 COPYRIGHT AND LICENSE
	239
	240	Copyright (C) 2014 by Marius Gavrilescu
	241
	242	This library is free software; you can redistribute it and/or modify
	243	it under the same terms as Perl itself, either Perl version 5.18.2 or,
	244	at your option, any later version of Perl 5 you may have available.
	245
	246
	247	=cut