Initial commit

author Marius Gavrilescu <marius@ieval.ro>

Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)

committer Marius Gavrilescu <marius@ieval.ro>

Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)
author Marius Gavrilescu <marius@ieval.ro>
Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)
committer Marius Gavrilescu <marius@ieval.ro>
Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)
diff --git a/Changes b/Changes

new file mode 100644 (file)

index 0000000..78da2f5
--- /dev/null
+++ b/Changes
@@ -0,0 +1,4 @@
+Revision history for Perl extension Aard.
+
+0.000_001 2014-05-01T02:06+03:00
+ - Initial release
diff --git a/MANIFEST b/MANIFEST

new file mode 100644 (file)

index 0000000..341b4ff
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,7 @@
+Changes
+Makefile.PL
+MANIFEST
+README
+t/Aard.t
+t/jargon-4.4.7-1.aar
+lib/Aard.pm
diff --git a/Makefile.PL b/Makefile.PL

new file mode 100644 (file)

index 0000000..097d872
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,26 @@
+use 5.014000;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+       NAME              => 'Aard',
+       VERSION_FROM      => 'lib/Aard.pm',
+       ABSTRACT_FROM     => 'lib/Aard.pm',
+       AUTHOR            => 'Marius Gavrilescu <marius@ieval.ro>',
+       MIN_PERL_VERSION  => '5.14.0',
+       LICENSE           => 'perl',
+       SIGN              => 1,
+       PREREQ_PM         => {
+               qw/IO::Uncompress::Inflate 0
+                  IO::Uncompress::Bunzip2 0
+                  List::Util 0
+
+                  JSON 0
+                  UUID::Tiny 0/,
+       },
+       META_MERGE        => {
+               dynamic_config => 0,
+               resources      => {
+                       repository => 'https://git.ieval.ro/?p=aard.git',
+               }
+       }
+);
diff --git a/README b/README

new file mode 100644 (file)

index 0000000..a05ed4a
--- /dev/null
+++ b/README
@@ -0,0 +1,30 @@
+Aard version 0.000_001
+======================
+
+Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of (key, article) pairs, with some associated metadata.
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+DEPENDENCIES
+
+This module requires these other modules and libraries:
+
+* JSON
+* UUID::Tiny
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2014 by Marius Gavrilescu
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.18.2 or,
+at your option, any later version of Perl 5 you may have available.
+
+
diff --git a/lib/Aard.pm b/lib/Aard.pm

new file mode 100644 (file)

index 0000000..fdb570e
--- /dev/null
+++ b/lib/Aard.pm
@@ -0,0 +1,231 @@
+package Aard;
+
+use 5.014000;
+use strict;
+use warnings;
+our $VERSION = '0.000_001';
+
+use IO::Uncompress::Inflate qw/inflate/;
+use IO::Uncompress::Bunzip2 qw/bunzip2/;
+use List::Util qw/sum/;
+
+use JSON qw/decode_json/;
+use UUID::Tiny qw/uuid_to_string/;
+
+use constant HEADER_SPEC => [
+       [signature             => 'Z4' , 4 ],
+       [sha1sum               => 'Z40', 40],
+       [version               => 'S>' , 2 ],
+       [uuid                  => 'Z16', 16],
+       [volume                => 'S>' , 2 ],
+       [total_volumes         => 'S>' , 2 ],
+       [meta_length           => 'L>' , 4 ],
+       [index_count           => 'L>' , 4 ],
+       [article_offset        => 'L>' , 4 ],
+       [index1_item_format    => 'Z4' , 4 ],
+       [key_length_format     => 'Z2' , 2 ],
+       [article_length_format => 'Z2' , 2 ],
+];
+
+my $header_length = sum map { $_->[2] } @{HEADER_SPEC()};
+
+sub decompress {
+       my ($input) = @_;
+       my $output = $input;
+       inflate \$input => \$output;
+       bunzip2 \$input => \$output if $input =~ /^BZ/;
+       $output
+}
+
+sub read_at {
+       my ($self, $offset, $length) = @_;
+       my $fh = $self->{fh};
+       my $part;
+       seek $fh, $offset, 0;
+       read $fh, $part, $length;
+       $part
+}
+
+sub index1 {
+       my ($self, $index) = @_;
+       unless (exists $self->{index1}{$index}) {
+               my $part = $self->read_at($self->{index1_offset} + $index * $self->{index_length}, $self->{index_length});
+               $self->{index1}{$index} = [unpack $self->{index_format}, $part]
+       }
+       $self->{index1}{$index}
+}
+
+sub fh            { shift->{fh} }
+sub sha1sum       { shift->{sha1sum} }
+sub uuid          { shift->{uuid} }
+sub uuid_string   { uuid_to_string shift->uuid }
+sub volume        { shift->{volume} }
+sub total_volumes { shift->{total_volumes} }
+sub count         { shift->{index_count} }
+
+sub meta                          { shift->{meta} }
+sub article_count                 { shift->meta->{article_count} }
+sub article_count_is_volume_total { shift->meta->{article_count_is_volume_total} }
+sub index_language                { shift->meta->{index_language} }
+sub article_language              { shift->meta->{article_language} }
+sub title                         { shift->meta->{title} }
+sub version                       { shift->meta->{version} }
+sub description                   { shift->meta->{description} }
+sub copyright                     { shift->meta->{copyright} }
+sub license                       { shift->meta->{license} }
+sub source                        { shift->meta->{source} }
+
+sub key {
+       my ($self, $index) = @_;
+       unless (exists $self->{key}{$index}) {
+               my $part = $self->read_at($self->{index2_offset} + $self->index1($index)->[0], 2);
+               my $len = unpack 'S>', $part;
+               read $self->{fh}, $self->{key}{$index}, $len;
+       }
+       $self->{key}{$index}
+}
+
+sub article {
+       my ($self, $index) = @_;
+       unless (exists $self->{article}{$index}) {
+               my $part = $self->read_at($self->{article_offset} + $self->index1($index)->[1], 4);
+               my $len = unpack 'L>', $part;
+               read $self->{fh}, $part, $len;
+               $self->{article}{$index} = decompress $part
+       }
+       $self->{article}{$index}
+}
+
+sub new {
+       my ($self, $file) = @_;
+       open my $fh, '<', $file or die $!;
+       binmode $fh;
+       my %header;
+       for (@{HEADER_SPEC()}) {
+               read $fh, my $part, $_->[2];
+               $header{$_->[0]} = unpack $_->[1], $part;
+       }
+
+       die 'Not a recognized aarddict dictionary file' if $header{signature} ne 'aard';
+       die 'Unknown file format version' if $header{version} != 1;
+
+       read $fh, my $meta, $header{meta_length};
+       $meta = decode_json decompress $meta;
+
+       my %obj = (
+               %header,
+               fh => $fh,
+               meta => $meta,
+               index_format => ($header{index1_item_format} eq '>LL' ? 'L>L>' : 'L>Q>'),
+               index_length => ($header{index1_item_format} eq '>LL' ? 8 : 12),
+       );
+       $obj{index1_offset} = $header_length + $obj{meta_length};
+       $obj{index2_offset} = $obj{index1_offset} + $obj{index_count} * $obj{index_length};
+       bless \%obj, $self
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Aard - Read aarddict dictionaries
+
+=head1 SYNOPSIS
+
+  use Aard;
+  my $dict = Aard->new('something.aar');
+  printf "This dictionary (volume %d of %d) has %d entries\n", $dict->volume, $dict->total_volumes, $dict->count;
+  printf "The tenth entry's key: %s\n", $dict->key(9);
+  printf "The tenth entry's value: %s\n", $dict->article(9);
+
+=head1 DESCRIPTION
+
+Aard is a module for reading files in the Aard Dictionary format (.aar). A dictionary is an array of I<(key, article)> pairs, with some associated metadata.
+
+=over
+
+=item B<new>(I<filename>)
+
+Creates a new Aard object for the given file.
+
+=item B<fh>
+
+Returns the open filehandle to the dictionary.
+
+=item B<count>
+
+Returns the number of entries in this dictionary.
+
+=item B<key>(I<index>)
+
+Returns the key of the I<index>th element. This method caches the keys.
+
+=item B<article>(I<index>)
+
+Returns the article of the I<index>th element. This method caches the articles.
+
+=item B<uuid>
+
+Returns the UUID of this dictionary as a binary string. This is a value shared by all volumes of the same dictionary.
+
+=item B<uuid_string>
+
+Returns the UUID of this dictionary as a human-readable string. This is a value shared by all volumes of the same dictionary.
+
+=item B<volume>
+
+Returns the volume number of this file.
+
+=item B<total_volumes>
+
+Returns the total number of volumes for this dictionary.
+
+=item B<meta>
+
+Returns the raw metadata as a hashref.
+
+=item B<article_count>
+
+Returns the number of unique articles in this volume (if B<article_count_is_volume_total> is true) or in this dictionary (otherwise).
+
+=item B<article_count_is_volume_total>
+
+Returns true if B<article_count> means number of articles in this volume. This is always true since aardtools 0.9.0.
+
+=item B<index_language>
+
+=item B<article_language>
+
+=item B<title>
+
+=item B<version>
+
+=item B<description>
+
+=item B<copyright>
+
+=item B<license>
+
+=item B<source>
+
+=back
+
+=head1 SEE ALSO
+
+L<http://aarddict.org>, L<http://aarddict.org/aardtools/doc/aardformat.html>
+
+=head1 AUTHOR
+
+Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2014 by Marius Gavrilescu
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.18.2 or,
+at your option, any later version of Perl 5 you may have available.
+
+
+=cut
diff --git a/t/Aard.t b/t/Aard.t

new file mode 100644 (file)

index 0000000..9637357
--- /dev/null
+++ b/t/Aard.t
@@ -0,0 +1,19 @@
+#!/usr/bin/perl -w
+use v5.14;
+use strict;
+use warnings;
+
+use Test::More tests => 9;
+BEGIN { use_ok('Aard') };
+
+my $dict = Aard->new('t/jargon-4.4.7-1.aar');
+is lc $dict->uuid_string, '4e5c4639-9d1d-42ee-b27d-b552d6b7386d', 'uuid_string';
+is $dict->volume, 1, 'volume';
+is $dict->total_volumes, 1, 'total_volumes';
+is $dict->count, 2307, 'count';
+
+is $dict->title, 'The Jargon File', 'title';
+is $dict->index_language, 'ENG', 'index_language';
+
+is $dict->key(20), 'admin', 'key 20';
+like $dict->article(20), qr/administrator/, 'value 20';
diff --git a/t/jargon-4.4.7-1.aar b/t/jargon-4.4.7-1.aar

new file mode 100644 (file)

index 0000000..3355f4a

Binary files /dev/null and b/t/jargon-4.4.7-1.aar differ
author	Marius Gavrilescu <marius@ieval.ro>
	Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)
committer	Marius Gavrilescu <marius@ieval.ro>
	Wed, 30 Apr 2014 23:06:36 +0000 (02:06 +0300)
Changes	[new file with mode: 0644]	patch \| blob
MANIFEST	[new file with mode: 0644]	patch \| blob
Makefile.PL	[new file with mode: 0644]	patch \| blob
README	[new file with mode: 0644]	patch \| blob
lib/Aard.pm	[new file with mode: 0644]	patch \| blob
t/Aard.t	[new file with mode: 0644]	patch \| blob
t/jargon-4.4.7-1.aar	[new file with mode: 0644]	patch \| blob