Initial commit 0.000_001
authorMarius Gavrilescu <marius@ieval.ro>
Thu, 30 Nov 2017 21:45:48 +0000 (21:45 +0000)
committerMarius Gavrilescu <marius@ieval.ro>
Thu, 30 Nov 2017 21:45:48 +0000 (21:45 +0000)
Changes [new file with mode: 0644]
MANIFEST [new file with mode: 0644]
Makefile.PL [new file with mode: 0644]
README [new file with mode: 0644]
ex.pl [new file with mode: 0644]
lib/Slob.pm [new file with mode: 0644]
t/Slob.t [new file with mode: 0644]
t/freedict-01.slob [new file with mode: 0644]

diff --git a/Changes b/Changes
new file mode 100644 (file)
index 0000000..bbbd437
--- /dev/null
+++ b/Changes
@@ -0,0 +1,4 @@
+Revision history for Perl extension Slob.
+
+0.000_001 2017-11-30T21:45+00:00
+ - Initial release
diff --git a/MANIFEST b/MANIFEST
new file mode 100644 (file)
index 0000000..823ea14
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,7 @@
+Changes
+Makefile.PL
+MANIFEST
+README
+t/Slob.t
+t/freedict-01.slob
+lib/Slob.pm
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644 (file)
index 0000000..2c380c0
--- /dev/null
@@ -0,0 +1,19 @@
+use 5.014000;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+       NAME              => 'Slob',
+       VERSION_FROM      => 'lib/Slob.pm',
+       ABSTRACT_FROM     => 'lib/Slob.pm',
+       AUTHOR            => 'Marius Gavrilescu <marius@ieval.ro>',
+       MIN_PERL_VERSION  => '5.14.0',
+       LICENSE           => 'perl',
+       SIGN              => 1,
+       PREREQ_PM         => {},
+       META_ADD          => {
+               dynamic_config => 0,
+               resources      => {
+                       repository   => 'https://git.ieval.ro/?p=slob.git',
+               },
+       }
+);
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..9186721
--- /dev/null
+++ b/README
@@ -0,0 +1,25 @@
+Slob version 0.000_001
+======================
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+DEPENDENCIES
+
+This module requires no non-core modules and libraries.
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2017 by Marius Gavrilescu
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.26.1 or,
+at your option, any later version of Perl 5 you may have available.
+
+
diff --git a/ex.pl b/ex.pl
new file mode 100644 (file)
index 0000000..16aa432
--- /dev/null
+++ b/ex.pl
@@ -0,0 +1,17 @@
+
+  use feature qw/:5.14/;
+  use Slob;
+  my $slob = Slob->new('t/freedict-01.slob');
+
+  my $nr_of_entries = $slob->ref_count; # if the same content has two
+                                        # keys pointing to it, this
+                                        # counts it twice
+
+  my $second_ref = $slob->seek_and_read_ref(4);
+  say "Entry is for $second_ref->{key}";
+  say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
+
+  my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
+  say "Bin has ", (scalar @{$bin->{positions}}), " entries";
+  say "Value at position $second_ref->{item_index} is ",
+    $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
diff --git a/lib/Slob.pm b/lib/Slob.pm
new file mode 100644 (file)
index 0000000..b1eb4f8
--- /dev/null
@@ -0,0 +1,245 @@
+package Slob;
+
+use 5.014000;
+use strict;
+use warnings;
+our $VERSION = '0.000_001';
+
+use constant MAGIC => "!-1SLOB\x1F";
+
+use Carp qw/croak verbose/;
+use Encode;
+
+sub new {
+       my ($class, $path) = @_;
+       open my $fh, '<', $path or croak "Cannot open \"$path\": $!";
+       my $self = bless {path => $path, fh => $fh}, $class;
+       $self->{header} = $self->read_header;
+       $self
+}
+
+sub read_data {
+       my ($self, $len) = @_;
+       my $data;
+       my $result = read $self->{fh}, $data, $len;
+       if (!defined $result) {
+               croak "Failed to read from $self->{path}: $!"
+       } elsif ($result == $len) {
+               $data
+       } elsif ($result == 0) {
+               croak "$self->{path} is at end of file"
+       } elsif ($result < $len) {
+               croak "Only read $result bytes of $self->{path} before reaching EOF"
+       }
+}
+
+sub read_formatted {
+       my ($self, $len_of_format, $format) = @_;
+       unpack $format, $self->read_data($len_of_format);
+}
+
+sub read_char  { shift->read_formatted(1, 'C') }
+sub read_short { shift->read_formatted(2, 'n') }
+sub read_int   { shift->read_formatted(4, 'N') }
+sub read_long  { shift->read_formatted(8, 'Q>') }
+
+sub read_tiny_text {
+       my ($self, $encoding) = @_;
+       my $data = $self->read_data($self->read_char);
+       if (length $data == 255) {
+               $data = unpack 'Z*', $data;
+       }
+       $encoding //= $self->{encoding};
+       decode $encoding, $data;
+}
+
+sub read_text {
+       my ($self, $encoding) = @_;
+       my $data = $self->read_data($self->read_short);
+       $encoding //= $self->{encoding};
+       decode $encoding, $data;
+}
+
+sub read_large_byte_string {
+       my ($self) = @_;
+       $self->read_data($self->read_short)
+}
+
+sub read_tag {
+       my ($self) = @_;
+       my $name  = $self->read_tiny_text;
+       my $value = $self->read_tiny_text;
+       ($name, $value)
+}
+
+sub read_tags {
+       my ($self) = @_;
+       my $tag_count = $self->read_char;
+       map { $self->read_tag } 1..$tag_count
+}
+
+sub read_content_types {
+       my ($self) = @_;
+       my $content_type_count = $self->read_char;
+       map { $self->read_text } 1..$content_type_count
+}
+
+sub read_positions {
+       my ($self) = @_;
+       my $count = $self->read_int;
+       my @positions = map { $self->read_long } 1..$count;
+       my $relative_to = $self->ftell;
+       map { $relative_to + $_ } @positions
+}
+
+sub fseek {
+       my ($self, $position) = @_;
+       seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
+}
+
+sub ftell {
+       my ($self) = @_;
+       my $result = tell $self->{fh};
+       croak "Failed to tell position in file" if $result == -1;
+       $result
+}
+
+sub uncompress {
+       my ($self, $data) = @_;
+       $data
+}
+
+sub read_header {
+       my ($self) = @_;
+       my $magic = $self->read_data(length MAGIC);
+       croak "Not a SLOB dictionary" unless MAGIC eq $magic;
+       my $uuid = $self->read_data(16);
+
+       my $encoding = $self->read_tiny_text('UTF-8');
+       $self->{encoding} = $encoding;
+
+       my $compression = $self->read_tiny_text;
+       die "Compression not yet supported" if $compression;
+       my %tags = $self->read_tags;
+       my @content_types = $self->read_content_types;
+       my $blob_count = $self->read_int;
+       my $store_offset = $self->read_long;
+       my $size = $self->read_long;
+       my @refs = $self->read_positions;
+
+       $self->fseek($store_offset);
+       my @storage_bins = $self->read_positions;
+
+       +{
+               uuid => $uuid,
+               encoding => $encoding,
+               compression => $compression,
+               tags => \%tags,
+               content_types => \@content_types,
+               blob_count => $blob_count,
+               store_offset => $store_offset,
+               size => $size,
+               refs => \@refs,
+               storage_bins => \@storage_bins,
+       }
+}
+
+sub read_ref {
+       my ($self) = @_;
+       my $key = $self->read_text;
+       my $bin_index = $self->read_int;
+       my $item_index = $self->read_short;
+       my $fragment = $self->read_tiny_text;
+       +{
+               key => $key,
+               bin_index => $bin_index,
+               item_index => $item_index,
+               fragment => $fragment,
+       }
+}
+
+sub read_storage_bin {
+       my ($self) = @_;
+       my $count = $self->read_int;
+       my @content_types = map { $self->read_char } 1..$count;
+       my $compressed_size = $self->read_int;
+       my $compressed_data = $self->read_data($compressed_size);
+       my $uncompressed_data = $self->uncompress($compressed_data);
+
+       my @positions = unpack "N$count", $uncompressed_data;
+       my $data = substr $uncompressed_data, $count * 4;
+       +{
+               positions => \@positions,
+               data => $data
+       }
+}
+
+sub ref_count { shift @{shift->{header}{refs}} }
+
+sub seek_and_read_ref {
+       my ($self, $index) = @_;
+       croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
+       $self->fseek($self->{header}{refs}[$index]);
+       $self->read_ref
+}
+
+sub seek_and_read_storage_bin {
+       my ($self, $index) = @_;
+       croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
+       $self->fseek($self->{header}{storage_bins}[$index]);
+       $self->read_storage_bin
+}
+
+sub get_entry_of_storage_bin {
+       my ($self, $storage_bin, $index) = @_;
+       my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
+       my $length = unpack 'N', $start_of_data;
+       substr $start_of_data, 4, $length;
+}
+
+1;
+__END__
+
+=encoding utf-8
+
+=head1 NAME
+
+Slob - Read .slob dictionaries (as used by Aard 2)
+
+=head1 SYNOPSIS
+
+  use feature qw/:5.14/;
+  use Slob;
+  my $slob = Slob->new('path/to/dict.slob');
+
+  my $nr_of_entries = $slob->ref_count; # if the same content has two
+                                        # keys pointing to it, this
+                                        # counts it twice
+
+  my $second_ref = $slob->seek_and_read_ref(4);
+  say "Entry is for $second_ref->{key}";
+  say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
+
+  my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
+  say "Bin has ", (scalar @{$bin->{positions}}), " entries";
+  say "Value at position $second_ref->{item_index} is ",
+    $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
+
+=head1 DESCRIPTION
+
+No documentation yet, see SYNOPSIS.
+
+=head1 AUTHOR
+
+Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2017 by Marius Gavrilescu
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.26.1 or,
+at your option, any later version of Perl 5 you may have available.
+
+
+=cut
diff --git a/t/Slob.t b/t/Slob.t
new file mode 100644 (file)
index 0000000..b0ab28f
--- /dev/null
+++ b/t/Slob.t
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+use Test::More tests => 6;
+BEGIN { use_ok('Slob') };
+
+my $slob = Slob->new('t/freedict-01.slob');
+
+my $nr_of_entries = $slob->ref_count;
+
+my $second_ref = $slob->seek_and_read_ref(4);
+my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
+
+is $second_ref->{key}, 'abacus';
+is $second_ref->{bin_index}, 0;
+is $second_ref->{item_index}, 161;
+my $count = scalar @{$bin->{positions}};
+is $count, 637;
+
+my $expected = <<'EOF';
+<html><head><link href="~/css/default.css" rel="stylesheet" type="text/css"><link href="~/css/night.css" rel="alternate stylesheet" title="Night" type="text/css"></head><script src="~/js/styleswitcher.js"></script><body><div class="form">
+          <div class="orth">abacus</div><div class="pron">æbəkəs</div></div><ol class="sense single"><li class="sense">
+          <ol class="cit single"><li class="cit trans">
+            <ol class="quote single"><li class="quote">Rechenbrett</li></ol><div class="gramGrp">
+              <div class="gen">m</div></div></li></ol></li></ol></body></html>
+EOF
+chomp $expected;
+is $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}), $expected;
diff --git a/t/freedict-01.slob b/t/freedict-01.slob
new file mode 100644 (file)
index 0000000..c836579
Binary files /dev/null and b/t/freedict-01.slob differ
This page took 0.020293 seconds and 4 git commands to generate.