Skip to content

Commit

Permalink
Preparing a driver for UniMorph.
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-zeman committed May 11, 2023
1 parent 2d1a873 commit 9dbb6c4
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 2 deletions.
5 changes: 5 additions & 0 deletions Changes.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
{{$NEXT}}

Drivers:
* MUL::Unimorph



3.015 2022-03-05 11:53:12+01:00 Europe/Prague

Features:
Expand Down
109 changes: 109 additions & 0 deletions lib/Lingua/Interset/Tagset/MUL/Unimorph.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# ABSTRACT: Driver for the UniMorph features.
# https://unimorph.github.io/
# Copyright © 2023 Dan Zeman <[email protected]>

package Lingua::Interset::Tagset::MUL::Unimorph;
use strict;
use warnings;
our $VERSION = '3.016';

use utf8;
use open ':utf8';
use namespace::autoclean;
use Moose;
use Lingua::Interset::FeatureStructure;
extends 'Lingua::Interset::Tagset';



#------------------------------------------------------------------------------
# Returns the tagset id that should be set as the value of the 'tagset' feature
# during decoding. Every derived class must (re)define this method! The result
# should correspond to the last two parts in package name, lowercased.
# Specifically, it should be the ISO 639-2 language code, followed by '::' and
# a language-specific tagset id. Example: 'cs::multext'.
#------------------------------------------------------------------------------
sub get_tagset_id
{
return 'mul::unimorph';
}



#------------------------------------------------------------------------------
# Decodes a physical tag (string) and returns the corresponding feature
# structure.
#------------------------------------------------------------------------------
sub decode
{
my $self = shift;
my $tag = shift;
# There is a string of feature values, separated by semicolons. The order of
# the features is not significant, except that the main part of speech always
# comes first.
my @features = split(/;/, $tag);
$fs->set_tagset('mul::unimorph');
###!!!
# DECODE THE FEATURES HERE.
###!!!
return $fs;
}



#------------------------------------------------------------------------------
# Takes feature structure and returns the corresponding physical tag (string).
#------------------------------------------------------------------------------
sub encode
{
my $self = shift;
my $fs = shift; # Lingua::Interset::FeatureStructure
###!!!
# ENCODE THE FEATURES HERE.
###!!!
my $tag = join(';', @features);
return $tag;
}



#------------------------------------------------------------------------------
# Returns reference to list of known tags.
###!!! WHAT DO WE DO FOR UNIMORPH HERE? THERE ARE TOO MANY POSSIBLE COMBINATIONS.
#------------------------------------------------------------------------------
sub list
{
my $self = shift;
my @list = ();
return \@list;
}



1;

=head1 SYNOPSIS
use Lingua::Interset::Tagset::MUL::Unimorph;
my $driver = Lingua::Interset::Tagset::MUL::Unimorph->new();
my $fs = $driver->decode("N;MASC;SG;NOM");
or
use Lingua::Interset qw(decode);
my $fs = decode('mul::unimorph', "N;MASC;SG;NOM");
=head1 DESCRIPTION
Interset driver for UniMorph 4.0 feature strings,
see L<https://unimorph.github.io/>.
=head1 SEE ALSO
L<Lingua::Interset>
L<Lingua::Interset::Tagset>,
L<Lingua::Interset::Tagset::MUL::Uposf>,
L<Lingua::Interset::Atom>,
L<Lingua::Interset::FeatureStructure>
=cut
4 changes: 2 additions & 2 deletions xt/author/test.t
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
# We must declare in advance how many tests we are going to perform.
# There are currently three tests per tagset driver.
use Test::More tests => 63*3;
use Test::More tests => 64*3;
use Lingua::Interset qw(get_driver_object);
use Lingua::Interset::Tagset;

Expand Down Expand Up @@ -43,7 +43,7 @@ my @tagsets =
'la::conll', 'la::it', 'la::itconll',
'lt::jablonskis', 'lt::multext',
'mt::mlss',
'mul::google', 'mul::upos',
'mul::google', 'mul::upos', 'mul::unimorph',
'nl::cgn', 'nl::conll',
'no::conll',
'pl::ipipan',
Expand Down

0 comments on commit 9dbb6c4

Please sign in to comment.