From 9dbb6c4f6cc2bbc5824ac268a77141eacd12ca35 Mon Sep 17 00:00:00 2001
From: Dan Zeman <zeman@ufal.mff.cuni.cz>
Date: Fri, 12 May 2023 00:18:11 +0200
Subject: [PATCH] Preparing a driver for UniMorph.

---
 Changes.txt                                |   5 +
 lib/Lingua/Interset/Tagset/MUL/Unimorph.pm | 109 +++++++++++++++++++++
 xt/author/test.t                           |   4 +-
 3 files changed, 116 insertions(+), 2 deletions(-)
 create mode 100644 lib/Lingua/Interset/Tagset/MUL/Unimorph.pm

diff --git a/Changes.txt b/Changes.txt
index d40667a..0a79437 100644
--- a/Changes.txt
+++ b/Changes.txt
@@ -1,5 +1,10 @@
 {{$NEXT}}
 
+Drivers:
+* MUL::Unimorph
+
+
+
 3.015     2022-03-05 11:53:12+01:00 Europe/Prague
 
 Features:
diff --git a/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm b/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm
new file mode 100644
index 0000000..13bdb7c
--- /dev/null
+++ b/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm
@@ -0,0 +1,109 @@
+# ABSTRACT: Driver for the UniMorph features.
+# https://unimorph.github.io/
+# Copyright © 2023 Dan Zeman <zeman@ufal.mff.cuni.cz>
+
+package Lingua::Interset::Tagset::MUL::Unimorph;
+use strict;
+use warnings;
+our $VERSION = '3.016';
+
+use utf8;
+use open ':utf8';
+use namespace::autoclean;
+use Moose;
+use Lingua::Interset::FeatureStructure;
+extends 'Lingua::Interset::Tagset';
+
+
+
+#------------------------------------------------------------------------------
+# Returns the tagset id that should be set as the value of the 'tagset' feature
+# during decoding. Every derived class must (re)define this method! The result
+# should correspond to the last two parts in package name, lowercased.
+# Specifically, it should be the ISO 639-2 language code, followed by '::' and
+# a language-specific tagset id. Example: 'cs::multext'.
+#------------------------------------------------------------------------------
+sub get_tagset_id
+{
+    return 'mul::unimorph';
+}
+
+
+
+#------------------------------------------------------------------------------
+# Decodes a physical tag (string) and returns the corresponding feature
+# structure.
+#------------------------------------------------------------------------------
+sub decode
+{
+    my $self = shift;
+    my $tag = shift;
+    # There is a string of feature values, separated by semicolons. The order of
+    # the features is not significant, except that the main part of speech always
+    # comes first.
+    my @features = split(/;/, $tag);
+    $fs->set_tagset('mul::unimorph');
+    ###!!!
+    # DECODE THE FEATURES HERE.
+    ###!!!
+    return $fs;
+}
+
+
+
+#------------------------------------------------------------------------------
+# Takes feature structure and returns the corresponding physical tag (string).
+#------------------------------------------------------------------------------
+sub encode
+{
+    my $self = shift;
+    my $fs = shift; # Lingua::Interset::FeatureStructure
+    ###!!!
+    # ENCODE THE FEATURES HERE.
+    ###!!!
+    my $tag = join(';', @features);
+    return $tag;
+}
+
+
+
+#------------------------------------------------------------------------------
+# Returns reference to list of known tags.
+###!!! WHAT DO WE DO FOR UNIMORPH HERE? THERE ARE TOO MANY POSSIBLE COMBINATIONS.
+#------------------------------------------------------------------------------
+sub list
+{
+    my $self = shift;
+    my @list = ();
+    return \@list;
+}
+
+
+
+1;
+
+=head1 SYNOPSIS
+
+  use Lingua::Interset::Tagset::MUL::Unimorph;
+  my $driver = Lingua::Interset::Tagset::MUL::Unimorph->new();
+  my $fs = $driver->decode("N;MASC;SG;NOM");
+
+or
+
+  use Lingua::Interset qw(decode);
+  my $fs = decode('mul::unimorph', "N;MASC;SG;NOM");
+
+=head1 DESCRIPTION
+
+Interset driver for UniMorph 4.0 feature strings,
+see L<https://unimorph.github.io/>.
+
+=head1 SEE ALSO
+
+L<Lingua::Interset>
+L<Lingua::Interset::Tagset>,
+L<Lingua::Interset::Tagset::MUL::Uposf>,
+L<Lingua::Interset::Atom>,
+L<Lingua::Interset::FeatureStructure>
+
+=cut
diff --git a/xt/author/test.t b/xt/author/test.t
index 6e14b5d..140b7df 100644
--- a/xt/author/test.t
+++ b/xt/author/test.t
@@ -10,7 +10,7 @@ binmode(STDOUT, ':utf8');
 binmode(STDERR, ':utf8');
 # We must declare in advance how many tests we are going to perform.
 # There are currently three tests per tagset driver.
-use Test::More tests => 63*3;
+use Test::More tests => 64*3;
 use Lingua::Interset qw(get_driver_object);
 use Lingua::Interset::Tagset;
 
@@ -43,7 +43,7 @@ my @tagsets =
     'la::conll', 'la::it', 'la::itconll',
     'lt::jablonskis', 'lt::multext',
     'mt::mlss',
-    'mul::google', 'mul::upos',
+    'mul::google', 'mul::upos', 'mul::unimorph',
     'nl::cgn', 'nl::conll',
     'no::conll',
     'pl::ipipan',