-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.h
118 lines (74 loc) · 2.91 KB
/
corpus.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
//
// corpus.h
// Perceptron GLM NLP Tasks
//
// Created by husnu sensoy on 13/01/14.
// Copyright (c) 2014 husnu sensoy. All rights reserved.
//
#ifndef Perceptron_GLM_NLP_Tasks_corpus_h
#define Perceptron_GLM_NLP_Tasks_corpus_h
#include "darray.h"
#include "hashmap.h"
#include "datastructure.h"
#include "vector.h"
#define EXAMPLE_CONLL_DIR "/Users/husnusensoy/uparse/data/nlp/treebank/treebank-2.0/combined/conll"
#define STOP "<STOP>"
#define START "*"
//static const char* ROOT = "root";
#define IS_ARC_VALID(from,to, length) check((from) != (to) && (from) <= (length) && (from) >= 0 && (to)>= 1 && (to) <= (length), "Arc between suspicious words %d to %d for sentence length %d", (from), (to), (length))
#define MAX_SENT_LENGTH 20
struct Word {
int id;
int parent;
int predicted_parent; // Parent predicted by the model.
char *form;
char *postag;
DArray *conll_piece;
vector embedding;
};
typedef struct Word* Word;
//Word parse_word( char* line, bool read_vector );
FeatureMatrix FeatureMatrix_create(int sent_length, uint32_t embedding_length, bool has_discrete_features);
enum EmbeddingTranformation{
CUBIC,
QUADRATIC,
LINEAR
};
struct CoNLLCorpus {
const char *base_dir;
DArray* sections;
DArray *sentences;
bool hasembeddings;
DArray *disrete_patterns_parts;
Word Root;
size_t word_embedding_dimension;
size_t transformed_embedding_length;
};
typedef struct CoNLLCorpus* CoNLLCorpus;
struct EmbeddingPattern {
int offset;
char node;
char subnode;
};
typedef struct EmbeddingPattern* EmbeddingPattern;
CoNLLCorpus create_CoNLLCorpus(const char* base_dir, DArray *sections, int embedding_dimension, DArray* discrete_patterns) ;
void read_corpus(CoNLLCorpus coprus, bool build_feature_matrix);
void free_CoNLLCorpus(CoNLLCorpus corpus, bool free_feature_matrix);
void add_word(FeaturedSentence sentence, Word word);
FeaturedSentence FeatureSentence_create();
void FeatureSentence_free(FeaturedSentence sent, bool free_words);
/**
* @brief Constructs feature_matrix for a given sentence
* @param featuremap String: Integer map for discrete features.
* @param corpus CoNLLCorpus object.
* @param sentence_idx Sentence for which feature matrix is built.
*/
void set_FeatureMatrix(Hashmap* featuremap, CoNLLCorpus corpus, int sentence_idx);
void free_featureMatrix(FeatureMatrix matrix);
void free_feature_matrix(CoNLLCorpus corpus, int sentence_idx);
void build_adjacency_matrix(CoNLLCorpus corpus, int sentence_idx, vector embeddings_w, vector discrete_w);
void set_adjacency_matrix(CoNLLCorpus corpus, int sentence_idx, KernelPerceptron kp);
void set_adjacency_matrix_fast(CoNLLCorpus corpus, int sentence_idx, KernelPerceptron kp, bool use_avg_alpha) ;
void free_FeaturedSentence(CoNLLCorpus corpus, int sentence_idx);
vector embedding_feature(FeaturedSentence sent, int from, int to, vector target);
#endif