-
Notifications
You must be signed in to change notification settings - Fork 1
/
flair_stats.py
41 lines (34 loc) · 1.19 KB
/
flair_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from flair.datasets import ColumnCorpus
column_format = {0: "token", 1: "ner"}
train_file = "train.txt"
dev_file = "dev.txt"
test_file = "test.txt"
gold_stats = {
"fr": {
"train": 7936,
"dev": 992,
"test": 992
},
"nl": {
"train": 5777,
"dev": 722,
"test": 723
}
}
def check_sentences(reference: int, actual: int, split_name: str):
assert actual == reference, f"Mismatch in number of sentences for {split_name} split"
for language in ["fr", "nl"]:
data_folder = f"./data/{language}"
corpus = ColumnCorpus(data_folder=data_folder,
column_format=column_format,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
comment_symbol="# ",
column_delimiter="\t"
)
print(f"Stats for language: {language}:")
print(corpus)
check_sentences(len(corpus.train), gold_stats[language]["train"], "train")
check_sentences(len(corpus.dev), gold_stats[language]["dev"], "dev")
check_sentences(len(corpus.test), gold_stats[language]["test"], "test")