diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 3842f5d25..1ffd396e0 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -135,6 +135,7 @@ from .treebanks import UD_OLD_FRENCH from .treebanks import UD_GOTHIC from .treebanks import UD_WOLOF +from .treebanks import UD_BELARUSIAN from .treebanks import UD_OLD_CHURCH_SLAVONIC from .treebanks import UD_COPTIC diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index acd138fa6..8c9a7dc42 100755 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -1510,6 +1510,33 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s super(UD_WOLOF, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) +class UD_BELARUSIAN(UniversalDependenciesCorpus): + def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): + + if type(base_path) == str: + base_path: Path = Path(base_path) + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Belarusian-HSE/master" + cached_path(f"{web_path}/be_hse-ud-dev.conllu", Path("datasets") / dataset_name) + cached_path( + f"{web_path}/be_hse-ud-test.conllu", Path("datasets") / dataset_name + ) + cached_path( + f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name + ) + + super(UD_BELARUSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) + + class UD_COPTIC(UniversalDependenciesCorpus): def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): @@ -1534,5 +1561,4 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s f"{web_path}/cop_scriptorium-ud-train.conllu", Path("datasets") / dataset_name ) - super(UD_COPTIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) - + super(UD_COPTIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) \ No newline at end of file