Skip to content

Commit

Permalink
Use lark grammar for nd file parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
imagejan committed Aug 27, 2024
1 parent 225b920 commit 706ee0d
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 79 deletions.
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"lark",
"pandas",
"pydantic",
]

[project.urls]
Expand Down Expand Up @@ -82,3 +84,6 @@ exclude_lines = [
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]

[tool.ruff.lint.flake8-type-checking]
runtime-evaluated-base-classes = ["pydantic.BaseModel"]
36 changes: 36 additions & 0 deletions src/metamorph_mda_parser/lark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from lark import Lark, Transformer


class NDInfoTransformer(Transformer):
def __init__(self):
self.wave_names = []
self.wave_do_z = []
self.stage_positions = []

def start(self, items):
result = dict(i for i in items if i is not None)
result["WaveNames"] = self.wave_names
result["WaveDoZ"] = self.wave_do_z
result["StagePositions"] = self.stage_positions
return result

def line(self, key_value):
key, value = key_value
if key.startswith("WaveName"):
self.wave_names.append(value)
return None # We handle WaveName entries separately
if key.startswith("Stage"):
self.stage_positions.append(value)
return None # We handle Stage entries separately
if key.startswith("WaveDoZ"):
self.wave_do_z.append(value)
return None # We handle WaveDoZ entries separately
return (key, value)

def boolean_value(self, b):
return b[0].value == "TRUE"


def parse(content):
parser = Lark.open("nd_grammar.lark", rel_to=__file__, parser="lalr", transformer=NDInfoTransformer())
return parser.parse(content)
105 changes: 32 additions & 73 deletions src/metamorph_mda_parser/nd.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,48 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Callable, Literal

if TYPE_CHECKING:
from pathlib import Path
from pathlib import Path
from typing import Literal

import pandas as pd
from pydantic import AliasGenerator, BaseModel, ConfigDict
from pydantic.alias_generators import to_pascal

from metamorph_mda_parser.lark import parse


class NdInfo:
class NdInfo(BaseModel):
model_config = ConfigDict(
alias_generator=AliasGenerator(
validation_alias=to_pascal,
),
)

path: Path
name: str
version: Literal["1.0", "2.0"]
description: str
do_timelapse: bool
do_stage: bool
do_wave: bool
do_z: bool
stage_positions: list[str]
wave_names: list[str]
wave_do_z: list[bool]
n_timepoints: int
n_z_steps: int
z_step_size: float
do_z_series: bool
stage_positions: list[str] = []
wave_names: list[str] = []
wave_do_z: list[bool] = []
n_stage_positions: int = 1
n_time_points: int = 1
n_z_steps: int = 1
z_step_size: float | None = None
wave_in_file_name: bool

def __init__(self, path: Path):
self.path = path
self.name = path.stem
self._parse_nd()

def _parse_nd(self) -> None:
with open(self.path) as nd:
# Version
self.version = self._parse_line(nd.readline(), "NDInfoFile", self._extract_version)
self.description = self._parse_line(nd.readline(), "Description", str)
self.start_time = self._parse_line(nd.readline(), "StartTime1", str)
# Time lapse
self.do_timelapse = self._parse_line(nd.readline(), "DoTimelapse", self._parse_bool)
if self.do_timelapse:
self.n_timepoints = self._parse_line(nd.readline(), "NTimePoints", int)

# Stage positions
self.do_stage = self._parse_line(nd.readline(), "DoStage", self._parse_bool)
if self.do_stage:
n_stage_positions = self._parse_line(nd.readline(), "NStagePositions", int)
self.stage_positions = []
for s in range(n_stage_positions):
self.stage_positions.append(self._parse_line(nd.readline(), f"Stage{s+1}", str))

# Wavelengths
self.do_wave = self._parse_line(nd.readline(), "DoWave", self._parse_bool)
if self.do_wave:
n_wavelengths = self._parse_line(nd.readline(), "NWavelengths", int)
self.wave_names = []
self.wave_do_z = []
for w in range(n_wavelengths):
self.wave_names.append(self._parse_line(nd.readline(), f"WaveName{w+1}", str))
self.wave_do_z.append(self._parse_line(nd.readline(), f"WaveDoZ{w+1}", self._parse_bool))

# Z steps
self.do_z = self._parse_line(nd.readline(), "DoZSeries", self._parse_bool)
self.n_z_steps = self._parse_line(nd.readline(), "NZSteps", int)
self.z_step_size = self._parse_line(nd.readline(), "ZStepSize", float)

self.wave_in_file_name = self._parse_line(nd.readline(), "WaveInFileName", self._parse_bool)

# End of file
last_line = nd.readline()
if last_line.strip(' "\n') != "EndFile":
message = f"Expected end of file, got: {last_line}"
raise ValueError(message)

def _parse_line(self, line: str, key: str, value_function: Callable):
tokens = line.split(",")
if tokens[0].strip(' "') != key:
message = f"Invalid nd file contents.\n\texpected: {key}\n\tgot: {line}"
raise ValueError(message)
return value_function(tokens[1].strip(' "\n'))

def _extract_version(self, value: str) -> str:
return value[8:]

def _parse_bool(self, value: str) -> bool:
return value.lower() == "true"
@staticmethod
def from_path(path: Path):
with open(path) as f:
content = f.read()
result = parse(content)
result["Path"] = path
result["Name"] = path.stem
result["Version"] = "1.0" # HACK
return NdInfo(**result)

def _wavelengths(self):
for i, w in enumerate(self.wave_names):
Expand All @@ -92,7 +51,7 @@ def _wavelengths(self):
i,
w,
f"_w{i+1}{w}" if self.wave_in_file_name else "",
self.wave_do_z[i],
self.wave_do_z[i] if self.wave_do_z else False,
)

def _stage_positions(self):
Expand All @@ -102,7 +61,7 @@ def _stage_positions(self):

def _timepoints(self):
if self.do_timelapse:
for t in range(self.n_timepoints):
for t in range(self.n_time_points):
yield t, f"_t{t+1}"

def _get_path_channel_position_time(self):
Expand Down
40 changes: 40 additions & 0 deletions src/metamorph_mda_parser/nd_grammar.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
start: line+ "\"EndFile\""
line: _QUOTE _special_key _QUOTE "," _special_value
| _QUOTE _string_key _QUOTE "," _QUOTE _string_value _QUOTE
| _QUOTE _boolean_key _QUOTE "," boolean_value
| _QUOTE _integer_key _QUOTE "," _integer_value
| _QUOTE _float_key _QUOTE "," _float_value

_special_key: /NDInfoFile/
| /Description/
| /StartTime[0-9]+/
_special_value: /.+/

_string_key: /WaveName[0-9]+/
| /Stage[0-9]+/
_string_value: /[^"]+/

_boolean_key: /DoTimelapse/
| /DoStage/
| /DoWave/
| /DoZSeries/
| /WaveInFileName/
| /WaveDoZ[0-9]+/
boolean_value: BOOLEAN

_integer_key: /NWavelengths/
| /NStagePositions/
| /NTimePoints/
| /NZSteps/
_integer_value: INT

_float_key: /ZStepSize/
_float_value: DECIMAL | INT

%import common.INT
%import common.DECIMAL
%import common.WS
%ignore WS

BOOLEAN: "TRUE" | "FALSE"
_QUOTE: "\""
31 changes: 25 additions & 6 deletions tests/test_nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def sample_4ch_1pos_1z():


def test_sample_4ch_4pos(sample_4ch_4pos):
nd_info = NdInfo(sample_4ch_4pos)
nd_info = NdInfo.from_path(sample_4ch_4pos)

assert nd_info.version == "1.0"
assert nd_info.description == "File recreated from images."
Expand All @@ -30,7 +30,7 @@ def test_sample_4ch_4pos(sample_4ch_4pos):
assert len(nd_info.stage_positions) == 4
assert nd_info.do_wave
assert len(nd_info.wave_names) == 4
assert nd_info.do_z
assert nd_info.do_z_series
assert nd_info.n_z_steps == 42
assert nd_info.z_step_size == 3.0
assert nd_info.wave_in_file_name
Expand All @@ -52,17 +52,17 @@ def test_sample_4ch_4pos(sample_4ch_4pos):


def test_sample_2ch_75pos_361t(sample_2ch_75pos_361t):
nd_info = NdInfo(sample_2ch_75pos_361t)
nd_info = NdInfo.from_path(sample_2ch_75pos_361t)

assert nd_info.version == "1.0"
assert nd_info.description == "File recreated from images."
assert nd_info.do_timelapse
assert nd_info.n_timepoints == 361
assert nd_info.n_time_points == 361
assert nd_info.do_stage
assert len(nd_info.stage_positions) == 75
assert nd_info.do_wave
assert len(nd_info.wave_names) == 2
assert nd_info.do_z
assert nd_info.do_z_series
assert nd_info.n_z_steps == 25
assert nd_info.z_step_size == 2.0
assert nd_info.wave_in_file_name
Expand All @@ -74,4 +74,23 @@ def test_sample_2ch_75pos_361t(sample_2ch_75pos_361t):


def test_sample_4ch_1pos_1z(sample_4ch_1pos_1z):
nd_info = NdInfo(sample_4ch_1pos_1z)
nd_info = NdInfo.from_path(sample_4ch_1pos_1z)

assert nd_info.version == "1.0"
assert nd_info.description == "File recreated from images."
assert not nd_info.do_timelapse
assert nd_info.n_time_points == 1
assert not nd_info.do_stage
assert len(nd_info.stage_positions) == 0
assert nd_info.do_wave
assert len(nd_info.wave_names) == 4
assert not nd_info.do_z_series
assert nd_info.n_z_steps == 1
assert nd_info.z_step_size is None
assert nd_info.wave_in_file_name
assert nd_info.wave_names == ["confDAPI", "confGFP", "confmCherry", "confCy5"]

files = nd_info.get_files()

assert len(files) == 4
assert all(p.suffix == ".tif" for p in files["path"])

0 comments on commit 706ee0d

Please sign in to comment.