Skip to content

Commit 9a561d6

Browse files
authored
Prevent truncation of cell method descriptions with nested brackets (#4436)
* Add failing test for brackets within comments * Fix test * Separate private function now splits the cell method before it's parsed * Add additional failing test for multiple axis methods * Handle multiple axis cell methods * Update docstring to reflect eventual choices * Add warnings to function, and test them * Significantly simpler warning checks
1 parent b1b77b9 commit 9a561d6

2 files changed

Lines changed: 126 additions & 2 deletions

File tree

lib/iris/fileformats/netcdf.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import os.path
2020
import re
2121
import string
22+
from typing import List
2223
import warnings
2324

2425
import cf_units
@@ -185,13 +186,14 @@
185186
_CM_INTERVAL = "interval"
186187
_CM_METHOD = "method"
187188
_CM_NAME = "name"
189+
_CM_PARSE_NAME = re.compile(r"([\w_]+\s*?:\s+)+")
188190
_CM_PARSE = re.compile(
189191
r"""
190192
(?P<name>([\w_]+\s*?:\s+)+)
191193
(?P<method>[\w_\s]+(?![\w_]*\s*?:))\s*
192194
(?:
193195
\(\s*
194-
(?P<extra>[^\)]+)
196+
(?P<extra>.+)
195197
\)\s*
196198
)?
197199
""",
@@ -203,6 +205,69 @@ class UnknownCellMethodWarning(Warning):
203205
pass
204206

205207

208+
def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]:
209+
"""
210+
Split a CF cell_methods attribute string into a list of zero or more cell
211+
methods, each of which is then parsed with a regex to return a list of match
212+
objects.
213+
214+
Args:
215+
216+
* nc_cell_methods: The value of the cell methods attribute to be split.
217+
218+
Returns:
219+
220+
* nc_cell_methods_matches: A list of the re.Match objects associated with
221+
each parsed cell method
222+
223+
Splitting is done based on words followed by colons outside of any brackets.
224+
Validation of anything other than being laid out in the expected format is
225+
left to the calling function.
226+
"""
227+
228+
# Find name candidates
229+
name_start_inds = []
230+
for m in _CM_PARSE_NAME.finditer(nc_cell_methods):
231+
name_start_inds.append(m.start())
232+
233+
# Remove those that fall inside brackets
234+
bracket_depth = 0
235+
for ind, cha in enumerate(nc_cell_methods):
236+
if cha == "(":
237+
bracket_depth += 1
238+
elif cha == ")":
239+
bracket_depth -= 1
240+
if bracket_depth < 0:
241+
msg = (
242+
"Cell methods may be incorrectly parsed due to mismatched "
243+
"brackets"
244+
)
245+
warnings.warn(msg, UserWarning, stacklevel=2)
246+
if bracket_depth > 0 and ind in name_start_inds:
247+
name_start_inds.remove(ind)
248+
249+
# List tuples of indices of starts and ends of the cell methods in the string
250+
method_indices = []
251+
for ii in range(len(name_start_inds) - 1):
252+
method_indices.append((name_start_inds[ii], name_start_inds[ii + 1]))
253+
method_indices.append((name_start_inds[-1], len(nc_cell_methods)))
254+
255+
# Index the string and match against each substring
256+
nc_cell_methods_matches = []
257+
for start_ind, end_ind in method_indices:
258+
nc_cell_method_str = nc_cell_methods[start_ind:end_ind]
259+
nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip())
260+
if not nc_cell_method_match:
261+
msg = (
262+
f"Failed to fully parse cell method string: {nc_cell_methods}"
263+
)
264+
warnings.warn(msg, UserWarning, stacklevel=2)
265+
continue
266+
nc_cell_methods_matches.append(nc_cell_method_match)
267+
268+
return nc_cell_methods_matches
269+
270+
206271
def parse_cell_methods(nc_cell_methods):
207272
"""
208273
Parse a CF cell_methods attribute string into a tuple of zero or
@@ -226,7 +291,7 @@ def parse_cell_methods(nc_cell_methods):
226291

227292
cell_methods = []
228293
if nc_cell_methods is not None:
229-
for m in _CM_PARSE.finditer(nc_cell_methods):
294+
for m in _split_cell_methods(nc_cell_methods):
230295
d = m.groupdict()
231296
method = d[_CM_METHOD]
232297
method = method.strip()

lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,20 @@ def test_with_interval(self):
4141
res = parse_cell_methods(cell_method_str)
4242
self.assertEqual(res, expected)
4343

44+
def test_multiple_axes(self):
45+
cell_method_strings = [
46+
"lat: lon: standard_deviation",
47+
"lat: lon : standard_deviation",
48+
"lat : lon: standard_deviation",
49+
"lat : lon : standard_deviation",
50+
]
51+
expected = (
52+
CellMethod(method="standard_deviation", coords=["lat", "lon"]),
53+
)
54+
for cell_method_str in cell_method_strings:
55+
res = parse_cell_methods(cell_method_str)
56+
self.assertEqual(res, expected)
57+
4458
def test_multiple(self):
4559
cell_method_strings = [
4660
"time: maximum (interval: 1 hr) time: mean (interval: 1 day)",
@@ -85,6 +99,51 @@ def test_comment(self):
8599
res = parse_cell_methods(cell_method_str)
86100
self.assertEqual(res, expected)
87101

102+
def test_comment_brackets(self):
103+
cell_method_strings = [
104+
"time: minimum within days (comment: 18h(day-1)-18h)",
105+
"time : minimum within days (comment: 18h(day-1)-18h)",
106+
]
107+
expected = (
108+
CellMethod(
109+
method="minimum within days",
110+
coords="time",
111+
intervals=None,
112+
comments="18h(day-1)-18h",
113+
),
114+
)
115+
for cell_method_str in cell_method_strings:
116+
res = parse_cell_methods(cell_method_str)
117+
self.assertEqual(res, expected)
118+
119+
def test_comment_bracket_mismatch_warning(self):
120+
cell_method_strings = [
121+
"time: minimum within days (comment: 18h day-1)-18h)",
122+
"time : minimum within days (comment: 18h day-1)-18h)",
123+
]
124+
for cell_method_str in cell_method_strings:
125+
with self.assertWarns(
126+
UserWarning,
127+
msg="Cell methods may be incorrectly parsed due to mismatched brackets",
128+
):
129+
_ = parse_cell_methods(cell_method_str)
130+
131+
def test_badly_formatted_warning(self):
132+
cell_method_strings = [
133+
# "time: maximum (interval: 1 hr comment: first bit "
134+
# "time: mean (interval: 1 day comment: second bit)",
135+
"time: (interval: 1 hr comment: first bit) "
136+
"time: mean (interval: 1 day comment: second bit)",
137+
"time: maximum (interval: 1 hr comment: first bit) "
138+
"time: (interval: 1 day comment: second bit)",
139+
]
140+
for cell_method_str in cell_method_strings:
141+
with self.assertWarns(
142+
UserWarning,
143+
msg=f"Failed to fully parse cell method string: {cell_method_str}",
144+
):
145+
_ = parse_cell_methods(cell_method_str)
146+
88147
def test_portions_of_cells(self):
89148
cell_method_strings = [
90149
"area: mean where sea_ice over sea",

0 commit comments

Comments
 (0)