ENH(+RF): add acq_time_precise after acq_time to _scans

yarikoptic · yarikoptic · commit d56ecf00abf2 · 2020-05-12T21:27:07.000-04:00
Unfortunately BIDS ATM does not instruct possibility to add .microseconds
to acq_time within _scans files, explicitly stops on "seconds".  So to not
break existing tools supporting BIDS and stay compliant, to introduce sub-second
precision we are adding an ad-hoc field (since allowed).

Note: in this commit I have not taken advantage of prior commit which
introduced microseconds keyword argument to get_datetime - to not waste
cpu cycles on reparsing and re-conversion.

while at it also centralized definition of which column headers to
expect.
diff --git a/heudiconv/bids.py b/heudiconv/bids.py
@@ -26,6 +26,23 @@
 
 lgr = logging.getLogger(__name__)
 
+# Fields to be populated in _scans files. Order matters
+SCANS_FILE_FIELDS = OrderedDict([
+    ("filename", OrderedDict([
+        ("Description", "Name of the nifti file")])),
+    ("acq_time", OrderedDict([
+        ("LongName", "Acquisition time"),
+        ("Description", "Acquisition time of the particular scan")])),
+    ("acq_time_precise", OrderedDict([
+        ("LongName", "Acquisition time precise"),
+        ("Description", "Acquisition time of the particular scan "
+                        "(with microseconds if available)")])),
+    ("operator", OrderedDict([
+        ("Description", "Name of the operator")])),
+    ("randstr", OrderedDict([
+        ("LongName", "Random string"),
+        ("Description", "md5 hash of UIDs")])),
+])
 
 class BIDSError(Exception):
     pass
@@ -360,22 +377,9 @@ def add_rows_to_scans_keys_file(fn, newrows):
         # _scans.tsv). This auto generation will make BIDS-validator happy.
         scans_json = '.'.join(fn.split('.')[:-1] + ['json'])
         if not op.lexists(scans_json):
-            save_json(scans_json,
-                OrderedDict([
-                    ("filename", OrderedDict([
-                        ("Description", "Name of the nifti file")])),
-                    ("acq_time", OrderedDict([
-                        ("LongName", "Acquisition time"),
-                        ("Description", "Acquisition time of the particular scan")])),
-                    ("operator", OrderedDict([
-                        ("Description", "Name of the operator")])),
-                    ("randstr", OrderedDict([
-                        ("LongName", "Random string"),
-                        ("Description", "md5 hash of UIDs")])),
-                ]),
-                sort_keys=False)
+            save_json(scans_json, SCANS_FILE_FIELDS, sort_keys=False)
 
-    header = ['filename', 'acq_time', 'operator', 'randstr']
+    header = SCANS_FILE_FIELDS
     # prepare all the data rows
     data_rows = [[k] + v for k, v in fnames2info.items()]
     # sort by the date/filename
@@ -399,7 +403,7 @@ def get_formatted_scans_key_row(dcm_fn):
     Returns
     -------
     row: list
-        [ISO acquisition time, performing physician name, random string]
+        [date time, date time (with microseconds), performing physician name, random string]
 
     """
     dcm_data = dcm.read_file(dcm_fn, stop_before_pixels=True, force=True)
@@ -424,7 +428,7 @@ def get_formatted_scans_key_row(dcm_fn):
         perfphys = dcm_data.PerformingPhysicianName
     except AttributeError:
         perfphys = ''
-    row = [acq_time, perfphys, randstr]
+    row = [acq_time.split('.')[0], acq_time, perfphys, randstr]
     # empty entries should be 'n/a'
     # https://github.com/dartmouth-pbs/heudiconv/issues/32
     row = ['n/a' if not str(e) else e for e in row]
diff --git a/heudiconv/tests/test_heuristics.py b/heudiconv/tests/test_heuristics.py
@@ -111,13 +111,15 @@ def test_scans_keys_reproin(tmpdir, invocation):
         reader = csv.reader(f, delimiter='\t')
         for i, row in enumerate(reader):
             if i == 0:
-                assert(row == ['filename', 'acq_time', 'operator', 'randstr'])
-            assert(len(row) == 4)
+                assert(row == ['filename', 'acq_time', 'acq_time_precise', 'operator', 'randstr'])
+            assert(len(row) == 5)
             if i != 0:
                 assert(os.path.exists(pjoin(dirname(scans_keys[0]), row[0])))
-                assert(re.match(
-                    '^[\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}.[\d]{6}$',
-                    row[1]))
+                bidsdatetime_regex = '^[\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}$'
+                # regular acq_time without .microseconds since BIDS did not envision it
+                assert(re.match(bidsdatetime_regex, row[1]))
+                # acq_time_precise
+                assert(re.match(bidsdatetime_regex.replace("$", ".[\d]{6}$"), row[2]))
 
 
 @patch('sys.stdout', new_callable=StringIO)
diff --git a/heudiconv/tests/test_main.py b/heudiconv/tests/test_main.py
@@ -115,7 +115,6 @@ def test_prepare_for_datalad(tmpdir):
 
     from datalad.api import Dataset
     superds = Dataset(str(tmpdir))
-
     assert superds.is_installed()
     assert not superds.repo.dirty
     subdss = superds.subdatasets(recursive=True, result_xfm='relpaths')
@@ -172,14 +171,15 @@ def test_get_formatted_scans_key_row():
          % TESTS_DATA_PATH
 
     row1 = get_formatted_scans_key_row(dcm_fn)
-    assert len(row1) == 3
-    assert row1[0] == '2016-10-14T09:26:36.693000'
-    assert row1[1] == 'n/a'
-    prandstr1 = row1[2]
+    assert len(row1) == 4
+    assert row1[0] == '2016-10-14T09:26:36'
+    assert row1[1] == '2016-10-14T09:26:36.693000'
+    assert row1[2] == 'n/a'
+    prandstr1 = row1[3]
 
     # if we rerun - should be identical!
     row2 = get_formatted_scans_key_row(dcm_fn)
-    prandstr2 = row2[2]
+    prandstr2 = row2[3]
     assert(prandstr1 == prandstr2)
     assert(row1 == row2)
     # So it is consistent across pythons etc, we use explicit value here
@@ -189,7 +189,7 @@ def test_get_formatted_scans_key_row():
     row3 = get_formatted_scans_key_row(
         "%s/01-anat-scout/0001.dcm" % TESTS_DATA_PATH)
     assert(row3 != row1)
-    prandstr3 = row3[2]
+    prandstr3 = row3[3]
     assert(prandstr1 != prandstr3)
     assert(prandstr3 == "fae3befb")
 
@@ -211,7 +211,7 @@ def _check_rows(fn, rows):
                 rows_loaded.append(row)
         for i, row_ in enumerate(rows_loaded):
             if i == 0:
-                assert(row_ == ['filename', 'acq_time', 'operator', 'randstr'])
+                assert(row_ == ['filename', 'acq_time', 'acq_time_precise', 'operator', 'randstr'])
             else:
                 assert(rows[row_[0]] == row_[1:])
         # dates, filename should be sorted (date "first", filename "second")