-
Notifications
You must be signed in to change notification settings - Fork 235
pygmt.x2sys_cross: Refactor to use virtualfiles for output tables [BREAKING CHANGE: Dummy times in 3rd and 4th columns now have np.timedelta64 type] #3182
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
95fab98
ce926a0
86278cb
58c6ea4
d6eeade
9d12ae1
28eb1df
5e926e8
c1c756d
3a3df0a
3aea9a6
d869a32
b46d21d
81a1ec0
07fe53e
5f04506
84765e4
b9b4098
bf59e61
9bc063a
b0b5099
1396ee8
6f2671a
a9a4179
04a1986
b27212b
aa3e9af
97312fb
6450ba0
29a7f9e
d5294a4
55f7c30
a44390d
b81e292
870d9c7
db94b91
de17d5e
ebce56e
71af717
cf2cfc7
3a62fc1
9fd35ce
2b3474b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5,19 +5,19 @@ | |||||
| import contextlib | ||||||
| import os | ||||||
| from pathlib import Path | ||||||
| from typing import Any, Literal | ||||||
|
|
||||||
| import pandas as pd | ||||||
| from packaging.version import Version | ||||||
| from pygmt.clib import Session | ||||||
| from pygmt.exceptions import GMTInvalidInput | ||||||
| from pygmt.helpers import ( | ||||||
| GMTTempFile, | ||||||
| build_arg_list, | ||||||
| data_kind, | ||||||
| fmt_docstring, | ||||||
| kwargs_to_strings, | ||||||
| unique_name, | ||||||
| use_alias, | ||||||
| validate_output_table_type, | ||||||
| ) | ||||||
|
|
||||||
|
|
||||||
|
|
@@ -71,7 +71,12 @@ def tempfile_from_dftrack(track, suffix): | |||||
| Z="trackvalues", | ||||||
| ) | ||||||
| @kwargs_to_strings(R="sequence") | ||||||
| def x2sys_cross(tracks=None, outfile=None, **kwargs): | ||||||
| def x2sys_cross( | ||||||
| tracks=None, | ||||||
| output_type: Literal["pandas", "numpy", "file"] = "pandas", | ||||||
| outfile: str | None = None, | ||||||
| **kwargs, | ||||||
| ): | ||||||
| r""" | ||||||
| Calculate crossovers between track data files. | ||||||
|
|
||||||
|
|
@@ -102,11 +107,8 @@ def x2sys_cross(tracks=None, outfile=None, **kwargs): | |||||
| set it will default to $GMT_SHAREDIR/x2sys]. (**Note**: MGD77 files | ||||||
| will also be looked for via $MGD77_HOME/mgd77_paths.txt and .gmt | ||||||
| files will be searched for via $GMT_SHAREDIR/mgg/gmtfile_paths). | ||||||
|
|
||||||
| outfile : str | ||||||
| Optional. The file name for the output ASCII txt file to store the | ||||||
| table in. | ||||||
|
|
||||||
| {output_type} | ||||||
| {outfile} | ||||||
seisman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| tag : str | ||||||
| Specify the x2sys TAG which identifies the attributes of this data | ||||||
| type. | ||||||
|
|
@@ -183,68 +185,56 @@ def x2sys_cross(tracks=None, outfile=None, **kwargs): | |||||
|
|
||||||
| Returns | ||||||
| ------- | ||||||
| crossover_errors : :class:`pandas.DataFrame` or None | ||||||
| Table containing crossover error information. | ||||||
| Return type depends on whether the ``outfile`` parameter is set: | ||||||
|
|
||||||
| - :class:`pandas.DataFrame` with (x, y, ..., etc) if ``outfile`` is not | ||||||
| set | ||||||
| - None if ``outfile`` is set (track output will be stored in the set in | ||||||
| ``outfile``) | ||||||
| crossover_errors | ||||||
| Table containing crossover error information. Return type depends on ``outfile`` | ||||||
| and ``output_type``: | ||||||
|
|
||||||
| - None if ``outfile`` is set (output will be stored in file set by ``outfile``) | ||||||
| - :class:`pandas.DataFrame` or :class:`numpy.ndarray` if ``outfile`` is not set | ||||||
| (depends on ``output_type``) | ||||||
| """ | ||||||
| with Session() as lib: | ||||||
| file_contexts = [] | ||||||
| for track in tracks: | ||||||
| kind = data_kind(track) | ||||||
| if kind == "file": | ||||||
| output_type = validate_output_table_type(output_type, outfile=outfile) | ||||||
|
|
||||||
| file_contexts: list[contextlib.AbstractContextManager[Any]] = [] | ||||||
| for track in tracks: | ||||||
| match data_kind(track): | ||||||
| case "file": | ||||||
| file_contexts.append(contextlib.nullcontext(track)) | ||||||
| elif kind == "matrix": | ||||||
| case "matrix": | ||||||
| # find suffix (-E) of trackfiles used (e.g. xyz, csv, etc) from | ||||||
| # $X2SYS_HOME/TAGNAME/TAGNAME.tag file | ||||||
| lastline = ( | ||||||
| Path(os.environ["X2SYS_HOME"], kwargs["T"], f"{kwargs['T']}.tag") | ||||||
| .read_text(encoding="utf8") | ||||||
| .strip() | ||||||
| .split("\n")[-1] | ||||||
| ) # e.g. "-Dxyz -Etsv -I1/1" | ||||||
| tagfile = Path( | ||||||
| os.environ["X2SYS_HOME"], kwargs["T"], f"{kwargs['T']}.tag" | ||||||
| ) | ||||||
| # Last line is like "-Dxyz -Etsv -I1/1" | ||||||
| lastline = tagfile.read_text().splitlines()[-1] | ||||||
|
||||||
| lastline = tagfile.read_text().splitlines()[-1] | |
| lastline = tagfile.read_text(encoding="utf8").splitlines()[-1] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added back.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that x2sys_cross can output multi-segment files (each segment is separated by IO_SEGMENT_MARKER which is > by default, see https://docs.generic-mapping-tools.org/6.5/reference/file-formats.html#optional-segment-header-records). If I'm not mistaken, the current virtualfile_to_dataset method does not implement multi-segment file handling yet? To be fair though, the current implementation in x2sys_cross simply merges all segments into one, since we skip rows starting with >, but we need to check that virtualfile_to_dataset will return all segments in a multi-segment file instead of just the first one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm not mistaken, the current
virtualfile_to_datasetmethod does not implement multi-segment file handling yet? To be fair though, the current implementation inx2sys_crosssimply merges all segments into one, since we skip rows starting with>, but we need to check thatvirtualfile_to_datasetwill return all segments in a multi-segment file instead of just the first one.
Yes. The main problem is that, as far as I know, there is no equivalent way to represent a multi-segment file in pandas. The multi-segment support was also mentioned in #2729 (comment).
If we can have a general way to represent multi-segment in pandas, then it should be straightforward to output multi-segments from _GMT_DATASET to the desired data structure.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be fair though, the current implementation in
x2sys_crosssimply merges all segments into one, since we skip rows starting with>, but we need to check thatvirtualfile_to_datasetwill return all segments in a multi-segment file instead of just the first one.
It's already tested in the _GMT_DATASET docstrings
pygmt/pygmt/datatypes/dataset.py
Line 215 in 466c8b6
| >>> with GMTTempFile(suffix=".txt") as tmpfile: |
For x2sys_cross, we also test the shape of the output pd.DataFrame.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Am I understanding the output correctly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've never used x2sys, but here is my understanding of the C codes and the output:
- The 3rd and 4th columns are datetimes. They can be either absolute datetimes (e.g.,
2023-01-01T01:23:45.678or dummy datetimes (i.e., double-precision numbers), depending on whether the input tracks contain datetimes. - Internally, absolute datetimes are also represented as double-precision numbers in GMT. So absolute datetimes and dummy datetimes are the same internally.
- When outputting to a file, GMT will convert double-precision numbers into absolute datetimes, since GMT know if the column has dummy datetimes or not.
- A
GMT_DATASETcontainer can only contain double-precision numbers and text strings. So when outputting to a virtual file, the 3rd and 4th columns always have double-precision numbers. If the column names aret_1/t_2, then we know they're absolute datetimes and should be converted; otherwise, they are just dummy datetimes and should not be converted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a little unsure if i_1/i_2 are actually dummy datetimes. This is a sample output from x2sys_cross:
# Tag: X2SYS4ivlhlo4
# Command: x2sys_cross @tut_ship.xyz -Qi -TX2SYS4ivlhlo4 ->/tmp/lala.txt
# x y i_1 i_2 dist_1 dist_2 head_1 head_2 vel_1 vel_2 z_X z_M
> @tut_ship 0 @tut_ship 0 NaN/NaN/1357.17 NaN/NaN/1357.17
251.004840022 20.000079064 18053.5647431 13446.6562433 333.339586673 229.636557499 269.996783034 270.023614846 NaN NaN 192.232797243 -2957.22757183
251.004840022 20.000079064 18053.5647431 71783.6562433 333.339586673 1148.20975878 269.996783034 270.023614846 NaN NaN 192.232797243 -2957.22757183
250.534946327 20.0000526811 18053.3762934 66989.0210846 332.869692978 1022.68273972 269.996783034 269.360150109 NaN NaN -57.6485957585 -2686.4268008
250.532033147 20.0000525175 18053.3751251 66988.9936489 332.866779797 1022.67977813 269.996783034 22.0133296951 NaN NaN -64.5973890802 -2682.04812157
252.068705 20.000075 13447.5 71784.5 230.700422496 1149.27362378 269.995072235 269.995072235 NaN NaN 0 -3206.5
It seems like the i_1/i_2 values vary between rows, but I can't quite remember what they represent... maybe an index of some sort? I might need to inspect the C code to see what's going on, can you point me to where these i_1/i_2 columns are being output?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dummy times are just double-precision indexes from 0 to n (xref: https://github.com/GenericMappingTools/gmt/blob/b56be20bee0b8de22a682fdcd458f9b9eeb76f64/src/x2sys/x2sys.c#L533).
The column name i_1 or t_1 is controlled by the variable t_or_i in the C code (https://github.com/GenericMappingTools/gmt/blob/b56be20bee0b8de22a682fdcd458f9b9eeb76f64/src/x2sys/x2sys_cross.c#L998). From https://github.com/GenericMappingTools/gmt/blob/b56be20bee0b8de22a682fdcd458f9b9eeb76f64/src/x2sys/x2sys_cross.c#L568, it's clear that, if got_time is True, then the column is absolute time (GMT_IS_ABSTIME), otherwise it's double-precision numbers (GMT_IS_FLOAT).
We can keep the dummy times as double-precision numbers or think them as seconds since unix epoch and then convert them to absolute times.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can keep the dummy times as double-precision numbers or think them as seconds since unix epoch and then convert them to absolute times.
Maybe convert the relative time to pandas.Timedelta or numpy.timedelta64? Xref #2848.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. Done in 9d12ae1.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Honestly, I'm not sure if we should support
numpyoutput type forx2sys_crossbecause all 'columns' will need to be the same dtype in anp.ndarray. If there are datetime values in the columns, they will get converted to floating point (?), which makes it more difficult to use later. Try adding a unit test fornumpyoutput_type and see if it makes sense.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right. Datetimes are converted to floating points by
df.to_numpy(). Will remove thenumpyoutput type.