-
Notifications
You must be signed in to change notification settings - Fork 235
GMT_DATASET.to_dataframe: Return an empty DataFrame if a file contains no data #3131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
175ba3c
2e6e277
7482b25
3246e5c
ec59f9c
a2c48d5
1281ec0
b817e91
71cc9b7
065ec12
dbfc2ae
06790e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801 | |
| """ | ||
| GMT dataset structure for holding multiple tables (files). | ||
|
|
||
| This class is only meant for internal use by PyGMT and is not exposed to users. | ||
| See the GMT source code gmt_resources.h for the original C struct definitions. | ||
| This class is only meant for internal use and is not exposed to users. See the GMT | ||
| source code ``gmt_resources.h`` for the original C struct definitions. | ||
|
|
||
| Examples | ||
| -------- | ||
|
|
@@ -156,6 +156,8 @@ def to_dataframe( | |
| the same. The same column in all segments of all tables are concatenated. The | ||
| trailing text column is also concatenated as a single string column. | ||
|
|
||
| If the object contains no data, an empty DataFrame will be returned. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| column_names | ||
|
|
@@ -200,8 +202,8 @@ def to_dataframe( | |
| >>> df.dtypes.to_list() | ||
| [dtype('float64'), dtype('float64'), dtype('float64'), string[python]] | ||
| """ | ||
| # Deal with numeric columns | ||
| vectors = [] | ||
| # Deal with numeric columns | ||
| for icol in range(self.n_columns): | ||
| colvector = [] | ||
| for itbl in range(self.n_tables): | ||
|
|
@@ -226,8 +228,13 @@ def to_dataframe( | |
| pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype()) | ||
| ) | ||
|
|
||
| # Return an empty DataFrame if no columns are found. | ||
| if len(vectors) == 0: | ||
seisman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return pd.DataFrame() | ||
|
||
|
|
||
| # Create a DataFrame object by concatenating multiple columns | ||
| df = pd.concat(objs=vectors, axis="columns") | ||
seisman marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if column_names is not None: # Assign column names | ||
| if column_names is not None: | ||
| df.columns = column_names | ||
| if dtype is not None: | ||
| df = df.astype(dtype) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| """ | ||
| Tests for GMT_DATASET data type. | ||
| """ | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| import pandas as pd | ||
| import pytest | ||
| from pygmt.clib import Session | ||
| from pygmt.helpers import GMTTempFile | ||
|
|
||
|
|
||
| def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None): | ||
| """ | ||
| Read tabular data as pandas.DataFrame object using pandas.read_csv(). | ||
|
|
||
| The parameters have the same meaning as in ``pandas.read_csv()``. | ||
| """ | ||
| try: | ||
| df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header) | ||
| except pd.errors.EmptyDataError: | ||
| # Return an empty DataFrame if the file contains no data | ||
| return pd.DataFrame() | ||
|
|
||
| # By default, pandas reads text strings with whitespaces as multiple columns, but | ||
| # GMT concatenates all trailing text as a single string column. Need do find all | ||
| # string columns (with dtype="object") and combine them into a single string column. | ||
| string_columns = df.select_dtypes(include=["object"]).columns | ||
| if len(string_columns) > 1: | ||
| df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1) | ||
| df = df.drop(string_columns[1:], axis=1) | ||
| # Convert 'object' to 'string' type | ||
| df = df.convert_dtypes( | ||
| convert_string=True, | ||
| convert_integer=False, | ||
| convert_boolean=False, | ||
| convert_floating=False, | ||
| ) | ||
| return df | ||
|
|
||
|
|
||
| def dataframe_from_gmt(fname): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For reference, GMT provides two special/undocumented modules |
||
| """ | ||
| Read tabular data as pandas.DataFrame using GMT virtual file. | ||
| """ | ||
| with Session() as lib: | ||
| with lib.virtualfile_out(kind="dataset") as vouttbl: | ||
| lib.call_module("read", f"{fname} {vouttbl} -Td") | ||
| df = lib.virtualfile_to_dataset(vfname=vouttbl) | ||
| return df | ||
|
|
||
|
|
||
| @pytest.mark.benchmark | ||
| def test_dataset(): | ||
| """ | ||
| Test the basic functionality of GMT_DATASET. | ||
| """ | ||
| with GMTTempFile(suffix=".txt") as tmpfile: | ||
| with Path(tmpfile.name).open(mode="w") as fp: | ||
| print(">", file=fp) | ||
| print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp) | ||
| print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp) | ||
| print(">", file=fp) | ||
| print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp) | ||
| print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp) | ||
|
|
||
| df = dataframe_from_gmt(tmpfile.name) | ||
| expected_df = dataframe_from_pandas(tmpfile.name, comment=">") | ||
| pd.testing.assert_frame_equal(df, expected_df) | ||
|
|
||
|
|
||
| def test_dataset_empty(): | ||
| """ | ||
| Make sure that an empty DataFrame is returned if a file contains no data. | ||
| """ | ||
| with GMTTempFile(suffix=".txt") as tmpfile: | ||
| with Path(tmpfile.name).open(mode="w") as fp: | ||
| print("# This is a comment line.", file=fp) | ||
|
|
||
| df = dataframe_from_gmt(tmpfile.name) | ||
| assert df.empty # Empty DataFrame | ||
| expected_df = dataframe_from_pandas(tmpfile.name) | ||
| pd.testing.assert_frame_equal(df, expected_df) | ||
Uh oh!
There was an error while loading. Please reload this page.