Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 98 additions & 19 deletions echopop/nwfsc_feat/ingest_nasc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,9 +1247,6 @@ def filter_transect_intervals(
):
transect_filter_df.rename(columns={"transect": "transect_num"}, inplace=True)

# Store original columns
original_columns = nasc_df.columns.tolist()

# Apply a filter, if needed
if subset_filter is not None:
# Extract tokens from string
Expand All @@ -1272,23 +1269,105 @@ def filter_transect_intervals(
if missing:
raise ValueError(f"Invalid column(s): {', '.join(missing)}")
else:
transect_filter_df = transect_filter_df.query(subset_filter)

# Perform a join to pair each row in nasc_df with matching rows in transect_filter_df
expanded_df = nasc_df.merge(
transect_filter_df[["transect_num", "log_start", "log_end"]], on="transect_num", how="left"
)

# Check for overlap between the distance range and log range
mask = (
(expanded_df["distance_e"] >= expanded_df["log_start"])
& (expanded_df["distance_s"] <= expanded_df["log_end"])
) | (expanded_df["log_start"].isna() | expanded_df["log_end"].isna())

# Apply mask and keep only original columns
filtered_df = expanded_df[mask].filter(original_columns).reset_index(drop=True)
transect_filter_df = transect_filter_df.query(subset_filter).sort_values(
["transect_num"]
)

return filtered_df
# Sort transect filter by vessel log distance start values
transect_filter_df = transect_filter_df.sort_values("log_start").reset_index(drop=True)

# Get arrays for easier processing
filter_transect_nums = transect_filter_df["transect_num"].values
filter_log_starts = transect_filter_df["log_start"].values
filter_log_ends = transect_filter_df["log_end"].values
unique_filter_transects = np.unique(filter_transect_nums)

# Get NASC data arrays
nasc_transect_nums = nasc_df["transect_num"].values
nasc_distance_starts = nasc_df["distance_s"].values
nasc_distance_ends = nasc_df["distance_e"].values

# Initialize removal list
indices_to_remove = []

# Process each unique transect that has filter intervals
for transect in unique_filter_transects:
# ---- Find filter interval indices for this transect
filter_interval_indices = np.where(filter_transect_nums == transect)[0]
num_intervals = len(filter_interval_indices)
# ---- Find NASC data indices for this transect
nasc_data_indices = np.where(nasc_transect_nums == transect)[0]
# ---- Skip if empty
if len(nasc_data_indices) == 0:
continue
# ---- Initialize current transect removal indices
current_removal_indices = []
# ---- Iterate through gaps
if num_intervals > 1:
# ---- Multiple intervals case
for j in range(num_intervals):
if j == 0:
# ---- Case 1: Remove data before first interval
condition_indices = np.where(
nasc_distance_ends[nasc_data_indices]
< filter_log_starts[filter_interval_indices[0]]
)[0]
elif j == num_intervals - 1:
# ---- Case 2: Remove data after last interval OR between last two intervals
condition_indices = np.where(
(
nasc_distance_starts[nasc_data_indices]
> filter_log_ends[filter_interval_indices[num_intervals - 1]]
)
| (
(
nasc_distance_starts[nasc_data_indices]
> filter_log_ends[filter_interval_indices[j - 1]]
)
& (
nasc_distance_ends[nasc_data_indices]
< filter_log_starts[filter_interval_indices[j]]
)
)
)[0]
else:
# ---- Case 3: Remove data between intervals j-1 and j
condition_indices = np.where(
(
nasc_distance_starts[nasc_data_indices]
> filter_log_ends[filter_interval_indices[j - 1]]
)
& (
nasc_distance_ends[nasc_data_indices]
< filter_log_starts[filter_interval_indices[j]]
)
)[0]
# ---- Add matching NASC indices to removal list
if len(condition_indices) > 0:
current_removal_indices.extend(nasc_data_indices[condition_indices])
else:
# ---- Single interval case: Remove data before start OR after end
condition_indices = np.where(
(
nasc_distance_ends[nasc_data_indices]
< filter_log_starts[filter_interval_indices[0]]
)
| (
nasc_distance_starts[nasc_data_indices]
> filter_log_ends[filter_interval_indices[0]]
)
)[0]
if len(condition_indices) > 0:
current_removal_indices.extend(nasc_data_indices[condition_indices])
# ---- Add all removal indices for this transect
indices_to_remove.extend(current_removal_indices)

# Create boolean mask for rows to keep
keep_mask = np.ones(len(nasc_df), dtype=bool)
keep_mask[indices_to_remove] = False

# Return the filtered dataframe
return nasc_df[keep_mask].reset_index(drop=True)


def convert_afsc_nasc_to_feat(
Expand Down
1 change: 1 addition & 0 deletions echopop/validators/kriging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class KrigingParameters(
search_radius: float = Field(gt=0.0, allow_inf_nan=False)

@model_validator(mode="after")
@classmethod
def validate_k_interval(cls, values):
# Get `k_min` and `k_max`
k_min = getattr(values, "k_min", 5)
Expand Down
1 change: 1 addition & 0 deletions echopop/validators/spatial.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def validate_init(cls, v):
return v

@model_validator(mode="after")
@classmethod
def validate_coordinate_overlap(cls, values):
# Get the mesh and transects DataFrames
mesh = getattr(values, "mesh")
Expand Down
1 change: 1 addition & 0 deletions echopop/validators/variogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def validate_transects(cls, v):
return spatial.TransectsDF.validate(v)

@model_validator(mode="after")
@classmethod
def validate_df_columns(cls, values):
# Get the mesh and transects DataFrames
coords = values.coordinate_names
Expand Down