Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions EchoPro/computation/biology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import numpy as np
import pandas as pd
from ..computation.spatial import correct_transect_intervals
from ..computation.operations import group_merge

def index_sex_weight_proportions( biology_dict: dict ):
"""
Generate dataframe containing sex-stratified weight proportions

Parameters
----------
biology_dict: dict
Biology data attribute dictionary
"""

# Age-stratified weight proportions
age_stratified_proportions = biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_included' ][ 'weight_proportions' ]

# Age-stratified & sex-indexed weight proportions
age_sex_stratified_proportions = biology_dict[ 'weight' ][ 'sex_stratified' ][ 'weight_proportions' ]

# Concatenate the two to add a 'total' category
return (
pd.concat(
[ ( age_sex_stratified_proportions
.rename( columns = { 'weight_sex_stratum_proportion': 'weight_proportion' } ) ) ,
( age_stratified_proportions.assign( sex = 'total' )
.rename( columns = { 'weight_stratum_proportion': 'weight_proportion' } ) ) ]
)
)

def index_transect_age_sex_proportions( acoustics_dict: dict ,
biology_dict: dict ,
info_strata: pd.DataFrame ):
"""
Prepares the age- and sex-stratified dataframe for biomass calculation

Parameters
----------
acoustics_dict: dict
Acoustic data attribute dictionary
biology_dict: dict
Biology data attribute dictionary
infra_strata: pd.DataFrame
Dataframe containing strata definitions
"""

### Prepare initial dataframes used for calculation population statistics
# Construct georeferenced dataframe containing NASC data
nasc_interval_df = correct_transect_intervals( acoustics_dict[ 'nasc' ][ 'nasc_df' ] )

### Call additional dataframes needed to merge with the NASC data and subsequently calculate
### population-level metrics (and later statistics)
# Sex-stratum-indexed proportions and average weight
weight_sex_strata = biology_dict[ 'weight' ][ 'weight_strata_df' ]

# Stratum-averaged sigma_bs
sigma_bs_strata = acoustics_dict[ 'sigma_bs' ][ 'strata_mean' ]

# Adult NASC proportions for each stratum (number)
# !!! TODO: Currently only uses 'age_1_excluded' -- this should become an argument that toggles
## This is not a major issue since both 'NASC_*' columns can be pivoted to create a single
## NASC column so the column name does not have to be hard-coded. This could then correspond to
## the configuration settings in some way, or this may be where the argument comes into play where
## the dataframe can be simply filtered based on the input/selection.
# between excluding and including age-1 fish
nasc_adult_number_proportions = (
biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_excluded' ][ 'number_proportions' ]
.rename( columns = { 'number_proportion': 'adult_number_proportion' } )
)

# Adult NASC proportions for each stratum (weight)
nasc_adult_weight_proportions = (
biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_excluded' ][ 'weight_proportions' ]
.rename( columns = { 'weight_proportion': 'adult_weight_proportion' } )
)

### Consolidate dataframes that will be added into a list
dataframes_to_add = [ nasc_interval_df , sigma_bs_strata , weight_sex_strata , nasc_adult_number_proportions ,
nasc_adult_weight_proportions ]

## Merge the relevant dataframes
return (
nasc_interval_df
# Merge stratum information ( join = 'outer' since missing values will be filled later on)
.merge( info_strata , on = [ 'stratum_num' , 'haul_num' ] , how = 'outer' )
# Drop unused hauls
.dropna( subset = 'transect_num' )
# Fill NaN w/ 0's for 'fraction_hake'
.assign( fraction_hake = lambda x: x[ 'fraction_hake' ].fillna( 0 ) )
# Group merge
.group_merge( dataframes_to_add = dataframes_to_add , on = 'stratum_num' )
)
83 changes: 82 additions & 1 deletion EchoPro/computation/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ..utils.monkey_patch_dataframe import patch_method_to_DataFrame
from typing import Union , List
from typing import Callable
from functools import reduce

@patch_method_to_DataFrame( pd.DataFrame )
def bin_variable( dataframe: pd.DataFrame ,
Expand Down Expand Up @@ -154,4 +155,84 @@ def meld( specimen_dataframe: pd.DataFrame ,
return pd.concat( [ specimen_stacked ,
length_dataframe ] ,
join = 'inner' )


@patch_method_to_DataFrame( pd.DataFrame )
def stretch( dataframe ,
variable ,
variable_contrast = 'sex' ,
index_variables = [ 'transect_num' , 'latitude' , 'longitude' , 'stratum_num' ] ,
sep = "_" ,
suffix = "\\w+" ):
"""
Melts dataframe into a parseable format

Parameters
----------
dataframe: pd.DataFrame
A DataFrame object containing pertinent biological data
variable: str
Data variable name
variable_contrast: str
The name of the column that will be used to index the data variable
index_variables: str or List
A list or string of additional indexing/metadata variables that will be joined to the
data-of-interest
sep: str
A character indicating the separation of the variable names in the wide format, to be stripped
from the names in the long format
suffix: str
A regular expression capturing the wanted suffixes
"""
### Ensure variables are a list in case input is just a str
idx_lst = [ index_variables ] if isinstance( index_variables , str) else index_variables

### Prepare the dataframe for pivoting from wide to long
# Filter out the dataframe columns with the target index variables
dataframe_reduced = (
# Select the index variables
dataframe.filter( items = idx_lst )
# Join with the target variable
.join( dataframe.filter( regex = variable ) )
)

### Pivot from wide to long
return (
pd.wide_to_long( df = dataframe_reduced ,
stubnames = variable ,
i = idx_lst ,
j = variable_contrast ,
sep = sep ,
suffix = suffix )
.reset_index( )
)

@patch_method_to_DataFrame( pd.DataFrame )
def group_merge( dataframe ,
dataframes_to_add ,
on ,
how = 'outer' ,
drop_na = True ):

### Ensure that both the 'dataframes' and 'on' arguments are lists
# dataframes
df_lst = [ dataframes_to_add ] if isinstance( dataframes_to_add , str) else dataframes_to_add

# on
on_lst = [ on ] if isinstance( on , str) else on

### Merge the dataframes that will be joined to the original dataframe
frames_to_add = reduce( lambda left, right: pd.merge( left , right , on = on_lst , how = how ) , df_lst )

### Find union of column names that will be used to join everything
union_lst = dataframe.filter( items = frames_to_add.columns ).columns.tolist()

### Merge together and remove NaN values depending on argument 'drop_na'
if drop_na:
merged_frame = dataframe.dropna().merge( frames_to_add.dropna() ,
on = union_lst , how = 'outer' )
else:
merged_frame = dataframe.merge( frames_to_add,
on = union_lst , how = 'outer' )

### Carriage return
return merged_frame
99 changes: 99 additions & 0 deletions EchoPro/computation/spatial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pandas as pd
import numpy as np
import geopy.distance

def correct_transect_intervals( dataframe: pd.DataFrame ,
threshold: np.float64 = 0.05 ):
"""
Calculate along-transect intervals and impute erroneous values

Parameters
----------
dataframe: pd.DataFrame
DataFrame

Notes
-----
This function calculates the along-track transect interval length and areas.
It then 'searches' for possible erroneous values at the end of each line
and replaces/imputes with alternative lengths/areas.
"""
return (
dataframe
# Calculate along-transect interval distances
.pipe( lambda df: df.assign( interval = df[ 'vessel_log_start' ].diff( periods = -1 ).abs( ) )
.replace( np.nan , df[ 'vessel_log_end' ].iloc[ -1 ] - df[ 'vessel_log_start' ].iloc[ -1 ] ) )
# Replace likely erroneous interval lengths associated with the edges of each transect
.pipe
( lambda df: df.assign( median_interval = np.median( df[ 'interval' ] ) )
.assign( interval = lambda x: np.where( np.abs( x[ 'interval' ] - x[ 'median_interval' ] > threshold ) ,
x.vessel_log_end - x.vessel_log_start ,
x.interval ) ) )
# Calculate interval area
.pipe( lambda df: df.assign( interval_area = df[ 'interval' ] * df[ 'transect_spacing' ] ) )
# Keep dataframe tidy by only retaining the necessary columns/variables
.loc[ : , [ 'latitude' , 'longitude' , 'transect_num' , 'stratum_num' , 'haul_num' ,
'interval' , 'interval_area' , 'NASC_all_ages' , 'NASC_no_age1' ] ]
)


def calculate_start_end_coordinates( group ,
contrast ):
"""
Calculates start and end latitude/longitude

Parameters
----------
group: pd.DataFrameGroupBy
Grouped DataFrame
contrast: List
Target contrast for grouping

Notes
-----
This function calculates the bounding rectangle surrounding the latitude/longitude values
for each grouped value (e.g. transect)
"""

return (
group
.groupby( contrast )
.apply( lambda x: pd.Series( { 'minimum_longitude': x['longitude'].min() ,
'maximum_longitude': x['longitude'].max() ,
'center_latitude': x[ 'latitude' ].mean() } ) )
.reset_index( )
)

def calculate_transect_distance( dataframe ,
contrast = 'transect_num' ):
"""
Calculates spatial features of each transect

Parameters
----------
dataframe: pd.DataFrame
DataFrame
contrast: List
Target contrast for grouping

Notes
-----
This function calculates the bounding rectangle surrounding the latitude/longitude values
for each transect and stratum, the average spacing between transects, approximate areas
relative to each transect, and the distance for each transect
"""

### Calculate mean transect spacinge
transect_spacing = dataframe.groupby( contrast )[ 'transect_spacing' ].mean().reset_index()

### Calculate minimum/maximum longitude, mean latitude, spacing, and area
return (
dataframe
.pipe( lambda df: calculate_start_end_coordinates( df , [ contrast ] ) )
.assign(transect_distance=lambda x: x.apply( lambda row: geopy.distance.distance(
( row[ 'center_latitude' ] , row[ 'minimum_longitude' ] ) ,
( row[ 'center_latitude' ] , row[ 'maximum_longitude' ] ) ).nm ,
axis=1 ) )
.merge( transect_spacing , on = [ contrast ] )
.assign( transect_area = lambda x: x.transect_distance * x.transect_spacing )
)
Loading