OSOceanAcoustics · brandynlucca · Feb 13, 2024 · Feb 2, 2024 · Feb 7, 2024 · Feb 8, 2024
diff --git a/EchoPro/computation/biology.py b/EchoPro/computation/biology.py
@@ -0,0 +1,93 @@
+import numpy as np
+import pandas as pd
+from ..computation.spatial import correct_transect_intervals
+from ..computation.operations import group_merge
+
+def index_sex_weight_proportions( biology_dict: dict ):
+    """
+    Generate dataframe containing sex-stratified weight proportions    
+
+    Parameters
+    ----------
+    biology_dict: dict
+        Biology data attribute dictionary 
+    """     
+
+    # Age-stratified weight proportions
+    age_stratified_proportions = biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_included' ][ 'weight_proportions' ]
+
+    # Age-stratified & sex-indexed weight proportions
+    age_sex_stratified_proportions = biology_dict[ 'weight' ][ 'sex_stratified' ][ 'weight_proportions' ]
+
+    # Concatenate the two to add a 'total' category
+    return (
+        pd.concat(
+            [ ( age_sex_stratified_proportions
+                .rename( columns = { 'weight_sex_stratum_proportion': 'weight_proportion' } ) ) ,
+              ( age_stratified_proportions.assign( sex = 'total' )
+                .rename( columns = { 'weight_stratum_proportion': 'weight_proportion' } ) ) ]
+        )
+    )
+
+def index_transect_age_sex_proportions( acoustics_dict: dict ,
+                                        biology_dict: dict ,
+                                        info_strata: pd.DataFrame ):
+    """
+    Prepares the age- and sex-stratified dataframe for biomass calculation    
+
+    Parameters
+    ----------
+    acoustics_dict: dict
+        Acoustic data attribute dictionary 
+    biology_dict: dict
+        Biology data attribute dictionary 
+    infra_strata: pd.DataFrame
+        Dataframe containing strata definitions
+    """     
+
+    ### Prepare initial dataframes used for calculation population statistics
+    # Construct georeferenced dataframe containing NASC data
+    nasc_interval_df = correct_transect_intervals( acoustics_dict[ 'nasc' ][ 'nasc_df' ] )
+
+    ### Call additional dataframes needed to merge with the NASC data and subsequently calculate
+    ### population-level metrics (and later statistics)
+    # Sex-stratum-indexed proportions and average weight
+    weight_sex_strata = biology_dict[ 'weight' ][ 'weight_strata_df' ]
+
+    # Stratum-averaged sigma_bs
+    sigma_bs_strata = acoustics_dict[ 'sigma_bs' ][ 'strata_mean' ]
+
+    # Adult NASC proportions for each stratum (number)
+    # !!! TODO: Currently only uses 'age_1_excluded' -- this should become an argument that toggles
+    ## This is not a major issue since both 'NASC_*' columns can be pivoted to create a single 
+    ## NASC column so the column name does not have to be hard-coded. This could then correspond to
+    ## the configuration settings in some way, or this may be where the argument comes into play where
+    ## the dataframe can be simply filtered based on the input/selection.
+    # between excluding and including age-1 fish
+    nasc_adult_number_proportions = (
+        biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_excluded' ][ 'number_proportions' ]
+        .rename( columns = { 'number_proportion': 'adult_number_proportion' } )
+    )
+
+    # Adult NASC proportions for each stratum (weight)
+    nasc_adult_weight_proportions = (
+        biology_dict[ 'weight' ][ 'age_stratified' ][ 'age_1_excluded' ][ 'weight_proportions' ]
+        .rename( columns = { 'weight_proportion': 'adult_weight_proportion' } )
+    )
+
+    ### Consolidate dataframes that will be added into a list
+    dataframes_to_add = [ nasc_interval_df , sigma_bs_strata , weight_sex_strata , nasc_adult_number_proportions , 
+                          nasc_adult_weight_proportions ]
+
+    ## Merge the relevant dataframes
+    return (
+        nasc_interval_df
+        # Merge stratum information ( join = 'outer' since missing values will be filled later on)
+        .merge( info_strata , on = [ 'stratum_num' , 'haul_num' ] , how = 'outer' )
+        # Drop unused hauls
+        .dropna( subset = 'transect_num' )
+        # Fill NaN w/ 0's for 'fraction_hake'
+        .assign( fraction_hake = lambda x: x[ 'fraction_hake' ].fillna( 0 ) )
+        # Group merge
+        .group_merge( dataframes_to_add = dataframes_to_add , on = 'stratum_num' )
+        )
diff --git a/EchoPro/computation/operations.py b/EchoPro/computation/operations.py
@@ -3,6 +3,7 @@
 from ..utils.monkey_patch_dataframe import patch_method_to_DataFrame
 from typing import Union , List
 from typing import Callable
+from functools import reduce
 
 @patch_method_to_DataFrame( pd.DataFrame )
 def bin_variable( dataframe: pd.DataFrame , 
@@ -154,4 +155,84 @@ def meld( specimen_dataframe: pd.DataFrame ,
     return pd.concat( [ specimen_stacked ,
                         length_dataframe ] ,
                         join = 'inner' )
-
+
+@patch_method_to_DataFrame( pd.DataFrame )    
+def stretch( dataframe ,             
+             variable ,
+             variable_contrast = 'sex' ,
+             index_variables = [ 'transect_num' , 'latitude' , 'longitude' , 'stratum_num' ] ,
+             sep = "_" ,
+             suffix = "\\w+" ):
+    """
+    Melts dataframe into a parseable format
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        A DataFrame object containing pertinent biological data
+    variable: str
+        Data variable name
+    variable_contrast: str
+        The name of the column that will be used to index the data variable
+    index_variables: str or List
+        A list or string of additional indexing/metadata variables that will be joined to the
+        data-of-interest
+    sep: str
+        A character indicating the separation of the variable names in the wide format, to be stripped 
+        from the names in the long format
+    suffix: str
+        A regular expression capturing the wanted suffixes
+    """
+    ### Ensure variables are a list in case input is just a str
+    idx_lst = [ index_variables ] if isinstance( index_variables , str) else index_variables
+
+    ### Prepare the dataframe for pivoting from wide to long
+    # Filter out the dataframe columns with the target index variables
+    dataframe_reduced = (
+        # Select the index variables
+        dataframe.filter( items = idx_lst ) 
+        # Join with the target variable
+        .join( dataframe.filter( regex = variable ) )
+    )
+
+    ### Pivot from wide to long
+    return (
+       pd.wide_to_long( df = dataframe_reduced , 
+                        stubnames = variable , 
+                        i = idx_lst , 
+                        j = variable_contrast , 
+                        sep = sep , 
+                        suffix = suffix ) 
+        .reset_index( )
+    )
+
+@patch_method_to_DataFrame( pd.DataFrame ) 
+def group_merge( dataframe ,
+                 dataframes_to_add ,
+                 on ,
+                 how = 'outer' ,
+                 drop_na = True ):
+
+    ### Ensure that both the 'dataframes' and 'on' arguments are lists
+    # dataframes
+    df_lst = [ dataframes_to_add ] if isinstance( dataframes_to_add , str) else dataframes_to_add
+
+    # on
+    on_lst = [ on ] if isinstance( on , str) else on
+
+    ### Merge the dataframes that will be joined to the original dataframe
+    frames_to_add = reduce( lambda left, right: pd.merge( left , right , on = on_lst , how = how ) , df_lst )
+
+    ### Find union of column names that will be used to join everything
+    union_lst = dataframe.filter( items = frames_to_add.columns ).columns.tolist()
+
+    ### Merge together and remove NaN values depending on argument 'drop_na'
+    if drop_na: 
+        merged_frame = dataframe.dropna().merge( frames_to_add.dropna() , 
+                                                 on = union_lst , how = 'outer' )
+    else:
+        merged_frame = dataframe.merge( frames_to_add, 
+                                        on = union_lst , how = 'outer' )
+
+    ### Carriage return
+    return merged_frame
diff --git a/EchoPro/computation/spatial.py b/EchoPro/computation/spatial.py
@@ -0,0 +1,99 @@
+import pandas as pd
+import numpy as np
+import geopy.distance
+
+def correct_transect_intervals( dataframe: pd.DataFrame ,
+                                threshold: np.float64 = 0.05 ):
+    """
+    Calculate along-transect intervals and impute erroneous values
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        DataFrame
+
+    Notes
+    -----
+    This function calculates the along-track transect interval length and areas.
+    It then 'searches' for possible erroneous values at the end of each line 
+    and replaces/imputes with alternative lengths/areas.
+    """
+    return (
+            dataframe
+                        # Calculate along-transect interval distances
+                .pipe( lambda df: df.assign( interval = df[ 'vessel_log_start' ].diff( periods = -1 ).abs( ) ) 
+                                    .replace( np.nan , df[ 'vessel_log_end' ].iloc[ -1 ] - df[ 'vessel_log_start' ].iloc[ -1 ] ) )
+                # Replace likely erroneous interval lengths associated with the edges of each transect
+                .pipe
+                ( lambda df: df.assign( median_interval = np.median( df[ 'interval' ] ) )
+                                    .assign( interval = lambda x: np.where( np.abs( x[ 'interval' ] - x[ 'median_interval' ] > threshold ) ,
+                                                                            x.vessel_log_end - x.vessel_log_start ,
+                                                                            x.interval ) ) )
+                # Calculate interval area
+                .pipe( lambda df: df.assign( interval_area = df[ 'interval' ] * df[ 'transect_spacing' ] ) )                            
+                # Keep dataframe tidy by only retaining the necessary columns/variables
+                .loc[ : , [ 'latitude' , 'longitude' , 'transect_num' , 'stratum_num' , 'haul_num' , 
+                            'interval' , 'interval_area' , 'NASC_all_ages' , 'NASC_no_age1' ] ]
+            )
+
+
+def calculate_start_end_coordinates( group ,
+                                     contrast ):
+    """
+    Calculates start and end latitude/longitude   
+
+    Parameters
+    ----------
+    group: pd.DataFrameGroupBy
+        Grouped DataFrame
+    contrast: List
+        Target contrast for grouping
+
+    Notes
+    -----
+    This function calculates the bounding rectangle surrounding the latitude/longitude values
+    for each grouped value (e.g. transect)
+    """ 
+
+    return (
+        group
+        .groupby( contrast )
+        .apply( lambda x: pd.Series( { 'minimum_longitude': x['longitude'].min() , 
+                                       'maximum_longitude': x['longitude'].max() ,
+                                       'center_latitude': x[ 'latitude' ].mean() } ) )
+        .reset_index( )
+    )
+
+def calculate_transect_distance( dataframe ,
+                                 contrast = 'transect_num' ):
+    """
+    Calculates spatial features of each transect    
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        DataFrame
+    contrast: List
+        Target contrast for grouping
+
+    Notes
+    -----
+    This function calculates the bounding rectangle surrounding the latitude/longitude values
+    for each transect and stratum, the average spacing between transects, approximate areas 
+    relative to each transect, and the distance for each transect
+    """ 
+
+    ### Calculate mean transect spacinge
+    transect_spacing = dataframe.groupby( contrast )[ 'transect_spacing' ].mean().reset_index()
+
+    ### Calculate minimum/maximum longitude, mean latitude, spacing, and area
+    return (
+            dataframe
+            .pipe( lambda df: calculate_start_end_coordinates( df , [ contrast ] ) )
+            .assign(transect_distance=lambda x: x.apply( lambda row: geopy.distance.distance( 
+                    ( row[ 'center_latitude' ] , row[ 'minimum_longitude' ] ) , 
+                    ( row[ 'center_latitude' ] , row[ 'maximum_longitude' ] ) ).nm , 
+                    axis=1 ) )
+            .merge( transect_spacing , on = [ contrast ] )
+            .assign( transect_area = lambda x: x.transect_distance * x.transect_spacing )
+    )