OSOceanAcoustics · brandynlucca · Feb 13, 2024 · Feb 2, 2024 · Feb 7, 2024 · Feb 8, 2024
diff --git a/EchoPro/computation/operations.py b/EchoPro/computation/operations.py
@@ -3,6 +3,7 @@
 from ..utils.monkey_patch_dataframe import patch_method_to_DataFrame
 from typing import Union , List
 from typing import Callable
+from functools import reduce
 
 @patch_method_to_DataFrame( pd.DataFrame )
 def bin_variable( dataframe: pd.DataFrame , 
@@ -154,4 +155,84 @@ def meld( specimen_dataframe: pd.DataFrame ,
     return pd.concat( [ specimen_stacked ,
                         length_dataframe ] ,
                         join = 'inner' )
-
+
+@patch_method_to_DataFrame( pd.DataFrame )    
+def stretch( dataframe ,             
+             variable ,
+             variable_contrast = 'sex' ,
+             index_variables = [ 'transect_num' , 'latitude' , 'longitude' , 'stratum_num' ] ,
+             sep = "_" ,
+             suffix = "\\w+" ):
+    """
+    Melts dataframe into a parseable format
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        A DataFrame object containing pertinent biological data
+    variable: str
+        Data variable name
+    variable_contrast: str
+        The name of the column that will be used to index the data variable
+    index_variables: str or List
+        A list or string of additional indexing/metadata variables that will be joined to the
+        data-of-interest
+    sep: str
+        A character indicating the separation of the variable names in the wide format, to be stripped 
+        from the names in the long format
+    suffix: str
+        A regular expression capturing the wanted suffixes
+    """
+    ### Ensure variables are a list in case input is just a str
+    idx_lst = [ index_variables ] if isinstance( index_variables , str) else index_variables
+
+    ### Prepare the dataframe for pivoting from wide to long
+    # Filter out the dataframe columns with the target index variables
+    dataframe_reduced = (
+        # Select the index variables
+        dataframe.filter( items = idx_lst ) 
+        # Join with the target variable
+        .join( dataframe.filter( regex = variable ) )
+    )
+
+    ### Pivot from wide to long
+    return (
+       pd.wide_to_long( df = dataframe_reduced , 
+                        stubnames = variable , 
+                        i = idx_lst , 
+                        j = variable_contrast , 
+                        sep = sep , 
+                        suffix = suffix ) 
+        .reset_index( )
+    )
+
+@patch_method_to_DataFrame( pd.DataFrame ) 
+def group_merge( dataframe ,
+                 dataframes_to_add ,
+                 on ,
+                 how = 'outer' ,
+                 drop_na = True ):
+
+    ### Ensure that both the 'dataframes' and 'on' arguments are lists
+    # dataframes
+    df_lst = [ dataframes_to_add ] if isinstance( dataframes_to_add , str) else dataframes_to_add
+
+    # on
+    on_lst = [ on ] if isinstance( on , str) else on
+
+    ### Merge the dataframes that will be joined to the original dataframe
+    frames_to_add = reduce( lambda left, right: pd.merge( left , right , on = on_lst , how = how ) , df_lst )
+
+    ### Find union of column names that will be used to join everything
+    union_lst = dataframe.filter( items = frames_to_add.columns ).columns.tolist()
+
+    ### Merge together and remove NaN values depending on argument 'drop_na'
+    if drop_na: 
+        merged_frame = dataframe.dropna().merge( frames_to_add.dropna() , 
+                                                 on = union_lst , how = 'outer' )
+    else:
+        merged_frame = dataframe.merge( frames_to_add, 
+                                        on = union_lst , how = 'outer' )
+
+    ### Carriage return
+    return merged_frame
diff --git a/EchoPro/computation/spatial.py b/EchoPro/computation/spatial.py
@@ -0,0 +1,61 @@
+import pandas as pd
+import geopy.distance
+
+def calculate_bounds( group ,
+                      contrast ):
+    """
+    Calculates latitude/longitude boundary box    
+
+    Parameters
+    ----------
+    group: pd.DataFrameGroupBy
+        Grouped DataFrame
+    contrast: List
+        Target contrast for grouping
+
+    Notes
+    -----
+    This function calculates the bounding rectangle surrounding the latitude/longitude values
+    for each grouped value (e.g. transect)
+    """ 
+
+    return (
+        group
+        .groupby( contrast )
+        .apply( lambda x: pd.Series( { 'minimum_longitude': x['longitude'].min() , 
+                                       'maximum_longitude': x['longitude'].max() ,
+                                       'center_latitude': x[ 'latitude' ].mean() } ) )
+        .reset_index( )
+    )
+
+def calculate_transect_distance( dataframe ,
+                                 contrast = 'transect_num' ):
+    """
+    Calculates spatial features of each transect    
+
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        DataFrame
+    contrast: List
+        Target contrast for grouping
+
+    Notes
+    -----
+    This function calculates the bounding rectangle surrounding the latitude/longitude values
+    for each transect and stratum, the average spacing between transects, approximate areas 
+    relative to each transect, and the distance for each transect
+    """ 
+
+    transect_spacing_area = dataframe.groupby( contrast )[ 'transect_spacing' ].mean().reset_index()
+
+    return (
+            dataframe
+            .pipe( lambda df: calculate_bounds( df , [ contrast ] ) )
+            .assign(transect_distance=lambda x: x.apply( lambda row: geopy.distance.distance( 
+                    ( row[ 'center_latitude' ] , row[ 'minimum_longitude' ] ) , 
+                    ( row[ 'center_latitude' ] , row[ 'maximum_longitude' ] ) ).nm , 
+                    axis=1 ) )
+            .merge( transect_spacing_area , on = [ contrast ] )
+            .assign( transect_area = lambda x: x.transect_distance * x.transect_spacing )
+    )
diff --git a/EchoPro/computation/statistics.py b/EchoPro/computation/statistics.py
@@ -0,0 +1,128 @@
+import numpy as np
+import scipy.stats as st
+
+def stratified_transect_statistic( transects , 
+                                   strata , 
+                                   sample_fraction , 
+                                   replicates ):
+
+    """
+    Calculates stratified mean statistics for a given transect
+
+
+    Parameters
+    ----------
+    transects: pd.DataFrame
+        DataFrame comprising a variety of spatial metrics for transect data
+    strata: pd.DataFrame
+        DataFrame comprising summary features of latitude (INPFC) delimited strata
+    sample_fraction: np.float64
+        Value representing the proportion of ftransects that are resampled from the
+        overall dataset within each strata
+    replicates: int
+        The number of iterations/realizations used for bootstrapping
+
+    Notes
+    -----
+    This function calculates the stratified summary statistics for biomass within 
+    `EchoPro.survey.stratified_summary()`.
+    """ 
+
+    ### Convert specific DataFrame columns to arrays for speed
+    distance = transects[ 'transect_distance' ].values
+    value = transects[ 'B_adult' ].values
+    num_transects = strata[ 'num_transects' ].values
+    total_transect_area = strata.set_index( 'stratum_inpfc' )[ 'total_transect_area' ]
+
+    ### Calculate the number of transects within each stratum based on the 
+    ### sampling faction defined from the configuration file
+    # Number of transects
+    num_transects_to_sample = np.round( sample_fraction * num_transects ).astype( int )
+
+    # Offset term used for later variance calculation
+    sample_offset = np.where( num_transects_to_sample == 1 , 0 , 1 )
+
+    # Calculate effective sample size/degrees of freedom for variance calculation
+    sample_dof =  num_transects_to_sample * ( num_transects_to_sample - sample_offset )
+
+    ### Pre-allocate and pre-compute the cumulative sum of numbered transects per strata
+    # Transect indices
+    cum_num_transects = np.concatenate( ( [ 0 ] , np.cumsum( num_transects ) ) )
+
+    # Stratified statistics
+    mean_arr = np.empty( replicates )
+    variance_arr = np.empty( replicates )
+
+    ### Iterate across all replicate iterations/realizations
+    for i in range( replicates ):
+
+        # Pre-allocate the stratum-specific means and variances
+        rho_j = np.empty_like( total_transect_area ) # mean
+        var_j = np.empty_like( total_transect_area ) # variance
+
+        # Iterate across all strata
+        for j in strata.stratum_inpfc - 1:
+
+            ### Resample (without replacement) based on binned indices
+            # Define start and end transects within each stratum
+            start , end = cum_num_transects[ j ] , cum_num_transects[ j + 1 ]
+
+            # Resample without replacement
+            sel_inds = np.random.choice( np.arange( start , end ) , 
+                                         num_transects_to_sample[ j ] , 
+                                         replace=False )
+
+            ### Define stratified weights
+            stratified_weights = distance[ sel_inds ] / np.mean( distance[ sel_inds ] )
+
+            ### Weighted value (e.g. biomass)
+            value_distance_density = value[ sel_inds ] / distance[ sel_inds ]
+
+            ### Compute mean and variance
+            rho_j[ j ] = np.nansum( value[ sel_inds ] * stratified_weights ) / np.nansum( stratified_weights )
+            var_j[ j ] = np.nansum( ( stratified_weights ** 2 * ( value_distance_density - rho_j[ j ] ) ** 2 ) ) / sample_dof[ j ]
+
+        ### Calculate the overall weighted means and variances for later calculations
+        # Mean
+        mean_arr[ i ] = np.nansum( strata.total_transect_area * rho_j )
+
+        # Variance
+        variance_arr[ i ] = np.sqrt( np.nansum( var_j * strata.total_transect_area ** 2 ) )
+
+    ### Calculate the summary statistics
+    stratified_results = {
+        'biomass': {
+            'mean': {
+                'estimate': np.mean( mean_arr ) ,
+                'confidence_interval': confidence_interval( mean_arr ) ,
+            } ,
+            'variance': {
+                'estimate': np.mean( variance_arr ) ,
+                'confidence_interval': confidence_interval( variance_arr ) ,
+            } ,
+            'CV': {
+                'estimate': np.mean( variance_arr / mean_arr ) ,
+                'confidence_interval': confidence_interval( variance_arr / mean_arr ) ,
+            }
+        }
+    }
+
+    ### Carriage return
+    return stratified_results
+
+def confidence_interval( values ):
+    """
+    Calculates the 95% confidence interval (Normal) for a given array    
+
+    Parameters
+    ----------
+    values: np.array
+        An array of values
+
+    Notes
+    -----
+    This function calculates the 95% confidence interval (assuming a Normal) distribution
+    for the bootstrapped stratified samples. This is done as opposed to using the percentile
+    method for estimate the intervals.
+    """ 
+    return np.mean( values ) + np.array( [ -1 , 1 ] ) * st.norm.ppf( 0.975 ) * np.std( values )
diff --git a/EchoPro/core.py b/EchoPro/core.py
@@ -56,6 +56,12 @@
             'stratum_num': int ,
             'northlimit_latitude': np.float64 ,
         } ,
+        'inpfc_strata': {
+            'stratum_num': int ,
+            'northlimit_latitude': np.float64 ,
+            'haul_start': int ,
+            'haul_end': int ,
+        } ,
     } ,
     'NASC': {
         # ACOUSTIC DATASET -- NASC EXCLUDING AGE-1
@@ -185,11 +191,12 @@
     } ,
     'stratification': {
         'name': 'spatial' ,
-        'data': ['strata' , 'geo_strata'] ,
+        'data': [ 'strata' , 'geo_strata' , 'inpfc_strata' ] ,
         'superlayer': [] ,
         'data_tree': {
             'strata_df': pd.DataFrame() ,
             'geo_strata_df': pd.DataFrame() ,
+            'inpfc_strata_df': pd.DataFrame() ,
         } ,
     } ,
     'NASC': {