Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 82 additions & 1 deletion EchoPro/computation/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ..utils.monkey_patch_dataframe import patch_method_to_DataFrame
from typing import Union , List
from typing import Callable
from functools import reduce

@patch_method_to_DataFrame( pd.DataFrame )
def bin_variable( dataframe: pd.DataFrame ,
Expand Down Expand Up @@ -154,4 +155,84 @@ def meld( specimen_dataframe: pd.DataFrame ,
return pd.concat( [ specimen_stacked ,
length_dataframe ] ,
join = 'inner' )


@patch_method_to_DataFrame( pd.DataFrame )
def stretch( dataframe ,
variable ,
variable_contrast = 'sex' ,
index_variables = [ 'transect_num' , 'latitude' , 'longitude' , 'stratum_num' ] ,
sep = "_" ,
suffix = "\\w+" ):
"""
Melts dataframe into a parseable format

Parameters
----------
dataframe: pd.DataFrame
A DataFrame object containing pertinent biological data
variable: str
Data variable name
variable_contrast: str
The name of the column that will be used to index the data variable
index_variables: str or List
A list or string of additional indexing/metadata variables that will be joined to the
data-of-interest
sep: str
A character indicating the separation of the variable names in the wide format, to be stripped
from the names in the long format
suffix: str
A regular expression capturing the wanted suffixes
"""
### Ensure variables are a list in case input is just a str
idx_lst = [ index_variables ] if isinstance( index_variables , str) else index_variables

### Prepare the dataframe for pivoting from wide to long
# Filter out the dataframe columns with the target index variables
dataframe_reduced = (
# Select the index variables
dataframe.filter( items = idx_lst )
# Join with the target variable
.join( dataframe.filter( regex = variable ) )
)

### Pivot from wide to long
return (
pd.wide_to_long( df = dataframe_reduced ,
stubnames = variable ,
i = idx_lst ,
j = variable_contrast ,
sep = sep ,
suffix = suffix )
.reset_index( )
)

@patch_method_to_DataFrame( pd.DataFrame )
def group_merge( dataframe ,
dataframes_to_add ,
on ,
how = 'outer' ,
drop_na = True ):

### Ensure that both the 'dataframes' and 'on' arguments are lists
# dataframes
df_lst = [ dataframes_to_add ] if isinstance( dataframes_to_add , str) else dataframes_to_add

# on
on_lst = [ on ] if isinstance( on , str) else on

### Merge the dataframes that will be joined to the original dataframe
frames_to_add = reduce( lambda left, right: pd.merge( left , right , on = on_lst , how = how ) , df_lst )

### Find union of column names that will be used to join everything
union_lst = dataframe.filter( items = frames_to_add.columns ).columns.tolist()

### Merge together and remove NaN values depending on argument 'drop_na'
if drop_na:
merged_frame = dataframe.dropna().merge( frames_to_add.dropna() ,
on = union_lst , how = 'outer' )
else:
merged_frame = dataframe.merge( frames_to_add,
on = union_lst , how = 'outer' )

### Carriage return
return merged_frame
61 changes: 61 additions & 0 deletions EchoPro/computation/spatial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
import geopy.distance

def calculate_bounds( group ,
contrast ):
"""
Calculates latitude/longitude boundary box

Parameters
----------
group: pd.DataFrameGroupBy
Grouped DataFrame
contrast: List
Target contrast for grouping

Notes
-----
This function calculates the bounding rectangle surrounding the latitude/longitude values
for each grouped value (e.g. transect)
"""

return (
group
.groupby( contrast )
.apply( lambda x: pd.Series( { 'minimum_longitude': x['longitude'].min() ,
'maximum_longitude': x['longitude'].max() ,
'center_latitude': x[ 'latitude' ].mean() } ) )
.reset_index( )
)

def calculate_transect_distance( dataframe ,
contrast = 'transect_num' ):
"""
Calculates spatial features of each transect

Parameters
----------
dataframe: pd.DataFrame
DataFrame
contrast: List
Target contrast for grouping

Notes
-----
This function calculates the bounding rectangle surrounding the latitude/longitude values
for each transect and stratum, the average spacing between transects, approximate areas
relative to each transect, and the distance for each transect
"""

transect_spacing_area = dataframe.groupby( contrast )[ 'transect_spacing' ].mean().reset_index()

return (
dataframe
.pipe( lambda df: calculate_bounds( df , [ contrast ] ) )
.assign(transect_distance=lambda x: x.apply( lambda row: geopy.distance.distance(
( row[ 'center_latitude' ] , row[ 'minimum_longitude' ] ) ,
( row[ 'center_latitude' ] , row[ 'maximum_longitude' ] ) ).nm ,
axis=1 ) )
.merge( transect_spacing_area , on = [ contrast ] )
.assign( transect_area = lambda x: x.transect_distance * x.transect_spacing )
)
128 changes: 128 additions & 0 deletions EchoPro/computation/statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import scipy.stats as st

def stratified_transect_statistic( transects ,
strata ,
sample_fraction ,
replicates ):

"""
Calculates stratified mean statistics for a given transect


Parameters
----------
transects: pd.DataFrame
DataFrame comprising a variety of spatial metrics for transect data
strata: pd.DataFrame
DataFrame comprising summary features of latitude (INPFC) delimited strata
sample_fraction: np.float64
Value representing the proportion of ftransects that are resampled from the
overall dataset within each strata
replicates: int
The number of iterations/realizations used for bootstrapping

Notes
-----
This function calculates the stratified summary statistics for biomass within
`EchoPro.survey.stratified_summary()`.
"""

### Convert specific DataFrame columns to arrays for speed
distance = transects[ 'transect_distance' ].values
value = transects[ 'B_adult' ].values
num_transects = strata[ 'num_transects' ].values
total_transect_area = strata.set_index( 'stratum_inpfc' )[ 'total_transect_area' ]

### Calculate the number of transects within each stratum based on the
### sampling faction defined from the configuration file
# Number of transects
num_transects_to_sample = np.round( sample_fraction * num_transects ).astype( int )

# Offset term used for later variance calculation
sample_offset = np.where( num_transects_to_sample == 1 , 0 , 1 )

# Calculate effective sample size/degrees of freedom for variance calculation
sample_dof = num_transects_to_sample * ( num_transects_to_sample - sample_offset )

### Pre-allocate and pre-compute the cumulative sum of numbered transects per strata
# Transect indices
cum_num_transects = np.concatenate( ( [ 0 ] , np.cumsum( num_transects ) ) )

# Stratified statistics
mean_arr = np.empty( replicates )
variance_arr = np.empty( replicates )

### Iterate across all replicate iterations/realizations
for i in range( replicates ):

# Pre-allocate the stratum-specific means and variances
rho_j = np.empty_like( total_transect_area ) # mean
var_j = np.empty_like( total_transect_area ) # variance

# Iterate across all strata
for j in strata.stratum_inpfc - 1:

### Resample (without replacement) based on binned indices
# Define start and end transects within each stratum
start , end = cum_num_transects[ j ] , cum_num_transects[ j + 1 ]

# Resample without replacement
sel_inds = np.random.choice( np.arange( start , end ) ,
num_transects_to_sample[ j ] ,
replace=False )

### Define stratified weights
stratified_weights = distance[ sel_inds ] / np.mean( distance[ sel_inds ] )

### Weighted value (e.g. biomass)
value_distance_density = value[ sel_inds ] / distance[ sel_inds ]

### Compute mean and variance
rho_j[ j ] = np.nansum( value[ sel_inds ] * stratified_weights ) / np.nansum( stratified_weights )
var_j[ j ] = np.nansum( ( stratified_weights ** 2 * ( value_distance_density - rho_j[ j ] ) ** 2 ) ) / sample_dof[ j ]

### Calculate the overall weighted means and variances for later calculations
# Mean
mean_arr[ i ] = np.nansum( strata.total_transect_area * rho_j )

# Variance
variance_arr[ i ] = np.sqrt( np.nansum( var_j * strata.total_transect_area ** 2 ) )

### Calculate the summary statistics
stratified_results = {
'biomass': {
'mean': {
'estimate': np.mean( mean_arr ) ,
'confidence_interval': confidence_interval( mean_arr ) ,
} ,
'variance': {
'estimate': np.mean( variance_arr ) ,
'confidence_interval': confidence_interval( variance_arr ) ,
} ,
'CV': {
'estimate': np.mean( variance_arr / mean_arr ) ,
'confidence_interval': confidence_interval( variance_arr / mean_arr ) ,
}
}
}

### Carriage return
return stratified_results

def confidence_interval( values ):
"""
Calculates the 95% confidence interval (Normal) for a given array

Parameters
----------
values: np.array
An array of values

Notes
-----
This function calculates the 95% confidence interval (assuming a Normal) distribution
for the bootstrapped stratified samples. This is done as opposed to using the percentile
method for estimate the intervals.
"""
return np.mean( values ) + np.array( [ -1 , 1 ] ) * st.norm.ppf( 0.975 ) * np.std( values )
9 changes: 8 additions & 1 deletion EchoPro/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@
'stratum_num': int ,
'northlimit_latitude': np.float64 ,
} ,
'inpfc_strata': {
'stratum_num': int ,
'northlimit_latitude': np.float64 ,
'haul_start': int ,
'haul_end': int ,
} ,
} ,
'NASC': {
# ACOUSTIC DATASET -- NASC EXCLUDING AGE-1
Expand Down Expand Up @@ -185,11 +191,12 @@
} ,
'stratification': {
'name': 'spatial' ,
'data': ['strata' , 'geo_strata'] ,
'data': [ 'strata' , 'geo_strata' , 'inpfc_strata' ] ,
'superlayer': [] ,
'data_tree': {
'strata_df': pd.DataFrame() ,
'geo_strata_df': pd.DataFrame() ,
'inpfc_strata_df': pd.DataFrame() ,
} ,
} ,
'NASC': {
Expand Down
Loading