Source code for adler.utilities.science_utilities

import numpy as np
import pandas as pd
import subprocess
from astropy.stats import sigma_clip as astropy_sigma_clip



[docs]
def outlier_diff(new_res, diff_cut=1.0):
    """Test whether new data point(s) is an outlier compared to the model by considering the residual difference.

    Parameters
    -----------

    new_res: array
        The residuals of the new data points compared to the model
    diff_cut: float
        The threshold difference value for outlier detection.

    Returns
    -----------
    outlier_flag : array
       Array of flag indicating if data point is an outlier (True)

    """

    if not isinstance(new_res, np.ndarray):
        new_res = np.array(new_res, ndmin=1)

    outlier_flag = np.array([False] * len(new_res))
    outlier_flag[np.abs(new_res) >= diff_cut] = True

    return outlier_flag




[docs]
def outlier_std(new_res, data_res, std_cut=3.0):
    """Test whether new data point(s) is an outlier compared to the model by considering the standard deviation of the residuals.

    Parameters
    -----------

    new_res: array
        The residuals of the new data point(s) compared to the model
    data_res: array
        The residuals of the data compared to the model.
    std_cut: float
       The threshold standard deviation for outlier detection.

    Returns
    -----------
    outlier_flag : array
       Array of flag indicating if data point is an outlier (True)

    """

    # calculate the standard deviation of the data - model residuals
    data_std = np.std(data_res)

    if not isinstance(new_res, np.ndarray):
        new_res = np.array(new_res, ndmin=1)

    outlier_flag = np.array([False] * len(new_res))
    outlier_flag[np.abs(new_res) >= (data_std * std_cut)] = True

    return outlier_flag




[docs]
def zero_func(x, axis=None):
    """Dummy function to return a zero.
    Can be used as the centre function in astropy.stats.sigma_clip to get std relative to zero rather than median/mean value.

    Parameters
    -----------

    x:
        Dummy variable
    axis:
        required to match the syntax of numpy functions such as np.median
    """
    return 0




[docs]
def sigma_clip(data_res, **kwargs):
    """Wrapper function for astropy.stats.sigma_clip, here we define the default centre of the data (the data - model residuals) to be zero

    Parameters
    -----------

    data_res: array
        The residuals of the data compared to the model.
    kwargs: dict
        Dictionary of keyword arguments from astropy.stats.sigma_clip, namely:
        sigma : default 3
        maxiters: default 5
        cenfunc: default 'median'


    Returns
    -----------
    sig_clip_mask: array
        returns only the mask from astropy.stats.sigma_clip
    """

    sig_clip = astropy_sigma_clip(data_res, **kwargs)
    sig_clip_mask = sig_clip.mask

    return sig_clip_mask




[docs]
def outlier_sigma_diff(data_res, data_sigma, std_sigma=1):
    """Function to identify outliers by comparing the uncertainty of measurements to their residuals

    Parameters
    -----------

    data_res: array
        The residuals of the data compared to the model.
    data_sigma: array
        The uncertainties of the data points
    std_sigma: float
        Number of standard deviations to identify outliers, assuming the data uncertainties represent one standard deviation

    Returns
    -----------
    outlier_flag : array
       Array of flag indicating if data point is an outlier (True)
    """

    outlier_flag = np.abs(data_res) > (std_sigma * data_sigma)

    return outlier_flag




[docs]
def apparition_gap_finder(x, dx=100.0):
    """Function to find gaps in a data series. E.g. given an array of observation times, find the different apparitions of an asteroid from gaps between observations larger than a given value.

    Parameters
    -----------

    x: array
        The SORTED data array to search for gaps
    dx: float
        The size of gap to identify in data series

    Returns
    -----------
    x_gaps: array
        Values of x which define the groups in the data, where each group is x_gaps[i] <= x < x_gaps[i+1]
    """

    ## TODO: check that x is sorted?

    # find the difference between each data point
    x_diff = np.diff(x)

    # select only differences greater than value dx
    x_diff_mask = x_diff > dx

    # find the values in x that correspond to the start of each group
    x_gaps = x[1:][x_diff_mask]

    # add the very first observation to the array
    x_gaps = np.insert(x_gaps, 0, x[0])

    # add the very last observation to the array, including a shift of dx to ensure the boundary inequalities work
    x_gaps = np.insert(x_gaps, len(x_gaps), x[-1] + dx)

    return x_gaps




[docs]
def get_df_obs_filt(planetoid, filt, x_col="midPointMjdTai", x1=None, x2=None, col_list=None, pc_model=None):
    """Retrieve a dataframe of observations in a given filter. Has the option to limit the observations to a range of values, e.g. times/phase angles, if required.

    Parameters
    -----------

    planetoid: object
        Adler planetoid object containging the observations
    filt: str
        The filter to query
    x_col: str
        Column name to use for ordering values and limiting observations to a range: x1 <= df_obs[x_col] <= x2
    x1: float
        Lower limit value for x_col
    x2: float
        Upper limit value for x_col
    col_list: list
        List of column names to retrieve in addition to x_col, otherwise all columns are retrieved.
        N.B. if AbsMag is included in col_list then the absolute magnitude is calculated for the phase curve model pc_model
    pc_model: object
        Adler PhaseCurve model used to calculate AbsMag if required

    Returns
    -----------
    df_obs: DataFrame
        DataFrame of observations in the requested filter, ordered by x_col and with any x_col limits applied
    """

    # get observations in filter as a dataframe
    obs = planetoid.observations_in_filter(filt)
    # TODO: add option to get all filters? calculating AbsMag gets complicated...
    # TODO: split the dataframe functions into a separate function?
    df_obs = pd.DataFrame(obs.__dict__)

    # if no list of columns is provided, retrieve all columns
    if col_list is None:
        col_list = list(df_obs)
    else:
        col_list = [x_col] + col_list

    # calculate the phase curve absolute magnitudes (i.e. remove phase curve variation from reduced_mag)
    if "AbsMag" in col_list:
        # calculate the model absolute magnitude
        # TODO: add robustness to the units, phaseAngle and reduced_mag must match pc_model
        # For now we must assume that there are no units, and that degrees have been passed...
        # df_obs["AbsMag"] = pc_model.AbsMag(obs.phaseAngle * u.deg, obs.reduced_mag * u.mag).value
        df_obs["AbsMag"] = pc_model.AbsMag(np.radians(obs.phaseAngle), obs.reduced_mag)

    # select only the required columns
    df_obs = df_obs[col_list]

    # sort into x_col order
    df_obs = df_obs.sort_values(x_col)
    # limit to range in x_col if limits x1, x2 are supplied
    if (x1 is not None) & (x2 is not None):  # TODO: add functionality to set only upper/lower limit
        x_mask = (df_obs[x_col] >= x1) & (df_obs[x_col] <= x2)
        df_obs = df_obs[x_mask]
    # reset the dataframe index
    df_obs = df_obs.reset_index(drop=True)

    return df_obs




[docs]
def large_magErr_mask(df_obs, magErr_percentile_cut=95):
    """
    # TODO docstring
    """
    magErr_mask = df_obs.magErr <= np.nanpercentile(df_obs.magErr, q=magErr_percentile_cut)
    return magErr_mask




[docs]
def split_obs(df_obs, process_mjd, n_new_nights=3):
    """
    # TODO docstring
    """
    mask = df_obs["midPointMjdTai"] < process_mjd - n_new_nights

    df_obs_old = df_obs[mask].copy()
    df_obs_new = df_obs[~mask].copy()

    return df_obs_old, df_obs_new, mask




[docs]
def running_stats(N, sum_x, sum_x2):
    """
    Function to calculate the running mean and std statistics from the number, sum and sum of the squares of a dataset.
    This function allows us to calculate stats for a dataset where we do not record every value, we record only the three summary values:
    https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods

    Parameters
    -----------

    N: int
        Number of data points, x
    sum_x: float
        Sum of all data points, x
    sum_x2: float
        Sum of the square of each data point, x**2

    Returns
    -----------
    mean_x, std_x: float, float
        Then mean and std of the data, x

    """
    mean_x = sum_x / N
    std_x = np.sqrt((N * sum_x2) - (sum_x**2.0)) / N
    return mean_x, std_x




[docs]
def execute_subprocess(cmd):
    """
    Wrapper function to execute a terminal command using the python subprocess module

    Parameters
    -----------

    cmd: str
        Command to be executed

    Returns
    -----------
    out, err: str, str
        The output and any error messages returned by subprocess

    """

    # run the command
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    # TODO: log the commands that were run

    # return the output and errors from the terminal
    out = result.stdout
    err = result.stderr

    return out, err