Module w4h.clean

The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)

Expand source code
"""The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)
"""

import inspect

import numpy as np
import pandas as pd

from w4h import logger_function, verbose_print

# This function removes all data from the downholeData table where there is no location information (in the headerData table). This includes elevation info too
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells and well intervals where there is no location information

    Parameters
    ----------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing well descriptions
    metadata_DF : pandas.DataFrame
        Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing only data with location information
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations'])

    before = df_with_locations.shape[0] #Extract length of data before this process

    df_with_locations[xcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_locations[ycol].replace(no_data_val_table, np.nan, inplace=True)
    
    df_with_locations.dropna(subset=xcol, inplace=True)
    df_with_locations.dropna(subset=ycol, inplace=True)
    
    if verbose:
        after = df_with_locations.shape[0]
        print('Removed well records with no location information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without location information".format(before-after))

    return df_with_locations

# Function to remove data (intended for headerData) without surface topography information
# THIS ASSUMES AND SHOULD ONLY BE RUN AFTER ALL DESIRED SURFACE TOPO DATASETS HAVE BEEN MERGED/ADDED
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells that do not have topography data (needed for layer selection later).

    This function is intended to be run on the metadata table after elevations have attempted to been added.

    Parameters
    ----------
    df_with_topo : pandas.DataFrame
        Pandas dataframe containing elevation information.
    zcol : str
        Name of elevation column
    no_data_val_table : any
        Value in dataset that indicates no data is present (replaced with np.nan)
    verbose : bool, optional
        Whether to print outputs, by default True
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with intervals with no topography removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo'])

    before = df_with_topo.shape[0]
    
    df_with_topo[zcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_topo.dropna(subset=[zcol], inplace=True)
    
    if verbose:
        after = df_with_topo.shape[0]
        print('Removed well records with no surface elevation information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without surface elevation information".format(before-after))
    
    return df_with_topo

# This function drops all records in the downholedata with no depth information (either top or bottom depth of well interval)
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False):
    """Function to remove well intervals with no depth information

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Dataframe containing well descriptions
    top_col : str, optional
        Name of column containing information on the top of the well intervals, by default 'TOP'
    bottom_col : str, optional
        Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
    no_data_val_table : any, optional
        No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
    verbose : bool, optional
        Whether to print results to console, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_depth : pandas.DataFrame
        Dataframe with depths dropped
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth'])
        
    #Replace empty cells in top and bottom columns with nan
    df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan)
    df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan)
    
    #Calculate number of rows before dropping
    before = df_with_depth.shape[0]

    #Drop records without depth information
    df_with_depth = df_with_depth.dropna(subset=[top_col])
    df_with_depth = df_with_depth.dropna(subset=[bottom_col])
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index
  
    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with no depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))
    
    return df_with_depth

# This function drops all records in downholeData with bad depth information (where the bottom of a record is nearer to the surface than the top)
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False):
    """Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Pandas dataframe containing the well records and descriptions for each interval
    top_col : str, default='TOP'
        The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
    bottom_col : str, default='BOTTOM'
        The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
    depth_type : str, {'depth', 'elevation'}
        Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
    verbose : bool, default = False
        Whether to print results to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.Dataframe
        Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth'])

    if depth_type.lower() =='depth':
        df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness
    elif depth_type.lower() =='elevation' or depth_type=='elev':
        df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness
    before = df_with_depth.shape[0] #Calculate number of rows before dropping
    df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top)
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with obviously bad depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))

    return df_with_depth

# This function drops all records in downholeData with no formation in formation in the description fiel
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False):
    """Function that removes all records in the dataframe containing the well descriptions where no description is given.

    Parameters
    ----------
    df_with_descriptions : pandas.DataFrame
        Pandas dataframe containing the well records with their individual descriptions
    description_col : str, optional
        Name of the column containing the geologic description of each interval, by default 'FORMATION'
    no_data_val_table : str, optional
        The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
    verbose : bool, optional
        Whether to print the results of this step to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False
        
    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with records with no description removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions'])
    #Replace empty cells in formation column with nans
    df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan) 
    before = df_with_descriptions.shape[0] #Calculate number of rows before dropping

    #Drop records without FORMATION information
    df_with_descriptions = df_with_descriptions.dropna(subset=[description_col])
    df_with_descriptions.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_descriptions.shape[0]
        print('Removed well records without geologic descriptions. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without geologic descriptions".format(before-after))

    return df_with_descriptions

Functions

def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False)

Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)

Parameters

df_with_depth : pandas.DataFrame
Pandas dataframe containing the well records and descriptions for each interval
top_col : str, default='TOP'
The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
bottom_col : str, default='BOTTOM'
The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
depth_type : str, {'depth', 'elevation'}
Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
verbose : bool, default = False
Whether to print results to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.Dataframe
Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
Expand source code
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False):
    """Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Pandas dataframe containing the well records and descriptions for each interval
    top_col : str, default='TOP'
        The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
    bottom_col : str, default='BOTTOM'
        The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
    depth_type : str, {'depth', 'elevation'}
        Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
    verbose : bool, default = False
        Whether to print results to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.Dataframe
        Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth'])

    if depth_type.lower() =='depth':
        df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness
    elif depth_type.lower() =='elevation' or depth_type=='elev':
        df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness
    before = df_with_depth.shape[0] #Calculate number of rows before dropping
    df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top)
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with obviously bad depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))

    return df_with_depth
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False)

Function to remove well intervals with no depth information

Parameters

df_with_depth : pandas.DataFrame
Dataframe containing well descriptions
top_col : str, optional
Name of column containing information on the top of the well intervals, by default 'TOP'
bottom_col : str, optional
Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
no_data_val_table : any, optional
No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
verbose : bool, optional
Whether to print results to console, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

df_with_depth : pandas.DataFrame
Dataframe with depths dropped
Expand source code
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False):
    """Function to remove well intervals with no depth information

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Dataframe containing well descriptions
    top_col : str, optional
        Name of column containing information on the top of the well intervals, by default 'TOP'
    bottom_col : str, optional
        Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
    no_data_val_table : any, optional
        No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
    verbose : bool, optional
        Whether to print results to console, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_depth : pandas.DataFrame
        Dataframe with depths dropped
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth'])
        
    #Replace empty cells in top and bottom columns with nan
    df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan)
    df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan)
    
    #Calculate number of rows before dropping
    before = df_with_depth.shape[0]

    #Drop records without depth information
    df_with_depth = df_with_depth.dropna(subset=[top_col])
    df_with_depth = df_with_depth.dropna(subset=[bottom_col])
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index
  
    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with no depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))
    
    return df_with_depth
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False)

Function that removes all records in the dataframe containing the well descriptions where no description is given.

Parameters

df_with_descriptions : pandas.DataFrame
Pandas dataframe containing the well records with their individual descriptions
description_col : str, optional
Name of the column containing the geologic description of each interval, by default 'FORMATION'
no_data_val_table : str, optional
The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
verbose : bool, optional
Whether to print the results of this step to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.DataFrame
Pandas dataframe with records with no description removed.
Expand source code
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False):
    """Function that removes all records in the dataframe containing the well descriptions where no description is given.

    Parameters
    ----------
    df_with_descriptions : pandas.DataFrame
        Pandas dataframe containing the well records with their individual descriptions
    description_col : str, optional
        Name of the column containing the geologic description of each interval, by default 'FORMATION'
    no_data_val_table : str, optional
        The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
    verbose : bool, optional
        Whether to print the results of this step to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False
        
    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with records with no description removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions'])
    #Replace empty cells in formation column with nans
    df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan) 
    before = df_with_descriptions.shape[0] #Calculate number of rows before dropping

    #Drop records without FORMATION information
    df_with_descriptions = df_with_descriptions.dropna(subset=[description_col])
    df_with_descriptions.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_descriptions.shape[0]
        print('Removed well records without geologic descriptions. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without geologic descriptions".format(before-after))

    return df_with_descriptions
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False)

Function to remove wells that do not have topography data (needed for layer selection later).

This function is intended to be run on the metadata table after elevations have attempted to been added.

Parameters

df_with_topo : pandas.DataFrame
Pandas dataframe containing elevation information.
zcol : str
Name of elevation column
no_data_val_table : any
Value in dataset that indicates no data is present (replaced with np.nan)
verbose : bool, optional
Whether to print outputs, by default True
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.DataFrame
Pandas dataframe with intervals with no topography removed.
Expand source code
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells that do not have topography data (needed for layer selection later).

    This function is intended to be run on the metadata table after elevations have attempted to been added.

    Parameters
    ----------
    df_with_topo : pandas.DataFrame
        Pandas dataframe containing elevation information.
    zcol : str
        Name of elevation column
    no_data_val_table : any
        Value in dataset that indicates no data is present (replaced with np.nan)
    verbose : bool, optional
        Whether to print outputs, by default True
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with intervals with no topography removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo'])

    before = df_with_topo.shape[0]
    
    df_with_topo[zcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_topo.dropna(subset=[zcol], inplace=True)
    
    if verbose:
        after = df_with_topo.shape[0]
        print('Removed well records with no surface elevation information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without surface elevation information".format(before-after))
    
    return df_with_topo
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False)

Function to remove wells and well intervals where there is no location information

Parameters

df_with_locations : pandas.DataFrame
Pandas dataframe containing well descriptions
metadata_DF : pandas.DataFrame
Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
log : bool, default = False
Whether to log results to log file, by default False

Returns

df_with_locations : pandas.DataFrame
Pandas dataframe containing only data with location information
Expand source code
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells and well intervals where there is no location information

    Parameters
    ----------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing well descriptions
    metadata_DF : pandas.DataFrame
        Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing only data with location information
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations'])

    before = df_with_locations.shape[0] #Extract length of data before this process

    df_with_locations[xcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_locations[ycol].replace(no_data_val_table, np.nan, inplace=True)
    
    df_with_locations.dropna(subset=xcol, inplace=True)
    df_with_locations.dropna(subset=ycol, inplace=True)
    
    if verbose:
        after = df_with_locations.shape[0]
        print('Removed well records with no location information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without location information".format(before-after))

    return df_with_locations