Module `w4h.classify`

The Classify module contains functions for defining geological intervals into a preset subset of geologic interpretations.

Expand source code

"""The Classify module contains functions for defining geological intervals into a preset subset of geologic interpretations.
"""

import datetime
import inspect

import pandas as pd
import numpy as np

from w4h import logger_function, verbose_print
#The following flags are used to mark the classification method:
#- 0: Not classified
#- 1: Specific Search Term Match
#- 2: wPermits bedrock top pick
#- 3: Intervals >550' below ground surface
#- 4: Wildcard match (startTerm) - no context
#- 5: Wildcard match (any substring) - more liberal
#- Top of well?

#Define records with full search term
def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify terms that have been specifically defined in the terms_df.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe with unclassified well descriptions.
    terms_df : pandas.DataFrame
        Dataframe containing the classifications
    description_col : str, default='FORMATION'
        Column name in df containing the well descriptions, by default 'FORMATION'.
    terms_col : str, default='DESCRIPTION'
        Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
    verbose : bool, default=False
        Whether to print up results, by default False.

    Returns
    -------
    df_Interps : pandas.DataFrame
        Dataframe containing the well descriptions and their matched classifications.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(specific_define, locals(), exclude_params=['df', 'terms_df'])

    if description_col != terms_col:
        terms_df.rename(columns={terms_col:description_col}, inplace=True)
        terms_col = description_col

    df[description_col] = df[description_col].astype(str)
    terms_df[terms_col] = terms_df[terms_col].astype(str)

    df[description_col] = df[description_col].str.casefold()
    terms_df[terms_col] = terms_df[terms_col].str.casefold()
    #df['FORMATION'] = df['FORMATION'].str.strip(['.,:?\t\s'])
    #terms_df['FORMATION'] = terms_df['FORMATION'].str.strip(['.,:?\t\s'])

    terms_df.drop_duplicates(subset=terms_col, keep='last', inplace=True)
    terms_df.reset_index(drop=True, inplace=True)
    
    df_Interps = pd.merge(left=df, right=terms_df.set_index(terms_col), on=description_col, how='left')
    df_Interps.rename(columns={description_col:'FORMATION'}, inplace=True)
    df_Interps['BEDROCK_FLAG'] = df_Interps['LITHOLOGY'] == 'BEDROCK'
    
    if verbose:
        print('Classified well records using exact matches')
        numRecsClass = int(df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum())
        recsRemainig = int(df_Interps.shape[0]-numRecsClass)
        percRecsClass =round((df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum()/df_Interps.shape[0])*100,2)
        print("\t{} records classified using exact matches ({}% of unclassified data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass))

    return df_Interps

def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False):
    """Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    classification_col : str, default = 'CLASS_FLAG'
        Name of column containing the classification flag, by default 'CLASS_FLAG'
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    Two-item tuple of pandas.Dataframe
        tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    classifedDF= df[df[classification_col].notna()] #Already-classifed data
    searchDF = df[df[classification_col].isna()] #Unclassified data
        
    return classifedDF, searchDF

#Classify downhole data by the initial substring
def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to starting substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(start_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Start Term process should be done by {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.startswith(s,na=False),4,inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.startswith(s,na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using initial substring matches')
        print("\t{} records classified using initial substring matches ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df

#Classify downhole data by any substring
def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to any substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(wildcard_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Wildcard Term process should be done by (?) {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False), 5, inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using any substring (wildcard) match')
        print("\t{} records classified using any substring match ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df

#Merge data back together
def remerge_data(classifieddf, searchdf):
    """Function to merge newly-classified (or not) and previously classified data

    Parameters
    ----------
    classifieddf : pandas.DataFrame
        Dataframe that had already been classified previously
    searchdf : pandas.DataFrame
        Dataframe with new classifications

    Returns
    -------
    remergeDF : pandas.DataFrame
        Dataframe containing all the data, merged back together
    """
    remergeDF = pd.concat([classifieddf,searchdf], join='inner').sort_index()
    return remergeDF

#Define well intervals by depth
def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False):
    """Function to define all intervals lower than thresh as bedrock

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to classify
    top_col : str, default = 'TOP'
        Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
    thresh : float, default = 550.0
        Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing intervals classified as bedrock due to depth
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(depth_define, locals(), exclude_params=['df'])
    df = df.copy()
    df['CLASS_FLAG'].mask(df[top_col]>thresh, 3 ,inplace=True) #Add a Classification Flag of 3 (bedrock b/c it's deepter than 550') to all records where the top of the interval is >550'
    df['BEDROCK_FLAG'].mask(df[top_col]>thresh, True, inplace=True)

    if verbose:
        if df.CLASS_FLAG.notnull().sum() == 0:
            brDepthClass = 0
        else:
            brDepthClass = df['CLASS_FLAG'].value_counts()[3.0]
        total = df.shape[0]

        numRecsClass = int(df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified bedrock well records using depth threshold at depth of {}'.format(thresh))
        print("\t{} records classified using bedrock threshold depth ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
        
    return df

#Output data that still needs to be defined
def export_undefined(df, outdir):
    """Function to export terms that still need to be defined.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing at least some unclassified data
    outdir : str or pathlib.Path
        Directory to save file. Filename will be generated automatically based on today's date.

    Returns
    -------
    stillNeededDF : pandas.DataFrame
        Dataframe containing only unclassified terms, and the number of times they occur
    """
    import pathlib
    
    
    if isinstance(outdir, pathlib.PurePath):
        if not outdir.is_dir() or not outdir.exists():
            print('Please specify a valid directory for export. Filename is generated automatically.')
            return
        outdir = outdir.as_posix()
    else:
        outdir.replace('\\','/')
        outdir.replace('\\'[-1], '/')

    #Get directory path correct        
    if outdir[-1] != '/':
        outdir = outdir+'/'

    todayDate = datetime.date.today()
    todayDateStr = str(todayDate)
    searchDF = df[df['CLASS_FLAG'].isna()]
    
    stillNeededDF=searchDF['FORMATION'].value_counts()
    stillNeededDF.to_csv(outdir+'Undefined_'+todayDateStr+'.csv')
    return stillNeededDF

#Fill in unclassified rows' flags with 0
def fill_unclassified(df, classification_col='CLASS_FLAG'):
    """Fills unclassified rows in 'CLASS_FLAG' column with np.nan

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe on which to perform operation

    Returns
    -------
    df : pandas.DataFrame
        Dataframe on which operation has been performed
    """
    df[classification_col].fillna(0, inplace=True)
    return df

#Merge lithologies to main df based on classifications
def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool'):
    """Function to merge lithologies and target booleans based on classifications
    
    Parameters
    ----------
    well_data_df : pandas.DataFrame
        Dataframe containing classified well data
    targinterps_df : pandas.DataFrame
        Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
    target_col : str, default = 'TARGET'
        Name of column in targinterps_df containing the target interpretations
    target_class, default = 'bool'
        Whether the input column is using boolean values as its target indicator
        
    Returns
    -------
    df_targ : pandas.DataFrame
        Dataframe containing merged lithologies/targets
    
    """    
    
    #by default, use the boolean input 
    if target_class=='bool':
        targinterps_df[target_col] = targinterps_df[target_col].where(targinterps_df[target_col]=='1', other='0').astype(int)
        targinterps_df[target_col].fillna(value=0, inplace=True)
    else:
        targinterps_df[target_col].replace('DoNotUse', value=-1, inplace=True)
        targinterps_df[target_col].fillna(value=-2, inplace=True)
        targinterps_df[target_col].astype(np.int8)

    df_targ = pd.merge(well_data_df, targinterps_df.set_index(interp_col), right_on=interp_col, left_on='LITHOLOGY', how='left')
    
    return df_targ

#Function to get unique wells
def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False):
    """Gets unique wells as a dataframe based on a given column name.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all wells and/or well intervals of interest
    wellid_col : str, default="API_NUMBER"
        Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    wellsDF
        DataFrame containing only the unique well IDs
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(get_unique_wells, locals(), exclude_params=['df'])
    #Get Unique well APIs
    uniqueWells = df[wellid_col].unique()
    wellsDF = pd.DataFrame(uniqueWells)
    if verbose:
        print('Number of unique wells: '+str(wellsDF.shape[0]))
    wellsDF.columns = ['UNIQUE_ID']
    
    return wellsDF

#Quickly sort dataframe
def sort_dataframe(df, sort_cols=['API_NUMBER','TOP'], remove_nans=True):
    """Function to sort dataframe by one or more columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to be sorted
    sort_cols : str or list of str, default = ['API_NUMBER','TOP']
        Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
    remove_nans : bool, default = True
        Whether or not to remove nans in the process, by default True

    Returns
    -------
    df_sorted : pandas.DataFrame
        Sorted dataframe
    """
    #Sort columns for better processing later
    df_sorted = df.sort_values(sort_cols)
    df_sorted.reset_index(inplace=True, drop=True)
    if remove_nans:
        df_sorted = df_sorted[pd.notna(df_sorted["LITHOLOGY"])]
    return df_sorted

Functions

def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False)

Function to define all intervals lower than thresh as bedrock

Parameters

df : pandas.DataFrame: Dataframe to classify
top_col : str, default = 'TOP': Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
thresh : float, default = 550.0: Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
verbose : bool, default = False: Whether to print results, by default False
log : bool, default = True: Whether to log results to log file

Returns

df : pandas.DataFrame: Dataframe containing intervals classified as bedrock due to depth

Expand source code

def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False):
    """Function to define all intervals lower than thresh as bedrock

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to classify
    top_col : str, default = 'TOP'
        Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
    thresh : float, default = 550.0
        Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing intervals classified as bedrock due to depth
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(depth_define, locals(), exclude_params=['df'])
    df = df.copy()
    df['CLASS_FLAG'].mask(df[top_col]>thresh, 3 ,inplace=True) #Add a Classification Flag of 3 (bedrock b/c it's deepter than 550') to all records where the top of the interval is >550'
    df['BEDROCK_FLAG'].mask(df[top_col]>thresh, True, inplace=True)

    if verbose:
        if df.CLASS_FLAG.notnull().sum() == 0:
            brDepthClass = 0
        else:
            brDepthClass = df['CLASS_FLAG'].value_counts()[3.0]
        total = df.shape[0]

        numRecsClass = int(df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified bedrock well records using depth threshold at depth of {}'.format(thresh))
        print("\t{} records classified using bedrock threshold depth ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
        
    return df

def export_undefined(df, outdir)

Function to export terms that still need to be defined.

Parameters

df : pandas.DataFrame: Dataframe containing at least some unclassified data
outdir : str or pathlib.Path: Directory to save file. Filename will be generated automatically based on today's date.

Returns

stillNeededDF : pandas.DataFrame: Dataframe containing only unclassified terms, and the number of times they occur

Expand source code

def export_undefined(df, outdir):
    """Function to export terms that still need to be defined.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing at least some unclassified data
    outdir : str or pathlib.Path
        Directory to save file. Filename will be generated automatically based on today's date.

    Returns
    -------
    stillNeededDF : pandas.DataFrame
        Dataframe containing only unclassified terms, and the number of times they occur
    """
    import pathlib
    
    
    if isinstance(outdir, pathlib.PurePath):
        if not outdir.is_dir() or not outdir.exists():
            print('Please specify a valid directory for export. Filename is generated automatically.')
            return
        outdir = outdir.as_posix()
    else:
        outdir.replace('\\','/')
        outdir.replace('\\'[-1], '/')

    #Get directory path correct        
    if outdir[-1] != '/':
        outdir = outdir+'/'

    todayDate = datetime.date.today()
    todayDateStr = str(todayDate)
    searchDF = df[df['CLASS_FLAG'].isna()]
    
    stillNeededDF=searchDF['FORMATION'].value_counts()
    stillNeededDF.to_csv(outdir+'Undefined_'+todayDateStr+'.csv')
    return stillNeededDF

def fill_unclassified(df, classification_col='CLASS_FLAG')

Fills unclassified rows in 'CLASS_FLAG' column with np.nan

Parameters

df : pandas.DataFrame: Dataframe on which to perform operation

Returns

df : pandas.DataFrame: Dataframe on which operation has been performed

Expand source code

def fill_unclassified(df, classification_col='CLASS_FLAG'):
    """Fills unclassified rows in 'CLASS_FLAG' column with np.nan

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe on which to perform operation

    Returns
    -------
    df : pandas.DataFrame
        Dataframe on which operation has been performed
    """
    df[classification_col].fillna(0, inplace=True)
    return df

def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False)

Gets unique wells as a dataframe based on a given column name.

Parameters

df : pandas.DataFrame: Dataframe containing all wells and/or well intervals of interest
wellid_col : str, default="API_NUMBER": Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
log : bool, default = False: Whether to log results to log file

Returns

wellsDF: DataFrame containing only the unique well IDs

Expand source code

def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False):
    """Gets unique wells as a dataframe based on a given column name.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all wells and/or well intervals of interest
    wellid_col : str, default="API_NUMBER"
        Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    wellsDF
        DataFrame containing only the unique well IDs
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(get_unique_wells, locals(), exclude_params=['df'])
    #Get Unique well APIs
    uniqueWells = df[wellid_col].unique()
    wellsDF = pd.DataFrame(uniqueWells)
    if verbose:
        print('Number of unique wells: '+str(wellsDF.shape[0]))
    wellsDF.columns = ['UNIQUE_ID']
    
    return wellsDF

def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool')

Function to merge lithologies and target booleans based on classifications

Parameters

well_data_df : pandas.DataFrame: Dataframe containing classified well data
targinterps_df : pandas.DataFrame: Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
target_col : str, default = 'TARGET': Name of column in targinterps_df containing the target interpretations

target_class, default = 'bool' Whether the input column is using boolean values as its target indicator

Returns

df_targ : pandas.DataFrame: Dataframe containing merged lithologies/targets

Expand source code

def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool'):
    """Function to merge lithologies and target booleans based on classifications
    
    Parameters
    ----------
    well_data_df : pandas.DataFrame
        Dataframe containing classified well data
    targinterps_df : pandas.DataFrame
        Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
    target_col : str, default = 'TARGET'
        Name of column in targinterps_df containing the target interpretations
    target_class, default = 'bool'
        Whether the input column is using boolean values as its target indicator
        
    Returns
    -------
    df_targ : pandas.DataFrame
        Dataframe containing merged lithologies/targets
    
    """    
    
    #by default, use the boolean input 
    if target_class=='bool':
        targinterps_df[target_col] = targinterps_df[target_col].where(targinterps_df[target_col]=='1', other='0').astype(int)
        targinterps_df[target_col].fillna(value=0, inplace=True)
    else:
        targinterps_df[target_col].replace('DoNotUse', value=-1, inplace=True)
        targinterps_df[target_col].fillna(value=-2, inplace=True)
        targinterps_df[target_col].astype(np.int8)

    df_targ = pd.merge(well_data_df, targinterps_df.set_index(interp_col), right_on=interp_col, left_on='LITHOLOGY', how='left')
    
    return df_targ

def remerge_data(classifieddf, searchdf)

Function to merge newly-classified (or not) and previously classified data

Parameters

classifieddf : pandas.DataFrame: Dataframe that had already been classified previously
searchdf : pandas.DataFrame: Dataframe with new classifications

Returns

remergeDF : pandas.DataFrame: Dataframe containing all the data, merged back together

Expand source code

def remerge_data(classifieddf, searchdf):
    """Function to merge newly-classified (or not) and previously classified data

    Parameters
    ----------
    classifieddf : pandas.DataFrame
        Dataframe that had already been classified previously
    searchdf : pandas.DataFrame
        Dataframe with new classifications

    Returns
    -------
    remergeDF : pandas.DataFrame
        Dataframe containing all the data, merged back together
    """
    remergeDF = pd.concat([classifieddf,searchdf], join='inner').sort_index()
    return remergeDF

def sort_dataframe(df, sort_cols=['API_NUMBER', 'TOP'], remove_nans=True)

Function to sort dataframe by one or more columns.

Parameters

df : pandas.DataFrame: Dataframe to be sorted
sort_cols : str or list of str, default = ['API_NUMBER','TOP']: Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
remove_nans : bool, default = True: Whether or not to remove nans in the process, by default True

Returns

df_sorted : pandas.DataFrame: Sorted dataframe

Expand source code

def sort_dataframe(df, sort_cols=['API_NUMBER','TOP'], remove_nans=True):
    """Function to sort dataframe by one or more columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to be sorted
    sort_cols : str or list of str, default = ['API_NUMBER','TOP']
        Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
    remove_nans : bool, default = True
        Whether or not to remove nans in the process, by default True

    Returns
    -------
    df_sorted : pandas.DataFrame
        Sorted dataframe
    """
    #Sort columns for better processing later
    df_sorted = df.sort_values(sort_cols)
    df_sorted.reset_index(inplace=True, drop=True)
    if remove_nans:
        df_sorted = df_sorted[pd.notna(df_sorted["LITHOLOGY"])]
    return df_sorted

def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify terms that have been specifically defined in the terms_df.

Parameters

df : pandas.DataFrame: Input dataframe with unclassified well descriptions.
terms_df : pandas.DataFrame: Dataframe containing the classifications
description_col : str, default='FORMATION': Column name in df containing the well descriptions, by default 'FORMATION'.
terms_col : str, default='DESCRIPTION': Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
verbose : bool, default=False: Whether to print up results, by default False.

Returns

df_Interps : pandas.DataFrame: Dataframe containing the well descriptions and their matched classifications.

Expand source code

def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify terms that have been specifically defined in the terms_df.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe with unclassified well descriptions.
    terms_df : pandas.DataFrame
        Dataframe containing the classifications
    description_col : str, default='FORMATION'
        Column name in df containing the well descriptions, by default 'FORMATION'.
    terms_col : str, default='DESCRIPTION'
        Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
    verbose : bool, default=False
        Whether to print up results, by default False.

    Returns
    -------
    df_Interps : pandas.DataFrame
        Dataframe containing the well descriptions and their matched classifications.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(specific_define, locals(), exclude_params=['df', 'terms_df'])

    if description_col != terms_col:
        terms_df.rename(columns={terms_col:description_col}, inplace=True)
        terms_col = description_col

    df[description_col] = df[description_col].astype(str)
    terms_df[terms_col] = terms_df[terms_col].astype(str)

    df[description_col] = df[description_col].str.casefold()
    terms_df[terms_col] = terms_df[terms_col].str.casefold()
    #df['FORMATION'] = df['FORMATION'].str.strip(['.,:?\t\s'])
    #terms_df['FORMATION'] = terms_df['FORMATION'].str.strip(['.,:?\t\s'])

    terms_df.drop_duplicates(subset=terms_col, keep='last', inplace=True)
    terms_df.reset_index(drop=True, inplace=True)
    
    df_Interps = pd.merge(left=df, right=terms_df.set_index(terms_col), on=description_col, how='left')
    df_Interps.rename(columns={description_col:'FORMATION'}, inplace=True)
    df_Interps['BEDROCK_FLAG'] = df_Interps['LITHOLOGY'] == 'BEDROCK'
    
    if verbose:
        print('Classified well records using exact matches')
        numRecsClass = int(df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum())
        recsRemainig = int(df_Interps.shape[0]-numRecsClass)
        percRecsClass =round((df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum()/df_Interps.shape[0])*100,2)
        print("\t{} records classified using exact matches ({}% of unclassified data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass))

    return df_Interps

def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False)

Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.

Parameters

df : pandas.DataFrame: Dataframe containing all the well descriptions
classification_col : str, default = 'CLASS_FLAG': Name of column containing the classification flag, by default 'CLASS_FLAG'
verbose : bool, default = False: Whether to print results, by default False
log : bool, default = False: Whether to log results to log file

Returns

Two-item tuple of pandas.Dataframe: tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.

Expand source code

def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False):
    """Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    classification_col : str, default = 'CLASS_FLAG'
        Name of column containing the classification flag, by default 'CLASS_FLAG'
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    Two-item tuple of pandas.Dataframe
        tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    classifedDF= df[df[classification_col].notna()] #Already-classifed data
    searchDF = df[df[classification_col].isna()] #Unclassified data
        
    return classifedDF, searchDF

def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify descriptions according to starting substring.

Parameters

df : pandas.DataFrame: Dataframe containing all the well descriptions
terms_df : pandas.DataFrame: Dataframe containing all the startswith substrings to use for searching
description_col : str, default = 'FORMATION': Name of column in df containing descriptions, by default 'FORMATION'
terms_col : str, default = 'FORMATION': Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose : bool, default = False: Whether to print out results, by default False
log : bool, default = True: Whether to log results to log file

Returns

df : pandas.DataFrame: Dataframe containing the original data and new classifications

Expand source code

def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to starting substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(start_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Start Term process should be done by {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.startswith(s,na=False),4,inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.startswith(s,na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using initial substring matches')
        print("\t{} records classified using initial substring matches ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df

def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify descriptions according to any substring.

Parameters

df : pandas.DataFrame: Dataframe containing all the well descriptions
terms_df : pandas.DataFrame: Dataframe containing all the startswith substrings to use for searching
description_col : str, default = 'FORMATION': Name of column in df containing descriptions, by default 'FORMATION'
terms_col : str, default = 'FORMATION': Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose : bool, default = False: Whether to print out results, by default False
log : bool, default = True: Whether to log results to log file

Returns

df : pandas.DataFrame: Dataframe containing the original data and new classifications

Expand source code

def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to any substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(wildcard_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Wildcard Term process should be done by (?) {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False), 5, inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using any substring (wildcard) match')
        print("\t{} records classified using any substring match ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df