Package w4h

This is the wells4hydrogeology package.

It contains the functions needed to convert raw well descriptions into usable (hydro)geologic data.

Expand source code
#__init__.py
"""
   This is the wells4hydrogeology package. 
   
   It contains the functions needed to convert raw well descriptions into usable (hydro)geologic data.

"""

from w4h.core import(logger_function,
                        verbose_print,
                        run,
                        get_resources)

from w4h.classify import (specific_define,
                          split_defined,
                          start_define,
                          wildcard_define,
                          remerge_data,
                          depth_define,
                          export_undefined,
                          fill_unclassified,
                          merge_lithologies,
                          get_unique_wells,
                          sort_dataframe)

from w4h.clean import (remove_nonlocated,
                       remove_no_topo,
                       remove_no_depth,
                       remove_bad_depth,
                       remove_no_description)

from w4h.export import (export_dataframe,
                        export_grids)

from w4h.layers import (get_layer_depths,
                        merge_metadata,
                        layer_target_thick,
                        layer_interp,
                        combine_dataset)

from w4h.mapping import (read_study_area,
                         coords2geometry,
                         clip_gdf2study_area,
                         sample_raster_points,
                         xyz_metadata_merge,
                         read_wms,
                         read_wcs,
                         grid2study_area,
                         read_model_grid,
                         read_grid,
                         align_rasters,
                         get_drift_thick)

from w4h.read import (get_current_date,
                      get_most_recent,
                      file_setup,
                      read_raw_csv,
                      read_xyz,
                      read_dict,
                      define_dtypes,
                      get_search_terms,
                      read_dictionary_terms,
                      read_lithologies,
                      add_control_points)


__all__=('logger_function','verbose_print', 'run', 'get_resources',
        'specific_define', 
        'split_defined', 
        'start_define',
        'wildcard_define',
        'remerge_data', 
        'depth_define', 
        'export_undefined', 
        'fill_unclassified', 
        'merge_lithologies', 
        'get_unique_wells',
        'sort_dataframe',
         'remove_nonlocated', 
         'remove_no_topo', 
         'remove_no_depth', 
         'remove_bad_depth', 
         'remove_no_description',
        'export_dataframe',
        'export_grids',
         'get_layer_depths',
         'merge_metadata', 
         'layer_target_thick', 
         'layer_interp',
         'combine_dataset',
        'read_study_area', 
        'coords2geometry', 
        'clip_gdf2study_area', 
        'sample_raster_points', 
        'xyz_metadata_merge', 
        'read_wms',
        'read_wcs', 
        'grid2study_area',
        'read_model_grid',
        'read_grid',
        'align_rasters',
        'get_drift_thick',
         'get_current_date',
         'get_most_recent',
         'file_setup',
         'read_raw_csv',
         'read_xyz',
         'read_dict',
         'define_dtypes',
         'get_search_terms',
         'read_dictionary_terms',
         'read_lithologies',
         'add_control_points'
         )

# Update the w4h.run() help() return to actually be helpful
run.__doc__ = core._run_docstring()

__author__='Riley Balikian, Joe Franke, Allan Jones, Mike Krasowski'

Sub-modules

w4h.classify

The Classify module contains functions for defining geological intervals into a preset subset of geologic interpretations.

w4h.clean

The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)

w4h.core

The Core module contains core functions of the package used in other modules or as primary functions in the package. This includes the main run() …

w4h.export

The Export module contains functions for exporting processed data.

w4h.layers

The Layers module contains functions for splitting data into a layered model and for interpolating data within the layers

w4h.mapping

The Mapping module contains the functions used for geospatial analysis throughout the package. This includes some input/output as well as functions to …

w4h.read

The Read module contains funtions primarily for the input of data through the reading of data files, as well as support functions to carry out this task

Functions

def add_control_points(df_without_control, df_control=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', controlpoints_crs='EPSG:4269', output_crs='EPSG:4269', description_col='FORMATION', interp_col='INTERPRETATION', target_col='TARGET', verbose=False, log=False, **kwargs)

Function to add control points, primarily to aid in interpolation. This may be useful when conditions are known but do not exist in input well database

Parameters

df_without_control : pandas.DataFrame
Dataframe with current working data
df_control : str, pathlib.Purepath, or pandas.DataFrame
Pandas dataframe with control points
well_key : str, optional
The column containing the "key" (unique identifier) for each well, by default 'API_NUMBER'
xcol : str, optional
The column in df_control containing the x coordinates for each control point, by default 'LONGITUDE'
ycol : str, optional
The column in df_control containing the y coordinates for each control point, by default 'LATITUDE'
zcol : str, optional
The column in df_control containing the z coordinates for each control point, by default 'ELEV_FT'
controlpoints_crs : str, optional
The column in df_control containing the crs of points, by default 'EPSG:4269'
output_crs : str, optional
The output coordinate system, by default 'EPSG:4269'
description_col : str, optional
The column in df_control with the description (if this is used), by default 'FORMATION'
interp_col : str, optional
The column in df_control with the interpretation (if this is used), by default 'INTERPRETATION'
target_col : str, optional
The column in df_control with the target code (if this is used), by default 'TARGET'
verbose : bool, optional
Whether to print information to terminal, by default False
log : bool, optional
Whether to log information in log file, by default False
**kwargs
Keyword arguments of pandas.concat() or pandas.read_csv that will be passed to that function, except for objs, which are df and df_control

Returns

pandas.DataFrame
Pandas DataFrame with original data and control points formatted the same way and concatenated together
Expand source code
def add_control_points(df_without_control, df_control=None,  xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', controlpoints_crs='EPSG:4269', output_crs='EPSG:4269', description_col='FORMATION', interp_col='INTERPRETATION', target_col='TARGET', verbose=False, log=False, **kwargs):
    """Function to add control points, primarily to aid in interpolation. This may be useful when conditions are known but do not exist in input well database

    Parameters
    ----------
    df_without_control : pandas.DataFrame
        Dataframe with current working data
    df_control : str, pathlib.Purepath, or pandas.DataFrame
        Pandas dataframe with control points
    well_key : str, optional
        The column containing the "key" (unique identifier) for each well, by default 'API_NUMBER'
    xcol : str, optional
        The column in df_control containing the x coordinates for each control point, by default 'LONGITUDE'
    ycol : str, optional
        The column in df_control containing the y coordinates for each control point, by default 'LATITUDE'
    zcol : str, optional
        The column in df_control containing the z coordinates for each control point, by default 'ELEV_FT'
    controlpoints_crs : str, optional
        The column in df_control containing the crs of points, by default 'EPSG:4269'
    output_crs : str, optional
        The output coordinate system, by default 'EPSG:4269'
    description_col : str, optional
        The column in df_control with the description (if this is used), by default 'FORMATION'
    interp_col : str, optional
        The column in df_control with the interpretation (if this is used), by default 'INTERPRETATION'
    target_col : str, optional
        The column in df_control with the target code (if this is used), by default 'TARGET'
    verbose : bool, optional
        Whether to print information to terminal, by default False
    log : bool, optional
        Whether to log information in log file, by default False
    **kwargs
        Keyword arguments of pandas.concat() or pandas.read_csv that will be passed to that function, except for objs, which are df and df_control

    Returns
    -------
    pandas.DataFrame
        Pandas DataFrame with original data and control points formatted the same way and concatenated together
    """
    if verbose:
        verbose_print(add_control_points, locals(), exclude_params=['df_without_control', 'df_control'])

    if df_control is None:
        return df_without_control
    elif isinstance(df_control, pd.DataFrame) or isinstance(df_control, gpd.GeoDataFrame):
        pass
    else:
        read_csv_kwargs = {k: v for k, v in locals()['kwargs'].items() if k in inspect.signature(pd.read_csv).parameters.keys()}
        df_control = pd.read_csv(df_control, **read_csv_kwargs)
    
    #Drop unnecessary columns, if needed
    if target_col not in df_without_control.columns and target_col in df_control.columns:
        df_control.drop([target_col], axis=1, inplace=True)
        
    if interp_col not in df_without_control.columns and interp_col in df_control.columns:
        df_control.drop([interp_col], axis=1, inplace=True)
        
    if description_col not in df_without_control.columns and description_col in df_control.columns:
        df_control.drop([description_col], axis=1, inplace=True)            
        
    #If our main df is already a geodataframe, make df_control one too
    if isinstance(df_without_control, gpd.GeoDataFrame):
        from w4h import coords2geometry
        gdf_control = coords2geometry(df_no_geometry=df_control, xcol=xcol, ycol=ycol, zcol=zcol, input_coords_crs=controlpoints_crs, log=log)

    #Get kwargs passed to pd.concat, and set defaults for ignore_index and join
    concat_kwargs = {k: v for k, v in locals()['kwargs'].items() if k in inspect.signature(pd.concat).parameters.keys()}
    if 'ignore_index' not in concat_kwargs.keys():
        concat_kwargs['ignore_index'] = True
    if 'join' not in concat_kwargs.keys():
        concat_kwargs['join'] = 'outer'
        
    gdf = pd.concat([df_without_control, gdf_control], **concat_kwargs)
    
    if controlpoints_crs != output_crs:
        gdf = gdf.to_crs(output_crs)
    
    return gdf
def align_rasters(grids_unaligned=None, model_grid=None, no_data_val_grid=0, verbose=False, log=False)

Reprojects two rasters and aligns their pixels

Parameters

grids_unaligned : list or xarray.DataArray
Contains a list of grids or one unaligned grid
model_grid : xarray.DataArray
Contains model grid
no_data_val_grid : int, default=0
Sets value of no data pixels
log : bool, default = False
Whether to log results to log file, by default False

Returns

alignedGrids : list or xarray.DataArray
Contains aligned grids
Expand source code
def align_rasters(grids_unaligned=None, model_grid=None,
                  no_data_val_grid=0, verbose=False, log=False):
    """Reprojects two rasters and aligns their pixels

    Parameters
    ----------
    grids_unaligned : list or xarray.DataArray
        Contains a list of grids or one unaligned grid
    model_grid : xarray.DataArray
        Contains model grid
    no_data_val_grid : int, default=0
        Sets value of no data pixels
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    alignedGrids : list or xarray.DataArray
        Contains aligned grids
    """
    
    if grids_unaligned is None:
        grids_unaligned = [w4h.get_resources()['surf_elev'], w4h.get_resources()['bedrock_elev']]
    if model_grid is None:
        model_grid = w4h.get_resources()['model_grid']

    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(align_rasters, locals(), exclude_params=['grids_unaligned', 'model_grid'])
    if isinstance(grids_unaligned, (tuple, list)):
        alignedGrids = []
        for g in grids_unaligned:
            alignedGrid = g.rio.reproject_match(model_grid)

            try:
                no_data_val_grid = alignedGrid.attrs['_FillValue'] #Extract from dataset itself
            except:
                pass
            alignedGrid = alignedGrid.where(alignedGrid != no_data_val_grid)  #Replace no data values with NaNs
            alignedGrids.append(alignedGrid)
    else:
        alignedGrid = grids_unaligned.rio.reproject_match(model_grid)

        try:
            noDataVal = alignedGrid.attrs['_FillValue'] #Extract from dataset itself
        except:
            pass

        alignedGrids = alignedGrid.where(alignedGrid != noDataVal, other=np.nan)  #Replace no data values with NaNs
        
    return alignedGrids
def clip_gdf2study_area(study_area, gdf, log=False, verbose=False)

Clips dataframe to only include things within study area.

Parameters

study_area : geopandas.GeoDataFrame
Inputs study area polygon
gdf : geopandas.GeoDataFrame
Inputs point data
log : bool, default = False
Whether to log results to log file, by default False

Returns

gdfClip : geopandas.GeoDataFrame
Contains only points within the study area
Expand source code
def clip_gdf2study_area(study_area, gdf, log=False, verbose=False):
    """Clips dataframe to only include things within study area.

    Parameters
    ----------
    study_area : geopandas.GeoDataFrame
        Inputs study area polygon
    gdf : geopandas.GeoDataFrame
        Inputs point data
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    gdfClip : geopandas.GeoDataFrame
        Contains only points within the study area
    
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(clip_gdf2study_area, locals(), exclude_params=['study_area', 'gdf'])

    if study_area is None:
        return gdf
    else:
        studyArea_proj = study_area.to_crs(gdf.crs).copy()
        gdfClip = gpd.clip(gdf, studyArea_proj) #Easier to project just study area to ensure data fit
        gdfClip.reset_index(inplace=True, drop=True) #Reset index
    
    return gdfClip
def combine_dataset(layer_dataset, surface_elev, bedrock_elev, layer_thick, log=False)

Function to combine xarray datasets or datarrays into a single xr.Dataset. Useful to add surface, bedrock, layer thick, and layer datasets all into one variable, for pickling, for example.

Parameters

layer_dataset : xr.DataArray
DataArray contining all the interpolated layer information.
surface_elev : xr.DataArray
DataArray containing surface elevation data
bedrock_elev : xr.DataArray
DataArray containing bedrock elevation data
layer_thick : xr.DataArray
DataArray containing the layer thickness at each point in the model grid
log : bool, default = False
Whether to log inputs and outputs to log file.

Returns

xr.Dataset
Dataset with all input arrays set to different variables within the dataset.
Expand source code
def combine_dataset(layer_dataset, surface_elev, bedrock_elev, layer_thick, log=False):
    """Function to combine xarray datasets or datarrays into a single xr.Dataset. Useful to add surface, bedrock, layer thick, and layer datasets all into one variable, for pickling, for example.

    Parameters
    ----------
    layer_dataset : xr.DataArray 
        DataArray contining all the interpolated layer information.
    surface_elev : xr.DataArray
        DataArray containing surface elevation data
    bedrock_elev : xr.DataArray
        DataArray containing bedrock elevation data
    layer_thick : xr.DataArray
        DataArray containing the layer thickness at each point in the model grid
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    xr.Dataset
        Dataset with all input arrays set to different variables within the dataset.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    
    daDict = {}
    daDict['Layers'] = layer_dataset
    daDict['Surface_Elev'] = surface_elev
    daDict['Bedrock_Elev'] = bedrock_elev
    daDict['Layer_Thickness'] = layer_thick

    combined_dataset = xr.Dataset(daDict)

    return combined_dataset
def coords2geometry(df_no_geometry, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', input_coords_crs='EPSG:4269', use_z=False, wkt_col='WKT', geometry_source='coords', verbose=False, log=False)

Adds geometry to points with xy coordinates in the specified coordinate reference system.

Parameters

df_no_geometry : pandas.Dataframe
a Pandas dataframe containing points
xcol : str, default='LONGITUDE'
Name of column holding x coordinate data in df_no_geometry
ycol : str, default='LATITUDE'
Name of column holding y coordinate data in df_no_geometry
zcol : str, default='ELEV_FT'
Name of column holding z coordinate data in df_no_geometry
input_coords_crs : str, default='EPSG:4269
Name of crs used for geometry
use_z : bool, default=False
Whether to use z column in calculation
geometry_source : str {'coords', 'wkt', 'geometry'}
 
log : bool, default = False
Whether to log results to log file, by default False

Returns

gdf : geopandas.GeoDataFrame
Geopandas dataframe with points and their geometry values
Expand source code
def coords2geometry(df_no_geometry, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', input_coords_crs='EPSG:4269', use_z=False, wkt_col='WKT', geometry_source='coords', verbose=False, log=False):
    """Adds geometry to points with xy coordinates in the specified coordinate reference system.

    Parameters
    ----------
    df_no_geometry : pandas.Dataframe
        a Pandas dataframe containing points
    xcol : str, default='LONGITUDE'
        Name of column holding x coordinate data in df_no_geometry
    ycol : str, default='LATITUDE'
        Name of column holding y coordinate data in df_no_geometry
    zcol : str, default='ELEV_FT'
        Name of column holding z coordinate data in df_no_geometry
    input_coords_crs : str, default='EPSG:4269
        Name of crs used for geometry
    use_z : bool, default=False
        Whether to use z column in calculation
    geometry_source : str {'coords', 'wkt', 'geometry'}
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    gdf : geopandas.GeoDataFrame
        Geopandas dataframe with points and their geometry values

    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(coords2geometry, locals(), exclude_params=['df_no_geometry'])

    if isinstance(df_no_geometry, gpd.GeoDataFrame):
        gdf = df_no_geometry
    else:
        wktList = ['wkt', 'well known text', 'wellknowntext', 'well_known_text', 'w']
        coords_list = ['coords', 'coordinates', 'xycoords', 'xy', 'xyz', 'xyzcoords', 'xycoordinates', 'xyzcoordinates', 'c']
        geometryList = ['geometry', 'geom', 'geo', 'g']
        if geometry_source.lower() in wktList:
            from shapely import wkt
            df_no_geometry['geometry'] = df_no_geometry[wkt_col].apply(wkt.loads)
            df_no_geometry.drop(wkt_col, axis=1, inplace=True) #Drop WKT column

        elif geometry_source.lower() in coords_list:#coords = pd.concat([y, x], axis=1)
            x = df_no_geometry[xcol].to_numpy()
            y = df_no_geometry[ycol].to_numpy()
            if use_z:
                z = df_no_geometry[zcol].to_numpy()
                df_no_geometry["geometry"] = gpd.points_from_xy(x, y, z=z, crs=input_coords_crs)
            else:
                df_no_geometry["geometry"] = gpd.points_from_xy(x, y, crs=input_coords_crs)
        elif geometry_source.lower() in geometryList:
            pass
        else:
            warnings.warn(message=f"""The parameter geometry_source={geometry_source} is not recognized.
                        Should be one of 'coords' (if x, y (and/or z) columns with coordintes used), 
                        'wkt' (if column with wkt string used), or 
                        'geometry' (if column with shapely geometry objects used, as with a GeoDataFrame)""")
            
        gdf = gpd.GeoDataFrame(df_no_geometry, geometry='geometry', crs=input_coords_crs)
    return gdf
def define_dtypes(undefined_df, datatypes=None, verbose=False, log=False)

Function to define datatypes of a dataframe, especially with file-indicated dyptes

Parameters

undefined_df : pd.DataFrame
Pandas dataframe with columns whose datatypes need to be (re)defined
datatypes : dict, str, pathlib.PurePath() object, or None, default = None
Dictionary containing datatypes, to be used in pandas.DataFrame.astype() function. If None, will read from file indicated by dtype_file (which must be defined, along with dtype_dir), by default None
log : bool, default = False
Whether to log inputs and outputs to log file.

Returns

dfout : pandas.DataFrame
Pandas dataframe containing redefined columns
Expand source code
def define_dtypes(undefined_df, datatypes=None, verbose=False, log=False):
    """Function to define datatypes of a dataframe, especially with file-indicated dyptes

    Parameters
    ----------
    undefined_df : pd.DataFrame
        Pandas dataframe with columns whose datatypes need to be (re)defined
    datatypes : dict, str, pathlib.PurePath() object, or None, default = None
        Dictionary containing datatypes, to be used in pandas.DataFrame.astype() function. If None, will read from file indicated by dtype_file (which must be defined, along with dtype_dir), by default None
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    dfout : pandas.DataFrame
        Pandas dataframe containing redefined columns
    """
    if verbose:
        verbose_print(define_dtypes, locals(), exclude_params=['undefined_df'])
    if undefined_df is None:
        dfout = None
    else:
        logger_function(log, locals(), inspect.currentframe().f_code.co_name)

        dfout = undefined_df.copy()
        
        if isinstance(datatypes, pathlib.PurePath) or isinstance(datatypes, str):
            datatypes = pathlib.Path(datatypes)

            if not datatypes.exists():
                if verbose:
                    print("ERROR: datatypes file '{}' does not exist, using inferred datatypes.".format(datatypes),)
                return dfout
            elif datatypes.is_dir():
                if verbose:
                    print('ERROR: datatypes must be either dict or filepath (path to directories not allowed)')
                return dfout

            datatypes = read_dict(file=datatypes)
            dfout = dfout.astype(datatypes)

        elif isinstance(datatypes, dict):
            if verbose:
                print('datatypes is None, not updating datatypes')
            dfout = dfout.astype(datatypes)
        else:
            if verbose:
                print('ERROR: datatypes must be either dict or a filepath, not {}'.format(type(datatypes)))
            return dfout
        
        #This is likely redundant
        dfcols = dfout.columns
        for i in range(0, np.shape(dfout)[1]):
            dfout.iloc[:,i] = undefined_df.iloc[:,i].astype(datatypes[dfcols[i]])

    return dfout
def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False)

Function to define all intervals lower than thresh as bedrock

Parameters

df : pandas.DataFrame
Dataframe to classify
top_col : str, default = 'TOP'
Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
thresh : float, default = 550.0
Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
verbose : bool, default = False
Whether to print results, by default False
log : bool, default = True
Whether to log results to log file

Returns

df : pandas.DataFrame
Dataframe containing intervals classified as bedrock due to depth
Expand source code
def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False):
    """Function to define all intervals lower than thresh as bedrock

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to classify
    top_col : str, default = 'TOP'
        Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
    thresh : float, default = 550.0
        Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing intervals classified as bedrock due to depth
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(depth_define, locals(), exclude_params=['df'])
    df = df.copy()
    df['CLASS_FLAG'].mask(df[top_col]>thresh, 3 ,inplace=True) #Add a Classification Flag of 3 (bedrock b/c it's deepter than 550') to all records where the top of the interval is >550'
    df['BEDROCK_FLAG'].mask(df[top_col]>thresh, True, inplace=True)

    if verbose:
        if df.CLASS_FLAG.notnull().sum() == 0:
            brDepthClass = 0
        else:
            brDepthClass = df['CLASS_FLAG'].value_counts()[3.0]
        total = df.shape[0]

        numRecsClass = int(df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified bedrock well records using depth threshold at depth of {}'.format(thresh))
        print("\t{} records classified using bedrock threshold depth ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
        
    return df
def export_dataframe(df, out_dir, filename, date_stamp=True, log=False)

Function to export dataframes

Parameters

df : pandas dataframe, or list of pandas dataframes
Data frame or list of dataframes to be exported
out_dir : string or pathlib.Path object
Directory to which to export dataframe object(s) as .csv
filename : str or list of strings
Filename(s) of output files
date_stamp : bool, default=True
Whether to include a datestamp in the filename. If true, file ends with _yyyy-mm-dd.csv of current date, by default True.
log : bool, default = True
Whether to log inputs and outputs to log file.
Expand source code
def export_dataframe(df, out_dir, filename, date_stamp=True, log=False):
    """Function to export dataframes

    Parameters
    ----------
    df : pandas dataframe, or list of pandas dataframes
        Data frame or list of dataframes to be exported
    out_dir : string or pathlib.Path object
        Directory to which to export dataframe object(s) as .csv
    filename : str or list of strings
        Filename(s) of output files
    date_stamp : bool, default=True
        Whether to include a datestamp in the filename. If true, file ends with _yyyy-mm-dd.csv of current date, by default True.
    log : bool, default = True
        Whether to log inputs and outputs to log file.        
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if date_stamp:
        nowTime = datetime.datetime.now()
        nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0]
        nowTimeStr = '_'+str(nowTime)
    else:
        nowTimeStr=''

    if type(out_dir) is str or isinstance(out_dir, pathlib.PurePath):
        out_dir = str(out_dir)
        out_dir = out_dir.replace('\\', '/').replace('\\'[-1], '/')
        if out_dir[-1] != '/':
            out_dir = out_dir + '/'
    else:
        print('Please input string or pathlib object for out_dir parameters')
        return

    if type(filename) is str:
        dfOutFile =  out_dir+filename+nowTimeStr+'.csv'
        df.to_csv(dfOutFile, index_label='ID')
        print('Exported '+filename+nowTimeStr+'.csv')
    elif type(filename) is list and type(df) is list and len(df) == len(filename):
        for i, f in enumerate(df):
            fname = filename[i]
            dfOutFile =  out_dir+fname+nowTimeStr+'.csv'
            f.to_csv(dfOutFile, index_label='ID')
            print('Exported '+fname+nowTimeStr+'.csv')
def export_grids(grid_data, out_path, file_id='', filetype='tif', variable_sep=True, date_stamp=True, verbose=False, log=False)

Function to export grids to files.

Parameters

grid_data : xarray DataArray or xarray Dataset
Dataset or dataarray to be exported
out_path : str or pathlib.Path object
Output location for data export. If variable_sep=True, this should be a directory. Otherwise, this should also include the filename. The file extension should not be included here.
file_id : str, optional
If specified, will add this after 'LayerXX' or 'AllLayers' in the filename, just before datestamp, if used. Example filename for file_id='Coarse': Layer1_Coarse_2023-04-18.tif.
filetype : str, optional
Output filetype. Can either be pickle or any file extension supported by rioxarray.rio.to_raster(). Can either include period or not., by default 'tif'
variable_sep : bool, optional
If grid_data is an xarray Dataset, this will export each variable in the dataset as a separate file, including the variable name in the filename, by default False
date_stamp : bool, optional
Whether to include a date stamp in the file name., by default True
log : bool, default = True
Whether to log inputs and outputs to log file.
Expand source code
def export_grids(grid_data, out_path, file_id='',filetype='tif', variable_sep=True, date_stamp=True, verbose=False, log=False):
    """Function to export grids to files.

    Parameters
    ----------
    grid_data : xarray DataArray or xarray Dataset
        Dataset or dataarray to be exported
    out_path : str or pathlib.Path object
        Output location for data export. If variable_sep=True, this should be a directory. Otherwise, this should also include the filename. The file extension should not be included here.
    file_id : str, optional
        If specified, will add this after 'LayerXX' or 'AllLayers' in the filename, just before datestamp, if used. Example filename for file_id='Coarse': Layer1_Coarse_2023-04-18.tif.
    filetype : str, optional
        Output filetype. Can either be pickle or any file extension supported by rioxarray.rio.to_raster(). Can either include period or not., by default 'tif'
    variable_sep : bool, optional
        If grid_data is an xarray Dataset, this will export each variable in the dataset as a separate file, including the variable name in the filename, by default False
    date_stamp : bool, optional
        Whether to include a date stamp in the file name., by default True
    log : bool, default = True
        Whether to log inputs and outputs to log file.        
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(export_grids, locals(), exclude_params=['grid_data'])
    #Initialize lists to determine which filetype will be used for export
    ncdfList = ['netcdf', 'ncdf', 'n']
    tifList = ['tif', 'tiff', 'geotiff', 'geotif', 't']
    pickleList = ['pickle', 'pkl', 'p']

    filenames = []

    #Format output string(s)
    #Format output filepath
    if isinstance(out_path, (pathlib.PurePath, str)):
        if isinstance(out_path, pathlib.PurePath):
            pass
        else:
            out_path = pathlib.Path(out_path)
            
        if out_path.parent.exists() == False:
            print('Directory does not exist. Please enter a different value for the out_path parameter.')
            return        

        if out_path.is_dir():
            if isinstance(grid_data, xr.DataArray):
                if variable_sep:
                    lyrs = grid_data.coords['Layer'].values
                    filenames = []
                    for l in lyrs:
                        filenames.append('Layer'+str(l))
                else:
                    filenames = ['AllLayers']
            if isinstance(grid_data, xr.Dataset):
                if variable_sep:
                    filenames = []
                    for var in grid_data:
                        filenames.append(var)
                else:
                    filenames = ['AllLayers']    
        else:
            filenames = [out_path.stem]
            out_path = out_path.parent
    else:
        print('No output path specified (out_path). Please input string or pathlib object for out_path parameters')
        return
    
    #Format datestamp, if desired in output filename
    if date_stamp:
        nowTime = datetime.datetime.now()
        nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0]
        nowTimeStr = '_'+str(nowTime)
    else:
        nowTimeStr=''

    #Ensure the file suffix includes .
    if filetype[0] == '.':
        pass
    else:
        filetype = '.' + filetype

    if file_id != '':
        file_id = '_'+file_id

    out_path = out_path.as_posix()+'/'
    
    if verbose:
        print('Export filepath(s):')
        
    outPaths = []
    for f in filenames:
        currOutPath = out_path+f+file_id+nowTimeStr+filetype
        outPaths.append(currOutPath)
        if verbose:
            print('\t {}'.format(currOutPath))
        
    #Do export
    if filetype.lower() in pickleList:
        import pickle
        for op in outPaths:
            try:
                with open(op, 'wb') as f:
                    pickle.dump(grid_data, f)
            except:
                print('An error occured during export.')
                print(op, 'could not be exported as a pickle object.')
                print('Try again using different parameters.')
    else:
        import rioxarray as rxr
        try:
            if isinstance(grid_data, xr.Dataset):
                if variable_sep:
                    for i, var in enumerate(grid_data.data_vars):
                        grid_data[var].rio.to_raster(outPaths[i])
                else:
                    grid_data.rio.to_raster(outPaths[0])
            elif isinstance(grid_data, xr.DataArray):
                if variable_sep:
                    lyrs = grid_data.coords['Layer'].values
                    for i, l in enumerate(lyrs):
                        out_grid = grid_data.sel(Layer = l).copy()
                        out_grid.rio.to_raster(outPaths[i])
                else:
                    grid_data.rio.to_raster(outPaths[0])
            else:
                grid_data.rio.to_raster(outPaths[0])
        except:
            print('An error occured during export.')
            print('{} could not be exported as {} file.'.format(outPaths, filetype))
            print('Try again using different parameters.')

    return
def export_undefined(df, outdir)

Function to export terms that still need to be defined.

Parameters

df : pandas.DataFrame
Dataframe containing at least some unclassified data
outdir : str or pathlib.Path
Directory to save file. Filename will be generated automatically based on today's date.

Returns

stillNeededDF : pandas.DataFrame
Dataframe containing only unclassified terms, and the number of times they occur
Expand source code
def export_undefined(df, outdir):
    """Function to export terms that still need to be defined.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing at least some unclassified data
    outdir : str or pathlib.Path
        Directory to save file. Filename will be generated automatically based on today's date.

    Returns
    -------
    stillNeededDF : pandas.DataFrame
        Dataframe containing only unclassified terms, and the number of times they occur
    """
    import pathlib
    
    
    if isinstance(outdir, pathlib.PurePath):
        if not outdir.is_dir() or not outdir.exists():
            print('Please specify a valid directory for export. Filename is generated automatically.')
            return
        outdir = outdir.as_posix()
    else:
        outdir.replace('\\','/')
        outdir.replace('\\'[-1], '/')

    #Get directory path correct        
    if outdir[-1] != '/':
        outdir = outdir+'/'

    todayDate = datetime.date.today()
    todayDateStr = str(todayDate)
    searchDF = df[df['CLASS_FLAG'].isna()]
    
    stillNeededDF=searchDF['FORMATION'].value_counts()
    stillNeededDF.to_csv(outdir+'Undefined_'+todayDateStr+'.csv')
    return stillNeededDF
def file_setup(well_data, metadata=None, data_filename='*ISGS_DOWNHOLE_DATA*.txt', metadata_filename='*ISGS_HEADER*.txt', log_dir=None, verbose=False, log=False)

Function to setup files, assuming data, metadata, and elevation/location are in separate files (there should be one "key"/identifying column consistent across all files to join/merge them later)

This function may not be useful if files are organized differently than this structure. If that is the case, it is recommended to use the get_most_recent() function for each individual file if needed. It may also be of use to simply skip this function altogether and directly define each filepath in a manner that can be used by pandas.read_csv()

Parameters

well_data : str or pathlib.Path object
Str or pathlib.Path to directory containing input files, by default str(repoDir)+'/resources'
metadata : str or pathlib.Path object, optional
Str or pathlib.Path to directory containing input metadata files, by default str(repoDir)+'/resources'
data_filename : str, optional
Pattern used by pathlib.glob() to get the most recent data file, by default 'ISGS_DOWNHOLE_DATA.txt'
metadata_filename : str, optional
Pattern used by pathlib.glob() to get the most recent metadata file, by default 'ISGS_HEADER.txt'
log_dir : str or pathlib.PurePath() or None, default=None
Directory to place log file in. This is not read directly, but is used indirectly by w4h.logger_function()
verbose : bool, default = False
Whether to print name of files to terminal, by default True
log : bool, default = True
Whether to log inputs and outputs to log file.

Returns

tuple
Tuple with paths to (well_data, metadata)
Expand source code
def file_setup(well_data, metadata=None, data_filename='*ISGS_DOWNHOLE_DATA*.txt', metadata_filename='*ISGS_HEADER*.txt', log_dir=None, verbose=False, log=False):
    """Function to setup files, assuming data, metadata, and elevation/location are in separate files (there should be one "key"/identifying column consistent across all files to join/merge them later)

    This function may not be useful if files are organized differently than this structure. 
    If that is the case, it is recommended to use the get_most_recent() function for each individual file if needed.
    It may also be of use to simply skip this function altogether and directly define each filepath in a manner that can be used by pandas.read_csv()

    Parameters
    ----------
    well_data : str or pathlib.Path object
        Str or pathlib.Path to directory containing input files, by default str(repoDir)+'/resources'
    metadata : str or pathlib.Path object, optional
        Str or pathlib.Path to directory containing input metadata files, by default str(repoDir)+'/resources'
    data_filename : str, optional
        Pattern used by pathlib.glob() to get the most recent data file, by default '*ISGS_DOWNHOLE_DATA*.txt'
    metadata_filename : str, optional
        Pattern used by pathlib.glob() to get the most recent metadata file, by default '*ISGS_HEADER*.txt'
    log_dir : str or pathlib.PurePath() or None, default=None
        Directory to place log file in. This is not read directly, but is used indirectly by w4h.logger_function()
    verbose : bool, default = False
        Whether to print name of files to terminal, by default True
    log : bool, default = True
        Whether to log inputs and outputs to log file.

    Returns
    -------
    tuple
        Tuple with paths to (well_data, metadata)
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(file_setup, locals(), exclude_params=['well_data'])
    #Define  filepath variables to be used later for reading/writing files
    data_path = pathlib.Path(well_data)
    if metadata is None:
        origMetaPath = None
        metadata=data_path
    else:
        origMetaPath = metadata
        metadata=pathlib.Path(metadata)

    #If input path is a directory, find most recent version of the file. If file, just read the file
    if data_path.is_dir():
        downholeDataFILE = get_most_recent(data_path, data_filename, verbose=verbose)
    else:
        downholeDataFILE = data_path
    
    if metadata.is_dir():
        headerDataFILE = get_most_recent(metadata, metadata_filename, verbose=verbose)
        if headerDataFILE == []:
            headerDataFILE = downholeDataFILE
    else:
        if origMetaPath is None:
            headerDataFILE = downholeDataFILE
        else:
            headerDataFILE = metadata
       #Set all input as pathlib.Path objects (may be redundant, but just in case)
    downholeDataPATH = pathlib.Path(downholeDataFILE)
    headerDataPATH = pathlib.Path(headerDataFILE)

    if verbose:
        print('Using the following file(s):')
        print('\t', downholeDataFILE)
        if headerDataFILE != downholeDataFILE:
            print('\t', headerDataFILE)

    #Define datatypes, to use later
    #downholeDataDTYPES = {'ID':np.uint32, "API_NUMBER":np.uint64,"TABLE_NAME":str,"WHO":str,"INTERPRET_DATE":str,"FORMATION":str,"THICKNESS":np.float64,"TOP":np.float64,"BOTTOM":np.float64}
    #headerDataDTYPES = {'ID':np.uint32,'API_NUMBER':np.uint64,"TDFORMATION":str,"PRODFORM":str,"TOTAL_DEPTH":np.float64,"SECTION":np.float64,"TWP":np.float64,"TDIR":str,"RNG":np.float64,"RDIR":str,"MERIDIAN":np.float64,"FARM_NAME":str,"NSFOOT":np.float64,"NSDIR":str,"EWFOOT":np.float64,"EWDIR":str,"QUARTERS":str,"ELEVATION":np.float64,"ELEVREF":str,"COMP_DATE":str,"STATUS":str,"FARM_NUM":str,"COUNTY_CODE":np.float64,"PERMIT_NUMBER":str,"COMPANY_NAME":str,"COMPANY_CODE":str,"PERMIT_DATE":str,"CORNER":str,"LATITUDE":np.float64,"LONGITUDE":np.float64,"ENTERED_BY":str,"UPDDATE":str,"ELEVSOURCE":str, "ELEV_FT":np.float64}
    return downholeDataPATH, headerDataPATH
def fill_unclassified(df, classification_col='CLASS_FLAG')

Fills unclassified rows in 'CLASS_FLAG' column with np.nan

Parameters

df : pandas.DataFrame
Dataframe on which to perform operation

Returns

df : pandas.DataFrame
Dataframe on which operation has been performed
Expand source code
def fill_unclassified(df, classification_col='CLASS_FLAG'):
    """Fills unclassified rows in 'CLASS_FLAG' column with np.nan

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe on which to perform operation

    Returns
    -------
    df : pandas.DataFrame
        Dataframe on which operation has been performed
    """
    df[classification_col].fillna(0, inplace=True)
    return df
def get_current_date()

Gets The Current Date To Help With Finding The Most Recent File

Parameters

None


Returns

todayDate
datetime object with today's date
dateSuffix
str to use for naming output files
Expand source code
def get_current_date():
    """ Gets the current date to help with finding the most recent file
        ---------------------
        Parameters:
            None

        ---------------------
        Returns:
            todayDate   : datetime object with today's date
            dateSuffix  : str to use for naming output files
    """
    todayDate = datetime.date.today()
    todayDateStr = str(todayDate)
    dateSuffix = '_'+todayDateStr
    return todayDate, dateSuffix
def get_drift_thick(surface_elev=None, bedrock_elev=None, layers=9, plot=False, verbose=False, log=False)

Finds the distance from surface_elev to bedrock_elev and then divides by number of layers to get layer thickness.

Parameters

surface_elev : rioxarray.DataArray
array holding surface elevation
bedrock_elev : rioxarray.DataArray
array holding bedrock elevation
layers : int, default=9
number of layers needed to calculate thickness for
plot : bool, default=False
tells function to either plot the data or not

Returns

driftThick : rioxarray.DataArray
Contains data array containing depth to bedrock at each point
layerThick : rioxarray.DataArray
Contains data array with layer thickness at each point
Expand source code
def get_drift_thick(surface_elev=None, bedrock_elev=None, layers=9, plot=False, verbose=False, log=False):
    """Finds the distance from surface_elev to bedrock_elev and then divides by number of layers to get layer thickness.

    Parameters
    ----------
    surface_elev : rioxarray.DataArray
        array holding surface elevation
    bedrock_elev : rioxarray.DataArray
        array holding bedrock elevation
    layers : int, default=9
        number of layers needed to calculate thickness for
    plot : bool, default=False
        tells function to either plot the data or not

    Returns
    -------
    driftThick : rioxarray.DataArray
        Contains data array containing depth to bedrock at each point
    layerThick : rioxarray.DataArray
        Contains data array with layer thickness at each point

    """
    
    if surface_elev is None:
        surface_elev = w4h.get_resources()['surf_elev']
    if bedrock_elev is None:
        bedrock_elev = w4h.get_resources()['bedrock_elev']
    
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(get_drift_thick, locals(), exclude_params=['surface_elev', 'bedrock_elev'])
    xr.set_options(keep_attrs=True)

    driftThick = surface_elev - bedrock_elev
    driftThick = driftThick.clip(0,max=5000,keep_attrs=True)
    if plot:
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        maxThick = np.nanmax(driftThick)
        if maxThick > 550:
            maxThick = 550
        dtPlot = driftThick.plot(vmin=0, vmax=maxThick, ax=ax)
        ax.set_title("Drift Thickness")
    try:
        noDataVal = driftThick.attrs['_FillValue'] #Extract from dataset itself
    except:
        noDataVal = 100001
    
    driftThick = driftThick.where(driftThick <100000, other=np.nan)  #Replace no data values with NaNs
    driftThick = driftThick.where(driftThick >-100000, other=np.nan)  #Replace no data values with NaNs

    layerThick = driftThick/layers
    
    xr.set_options(keep_attrs='default')

    return driftThick, layerThick
def get_layer_depths(df_with_depths, surface_elev_col='SURFACE_ELEV', layer_thick_col='LAYER_THICK', layers=9, log=False)

Function to calculate depths and elevations of each model layer at each well based on surface elevation, bedrock elevation, and number of layers/layer thickness

Parameters

df_with_depths : pandas.DataFrame
Dataframe containing well metdata
layers : int, default=9
Number of layers. This should correlate with get_drift_thick() input parameter, if drift thickness was calculated using that function, by default 9.
log : bool, default = False
Whether to log inputs and outputs to log file.

Returns

pandas.Dataframe
Dataframe containing new columns for depth to layers and elevation of layers.
Expand source code
def get_layer_depths(df_with_depths, surface_elev_col='SURFACE_ELEV', layer_thick_col='LAYER_THICK', layers=9, log=False):
    """Function to calculate depths and elevations of each model layer at each well based on surface elevation, bedrock elevation, and number of layers/layer thickness

    Parameters
    ----------
    df_with_depths : pandas.DataFrame
        Dataframe containing well metdata
    layers : int, default=9
        Number of layers. This should correlate with get_drift_thick() input parameter, if drift thickness was calculated using that function, by default 9.
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    pandas.Dataframe
        Dataframe containing new columns for depth to layers and elevation of layers.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    for layer in range(0, layers): #For each layer
        #Make column names
        depthColName  = 'DEPTH_LAYER'+str(layer+1)
        #depthMcolName = 'Depth_M_LAYER'+str(layer) 

        #Calculate depth to each layer at each well, in feet and meters
        df_with_depths[depthColName]  = df_with_depths[layer_thick_col] * layer
        #headerData[depthMcolName] = headerData[depthColName] * 0.3048

    for layer in range(0, layers): #For each layer
        elevColName = 'ELEV_LAYER'+str(layer+1)
        #elevMColName = 'ELEV_M_LAYER'+str(layer)
            
        df_with_depths[elevColName]  = df_with_depths[surface_elev_col] - df_with_depths[layer_thick_col] * layer
        #headerData[elevMColName]  = headerData['SURFACE_ELEV_M'] - headerData['LAYER_THICK_M'] * layer
        
    return df_with_depths
def get_most_recent(dir=WindowsPath('c:/Users/riley/LocalData/Github/wells4hydrogeology/w4h/resources'), glob_pattern='*', verbose=True)

Function to find the most recent file with the indicated pattern, using pathlib.glob function.

Parameters

dir : str or pathlib.Path object, optional
Directory in which to find the most recent file, by default str(repoDir)+'/resources'
glob_pattern : str, optional
String used by the pathlib.glob() function/method for searching, by default '*'

Returns

pathlib.Path object
Pathlib Path object of the most recent file fitting the glob pattern indicated in the glob_pattern parameter.
Expand source code
def get_most_recent(dir=resource_dir, glob_pattern='*', verbose=True):
    """Function to find the most recent file with the indicated pattern, using pathlib.glob function.

    Parameters
    ----------
    dir : str or pathlib.Path object, optional
        Directory in which to find the most recent file, by default str(repoDir)+'/resources'
    glob_pattern : str, optional
        String used by the pathlib.glob() function/method for searching, by default '*'

    Returns
    -------
    pathlib.Path object
        Pathlib Path object of the most recent file fitting the glob pattern indicated in the glob_pattern parameter.
    """
    if verbose:
        verbose_print(get_most_recent, locals())

    todayDate = datetime.date.today()
    todayDateStr = str(todayDate)

    files = pathlib.Path(dir).rglob(glob_pattern) #Get all the files that fit the pattern
    fileDates = []
    for f in files: #Get the file dates from their file modification times
        fileDates.append(np.datetime64(datetime.datetime.fromtimestamp(os.path.getmtime(f))))
    
    if fileDates == []:
        #If no files found that match pattern, return an empty pathlib.Path()
        if verbose:
            print('No file found in {} matching {} pattern'.format(dir, glob_pattern))
        mostRecentFile = pathlib.Path()
        return mostRecentFile
    else:
        globInd = np.argmin(np.datetime64(todayDateStr) - np.array(fileDates)) #Find the index of the most recent file

    #Iterate through glob/files again (need to recreate glob)
    files = pathlib.Path(dir).rglob(glob_pattern)
    for j, f in enumerate(files):
        if j == globInd:
            mostRecentFile=f
            break
    
    if verbose:
        print('Most Recent version of file fitting {} pattern is: {}'.format(glob_pattern, mostRecentFile.name))

    return mostRecentFile
def get_resources(resource_type='filepaths', scope='local', verbose=False)

Function to get filepaths for resources included with package

Parameters

resource_type : str, {'filepaths', 'data'}
If filepaths, will return dictionary with filepaths to sample data. If data, returns dictionary with data objects.
scope : str, {'local', 'statewide'}
If 'local', will read in sample data for a local (around county sized) project. If 'state', will read in sample data for a statewide project (Illinois)
verbose : bool, optional
Whether to print results to terminal, by default False

Returns

resources_dict : dict
Dictionary containing key, value pairs with filepaths to resources that may be of interest.
Expand source code
def get_resources(resource_type='filepaths', scope='local', verbose=False):
    """Function to get filepaths for resources included with package

    Parameters
    ----------
    resource_type : str, {'filepaths', 'data'}
        If filepaths, will return dictionary with filepaths to sample data. If data, returns dictionary with data objects.
    scope : str, {'local', 'statewide'}
        If 'local', will read in sample data for a local (around county sized) project. If 'state', will read in sample data for a statewide project (Illinois)
    verbose : bool, optional
        Whether to print results to terminal, by default False

    Returns
    -------
    resources_dict : dict
        Dictionary containing key, value pairs with filepaths to resources that may be of interest.
    """
    resources_dict = {}
    sample_data_dir = resource_dir.joinpath('sample_data')

    #Get sample data
    #Get lithology dictionaries' filepaths
    sample_dictionary_dir = sample_data_dir.joinpath('DictionaryTerms')
    resources_dict['LithologyDict_Exact'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*DICTIONARY_SearchTerms*', verbose=verbose)
    resources_dict['LithologyDict_Start'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*SearchTerms-Start*', verbose=verbose)
    resources_dict['LithologyDict_Wildcard'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*SearchTerms-Wildcard*', verbose=verbose)

    #Get Lithology Interpretation filepaths
    lith_interp_dir = sample_data_dir.joinpath('LithologyInterpretations')
    resources_dict['LithInterps_FineCoarse'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*FineCoarse*', verbose=verbose)
    resources_dict['LithInterps_Clay'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Clay*', verbose=verbose)
    resources_dict['LithInterps_Silt'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Silt*', verbose=verbose)    
    resources_dict['LithInterps_Sand'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Sand*', verbose=verbose)    
    resources_dict['LithInterps_Gravel'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Gravel*', verbose=verbose)    

    #Get other resource filepaths
    resources_dict['well_data_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*downholeDataTypes*', verbose=verbose)
    resources_dict['metadata_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*headerDataTypes*', verbose=verbose)
    resources_dict['ISWS_CRS'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='isws_crs.json', verbose=verbose)
    resources_dict['xyz_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='xyzDataTypes.json', verbose=verbose)

    resources_dict['model_grid'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='grid_625_raster.tif', verbose=verbose)

    statewideSampleDir = sample_data_dir.joinpath('statewide_sample_data')
    statewideList = ['statewide', 'state', 'regional', 'region', 's', 'r']
    if scope.lower() in statewideList:
        resources_dict['well_data'] = statewideSampleDir.joinpath("IL_Statewide_WellData_XYz_2023-07-20_cleaned.zip")

        resources_dict['surf_elev'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_Surface_Elev_ft_625ft_Lambert_GridAlign*', verbose=verbose)
        resources_dict['bedrock_elev'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_Bedrock_Elev_2023_ft_625ft_Lambert_GridAlign*', verbose=verbose)
        resources_dict['study_area'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_boundary*', verbose=verbose)
    else:
        resources_dict['study_area'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*sample_studyArea*', verbose=verbose)
        resources_dict['surf_elev'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*sample_surface_bedrock_lidarresampled100ft*', verbose=verbose)
        resources_dict['bedrock_elev'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*LocalSample_Bedrock_elev_EStLGrimleyPhillips*', verbose=verbose)

        resources_dict['well_data'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='sample_well_data*', verbose=verbose)

    # Get data objects if specified
    dataObjList = ['data', 'objects', 'do', 'data objects', 'dataobjects']
    if resource_type.lower() in dataObjList:
        resources_dict['LithologyDict_Exact'] = pd.read_csv(resources_dict['LithologyDict_Exact'], 
                                                            dtype={"ID":int, "DESCRIPTION":str, "LITHOLOGY":str,
                                                            "COLOR":str, "CONSISTENCY":str, "MOD1":str, "MOD2":str,
                                                            "INTERPRETED":str, "COMPLETED":str, "ORIGIN_INDIANA":str},
                                                            index_col='ID')
        resources_dict['LithologyDict_Start'] = pd.read_csv(resources_dict['LithologyDict_Start'])
        resources_dict['LithologyDict_Wildcard'] = pd.read_csv(resources_dict['LithologyDict_Wildcard'])

        resources_dict['LithInterps_FineCoarse'] = pd.read_csv(resources_dict['LithInterps_FineCoarse'])
        resources_dict['LithInterps_Clay'] = pd.read_csv(resources_dict['LithInterps_Clay'])
        resources_dict['LithInterps_Silt'] = pd.read_csv(resources_dict['LithInterps_Silt'])
        resources_dict['LithInterps_Sand'] = pd.read_csv(resources_dict['LithInterps_Sand'])
        resources_dict['LithInterps_Gravel'] = pd.read_csv(resources_dict['LithInterps_Gravel'])

        
        with open(resources_dict['well_data_dtypes'], 'r', encoding='utf-8') as f:
            resources_dict['well_data_dtypes'] = json.load(f)

        with open(resources_dict['metadata_dtypes'], 'r', encoding='utf-8') as f:
            resources_dict['metadata_dtypes'] = json.load(f)            

        with open(resources_dict['ISWS_CRS'], 'r', encoding='utf-8') as f:
            resources_dict['ISWS_CRS'] = json.load(f)
        
        with open(resources_dict['xyz_dtypes'], 'r', encoding='utf-8') as f:
            resources_dict['xyz_dtypes'] = json.load(f)


        if scope.lower() in statewideList:
            sacrs = resources_dict['ISWS_CRS']
            with zipfile.ZipFile(resources_dict['well_data'].as_posix(), 'r') as archive:
                for file_name in archive.namelist():
                    with archive.open(file_name) as file:
                        if 'HEADER' in file_name:
                            metaDF = pd.read_csv(file)
                        else:
                            resources_dict['well_data'] = pd.read_csv(file)
            geometry = [Point(xy) for xy in zip(resources_dict['well_data']['LONGITUDE'], resources_dict['well_data']['LATITUDE'])]
            resources_dict['well_data'] = gpd.GeoDataFrame(resources_dict['well_data'], geometry=geometry, crs='EPSG:4269')
            
        else:
            sacrs = 'EPSG:4269'
            df = pd.read_csv(resources_dict['well_data'])
            df['geometry'] = df['geometry'].apply(wkt.loads)
            resources_dict['well_data'] = gpd.GeoDataFrame(df, geometry='geometry')


        resources_dict['study_area'] = gpd.read_file(resources_dict['study_area'], geometry='geometry', crs=sacrs)

        resources_dict['model_grid'] = rxr.open_rasterio(resources_dict['model_grid'])
        resources_dict['surf_elev'] = rxr.open_rasterio(resources_dict['surf_elev'])
        #resources_dict['surf_elev'] = resources_dict['surf_elev'].sel(band=1)
        resources_dict['bedrock_elev'] = rxr.open_rasterio(resources_dict['bedrock_elev'])
        #resources_dict['bedrock_elev'] = resources_dict['bedrock_elev'].sel(band=1)

    return resources_dict
def get_search_terms(spec_path='C:\\Users\\riley\\LocalData\\Github\\wells4hydrogeology/resources/', spec_glob_pattern='*SearchTerms-Specific*', start_path=None, start_glob_pattern='*SearchTerms-Start*', wildcard_path=None, wildcard_glob_pattern='*SearchTerms-Wildcard', verbose=False, log=False)

Read in dictionary files for downhole data

Parameters

spec_path : str or pathlib.Path, optional
Directory where the file containing the specific search terms is located, by default str(repoDir)+'/resources/'
spec_glob_pattern : str, optional
Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Specific'
start_path : str or None, optional
Directory where the file containing the start search terms is located, by default None
start_glob_pattern : str, optional
Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Start'
wildcard_path : str or pathlib.Path, default = None
Directory where the file containing the wildcard search terms is located, by default None
wildcard_glob_pattern : str, default = '*SearchTerms-Wildcard'
Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Wildcard'
log : bool, default = True
Whether to log inputs and outputs to log file.

Returns

(specTermsPath, startTermsPath, wilcardTermsPath) : tuple Tuple containing the pandas dataframes with specific search terms, with start search terms, and with wildcard search terms

Expand source code
def get_search_terms(spec_path=str(repoDir)+'/resources/', spec_glob_pattern='*SearchTerms-Specific*', 
                     start_path=None, start_glob_pattern = '*SearchTerms-Start*', 
                     wildcard_path=None, wildcard_glob_pattern='*SearchTerms-Wildcard',
                     verbose=False, log=False):
    """Read in dictionary files for downhole data

    Parameters
    ----------
    spec_path : str or pathlib.Path, optional
        Directory where the file containing the specific search terms is located, by default str(repoDir)+'/resources/'
    spec_glob_pattern : str, optional
        Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Specific*'
    start_path : str or None, optional
        Directory where the file containing the start search terms is located, by default None
    start_glob_pattern : str, optional
        Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Start*'
    wildcard_path : str or pathlib.Path, default = None
        Directory where the file containing the wildcard search terms is located, by default None    
    wildcard_glob_pattern : str, default = '*SearchTerms-Wildcard'
        Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Wildcard*'
    log : bool, default = True
        Whether to log inputs and outputs to log file.        

    Returns
    -------
    (specTermsPath, startTermsPath, wilcardTermsPath) : tuple
        Tuple containing the pandas dataframes with specific search terms,  with start search terms, and with wildcard search terms
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(get_search_terms, locals())    
    #specTermsFile = "SearchTerms-Specific_BedrockOrNo_2022-09.csv" #Specific matches
    #startTermsFile = "SearchTerms-Start_BedrockOrNo.csv" #Wildcard matches for the start of the description

    #Exact match path
    if spec_path is None:
        specTermsPath = spec_path        
    else:
        spec_path = pathlib.Path(spec_path)

        if spec_path.is_dir():
            specTermsPath = get_most_recent(spec_path, spec_glob_pattern)
        else:
            specTermsPath = spec_path

    #Startswith path
    if start_path is None:
        startTermsPath = start_path        
    else:
        start_path = pathlib.Path(start_path)

        if start_path.is_dir():
            startTermsPath = get_most_recent(start_path, start_glob_pattern)
        else:
            startTermsPath = start_path

    #Wildcard Path
    if wildcard_path is None:
        wilcardTermsPath = wildcard_path        
    else:
        wildcard_path = pathlib.Path(wildcard_path)

        if wildcard_path.is_dir():
            wilcardTermsPath = get_most_recent(wildcard_path, wildcard_glob_pattern)
        else:
            wilcardTermsPath = wildcard_path
    
    return specTermsPath, startTermsPath, wilcardTermsPath
def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False)

Gets unique wells as a dataframe based on a given column name.

Parameters

df : pandas.DataFrame
Dataframe containing all wells and/or well intervals of interest
wellid_col : str, default="API_NUMBER"
Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
log : bool, default = False
Whether to log results to log file

Returns

wellsDF
DataFrame containing only the unique well IDs
Expand source code
def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False):
    """Gets unique wells as a dataframe based on a given column name.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all wells and/or well intervals of interest
    wellid_col : str, default="API_NUMBER"
        Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    wellsDF
        DataFrame containing only the unique well IDs
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(get_unique_wells, locals(), exclude_params=['df'])
    #Get Unique well APIs
    uniqueWells = df[wellid_col].unique()
    wellsDF = pd.DataFrame(uniqueWells)
    if verbose:
        print('Number of unique wells: '+str(wellsDF.shape[0]))
    wellsDF.columns = ['UNIQUE_ID']
    
    return wellsDF
def grid2study_area(study_area, grid, output_crs='EPSG:4269', verbose=False, log=False)

Clips grid to study area.

Parameters

study_area : geopandas.GeoDataFrame
inputs study area polygon
grid : xarray.DataArray
inputs grid array
output_crs : str, default='EPSG:4269'
inputs the coordinate reference system for the study area
log : bool, default = False
Whether to log results to log file, by default False

Returns

grid : xarray.DataArray
returns xarray containing grid clipped only to area within study area
Expand source code
def grid2study_area(study_area, grid, output_crs='EPSG:4269',verbose=False, log=False):
    """Clips grid to study area.

    Parameters
    ----------
    study_area : geopandas.GeoDataFrame
        inputs study area polygon
    grid : xarray.DataArray
        inputs grid array
    output_crs : str, default='EPSG:4269'
        inputs the coordinate reference system for the study area
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    grid : xarray.DataArray
        returns xarray containing grid clipped only to area within study area

    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(grid2study_area, locals(), exclude_params=['study_area', 'grid'])
    
    if output_crs=='':
        output_crs=study_area.crs

    #Get input CRS's
    grid_crs = pyproj.CRS.from_wkt(grid.rio.crs.to_wkt())
    study_area_crs = study_area.crs
    
    # Reproject if needed
    if output_crs != grid_crs:
        grid = grid.rio.reproject(output_crs)
    grid_crs = pyproj.CRS.from_wkt(grid.rio.crs.to_wkt())

    if grid_crs != study_area_crs:
        study_area = study_area.to_crs(grid_crs)   

    # We'll just clip to outer bounds of study area
    saExtent = study_area.total_bounds

    if grid['y'][-1].values - grid['y'][0].values > 0:
        miny=saExtent[1]
        maxy=saExtent[3]
    else:
        miny=saExtent[3]
        maxy=saExtent[1]        
        
    if grid['x'][-1].values - grid['x'][0].values > 0:
        minx=saExtent[0]
        maxx=saExtent[2]
    else:
        minx=saExtent[2]
        maxx=saExtent[0]
    
    # "Clip" it
    grid = grid.sel(x=slice(minx, maxx), y=slice(miny, maxy))
    if 'band' in grid.dims:
        grid = grid.sel(band=1)

    return grid
def layer_interp(points, grid, layers=None, interp_kind='nearest', return_type='dataarray', export_dir=None, target_col='TARG_THICK_PER', layer_col='LAYER', xcol=None, ycol=None, xcoord='x', ycoord='y', log=False, verbose=False, **kwargs)

Function to interpolate results, going from points to grid data. Uses scipy.interpolate module.

Parameters

points : list
List containing pandas dataframes or geopandas geoadataframes containing the point data. Should be resDF_list output from layer_target_thick().
grid : xr.DataArray or xr.Dataset
Xarray DataArray or DataSet with the coordinates/spatial reference of the output grid to interpolate to
layers : int, default=None
Number of layers for interpolation. If None, uses the length ofthe points list to determine number of layers. By default None.
interp_kind : str, {'nearest', 'interp2d','linear', 'cloughtocher', 'radial basis function'}
Type of interpolation to use. See scipy.interpolate N-D scattered. Values can be any of the following (also shown in "kind" column of N-D scattered section of table here: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html). By default 'nearest'
return_type : str, {'dataarray', 'dataset'}
Type of xarray object to return, either xr.DataArray or xr.Dataset, by default 'dataarray.'
export_dir : str or pathlib.Path, default=None
Export directory for interpolated grids, using w4h.export_grids(). If None, does not export, by default None.
target_col : str, default = 'TARG_THICK_PER'
Name of column in points containing data to be interpolated, by default 'TARG_THICK_PER'.
layer_col : str, default = 'Layer'
Name of column containing layer number. Not currently used, by default 'LAYER'
xcol : str, default = 'None'
Name of column containing x coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
ycol : str, default = 'None'
Name of column containing y coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
xcoord : str, default='x'
Name of x coordinate in grid, used to extract x values of grid, by default 'x'
ycoord : str, default='y'
Name of y coordinate in grid, used to extract x values of grid, by default 'y'
log : bool, default = True
Whether to log inputs and outputs to log file.
**kwargs
Keyword arguments to be read directly into whichever scipy.interpolate function is designated by the interp_kind parameter.

Returns

interp_data : xr.DataArray or xr.Dataset, depending on return_type
By default, returns an xr.DataArray object with the layers added as a new dimension called Layer. Can also specify return_type='dataset' to return an xr.Dataset with each layer as a separate variable.
Expand source code
def layer_interp(points, grid, layers=None, interp_kind='nearest', return_type='dataarray', export_dir=None, target_col='TARG_THICK_PER', layer_col='LAYER', xcol=None, ycol=None, xcoord='x', ycoord='y', log=False, verbose=False, **kwargs):
    """Function to interpolate results, going from points to grid data. Uses scipy.interpolate module.

    Parameters
    ----------
    points : list
        List containing pandas dataframes or geopandas geoadataframes containing the point data. Should be resDF_list output from layer_target_thick().
    grid : xr.DataArray or xr.Dataset
        Xarray DataArray or DataSet with the coordinates/spatial reference of the output grid to interpolate to
    layers : int, default=None
        Number of layers for interpolation. If None, uses the length ofthe points list to determine number of layers. By default None.
    interp_kind : str, {'nearest', 'interp2d','linear', 'cloughtocher', 'radial basis function'}
        Type of interpolation to use. See scipy.interpolate N-D scattered. Values can be any of the following (also shown in "kind" column of N-D scattered section of table here: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html). By default 'nearest'
    return_type : str, {'dataarray', 'dataset'}
        Type of xarray object to return, either xr.DataArray or xr.Dataset, by default 'dataarray.'
    export_dir : str or pathlib.Path, default=None
        Export directory for interpolated grids, using w4h.export_grids(). If None, does not export, by default None.
    target_col : str, default = 'TARG_THICK_PER'
        Name of column in points containing data to be interpolated, by default 'TARG_THICK_PER'.
    layer_col : str, default = 'Layer'
        Name of column containing layer number. Not currently used, by default 'LAYER'
    xcol : str, default = 'None'
        Name of column containing x coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
    ycol : str, default = 'None'
        Name of column containing y coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
    xcoord : str, default='x'
        Name of x coordinate in grid, used to extract x values of grid, by default 'x'
    ycoord : str, default='y'
        Name of y coordinate in grid, used to extract x values of grid, by default 'y'
    log : bool, default = True
        Whether to log inputs and outputs to log file.
    **kwargs
        Keyword arguments to be read directly into whichever scipy.interpolate function is designated by the interp_kind parameter.

    Returns
    -------
    interp_data : xr.DataArray or xr.Dataset, depending on return_type
        By default, returns an xr.DataArray object with the layers added as a new dimension called Layer. Can also specify return_type='dataset' to return an xr.Dataset with each layer as a separate variable.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(layer_interp, locals(), exclude_params=['points', 'grid'])

    nnList = ['nearest', 'nearest neighbor', 'nearestneighbor','neighbor',  'nn','n']
    splineList = ['interp2d', 'interp2', 'interp', 'spline', 'spl', 'sp', 's']
    linList = ['linear', 'lin', 'l']
    ctList = ['clough tocher', 'clough', 'cloughtocher', 'ct', 'c']
    rbfList = ['rbf', 'radial basis', 'radial basis function', 'r', 'radial']
    #k-nearest neighbors from scikit-learn?
    #kriging? (from pykrige or maybe also from scikit-learn)
    
    X = np.round(grid[xcoord].values, 5)# #Extract xcoords from grid
    Y = np.round(grid[ycoord].values, 5)# #Extract ycoords from grid
    
    if layers is None and (type(points) is list or type(points) is dict):
        layers = len(points)

    if len(points) != layers:
        print('You have specified a different number of layers than what is iterable in the points argument. This may not work properly.')

    if verbose:
        print('Interpolating target lithology at each layer:')
    daDict = {}
    for lyr in range(1, layers+1):
        
        if type(points) is list or type(points) is dict:
            pts = points[lyr-1]
        else:
            pts = points

        if xcol is None:
            if 'geometry' in pts.columns:
                dataX = pts['geometry'].x
            else:
                print('xcol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)' )
                return
        else:
            dataX = pts[xcol]
        
        if ycol is None:
            if 'geometry' in pts.columns:
                dataY = pts['geometry'].y
            else:
                print('ycol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)' )
                return
        else:
            dataY = pts[ycol]

        #layer = pts[layer_col]        
        interpVal = pts[target_col]

        #return points
        dataX.dropna(inplace=True)
        dataY = dataY.loc[dataX.index]
        dataY.dropna(inplace=True)
        interpVal = interpVal.loc[dataY.index]
        interpVal.dropna(inplace=True)
        
        dataX = dataX.loc[interpVal.index]
        dataY = dataY.loc[interpVal.index]
        
        dataX = dataX.reset_index(drop=True)
        dataY = dataY.reset_index(drop=True)
        interpVal = interpVal.reset_index(drop=True)
        
        if interp_kind.lower() in nnList:
            interpType = 'Nearest Neighbor'
            X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation
            dataPoints = np.array(list(zip(dataX, dataY)))
            interp = interpolate.NearestNDInterpolator(dataPoints, interpVal, **kwargs)
            Z = interp(X, Y)
        elif interp_kind.lower() in linList:
            interpType = 'Linear' 
            dataPoints = np.array(list(zip(dataX, dataY)))
            interp = interpolate.LinearNDInterpolator(dataPoints, interpVal, **kwargs)
            X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation
            Z = interp(X, Y)
        elif interp_kind.lower() in ctList:
            interpType = 'Clough-Toucher'
            X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation
            if 'tol' not in kwargs:
                kwargs['tol'] = 1e10
            interp = interpolate.CloughTocher2DInterpolator(list(zip(dataX, dataY)), interpVal, **kwargs)
            Z = interp(X, Y) 
        elif interp_kind.lower() in rbfList:
            interpType = 'Radial Basis'
            dataXY=  np.column_stack((dataX, dataY))
            interp = interpolate.RBFInterpolator(dataXY, interpVal, **kwargs)
            print("Radial Basis Function does not work well with many well-based datasets. Consider instead specifying 'nearest', 'linear', 'spline', or 'clough tocher' for interpolation interp_kind.")
            Z = interp(np.column_stack((X.ravel(), Y.ravel()))).reshape(X.shape)
        elif interp_kind.lower() in splineList:
            interpType = 'Spline Interpolation'
            Z = interpolate.bisplrep(dataX, dataY, interpVal, **kwargs)
                #interp = interpolate.interp2d(dataX, dataY, interpVal, kind=lin_kind, **kwargs)
                #Z = interp(X, Y)
        else:
            if verbose:
                print('Specified interpolation interp_kind not recognized, using nearest neighbor.')
            interpType = 'Nearest Neighbor'
            X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation
            interp = interpolate.NearestNDInterpolator(list(zip(dataX, dataY)), interpVal, **kwargs)
            Z = interp(X, Y)

        #global ZTest
        #ZTest = Z

        interp_grid = xr.DataArray( #Create new datarray with new data values, but everything else the same
                    data=Z,
                    dims=grid.dims,
                    coords=grid.coords)
        
        if 'band' in interp_grid.coords:
            interp_grid = interp_grid.drop_vars('band')
        interp_grid = interp_grid.clip(min=0, max=1, keep_attrs=True)

        interp_grid = interp_grid.expand_dims(dim='Layer')
        interp_grid = interp_grid.assign_coords(Layer=[lyr])

        del Z
        del dataX
        del dataY
        del interpVal
        del interp

        #interp_grid=interp_grid.interpolate_na(dim=x)
        zFillDigs = len(str(layers))
        daDict['Layer'+str(lyr).zfill(zFillDigs)] = interp_grid
        del interp_grid
        if verbose:
            print('\tCompleted {} interpolation for Layer {}'.format(str(interpType).lower(), str(lyr).zfill(zFillDigs)))

    dataAList = ['dataarray', 'da', 'a', 'array']
    dataSList = ['dataset', 'ds', 'set']

    if return_type.lower() in dataAList:
        interp_data = xr.concat(daDict.values(), dim='Layer')
        interp_data = interp_data.assign_coords(Layer=np.arange(1,10))
    elif return_type.lower() in dataSList:
        interp_data = xr.Dataset(daDict)
        if verbose:
            print('Done with interpolation, getting global attrs')

        #Get common attributes from all layers to use as "global" attributes
        common_attrs = {}
        for i, (var_name, data_array) in enumerate(interp_data.data_vars.items()):
            if i == 0:
                common_attrs = data_array.attrs
            else:
                common_attrs = {k: v for k, v in common_attrs.items() if k in data_array.attrs and data_array.attrs[k] == v}
        interp_data.attrs.update(common_attrs)
    else:
        print(f"{return_type} is not a valid input for return_type. Please set return_type to either 'dataarray' or 'dataset'")
        return

    if verbose:
        for i, layer_data in enumerate(interp_data):
            pts = points[i]
            pts.plot(c=pts[target_col])

    if export_dir is None:
        pass
    else:
        w4h.export_grids(grid_data=interp_data, out_path=export_dir, file_id='',filetype='tif', variable_sep=True, date_stamp=True)
        print('Exported to {}'.format(export_dir))

    return interp_data
def layer_target_thick(df, layers=9, return_all=False, export_dir=None, outfile_prefix=None, depth_top_col='TOP', depth_bot_col='BOTTOM', log=False)

Function to calculate thickness of target material in each layer at each well point

Parameters

df : geopandas.geodataframe
Geodataframe containing classified data, surface elevation, bedrock elevation, layer depths, geometry.
layers : int, default=9
Number of layers in model, by default 9
return_all : bool, default=False
If True, return list of original geodataframes with extra column added for target thick for each layer. If False, return list of geopandas.geodataframes with only essential information for each layer.
export_dir : str or pathlib.Path, default=None
If str or pathlib.Path, should be directory to which to export dataframes built in function.
outfile_prefix : str, default=None
Only used if export_dir is set. Will be used at the start of the exported filenames
depth_top_col : str, default='TOP'
Name of column containing data for depth to top of described well intervals
depth_bot_col : str, default='BOTTOM'
Name of column containing data for depth to bottom of described well intervals
log : bool, default = True
Whether to log inputs and outputs to log file.

Returns

res_df or res : geopandas.geodataframe
Geopandas geodataframe containing only important information needed for next stage of analysis.
Expand source code
def layer_target_thick(df, layers=9, return_all=False, export_dir=None, outfile_prefix=None, depth_top_col='TOP', depth_bot_col='BOTTOM', log=False):
    """Function to calculate thickness of target material in each layer at each well point

    Parameters
    ----------
    df : geopandas.geodataframe
        Geodataframe containing classified data, surface elevation, bedrock elevation, layer depths, geometry.
    layers : int, default=9
        Number of layers in model, by default 9
    return_all : bool, default=False
        If True, return list of original geodataframes with extra column added for target thick for each layer.
        If False, return list of geopandas.geodataframes with only essential information for each layer.
    export_dir : str or pathlib.Path, default=None
        If str or pathlib.Path, should be directory to which to export dataframes built in function.
    outfile_prefix : str, default=None
        Only used if export_dir is set. Will be used at the start of the exported filenames
    depth_top_col : str, default='TOP'
        Name of column containing data for depth to top of described well intervals
    depth_bot_col : str, default='BOTTOM'
        Name of column containing data for depth to bottom of described well intervals
    log : bool, default = True
        Whether to log inputs and outputs to log file.
    
    Returns
    -------
    res_df or res : geopandas.geodataframe
        Geopandas geodataframe containing only important information needed for next stage of analysis.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    df['TOP_ELEV'] = df['SURFACE_ELEV'] - df[depth_top_col]
    df['BOT_ELEV'] = df['SURFACE_ELEV'] - df[depth_bot_col]

    layerList = range(1,layers+1)
    res_list = []
    resdf_list = []

    #Generate Column names based on (looped) integers
    for layer in layerList:
        zStr = 'ELEV'
        zColT = 'TOP_ELEV'
        zColB = 'BOT_ELEV'
        topCol = zStr+'_LAYER'+str(layer)
        if layer != 9: #For all layers except the bottom layer....
            botCol = zStr+'_LAYER'+str(layer+1) #use the layer below it to 
        else: #Otherwise, ...
            botCol = "BEDROCK_"+zStr #Use the (corrected) bedrock depth

        #Divide records into 4 categories for ease of calculation, to be joined back together later  
            #Category 1: Well interval starts above layer top, ends within model layer
            #Category 2: Well interval is entirely contained withing model layer
            #Category 3: Well interval starts within model layer, continues through bottom of model layer
            #Category 4: well interval begins and ends on either side of model layer (model layer is contained within well layer)

        #records1 = intervals that go through the top of the layer and bottom is within layer
        records1 = df.loc[(df[zColT] >= df[topCol]) & #Top of the well interval is above or equal to the top of the layer
                        (df[zColB] <= df[topCol]) & # & #Bottom is below the top of the layer
                        (df[zColB] <= df[zColT])].copy() #Bottom is deeper than top (should already be the case)
        records1['TARG_THICK'] = pd.DataFrame(np.round((records1.loc[:,topCol]-records1.loc[: , zColB]) * records1['TARGET'],3)).copy() #Multiply "target" (1 or 0) by length within layer            
        #records2 = entire interval is within layer
        records2 = df.loc[(df[zColT] <= df[topCol]) & #Top of the well is lower than top of the layer 
                    (df[zColB] >= df[botCol]) & #Bottom of the well is above bottom of the layer 
                    (df[zColB] <= df[zColT])].copy() #Bottom ofthe well is deeper than or equal to top (should already be the case)
        records2['TARG_THICK'] = pd.DataFrame(np.round((records2.loc[: , zColT] - records2.loc[: , zColB]) * records2['TARGET'],3)).copy()

        #records3 = intervals with top within layer and bottom of interval going through bottom of layer
        records3 = df.loc[(df[zColT] > df[botCol]) & #Top of the well is above bottom of layer
                    (df[zColB] < df[botCol]) & #Bottom of the well is below bottom of layer
                    (df[zColT] <= df[topCol]) & #Top of well is below top of layer
                    (df[zColB] <= df[zColT])].copy() #Bottom is deeper than top (should already be the case)
        records3['TARG_THICK'] = pd.DataFrame(np.round((records3.loc[: , zColT] - (records3.loc[:,botCol]))*records3['TARGET'],3)).copy()

        #records4 = interval goes through entire layer
        records4 = df.loc[(df[zColT] >= df[topCol]) & #Top of well is above top of layer
                    (df[zColB] < df[botCol]) & #Bottom of well is below bottom of layer
                    (df[zColB] <= df[zColT])].copy() #Bottom of well is below top of well
        records4['TARG_THICK'] = pd.DataFrame(np.round((records4.loc[: , topCol]-records4.loc[: , botCol]) * records4['TARGET'],3)).copy()

        #Put the four calculated record categories back together into single dataframe
        res = pd.concat([records1, records2, records3, records4])
        #The sign may be reversed if using depth rather than elevation
        if (res['TARG_THICK'] < 0).all():
            res['TARG_THICK'] = res['TARG_THICK'] * -1
        
        #Cannot have negative thicknesses
        res['TARG_THICK'].clip(lower=0, inplace=True)
        res['LAYER_THICK'].clip(lower=0, inplace=True)
        
        #Get geometrys for each unique API/well
        res.reset_index(drop=True, inplace=True)
        res_df = res.groupby(by=['API_NUMBER','LATITUDE','LONGITUDE'], as_index=False).sum(numeric_only=True)#Calculate thickness for each well interval in the layer indicated (e.g., if there are two well intervals from same well in one model layer)
        uniqInd = pd.DataFrame([v.values[0] for k, v in res.groupby(by=['API_NUMBER','LATITUDE','LONGITUDE']).groups.items()]).loc[:,0]
        geomCol = res.loc[uniqInd, 'geometry']
        geomCol = pd.DataFrame(geomCol[~geomCol.index.duplicated(keep='first')]).reset_index()
        

        res_df['TARG_THICK_PER'] =  pd.DataFrame(np.round(res_df['TARG_THICK']/res_df['LAYER_THICK'],3)) #Calculate thickness as percent of total layer thickness
        #Replace np.inf and np.nans with 0
        res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER']!=np.inf, other=0)
        res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER']!=np.nan, other=0)

        res_df["LAYER"] = layer #Just to have as part of the output file, include the present layer in the file itself as a separate column
        res_df = res_df[['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'LATITUDE_PROJ', 'LONGITUDE_PROJ','TOP', 'BOTTOM', 'TOP_ELEV', 'BOT_ELEV', 'SURFACE_ELEV', topCol, botCol,'LAYER_THICK','TARG_THICK', 'TARG_THICK_PER', 'LAYER']].copy() #Format dataframe for output
        res_df = gpd.GeoDataFrame(res_df, geometry=geomCol.loc[:,'geometry'])
        resdf_list.append(res_df)
        res_list.append(res)

        if isinstance(export_dir, (pathlib.PurePath, str)):
            export_dir = pathlib.Path(export_dir)
            if export_dir.is_dir():
                pass
            else:
                try:
                    os.mkdir(export_dir)
                except:
                    print('Specified export directory does not exist and cannot be created. Function will continue run, but data will not be exported.')

            #Format and build export filepath
            export_dir = str(export_dir).replace('\\', '/').replace('\\'[0], '/')
            zFillDigs = len(str(len(layerList)))
            if str(outfile_prefix).endswith('_'):
                outfile_prefix = outfile_prefix[:-1]
            elif outfile_prefix is None:
                outfile_prefix = ''
            

            if export_dir[-1] =='/':
                export_dir = export_dir[:-1]
            nowStr = str(datetime.datetime.today().date())+'_'+str(datetime.datetime.today().hour)+'-'+str(datetime.datetime.today().minute)+'-'+str(datetime.datetime.today().second)
            outPath = export_dir+'/'+outfile_prefix+'_Lyr'+str(layer).zfill(zFillDigs)+'_'+nowStr+'.csv'

            if return_all:
                res.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID')
                res_df.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID')
            else:
                res_df.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID')

    if return_all:
        return res_list, resdf_list
    else:
        return resdf_list
def logger_function(logtocommence, parameters, func_name)

Function to log other functions, to be called from within other functions

Parameters

logtocommence : bool
Whether to perform logging steps
parameters : dict
Dictionary containing parameters and their values, from function
func_name : str
Name of function within which this is called
Expand source code
def logger_function(logtocommence, parameters, func_name):
    """Function to log other functions, to be called from within other functions

    Parameters
    ----------
    logtocommence : bool
        Whether to perform logging steps
    parameters : dict
        Dictionary containing parameters and their values, from function
    func_name : str
        Name of function within which this is called
    """
    if logtocommence:
        global log_filename
        #log parameter should be false by default on all. If true, will show up in kwargs
        
        #Get the log parameter value
        if 'log' in parameters.keys():
            log_file = parameters.pop('log', None)
        else:
            #If it wasn't set, default to None
            log_file = None
        
        #Get currenet time and setup format for log messages
        curr_time = datetime.datetime.now()
        FORMAT = '%(asctime)s  %(message)s'

        #Check if we are starting a new logfile (only does this during run of file_setup() or (currently non-existent) new_logfile() functions)
        if log_file == True and (func_name == 'file_setup' or func_name == 'new_logfile'):

            #Get the log_dir variable set as a file_setup() parameter, or default to None if not specified
            out_dir = parameters.pop('log_dir', None)
            if out_dir is None:
                #If output directory not specified, default to the input directory
                out_dir = parameters['well_data']
            
            #Get the timestamp for the filename (this won't change, so represents the start of logging)
            timestamp = curr_time.strftime('%Y-%m-%d_%H-%M-%S')
            log_filename = pathlib.Path(out_dir).joinpath(f"log_{timestamp}.txt")
            if 'verbose' in parameters.keys():
                print('Logging data to', log_filename)

            #Set up logging stream using logging module
            logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT, filemode='w')

            #Log 
            logging.info(f"{func_name} CALLED WITH PARAMETERS:\n\t {parameters}")
        elif log_file == True:
            #Run this for functions that aren't setting up logging file
            if log_filename:
                #Get the log stream and log this function's call with parameters
                logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT)
                logging.info(f"{func_name} CALLED WITH PARAMETERS: \n\t{parameters}")
            else:
                #If log file has not already been set up, set it up
                timestamp = curr_time.strftime('%Y-%m-%d_%H-%M-%S')
                log_filename = f"log_{timestamp}.txt"

                #Now, get the log stream and log this function's call with parameters
                logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT)
                logging.info(f"{func_name} CALLED WITH PARAMETERS: \n\t{parameters}")
        else:
            #Don't log if log=False
            pass
    return
def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool')

Function to merge lithologies and target booleans based on classifications

Parameters

well_data_df : pandas.DataFrame
Dataframe containing classified well data
targinterps_df : pandas.DataFrame
Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
target_col : str, default = 'TARGET'
Name of column in targinterps_df containing the target interpretations

target_class, default = 'bool' Whether the input column is using boolean values as its target indicator

Returns

df_targ : pandas.DataFrame
Dataframe containing merged lithologies/targets
Expand source code
def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool'):
    """Function to merge lithologies and target booleans based on classifications
    
    Parameters
    ----------
    well_data_df : pandas.DataFrame
        Dataframe containing classified well data
    targinterps_df : pandas.DataFrame
        Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
    target_col : str, default = 'TARGET'
        Name of column in targinterps_df containing the target interpretations
    target_class, default = 'bool'
        Whether the input column is using boolean values as its target indicator
        
    Returns
    -------
    df_targ : pandas.DataFrame
        Dataframe containing merged lithologies/targets
    
    """    
    
    #by default, use the boolean input 
    if target_class=='bool':
        targinterps_df[target_col] = targinterps_df[target_col].where(targinterps_df[target_col]=='1', other='0').astype(int)
        targinterps_df[target_col].fillna(value=0, inplace=True)
    else:
        targinterps_df[target_col].replace('DoNotUse', value=-1, inplace=True)
        targinterps_df[target_col].fillna(value=-2, inplace=True)
        targinterps_df[target_col].astype(np.int8)

    df_targ = pd.merge(well_data_df, targinterps_df.set_index(interp_col), right_on=interp_col, left_on='LITHOLOGY', how='left')
    
    return df_targ
def merge_metadata(data_df, header_df, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, verbose=False, **kwargs)

Function to merge tables, intended for merging metadata table with data table

Parameters

data_df : pandas.DataFrame
"Left" dataframe, intended for this purpose to be dataframe with main data, but can be anything
header_df : pandas.DataFrame
"Right" dataframe, intended for this purpose to be dataframe with metadata, but can be anything
data_cols : list, optional
List of strings of column names, for columns to be included after join from "left" table (data table). If None, all columns are kept, by default None
header_cols : list, optional
List of strings of columns names, for columns to be included in merged table after merge from "right" table (metadata). If None, all columns are kept, by default None
auto_pick_cols : bool, default = False
Whether to autopick the columns from the metadata table. If True, the following column names are kept:['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'], by default False
drop_duplicate_cols : bool, optional
If True, drops duplicate columns from the tables so that columns do not get renamed upon merge, by default True
log : bool, default = False
Whether to log inputs and outputs to log file.
**kwargs
kwargs that are passed directly to pd.merge(). By default, the 'on' and 'how' parameters are defined as on='API_NUMBER' and how='inner'

Returns

mergedTable : pandas.DataFrame
Merged dataframe
Expand source code
def merge_metadata(data_df, header_df, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, verbose=False, **kwargs):
    """Function to merge tables, intended for merging metadata table with data table

    Parameters
    ----------
    data_df : pandas.DataFrame
        "Left" dataframe, intended for this purpose to be dataframe with main data, but can be anything
    header_df : pandas.DataFrame
        "Right" dataframe, intended for this purpose to be dataframe with metadata, but can be anything
    data_cols : list, optional
        List of strings of column names, for columns to be included after join from "left" table (data table). If None, all columns are kept, by default None
    header_cols : list, optional
        List of strings of columns names, for columns to be included in merged table after merge from "right" table (metadata). If None, all columns are kept, by default None
    auto_pick_cols : bool, default = False
        Whether to autopick the columns from the metadata table. If True, the following column names are kept:['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'], by default False
    drop_duplicate_cols : bool, optional
        If True, drops duplicate columns from the tables so that columns do not get renamed upon merge, by default True
    log : bool, default = False
        Whether to log inputs and outputs to log file.
    **kwargs
        kwargs that are passed directly to pd.merge(). By default, the 'on' and 'how' parameters are defined as on='API_NUMBER' and how='inner'

    Returns
    -------
    mergedTable : pandas.DataFrame
        Merged dataframe
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(merge_metadata, locals(), exclude_params=['data_df', 'header_df'])
    if header_df is None:
        #Figure out which columns to include
        if data_cols is None:
            #If not specified, get all the cols
            data_cols = data_df.columns                      
        else:
            if header_cols is not None:
                data_cols = data_cols + header_cols
        
        data_df = data_df[data_cols]

        mergedTable = data_df
    else:   
        if auto_pick_cols:
            header_cols = ['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK']
            for c in header_df.columns:
                if c.startswith('ELEV_') or c.startswith('DEPTH'):
                    header_cols.append(c)
                if '_PROJ' in c:
                    header_cols.append(c)
            header_cols.append('geometry')
        elif header_cols is None:
            header_cols = header_df.columns
        else:
            header_cols = header_cols

        #If not specified, get all the cols
        if data_cols is None:
            data_cols = data_df.columns

        #Defults for on and how
        if 'on' not in kwargs.keys():
            kwargs['on']='API_NUMBER'

        if 'how' not in kwargs.keys():
            kwargs['how']='inner'

        #Drop duplicate columns
        if drop_duplicate_cols:
            header_colCopy= header_cols.copy()
            remCount = 0
            for i, c in enumerate(header_colCopy):
                if c in data_cols and c != kwargs['on']:
                    if verbose:
                        print('Removing {} (duplicate columns) from data.'.format(header_cols[i-remCount]))
                    header_cols.pop(i - remCount)
                    remCount += 1

        leftTable_join = data_df[data_cols]
        rightTable_join = header_df[header_cols]

        mergedTable = pd.merge(left=leftTable_join, right=rightTable_join, **kwargs)
    return mergedTable
def read_dict(file, keytype='np')

Function to read a text file with a dictionary in it into a python dictionary

Parameters

file : str or pathlib.Path object
Filepath to the file of interest containing the dictionary text
keytype : str, optional
String indicating the datatypes used in the text, currently only 'np' is implemented, by default 'np'

Returns

dict
Dictionary translated from text file.
Expand source code
def read_dict(file, keytype='np'):
    """Function to read a text file with a dictionary in it into a python dictionary

    Parameters
    ----------
    file : str or pathlib.Path object
        Filepath to the file of interest containing the dictionary text
    keytype : str, optional
        String indicating the datatypes used in the text, currently only 'np' is implemented, by default 'np'

    Returns
    -------
    dict
        Dictionary translated from text file.
    """

    if pathlib.Path(file).suffix == '.json':
        with open(file, 'r') as f:
            jsDict = json.load(f)
        return jsDict

    with open(file, 'r') as f:
        data= f.read()

    jsDict = json.loads(data)
    if keytype=='np':
        for k in jsDict.keys():
            jsDict[k] = getattr(np, jsDict[k])
    
    return jsDict
def read_dictionary_terms(dict_file=None, id_col='ID', search_col='DESCRIPTION', definition_col='LITHOLOGY', class_flag_col='CLASS_FLAG', dictionary_type=None, class_flag=6, rem_extra_cols=True, verbose=False, log=False)

Function to read dictionary terms from file into pandas dataframe

Parameters

dict_file : str or pathlib.Path object, or list of these
File or list of files to be read
search_col : str, default = 'DESCRIPTION'
Name of column containing search terms (geologic formations)
definition_col : str, default = 'LITHOLOGY'
Name of column containing interpretations of search terms (lithologies)
dictionary_type : str or None, {None, 'exact', 'start', 'wildcard',}
Indicator of which kind of dictionary terms to be read in: None, 'exact', 'start', or 'wildcard' by default None. - If None, uses name of file to try to determine. If it cannot, it will default to using the classification flag from class_flag - If 'exact', will be used to search for exact matches to geologic descriptions - If 'start', will be used as with the .startswith() string method to find inexact matches to geologic descriptions - If 'wildcard', will be used to find any matching substring for inexact geologic matches
class_flag : int, default = 1
Classification flag to be used if dictionary_type is None and cannot be otherwise determined, by default 1
rem_extra_cols : bool, default = True
Whether to remove the extra columns from the input file after it is read in as a pandas dataframe, by default True
log : bool, default = False
Whether to log inputs and outputs to log file.

Returns

dict_terms : pandas.DataFrame
Pandas dataframe with formatting ready to be used in the classification steps of this package
Expand source code
def read_dictionary_terms(dict_file=None, id_col='ID', search_col='DESCRIPTION', definition_col='LITHOLOGY', class_flag_col='CLASS_FLAG', dictionary_type=None, class_flag=6, rem_extra_cols=True, verbose=False, log=False):
    """Function to read dictionary terms from file into pandas dataframe

    Parameters
    ----------
    dict_file : str or pathlib.Path object, or list of these
        File or list of files to be read
    search_col : str, default = 'DESCRIPTION'
        Name of column containing search terms (geologic formations)
    definition_col : str, default = 'LITHOLOGY'
        Name of column containing interpretations of search terms (lithologies)
    dictionary_type : str or None, {None, 'exact', 'start', 'wildcard',}
        Indicator of which kind of dictionary terms to be read in: None, 'exact', 'start', or 'wildcard' by default None.
            - If None, uses name of file to try to determine. If it cannot, it will default to using the classification flag from class_flag
            - If 'exact', will be used to search for exact matches to geologic descriptions
            - If 'start', will be used as with the .startswith() string method to find inexact matches to geologic descriptions
            - If 'wildcard', will be used to find any matching substring for inexact geologic matches
    class_flag : int, default = 1
        Classification flag to be used if dictionary_type is None and cannot be otherwise determined, by default 1
    rem_extra_cols : bool, default = True
        Whether to remove the extra columns from the input file after it is read in as a pandas dataframe, by default True
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    dict_terms : pandas.DataFrame
        Pandas dataframe with formatting ready to be used in the classification steps of this package
    """
    
    if dict_file is None:
        dict_file=w4h.get_resources()['LithologyDict_Exact']
    
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_dictionary_terms, locals())    
    #Read files into pandas dataframes
    dict_terms = []
    #if dict_file is None:
    #    dict_file = get_resources()['LithologyDict_Exact']
    #    df = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAGS'])
    #    dict_terms.append(df)
    #    dict_file = ['']
    if isinstance(dict_file, (tuple, list)):
        for i, f in enumerate(dict_file):
            if not f.exists():
                df = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAGS'])
                dict_terms.append(df)
            else:
                dict_terms.append(pd.read_csv(f))

            if id_col in dict_terms[i].columns:
                dict_terms[i].set_index(id_col, drop=True, inplace=True)
    else:
        if dict_file is None:
            dict_file = w4h.get_resources()['LithologyDict_Exact']
        
        dict_file = pathlib.Path(dict_file)
        if dict_file.exists() and dict_file.is_file():
            dict_terms.append(pd.read_csv(dict_file, low_memory=False))
            if id_col in dict_terms[-1].columns:
                dict_terms[-1].set_index(id_col, drop=True, inplace=True)
            dict_file = [dict_file]
        else:
            print(f'ERROR: dict_file ({dict_file}) does not exist.')
            #Create empty dataframe to return
            dict_terms = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', "CLASS_FLAGS"])
            return dict_terms

    #Rename important columns
    searchTermList = ['searchterm', 'search', 'exact']
    startTermList = ['startterm', 'start', 'startswith']
    wildcardTermList = ['wildcard', 'substring', ]

    #Recast all columns to datatypes of headerData to defined types
    dict_termDtypes = {search_col:str, definition_col:str, class_flag_col:np.uint8}

    if dictionary_type is None:
        dictionary_type='' #Allow string methods on this variable

    #Iterating, to allow reading of multiple dict file at once (also works with just one at at time)
    for i, d in enumerate(dict_terms):
        if dictionary_type.lower() in searchTermList or (dictionary_type=='' and 'spec' in str(dict_file[i]).lower()):
            d[class_flag_col] = 1
        elif dictionary_type.lower() in startTermList or (dictionary_type=='' and 'start' in str(dict_file[i]).lower()):
            d[class_flag_col] = 4 #Start term classification flag
        elif dictionary_type.lower() in wildcardTermList or (dictionary_type=='' and 'wildcard' in str(dict_file[i]).lower()):
            d[class_flag_col] = 5 #Wildcard term classification flag
        else:
            d[class_flag_col] = class_flag #Custom classification flag, defined as argument
        #1: exact classification match, 2: (not defined...ML?), 3: bedrock classification for obvious bedrock, 4: start term, 5: wildcard/substring, 6: Undefined

        #Rename columns so it is consistent through rest of code
        if search_col != 'DESCRIPTION':
            d.rename(columns={search_col:'DESCRIPTION'}, inplace=True)
        if definition_col != 'LITHOLOGY':
            d.rename(columns={definition_col:'LITHOLOGY'}, inplace=True)
        if class_flag_col != 'CLASS_FLAG':
            d.rename(columns={class_flag_col:'CLASS_FLAG'}, inplace=True)

        #Cast all columns as type str, if not already
        for i in range(0, np.shape(d)[1]):
            if d.iloc[:,i].name in list(dict_termDtypes.keys()):
                d.iloc[:,i] = d.iloc[:,i].astype(dict_termDtypes[d.iloc[:,i].name])
        
        #Delete duplicate definitions
        d.drop_duplicates(subset='DESCRIPTION',inplace=True) #Apparently, there are some duplicate definitions, which need to be deleted first
        d.reset_index(inplace=True, drop=True)

        #Whether to remove extra columns that aren't needed from dataframe
        if rem_extra_cols:
            d = d[['DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAG']]

    #If only one file designated, convert it so it's no longer a list, but just a dataframe

    if len(dict_terms) == 1:
        dict_terms = dict_terms[0]

    return dict_terms
def read_grid(grid_path=None, grid_type='model', no_data_val_grid=0, use_service=False, study_area=None, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False, **kwargs)

Reads in grid

Parameters

grid_path : str or pathlib.Path, default=None
Path to a grid file
grid_type : str, default='model'
Sets what type of grid to load in
no_data_val_grid : int, default=0
Sets the no data value of the grid
use_service : str, default=False
Sets which service the function uses
study_area : geopandas.GeoDataFrame, default=None
Dataframe containing study area polygon
grid_crs : str, default=None
Sets crs to use if clipping to study area
log : bool, default = False
Whether to log results to log file, by default False

Returns

gridIN : xarray.DataArray
Returns grid
Expand source code
def read_grid(grid_path=None, grid_type='model', no_data_val_grid=0, use_service=False, study_area=None,  grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False, **kwargs):
    """Reads in grid

    Parameters
    ----------
    grid_path : str or pathlib.Path, default=None
        Path to a grid file
    grid_type : str, default='model'
        Sets what type of grid to load in
    no_data_val_grid : int, default=0
        Sets the no data value of the grid
    use_service : str, default=False
        Sets which service the function uses
    study_area : geopandas.GeoDataFrame, default=None
        Dataframe containing study area polygon
    grid_crs : str, default=None
        Sets crs to use if clipping to study area
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    gridIN : xarray.DataArray
        Returns grid
    
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_grid, locals(), exclude_params=['study_area'])
        
    if grid_type=='model':
        if 'read_grid' in list(kwargs.keys()):
            rgrid = kwargs['read_grid']
        else:
            rgrid=True
        gridIN = read_model_grid(model_grid_path=grid_path, study_area=study_area,  no_data_val_grid=0, read_grid=rgrid, grid_crs=grid_crs, output_crs=output_crs, verbose=verbose)
    else:
        if use_service==False:
            gridIN = rxr.open_rasterio(grid_path)
        elif use_service.lower()=='wcs':
            gridIN = read_wcs(study_area, wcs_url=lidarURL, **kwargs)
        elif use_service.lower()=='wms':
            gridIN = read_wms(study_area, wcs_url=lidarURL, **kwargs)
        elif use_service:
            #Deafults to wms
            gridIN = read_wms(study_area, wcs_url=lidarURL, **kwargs)
        
        if study_area is not None:
            study_area_crs = study_area.crs
            if grid_crs is None:
                try:
                    grid_crs = pyproj.CRS.from_wkt(gridIN.rio.crs.to_wkt())
                except Exception:
                    iswsCRS = pyproj.CRS.from_json_dict(w4h.read_dict(w4h.get_resources()['ISWS_CRS']))
                    gridIN.rio.write_crs(iswsCRS)
            elif grid_crs.lower()=='isws':
                iswsCRS = pyproj.CRS.from_json_dict(w4h.read_dict(w4h.get_resources()['ISWS_CRS']))
                gridIN.rio.write_crs(iswsCRS)
                        
            if study_area_crs is None:
                study_area_crs=study_area.crs
            study_area = study_area.to_crs(output_crs)
            study_area_crs=study_area.crs
            
            gridIN = gridIN.rio.reproject(output_crs)
            gridIN = grid2study_area(study_area=study_area, grid=gridIN, output_crs=output_crs,)
        else:
            gridIN = gridIN.rio.reproject(output_crs)
            if 'band' in gridIN.dims:
                gridIN = gridIN.sel(band=1)

        try:
            no_data_val_grid = gridIN.attrs['_FillValue'] #Extract from dataset itself
        except:
            pass
                
        gridIN = gridIN.where(gridIN != no_data_val_grid, other=np.nan)  #Replace no data values with NaNs

    return gridIN
def read_lithologies(lith_file=None, interp_col='LITHOLOGY', target_col='CODE', use_cols=None, verbose=False, log=False)

Function to read lithology file into pandas dataframe

Parameters

lith_file : str or pathlib.Path object, default = None
Filename of lithology file. If None, default is contained within repository, by default None
interp_col : str, default = 'LITHOLOGY'
Column to used to match interpretations
target_col : str, default = 'CODE'
Column to be used as target code
use_cols : list, default = None
Which columns to use when reading in dataframe. If None, defaults to ['LITHOLOGY', 'CODE'].
log : bool, default = True
Whether to log inputs and outputs to log file.

Returns

pandas.DataFrame
Pandas dataframe with lithology information
Expand source code
def read_lithologies(lith_file=None, interp_col='LITHOLOGY', target_col='CODE', use_cols=None, verbose=False,  log=False):
    """Function to read lithology file into pandas dataframe

    Parameters
    ----------
    lith_file : str or pathlib.Path object, default = None
        Filename of lithology file. If None, default is contained within repository, by default None
    interp_col : str, default = 'LITHOLOGY'
        Column to used to match interpretations
    target_col : str, default = 'CODE'
        Column to be used as target code
    use_cols : list, default = None
        Which columns to use when reading in dataframe. If None, defaults to ['LITHOLOGY', 'CODE'].
    log : bool, default = True
        Whether to log inputs and outputs to log file.

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with lithology information
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_lithologies, locals())   
    if lith_file is None:
        #Find resources
        import w4h
        lith_file= w4h.get_resources()['LithInterps_FineCoarse']
    
    if not isinstance(lith_file, pathlib.PurePath):
        lith_file = pathlib.Path(lith_file)

    if use_cols is None:
        use_cols = [interp_col, target_col]

    lithoDF = pd.read_csv(lith_file, usecols=use_cols)

    lithoDF.rename(columns={interp_col:'INTERPRETATION', target_col:'TARGET'}, inplace=True)

    return lithoDF
def read_model_grid(model_grid_path, study_area=None, no_data_val_grid=0, read_grid=True, node_byspace=True, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False)

Reads in model grid to xarray data array

Parameters

grid_path : str
Path to model grid file
study_area : geopandas.GeoDataFrame, default=None
Dataframe containing study area polygon
no_data_val_grid : int, default=0
value assigned to areas with no data
readGrid : bool, default=True
Whether function to either read grid or create grid
node_byspace : bool, default=False
Denotes how to create grid
output_crs : str, default='EPSG:4269'
Inputs study area crs
grid_crs : str, default=None
Inputs grid crs
log : bool, default = False
Whether to log results to log file, by default False

Returns

modelGrid : xarray.DataArray
Data array containing model grid
Expand source code
def read_model_grid(model_grid_path, study_area=None, no_data_val_grid=0, read_grid=True, node_byspace=True, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False):
    """Reads in model grid to xarray data array

    Parameters
    ----------
    grid_path : str
        Path to model grid file
    study_area : geopandas.GeoDataFrame, default=None
        Dataframe containing study area polygon
    no_data_val_grid : int, default=0
        value assigned to areas with no data
    readGrid : bool, default=True
        Whether function to either read grid or create grid
    node_byspace : bool, default=False
        Denotes how to create grid
    output_crs : str, default='EPSG:4269'
        Inputs study area crs
    grid_crs : str, default=None
        Inputs grid crs
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    modelGrid : xarray.DataArray
        Data array containing model grid
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_model_grid, locals(), exclude_params='study_area')
    
    if read_grid and model_grid_path is not None:
        modelGridIN = rxr.open_rasterio(model_grid_path)

        if grid_crs is None:
            try:
                grid_crs=modelGridIN.spatial_ref.crs_wkt
            except Exception:
                iswsCRSPath = w4h.get_resources()['ISWS_CRS']
                iswsCRS = w4h.read_dict(iswsCRSPath, keytype=None)
                grid_crs = iswsCRS              
                modelGridIN.rio.write_crs(grid_crs)
        elif isinstance(grid_crs, str) and grid_crs.lower()=='isws':
            iswsCRSPath = w4h.get_resources()['ISWS_CRS']
            iswsCRS = w4h.read_dict(iswsCRSPath, keytype=None)            
            grid_crs = iswsCRS              
            modelGridIN.rio.write_crs(grid_crs)
        elif isinstance(grid_crs, pathlib.PurePath) or (isinstance(grid_crs, str) and pathlib.Path(grid_crs).exists()):
            iswsCRSPath = w4h.get_resources()['ISWS_CRS']
            grid_crs = w4h.read_dict(iswsCRSPath, keytype=None)            
            modelGridIN.rio.write_crs(grid_crs)
        else:
            warnings.warn(f'CRS Specification for grid is {grid_crs}, but this cannot be written to the grid')
        
        modelGridIN = modelGridIN.rio.reproject(output_crs) 
           
        if study_area is not None:                
            study_area = study_area.to_crs(output_crs)
            modelGrid = grid2study_area(study_area=study_area, grid=modelGridIN, output_crs=output_crs)
        else:
            modelGrid = modelGridIN

        try:
            noDataVal = float(modelGrid.attrs['_FillValue']) #Extract from dataset itsel
        except:
            noDataVal = no_data_val_grid

        modelGrid = modelGrid.where(modelGrid != noDataVal, other=np.nan)   #Replace no data values with NaNs
        modelGrid = modelGrid.where(modelGrid == np.nan, other=1) #Replace all other values with 1
        modelGrid.rio.reproject(grid_crs, inplace=True)     
    elif model_grid_path is None and study_area is None:
        if verbose:
            print("ERROR: Either model_grid_path or study_area must be defined.")
    else:
        spatRefDict = w4h.read_dict(iswsCRSPath, keytype=None)            

        saExtent = study_area.total_bounds

        startX = saExtent[0] #Starting X Coordinate
        startY = saExtent[1] #starting Y Coordinate
        
        endX = saExtent[2]
        endY = saExtent[3]
        
        if node_byspace:
            xSpacing = 625 #X Node spacing 
            ySpacing = xSpacing #Y Node spacing  
            
            x = np.arange(startX, endX, xSpacing)
            y = np.arange(startY, endY, ySpacing)
        else:
            xNodes = 100 #Number of X Nodes
            yNodes = 100 #Number of Y Nodes

            x = np.linspace(startX, endX, num=xNodes)
            y = np.linspace(startY, endY, num=yNodes)        
        
        xx, yy = np.meshgrid(x, y)
        zz = np.ones_like(xx).transpose()

        yIn = np.flipud(y)

        coords = {'x':x,'y':yIn, 'spatial_ref':0}
        dims = {'x':x,'y':yIn}
        
        modelGrid = xr.DataArray(data=zz,coords=coords,attrs={'_FillValue':no_data_val_grid}, dims=dims)
        modelGrid.spatial_ref.attrs['spatial_ref'] = {}
        if grid_crs is None or grid_crs=='isws' or grid_crs=='ISWS':
            for k in spatRefDict:
                modelGrid.spatial_ref.attrs[k] = spatRefDict[k]
    return modelGrid
def read_raw_csv(data_filepath, metadata_filepath, data_cols=None, metadata_cols=None, xcol='LONGITUDE', ycol='LATITUDE', well_key='API_NUMBER', encoding='latin-1', verbose=False, log=False, **read_csv_kwargs)

Easy function to read raw .txt files output from (for example), an Access database

Parameters

data_filepath : str
Filename of the file containing data, including the extension.
metadata_filepath : str
Filename of the file containing metadata, including the extension.
data_cols : list, default = None
List with strings with names of columns from txt file to keep after reading. If None, ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"], by default None.
metadata_cols : list, default = None
List with strings with names of columns from txt file to keep after reading. If None, ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"], by default None
x_col : str, default = 'LONGITUDE'
Name of column in metadata file indicating the x-location of the well, by default 'LONGITUDE'
ycol : str, default = 'LATITUDE'
Name of the column in metadata file indicating the y-location of the well, by default 'LATITUDE'
well_key : str, default = 'API_NUMBER'
Name of the column with the key/identifier that will be used to merge data later, by default 'API_NUMBER'
encoding : str, default = 'latin-1'
Encoding of the data in the input files, by default 'latin-1'
verbose : bool, default = False
Whether to print the number of rows in the input columns, by default False
log : bool, default = False
Whether to log inputs and outputs to log file.
**read_csv_kwargs
**kwargs that get passed to pd.read_csv()

Returns

(pandas.DataFrame, pandas.DataFrame/None) Tuple/list with two pandas dataframes: (well_data, metadata) metadata is None if only well_data is used

Expand source code
def read_raw_csv(data_filepath, metadata_filepath, data_cols=None, metadata_cols=None, xcol='LONGITUDE', ycol='LATITUDE', well_key='API_NUMBER', encoding='latin-1', verbose=False, log=False, **read_csv_kwargs):
    """Easy function to read raw .txt files output from (for example), an Access database

    Parameters
    ----------
    data_filepath : str
        Filename of the file containing data, including the extension.
    metadata_filepath : str
        Filename of the file containing metadata, including the extension.
    data_cols : list, default = None
        List with strings with names of columns from txt file to keep after reading. If None, ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"], by default None.
    metadata_cols : list, default = None
        List with strings with names of columns from txt file to keep after reading. If None, ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"], by default None
    x_col : str, default = 'LONGITUDE'
        Name of column in metadata file indicating the x-location of the well, by default 'LONGITUDE'
    ycol : str, default = 'LATITUDE'
        Name of the column in metadata file indicating the y-location of the well, by default 'LATITUDE'
    well_key : str, default = 'API_NUMBER'
        Name of the column with the key/identifier that will be used to merge data later, by default 'API_NUMBER'
    encoding : str, default = 'latin-1'
        Encoding of the data in the input files, by default 'latin-1'
    verbose : bool, default = False
        Whether to print the number of rows in the input columns, by default False
    log : bool, default = False
        Whether to log inputs and outputs to log file.
    **read_csv_kwargs
        **kwargs that get passed to pd.read_csv()

    Returns
    -------
    (pandas.DataFrame, pandas.DataFrame/None)
        Tuple/list with two pandas dataframes: (well_data, metadata) metadata is None if only well_data is used
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_raw_csv, locals())
    #Check if input data is already dataframe, otherwise, read it in as dataframe
    if not isinstance(data_filepath, pd.DataFrame) or isinstance(data_filepath, gpd.GeoDataFrame):
        downholeDataIN = pd.read_csv(data_filepath, sep=',', header='infer', encoding=encoding, **read_csv_kwargs)

    if data_cols is None:
        data_useCols = downholeDataIN.columns
    elif data_cols == 'auto':
        data_useCols = ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"]
    else:
        data_useCols= data_cols
    downholeDataIN = downholeDataIN[data_useCols]

    if metadata_filepath is None:
        headerDataIN = None
    elif not isinstance(metadata_filepath, pd.DataFrame) or isinstance(metadata_filepath, gpd.GeoDataFrame):
        headerDataIN = pd.read_csv(metadata_filepath, sep=',', header='infer', encoding=encoding, **read_csv_kwargs)
    else:
        headerDataIN = None

    if metadata_cols is None and headerDataIN is not None:
        metadata_useCols = headerDataIN.columns
    elif metadata_cols == 'auto':
        metadata_useCols = ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"]
    else:
        metadata_useCols= metadata_cols

    if headerDataIN is not None:
        headerDataIN = headerDataIN[metadata_useCols]
        
    #Drop data with no API        
    downholeDataIN = downholeDataIN.dropna(subset=[well_key]) #Drop data with no API
    if headerDataIN is not None:
        headerDataIN = headerDataIN.dropna(subset=[well_key]) #Drop metadata with no API

        #Drop data with no or missing location information
        headerDataIN = headerDataIN.dropna(subset=[ycol]) 
        headerDataIN = headerDataIN.dropna(subset=[xcol])

        #Reset index so index goes from 0 in numerical/integer order
        headerDataIN.reset_index(inplace=True, drop=True)
    else:
        #***UPDATE: Need to make sure these columns exist in this case***
        #Drop data with no or missing location information
        downholeDataIN = downholeDataIN.dropna(subset=[ycol]) 
        downholeDataIN = downholeDataIN.dropna(subset=[xcol])        
    downholeDataIN.reset_index(inplace=True, drop=True)
    
    #Print outputs to terminal, if designated
    if verbose:
        print('Data file has ' + str(downholeDataIN.shape[0])+' valid well records.')
        if headerDataIN is not None:
            print("Metadata file has "+str(headerDataIN.shape[0])+" unique wells with valid location information.")
    
    return downholeDataIN, headerDataIN
def read_study_area(study_area=None, output_crs='EPSG:4269', buffer=None, return_original=False, log=False, verbose=False, **read_file_kwargs)

Read study area geospatial file into geopandas

Parameters

study_area : str, pathlib.Path, geopandas.GeoDataFrame, or shapely.Geometry
Filepath to any geospatial file readable by geopandas. Polygon is best, but may work with other types if extent is correct.
study_area_crs : str, tuple, dict, optional
CRS designation readable by geopandas/pyproj
buffer : None or numeric, default=None
If None, no buffer created. If a numeric value is given (float or int, for example), a buffer will be created at that distance in the unit of the study_area_crs.
return_original : bool, default=False
Whether to return the (reprojected) study area as well as the (reprojected) buffered study area. Study area is only used for clipping data, so usually return_original=False is sufficient.
log : bool, default = False
Whether to log results to log file, by default False
verbose : bool, default=False
Whether to print status and results to terminal

Returns

studyAreaIN : geopandas dataframe
Geopandas dataframe with polygon geometry.
Expand source code
def read_study_area(study_area=None, output_crs='EPSG:4269', buffer=None, return_original=False, log=False, verbose=False, **read_file_kwargs):
    """Read study area geospatial file into geopandas

    Parameters
    ----------
    study_area : str, pathlib.Path, geopandas.GeoDataFrame, or shapely.Geometry
        Filepath to any geospatial file readable by geopandas. 
        Polygon is best, but may work with other types if extent is correct.
    study_area_crs : str, tuple, dict, optional
        CRS designation readable by geopandas/pyproj
    buffer : None or numeric, default=None
        If None, no buffer created. If a numeric value is given (float or int, for example), a buffer will be created at that distance in the unit of the study_area_crs.
    return_original : bool, default=False
        Whether to return the (reprojected) study area as well as the (reprojected) buffered study area. Study area is only used for clipping data, so usually return_original=False is sufficient.
    log : bool, default = False
        Whether to log results to log file, by default False
    verbose : bool, default=False
        Whether to print status and results to terminal

    Returns
    -------
    studyAreaIN : geopandas dataframe
        Geopandas dataframe with polygon geometry.
    """
    
    if study_area is None:
        study_area = w4h.get_resources()['study_area']

    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_study_area, locals())
    
    if isinstance(study_area, (gpd.GeoDataFrame, gpd.GeoSeries)):
        studyAreaIN = study_area
    elif isinstance(study_area, shapely.Geometry):
        if 'crs' in read_file_kwargs:
            crs = read_file_kwargs['crs']
        else:
            warnings.warn('A shapely Geometry object was read in as the study area, but no crs specified. Using CRS specified in output_crs: {output_crs}.\
                          You can specify the crs as a keyword argument in the read_study_area() function')
            crs = output_crs
        studyAreaIN = gpd.GeoDataFrame(index=[0], crs=crs, geometry=[study_area])
    else:
        studyAreaIN = gpd.read_file(study_area, **read_file_kwargs)
    studyAreaIN.to_crs(output_crs, inplace=True)

    # Create a buffered study area for clipping
    if buffer is not None:
        if return_original:
            studyAreaNoBuffer = studyAreaIN
        studyAreaIN = studyAreaIN.buffer(distance=buffer)
        
        if verbose:
            print('\tBuffer applied.')
    
    if verbose:
        print("\tStudy area read.")

    if return_original:
        return studyAreaIN, studyAreaNoBuffer
    return studyAreaIN
def read_wcs(study_area, wcs_url='https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS', res_x=30, res_y=30, verbose=False, log=False, **kwargs)

Reads a WebCoverageService from a url and returns a rioxarray dataset containing it.

Parameters

study_area : geopandas.GeoDataFrame
Dataframe containing study area polygon
wcs_url : str, default=lidarURL
 
Represents the url for the WCS
res_x : int, default=30
Sets resolution for x axis
res_y : int, default=30
Sets resolution for y axis
log : bool, default = False
Whether to log results to log file, by default False
**kwargs
 

Returns

wcsData_rxr : xarray.DataArray
A xarray dataarray holding the image from the WebCoverageService
Expand source code
def read_wcs(study_area, wcs_url=lidarURL, res_x=30, res_y=30, verbose=False, log=False, **kwargs):
    """Reads a WebCoverageService from a url and returns a rioxarray dataset containing it.

    Parameters
    ----------
    study_area : geopandas.GeoDataFrame
        Dataframe containing study area polygon
    wcs_url : str, default=lidarURL
    Represents the url for the WCS
    res_x : int, default=30
        Sets resolution for x axis
    res_y : int, default=30
        Sets resolution for y axis
    log : bool, default = False
        Whether to log results to log file, by default False
    **kwargs

    Returns
    -------
    wcsData_rxr : xarray.DataArray
        A xarray dataarray holding the image from the WebCoverageService
    """
    #Drawn largely from: https://git.wur.nl/isric/soilgrids/soilgrids.notebooks/-/blob/master/01-WCS-basics.ipynb
    
    #30m DEM
    #wcs_url = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_DEM_30M/ImageServer/WCSServer?request=GetCapabilities&service=WCS'
    #lidar url:
    #lidarURL = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS'

    #studyAreaPath = r"\\isgs-sinkhole.ad.uillinois.edu\geophysics\Balikian\ISWS_HydroGeo\WellDataAutoClassification\SampleData\ESL_StudyArea_5mi.shp"
    #study_area = gpd.read_file(studyAreaPath)
    if study_area is None:
        print('ERROR: study_area must be specified to use read_wcs (currently set to {})'.format(study_area))
        return

    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(read_wcs, locals(), exclude_params=['study_area'])      

    if 'wcs_url' in kwargs:
        wcs_url = kwargs['wcs_url']
    if 'res_x' in kwargs:
        res_x = kwargs['res_x']
    if 'res_y' in kwargs:
        res_y = kwargs['res_y']
    
    width_in = ''
    height_in= ''

    #Create coverage object
    my_wcs = WebCoverageService(wcs_url, version='1.0.0') 
    #names = [k for k in my_wcs.contents.keys()]
    #print(names)
    dataID = 'IL_Statewide_Lidar_DEM'
    data = my_wcs.contents[dataID]
    dBBox = data.boundingboxes #Is this an error?
    
    study_area = study_area.to_crs(data.boundingboxes[0]['nativeSrs'])
    saBBox = study_area.total_bounds
    
    #In case study area bounding box goes outside data bounding box, use data bounding box values
    newBBox = []
    for i,c in enumerate(dBBox[0]['bbox']):
        if i == 0 or i==2:
            if saBBox[i] < c:
                newBBox.append(saBBox[i])
            else:
                newBBox.append(c)
        else:
            if saBBox[i] > c:
                newBBox.append(saBBox[i])
            else:
                newBBox.append(c)

    #Recalculate resolution if it is too fine to read in
    #Start by getting the area of the study area bounding box
    saWidth = saBBox[2]-saBBox[0]
    saHeight = saBBox[3]-saBBox[1]
    saBBoxAreaM = saWidth*saHeight
    saBBoxAreaKM = saBBoxAreaM/(1000*1000) #Area in km^2

    if saBBoxAreaM/(res_x*res_y) > (4100*15000)*0.457194: #What I think might be the max download size?
        print("Resolution inputs overriden, file request too large.")
        res_x=str(round(saWidth/2500, 2))

        width_in  = str(int(saWidth/float(res_x )))
        height_in = str(int(saHeight/float(res_x)))
        
        res_y=str(round(saHeight/height_in, 2))

        print('New resolution is: '+res_x+'m_x X '+res_y+'m_y' )
        print('Dataset size: '+width_in+' pixels_x X '+height_in+' pixels_y')

    bBox = tuple(newBBox)
    bBox_str = str(tuple(newBBox)[1:-1]).replace(' ','')
    dataCRS = 'EPSG:3857'

    #Format WCS request using owslib
    response = my_wcs.getCoverage(
        identifier=my_wcs[dataID].id, 
        crs=dataCRS,#'urn:ogc:def:crs:EPSG::26716',
        bbox=bBox,
        resx=res_x, 
        resy=res_y,
        timeout=60,
        #width = width_in, height=height_in,
        format='GeoTIFF')
    response

    #If I can figure out url, this might be better?
    #baseURL = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/'+dataID+'/ImageServer/WCSServer'
    #addonRequestURL = '?request=GetCoverage&service=WCS&bbox='+bBox_str+'&srs='+dataCRS+'&format=GeoTIFF'+'&WIDTH='+width_in+'&HEIGHT='+height_in+')'
    #reqURL = baseURL+addonRequestURL
    #wcsData_rxr =  rxr.open_rasterio(reqURL)

    with MemoryFile(response) as memfile:
        with memfile.open() as dataset:
            wcsData_rxr =  rxr.open_rasterio(dataset)

    return wcsData_rxr
def read_wms(study_area, layer_name='IL_Statewide_Lidar_DEM_WGS:None', wms_url='https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS', srs='EPSG:3857', clip_to_studyarea=True, bbox=[-9889002.6155, 5134541.069716, -9737541.607038, 5239029.6274], res_x=30, res_y=30, size_x=512, size_y=512, format='image/tiff', verbose=False, log=False, **kwargs)

Reads a WebMapService from a url and returns a rioxarray dataset containing it.

Parameters

study_area : geopandas.GeoDataFrame
Dataframe containg study area polygon
layer_name : str, default='IL_Statewide_Lidar_DEM_WGS:None'
Represents the layer name in the WMS
wms_url : str, default=lidarURL
Represents the url for the WMS
srs : str, default='EPSG:3857'
Sets the srs
clip_to_studyarea : bool, default=True
Whether to clip to study area or not
res_x : int, default=30
Sets resolution for x axis
res_y : int, default=512
Sets resolution for y axis
size_x : int, default=512
Sets width of result
size_y : int, default=512
Sets height of result
log : bool, default = False
Whether to log results to log file, by default False

Returns

wmsData_rxr : xarray.DataArray
Holds the image from the WebMapService
Expand source code
def read_wms(study_area, layer_name='IL_Statewide_Lidar_DEM_WGS:None', wms_url=lidarURL, srs='EPSG:3857', 
             clip_to_studyarea=True, bbox=[-9889002.615500,5134541.069716,-9737541.607038,5239029.627400],
             res_x=30, res_y=30, size_x=512, size_y=512, 
             format='image/tiff', verbose=False, log=False, **kwargs):
    """
    Reads a WebMapService from a url and returns a rioxarray dataset containing it.

    Parameters
    ----------
    study_area : geopandas.GeoDataFrame
        Dataframe containg study area polygon
    layer_name : str, default='IL_Statewide_Lidar_DEM_WGS:None'
        Represents the layer name in the WMS
    wms_url : str, default=lidarURL
        Represents the url for the WMS
    srs : str, default='EPSG:3857'
        Sets the srs
    clip_to_studyarea : bool, default=True
        Whether to clip to study area or not
    res_x : int, default=30
        Sets resolution for x axis
    res_y : int, default=512
        Sets resolution for y axis
    size_x : int, default=512
        Sets width of result
    size_y : int, default=512
        Sets height of result
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    wmsData_rxr : xarray.DataArray
        Holds the image from the WebMapService
    """
    if study_area is None:
        print('ERROR: study_area must be specified to use read_wms (currently set to {})'.format(study_area))
        return
    
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_wms, locals(), exclude_params=['study_area'])
    from owslib.wms import WebMapService
    # Define WMS endpoint URL
    if 'wms_url' in kwargs:
        wms_url = kwargs['wms_url']
    else:
        wms_url = wms_url

    # Create WMS connection object
    wms = WebMapService(wms_url)
    # Print available layers
    #print(wms.contents)

    # Select desired layer
    if 'layer_name' in kwargs:
        layer = kwargs['layer_name']
    else:
        layer = layer_name
    
    data = wms.contents#[layer]
    if 'srs' in kwargs:
        studyArea_proj = study_area.to_crs(kwargs['srs'])
        saBBox = studyArea_proj.total_bounds
    else:
        studyArea_proj = study_area.to_crs(srs)
    
    saBBox = studyArea_proj.total_bounds

    if layer == 'IL_Statewide_Lidar_DEM_WGS:None':
        dBBox = data['0'].boundingBox #Is this an error?

        gpdDict = {'Label': ['Surf Data Box'], 'geometry': [shapely.geometry.Polygon(((dBBox[0], dBBox[1]), (dBBox[0], dBBox[3]), (dBBox[2], dBBox[3]), (dBBox[2], dBBox[1]), (dBBox[0], dBBox[1])))]}
        dBBoxGDF = gpd.GeoDataFrame(gpdDict, crs=dBBox[4])
        dBBoxGDF.to_crs(srs)

        #In case study area bounding box goes outside data bounding box, use data bounding box values
        newBBox = []
        for i,c in enumerate(dBBox):
            if type(c) is str:
                pass
            elif i == 0 or i==2:
                if saBBox[i] < c:
                    newBBox.append(saBBox[i])
                else:
                    newBBox.append(c)
            else:
                if saBBox[i] > c:
                    newBBox.append(saBBox[i])
                else:
                    newBBox.append(c)

    saWidth = saBBox[2]-saBBox[0]
    saHeight = saBBox[3]-saBBox[1]    
    # Check kwargs for rest of parameters
    if 'size_x' in kwargs:
        size_x = kwargs['size_x']
    if 'size_y' in kwargs:
        size_y = kwargs['size_y']
    if 'format' in kwargs:
        format = kwargs['format']
    if 'clip_to_studyarea' in kwargs:
        clip_to_studyarea = kwargs['clip_to_studyarea']
   
    #get the wms
    if clip_to_studyarea:
        img = wms.getmap(layers=[layer], srs=srs, bbox=saBBox, size=(size_x, size_y), format=format, transparent=True, timeout=60)        
    else:
        img = wms.getmap(layers=[layer], srs=srs, bbox=bbox, size=(size_x, size_y), format=format, transparent=True, timeout=60)

    #Save wms in memory to a raster dataset
    with MemoryFile(img) as memfile:
        with memfile.open() as dataset:
            wmsData_rxr = rxr.open_rasterio(dataset)

    #if clip_to_studyarea:
    #    wmsData_rxr = wmsData_rxr.sel(x=slice(saBBox[0], saBBox[2]), y=slice(saBBox[3], saBBox[1]))#.sel(band=1)

    return wmsData_rxr
def read_xyz(xyzpath, datatypes=None, verbose=False, log=False)

Function to read file containing xyz data (elevation/location)

Parameters

xyzpath : str or pathlib.Path
Filepath of the xyz file, including extension
datatypes : dict, default = None
Dictionary containing the datatypes for the columns int he xyz file. If None, {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64}, by default None
verbose : bool, default = False
Whether to print the number of xyz records to the terminal, by default False
log : bool, default = False
Whether to log inputs and outputs to log file.

Returns

pandas.DataFrame
Pandas dataframe containing the elevation and location data
Expand source code
def read_xyz(xyzpath, datatypes=None, verbose=False, log=False):
    """Function to read file containing xyz data (elevation/location)

    Parameters
    ----------
    xyzpath : str or pathlib.Path
        Filepath of the xyz file, including extension
    datatypes : dict, default = None
        Dictionary containing the datatypes for the columns int he xyz file. If None, {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64}, by default None
    verbose : bool, default = False
        Whether to print the number of xyz records to the terminal, by default False
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe containing the elevation and location data
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(read_xyz, locals())
    if datatypes is None:
        xyzDTypes = {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64}

    xyzDataIN = pd.read_csv(xyzpath, sep=',', header='infer', dtype=xyzDTypes, index_col='ID')
    
    if verbose:
        print('XYZ file has ' + str(xyzDataIN.shape[0])+' records with elevation and location.')
    return xyzDataIN
def remerge_data(classifieddf, searchdf)

Function to merge newly-classified (or not) and previously classified data

Parameters

classifieddf : pandas.DataFrame
Dataframe that had already been classified previously
searchdf : pandas.DataFrame
Dataframe with new classifications

Returns

remergeDF : pandas.DataFrame
Dataframe containing all the data, merged back together
Expand source code
def remerge_data(classifieddf, searchdf):
    """Function to merge newly-classified (or not) and previously classified data

    Parameters
    ----------
    classifieddf : pandas.DataFrame
        Dataframe that had already been classified previously
    searchdf : pandas.DataFrame
        Dataframe with new classifications

    Returns
    -------
    remergeDF : pandas.DataFrame
        Dataframe containing all the data, merged back together
    """
    remergeDF = pd.concat([classifieddf,searchdf], join='inner').sort_index()
    return remergeDF
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False)

Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)

Parameters

df_with_depth : pandas.DataFrame
Pandas dataframe containing the well records and descriptions for each interval
top_col : str, default='TOP'
The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
bottom_col : str, default='BOTTOM'
The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
depth_type : str, {'depth', 'elevation'}
Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
verbose : bool, default = False
Whether to print results to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.Dataframe
Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
Expand source code
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False):
    """Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Pandas dataframe containing the well records and descriptions for each interval
    top_col : str, default='TOP'
        The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
    bottom_col : str, default='BOTTOM'
        The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
    depth_type : str, {'depth', 'elevation'}
        Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
    verbose : bool, default = False
        Whether to print results to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.Dataframe
        Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth'])

    if depth_type.lower() =='depth':
        df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness
    elif depth_type.lower() =='elevation' or depth_type=='elev':
        df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness
    before = df_with_depth.shape[0] #Calculate number of rows before dropping
    df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top)
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with obviously bad depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))

    return df_with_depth
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False)

Function to remove well intervals with no depth information

Parameters

df_with_depth : pandas.DataFrame
Dataframe containing well descriptions
top_col : str, optional
Name of column containing information on the top of the well intervals, by default 'TOP'
bottom_col : str, optional
Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
no_data_val_table : any, optional
No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
verbose : bool, optional
Whether to print results to console, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

df_with_depth : pandas.DataFrame
Dataframe with depths dropped
Expand source code
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False):
    """Function to remove well intervals with no depth information

    Parameters
    ----------
    df_with_depth : pandas.DataFrame
        Dataframe containing well descriptions
    top_col : str, optional
        Name of column containing information on the top of the well intervals, by default 'TOP'
    bottom_col : str, optional
        Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
    no_data_val_table : any, optional
        No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
    verbose : bool, optional
        Whether to print results to console, by default False
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_depth : pandas.DataFrame
        Dataframe with depths dropped
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth'])
        
    #Replace empty cells in top and bottom columns with nan
    df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan)
    df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan)
    
    #Calculate number of rows before dropping
    before = df_with_depth.shape[0]

    #Drop records without depth information
    df_with_depth = df_with_depth.dropna(subset=[top_col])
    df_with_depth = df_with_depth.dropna(subset=[bottom_col])
    df_with_depth.reset_index(inplace=True, drop=True) #Reset index
  
    if verbose:
        after = df_with_depth.shape[0]
        print('Removed well records with no depth information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without depth information".format(before-after))
    
    return df_with_depth
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False)

Function that removes all records in the dataframe containing the well descriptions where no description is given.

Parameters

df_with_descriptions : pandas.DataFrame
Pandas dataframe containing the well records with their individual descriptions
description_col : str, optional
Name of the column containing the geologic description of each interval, by default 'FORMATION'
no_data_val_table : str, optional
The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
verbose : bool, optional
Whether to print the results of this step to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.DataFrame
Pandas dataframe with records with no description removed.
Expand source code
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False):
    """Function that removes all records in the dataframe containing the well descriptions where no description is given.

    Parameters
    ----------
    df_with_descriptions : pandas.DataFrame
        Pandas dataframe containing the well records with their individual descriptions
    description_col : str, optional
        Name of the column containing the geologic description of each interval, by default 'FORMATION'
    no_data_val_table : str, optional
        The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
    verbose : bool, optional
        Whether to print the results of this step to the terminal, by default False
    log : bool, default = False
        Whether to log results to log file, by default False
        
    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with records with no description removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions'])
    #Replace empty cells in formation column with nans
    df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan) 
    before = df_with_descriptions.shape[0] #Calculate number of rows before dropping

    #Drop records without FORMATION information
    df_with_descriptions = df_with_descriptions.dropna(subset=[description_col])
    df_with_descriptions.reset_index(inplace=True, drop=True) #Reset index

    if verbose:
        after = df_with_descriptions.shape[0]
        print('Removed well records without geologic descriptions. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} well records removed without geologic descriptions".format(before-after))

    return df_with_descriptions
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False)

Function to remove wells that do not have topography data (needed for layer selection later).

This function is intended to be run on the metadata table after elevations have attempted to been added.

Parameters

df_with_topo : pandas.DataFrame
Pandas dataframe containing elevation information.
zcol : str
Name of elevation column
no_data_val_table : any
Value in dataset that indicates no data is present (replaced with np.nan)
verbose : bool, optional
Whether to print outputs, by default True
log : bool, default = False
Whether to log results to log file, by default False

Returns

pandas.DataFrame
Pandas dataframe with intervals with no topography removed.
Expand source code
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells that do not have topography data (needed for layer selection later).

    This function is intended to be run on the metadata table after elevations have attempted to been added.

    Parameters
    ----------
    df_with_topo : pandas.DataFrame
        Pandas dataframe containing elevation information.
    zcol : str
        Name of elevation column
    no_data_val_table : any
        Value in dataset that indicates no data is present (replaced with np.nan)
    verbose : bool, optional
        Whether to print outputs, by default True
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe with intervals with no topography removed.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo'])

    before = df_with_topo.shape[0]
    
    df_with_topo[zcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_topo.dropna(subset=[zcol], inplace=True)
    
    if verbose:
        after = df_with_topo.shape[0]
        print('Removed well records with no surface elevation information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without surface elevation information".format(before-after))
    
    return df_with_topo
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False)

Function to remove wells and well intervals where there is no location information

Parameters

df_with_locations : pandas.DataFrame
Pandas dataframe containing well descriptions
metadata_DF : pandas.DataFrame
Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
log : bool, default = False
Whether to log results to log file, by default False

Returns

df_with_locations : pandas.DataFrame
Pandas dataframe containing only data with location information
Expand source code
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False):
    """Function to remove wells and well intervals where there is no location information

    Parameters
    ----------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing well descriptions
    metadata_DF : pandas.DataFrame
        Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    df_with_locations : pandas.DataFrame
        Pandas dataframe containing only data with location information
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations'])

    before = df_with_locations.shape[0] #Extract length of data before this process

    df_with_locations[xcol].replace(no_data_val_table, np.nan, inplace=True)
    df_with_locations[ycol].replace(no_data_val_table, np.nan, inplace=True)
    
    df_with_locations.dropna(subset=xcol, inplace=True)
    df_with_locations.dropna(subset=ycol, inplace=True)
    
    if verbose:
        after = df_with_locations.shape[0]
        print('Removed well records with no location information. ')
        print("\tNumber of records before removing: "+str(before))
        print("\tNumber of records after removing: "+str(after))
        print("\t\t{} wells records removed without location information".format(before-after))

    return df_with_locations
def run(well_data, surf_elev_grid, bedrock_elev_grid, model_grid=None, metadata=None, layers=9, well_data_cols=None, well_metadata_cols=None, description_col='FORMATION', top_col='TOP', bottom_col='BOTTOM', depth_type='depth', study_area=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEVATION', well_id_col='API_NUMBER', output_crs='EPSG:4269', lith_dict=None, lith_dict_start=None, lith_dict_wildcard=None, target_dict=None, target_name='', export_dir=None, verbose=False, log=False, **kw_params)

w4h.run() is a function that runs the intended workflow of the wells4hydrogeology (w4h) package. This means that it runs several constituent functions. The workflow that this follows is provided in the package wiki. It accepts the parameters of the constituent functions. To see a list of these functions and parameters, use help(run()).

The following functions used in w4h.run() are listed below, along with their parameters and default values for those parameters. 
See the documentation for the each of the individual functions for more information on a specific parameter:

file_setup

    well_data             | default = '<no default>'

    metadata              | default = None

    data_filename         | default = '*ISGS_DOWNHOLE_DATA*.txt'

    metadata_filename     | default = '*ISGS_HEADER*.txt'

    log_dir               | default = None

    verbose               | default = False

    log                   | default = False

read_raw_csv

    data_filepath         | default = '<output of previous function>'

    metadata_filepath     | default = '<output of previous function>'

    data_cols             | default = None

    metadata_cols         | default = None

    xcol                  | default = 'LONGITUDE'

    ycol                  | default = 'LATITUDE'

    well_key              | default = 'API_NUMBER'

    encoding              | default = 'latin-1'

    verbose               | default = False

    log                   | default = False

    read_csv_kwargs       | default = {}

define_dtypes

    undefined_df          | default = '<output of previous function>'

    datatypes             | default = None

    verbose               | default = False

    log                   | default = False

merge_metadata

    data_df               | default = '<output of previous function>'

    header_df             | default = '<output of previous function>'

    data_cols             | default = None

    header_cols           | default = None

    auto_pick_cols        | default = False

    drop_duplicate_cols   | default = True

    log                   | default = False

    verbose               | default = False

    kwargs                | default = {}

coords2geometry

    df_no_geometry        | default = '<output of previous function>'

    xcol                  | default = 'LONGITUDE'

    ycol                  | default = 'LATITUDE'

    zcol                  | default = 'ELEV_FT'

    input_coords_crs      | default = 'EPSG:4269'

    use_z                 | default = False

    wkt_col               | default = 'WKT'

    geometry_source       | default = 'coords'

    verbose               | default = False

    log                   | default = False

read_study_area

    study_area            | default = None

    output_crs            | default = 'EPSG:4269'

    buffer                | default = None

    return_original       | default = False

    log                   | default = False

    verbose               | default = False

    read_file_kwargs      | default = {}

clip_gdf2study_area

    study_area            | default = '<output of previous function>'

    gdf                   | default = '<output of previous function>'

    log                   | default = False

    verbose               | default = False

read_grid

    grid_path             | default = None

    grid_type             | default = 'model'

    no_data_val_grid      | default = 0

    use_service           | default = False

    study_area            | default = None

    grid_crs              | default = None

    output_crs            | default = 'EPSG:4269'

    verbose               | default = False

    log                   | default = False

    kwargs                | default = {}

add_control_points

    df_without_control    | default = '<output of previous function>'

    df_control            | default = None

    xcol                  | default = 'LONGITUDE'

    ycol                  | default = 'LATITUDE'

    zcol                  | default = 'ELEV_FT'

    controlpoints_crs     | default = 'EPSG:4269'

    output_crs            | default = 'EPSG:4269'

    description_col       | default = 'FORMATION'

    interp_col            | default = 'INTERPRETATION'

    target_col            | default = 'TARGET'

    verbose               | default = False

    log                   | default = False

    kwargs                | default = {}

remove_nonlocated

    df_with_locations     | default = '<output of previous function>'

    xcol                  | default = 'LONGITUDE'

    ycol                  | default = 'LATITUDE'

    no_data_val_table     | default = ''

    verbose               | default = False

    log                   | default = False

remove_no_topo

    df_with_topo          | default = '<output of previous function>'

    zcol                  | default = 'ELEVATION'

    no_data_val_table     | default = ''

    verbose               | default = False

    log                   | default = False

remove_no_depth

    df_with_depth         | default = '<output of previous function>'

    top_col               | default = 'TOP'

    bottom_col            | default = 'BOTTOM'

    no_data_val_table     | default = ''

    verbose               | default = False

    log                   | default = False

remove_bad_depth

    df_with_depth         | default = '<output of previous function>'

    top_col               | default = 'TOP'

    bottom_col            | default = 'BOTTOM'

    depth_type            | default = 'depth'

    verbose               | default = False

    log                   | default = False

remove_no_description

    df_with_descriptions  | default = '<output of previous function>'

    description_col       | default = 'FORMATION'

    no_data_val_table     | default = ''

    verbose               | default = False

    log                   | default = False

get_search_terms

    spec_path             | default = 'C:\Users\riley\LocalData\Github\wells4hydrogeology/resources/'

    spec_glob_pattern     | default = '*SearchTerms-Specific*'

    start_path            | default = None

    start_glob_pattern    | default = '*SearchTerms-Start*'

    wildcard_path         | default = None

    wildcard_glob_pattern | default = '*SearchTerms-Wildcard'

    verbose               | default = False

    log                   | default = False

read_dictionary_terms

    dict_file             | default = None

    id_col                | default = 'ID'

    search_col            | default = 'DESCRIPTION'

    definition_col        | default = 'LITHOLOGY'

    class_flag_col        | default = 'CLASS_FLAG'

    dictionary_type       | default = None

    class_flag            | default = 6

    rem_extra_cols        | default = True

    verbose               | default = False

    log                   | default = False

specific_define

    df                    | default = '<output of previous function>'

    terms_df              | default = '<output of previous function>'

    description_col       | default = 'FORMATION'

    terms_col             | default = 'DESCRIPTION'

    verbose               | default = False

    log                   | default = False

split_defined

    df                    | default = '<output of previous function>'

    classification_col    | default = 'CLASS_FLAG'

    verbose               | default = False

    log                   | default = False

start_define

    df                    | default = '<output of previous function>'

    terms_df              | default = '<output of previous function>'

    description_col       | default = 'FORMATION'

    terms_col             | default = 'DESCRIPTION'

    verbose               | default = False

    log                   | default = False

wildcard_define

    df                    | default = '<output of previous function>'

    terms_df              | default = '<output of previous function>'

    description_col       | default = 'FORMATION'

    terms_col             | default = 'DESCRIPTION'

    verbose               | default = False

    log                   | default = False

remerge_data

    classifieddf          | default = '<output of previous function>'

    searchdf              | default = '<output of previous function>'

fill_unclassified

    df                    | default = '<output of previous function>'

    classification_col    | default = 'CLASS_FLAG'

read_lithologies

    lith_file             | default = None

    interp_col            | default = 'LITHOLOGY'

    target_col            | default = 'CODE'

    use_cols              | default = None

    verbose               | default = False

    log                   | default = False

merge_lithologies

    well_data_df          | default = '<output of previous function>'

    targinterps_df        | default = '<output of previous function>'

    interp_col            | default = 'INTERPRETATION'

    target_col            | default = 'TARGET'

    target_class          | default = 'bool'

align_rasters

    grids_unaligned       | default = None

    model_grid            | default = None

    no_data_val_grid      | default = 0

    verbose               | default = False

    log                   | default = False

get_drift_thick

    surface_elev          | default = None

    bedrock_elev          | default = None

    layers                | default = 9

    plot                  | default = False

    verbose               | default = False

    log                   | default = False

sample_raster_points

    raster                | default = None

    points_df             | default = None

    well_id_col           | default = 'API_NUMBER'

    xcol                  | default = 'LONGITUDE'

    ycol                  | default = 'LATITUDE'

    new_col               | default = 'SAMPLED'

    verbose               | default = True

    log                   | default = False

get_layer_depths

    df_with_depths        | default = '<output of previous function>'

    surface_elev_col      | default = 'SURFACE_ELEV'

    layer_thick_col       | default = 'LAYER_THICK'

    layers                | default = 9

    log                   | default = False

layer_target_thick

    df                    | default = '<output of previous function>'

    layers                | default = 9

    return_all            | default = False

    export_dir            | default = None

    outfile_prefix        | default = None

    depth_top_col         | default = 'TOP'

    depth_bot_col         | default = 'BOTTOM'

    log                   | default = False

layer_interp

    points                | default = '<no default>'

    grid                  | default = '<no default>'

    layers                | default = None

    interp_kind           | default = 'nearest'

    return_type           | default = 'dataarray'

    export_dir            | default = None

    target_col            | default = 'TARG_THICK_PER'

    layer_col             | default = 'LAYER'

    xcol                  | default = None

    ycol                  | default = None

    xcoord                | default = 'x'

    ycoord                | default = 'y'

    log                   | default = False

    verbose               | default = False

    kwargs                | default = {}

export_grids

    grid_data             | default = '<no default>'

    out_path              | default = '<no default>'

    file_id               | default = ''

    filetype              | default = 'tif'

    variable_sep          | default = True

    date_stamp            | default = True

    verbose               | default = False

    log                   | default = False"
Expand source code
def run(well_data,
        surf_elev_grid,
        bedrock_elev_grid,
        model_grid=None,
        metadata=None,
        layers = 9,
        well_data_cols=None, well_metadata_cols=None, description_col='FORMATION', top_col='TOP', bottom_col='BOTTOM', depth_type='depth',
        study_area=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEVATION', well_id_col='API_NUMBER', output_crs='EPSG:4269',
        lith_dict=None, lith_dict_start=None, lith_dict_wildcard=None,
        target_dict=None,
        target_name='',
        export_dir=None,
        verbose=False,
        log=False,
        **kw_params):
    
    """Function to run entire process with one line of code. 
    
    NOTE: verbose and log are boolean parameters used for most of the functions. verbose=True prints information to terminal, log=True logs to a file in the log_dir, which defaults to the export_dir

    Parameters
    ----------
    well_data : str or pathlib.Path obj
        Filepath to file or directory containing well data.
    surf_elev_grid : str or pathlib.Path object
        _description_
    bedrock_elev_grid : str or pathlib.Path object
        _description_
    model_grid : str or pathlib.Path object, or model grid parameters (see model_grid function)
        _description_        
    metadata : str or pathlib.Path object, or None, default=None
        Filepath to file or directory containing well metadata, such as location and elevation. If None, will check if well_data is a directory, and if so, will use metadata_filename to search in same directory.
    well_data_cols : List or list-like
        Columns to 
    well_metadata_cols : List or list-like
        _description_
    layers : int, default = 9
        The number of layers in the model grid
    description_col : str, default = 'FORMATION'
        Name of column containing geologic descriptions of the well interval. This column should be in well_data.
    top_col : str, default = 'TOP'
        Name of column containing depth/elevation at top of well interval. This column should be in well_data.
    bottom_col : str, default = 'BOTTOM'
        Name of column containing depth/elevation at bottom of well interval. This column should be in well_data.    
    depth_type : str, default = 'depth'
        Whether values top_col or bottom_col refer to depth or elevation.
    study_area : str or pathlib.Path object, or geopandas.GeoDataFrame
        _description_
    xcol : str, default = 'LONGITUDE' 
        Name of column containing x coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data.
    ycol : str, default = 'LATITUDE'
        Name of column containing y coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data.
    zcol : str, default = 'ELEVATION' 
        Name of column containing z coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data.
    output_crs : crs definition accepted by pyproj, default = 'EPSG:4269'
        CRS to output all of the data into
    lith_dict : str or pathlib.Path object, or pandas.DataFrame
        _description_
    lith_dict_start : str or pathlib.Path object, or pandas.DataFrame
        _description_
    lith_dict_wildcard : str or pathlib.Path object, or pandas.DataFrame
        _description_
    target_dict : str or pathlib.Path object, or pandas.DataFrame
        _description_
    target_name : str, default = 'CoarseFine'
        Name of target of interest, to be used on exported files
    export_dir : str or pathlib.Path object, default = None
        Directory to export output files
    verbose : bool, default = False
        Whether to print updates/results
    log : bool, default = False
        Whether to send parameters and outputs to log file, to be saved in export_dir, or the same directory as well_data if export_dir not defined.
    **kw_params
        Keyword parameters used by any of the functions throughout the process. See list of functions above, and the API documentation for their possible parameters
    """

    if verbose:
        verbose_print(run, locals())

    #Get data (files or otherwise)
    file_setup_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.file_setup).parameters.keys()}
    
    #Check how well_data and metadata were defined
    if isinstance(well_data, pathlib.PurePath) or isinstance(well_data, str):
        #Convert well_data to pathlib.Path if not already
        if isinstance(well_data, str):
            well_data = pathlib.Path(well_data)

        if metadata is None:
            if well_data.is_dir():
                #If the two files are supposed to be in the same directory (or just want well_data found)
                well_dataPath, metadataPath = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs)             
            elif well_data.exists():
                #If well_data is a file, and metadata is not used
                well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs)             
                metadataPath = None
            else:
                #Need for well_data to exist at the very least
                raise IOError('well_data file does not exist:{}'.format(well_data))
        elif isinstance(metadata, pathlib.PurePath) or isinstance(metadata, str):
            #Metdata has specifically been specified by a filepath
            if isinstance(metadata, str):
                metadata = pathlib.Path(metadata)    
            well_dataPath, metadataPath = w4h.file_setup(well_data=well_data, metadata=metadata, **file_setup_kwargs)                
        else:
            if isinstance(metadata, pd.DataFrame):
                well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs)             
                metadataPath = metadata
            elif metadata is None:
                well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs)             

    elif isinstance(well_data, pd.DataFrame):
        if isinstance(metadata, pd.DataFrame):
            well_dataPath = well_data
            metadataPath = metadata
        elif isinstance(metadata, pathlib.PurePath) or isinstance(metadata, str):
            _, metadataPath = w4h.file_setup(well_data=metadata, metadata=metadata, verbose=verbose, log=log, **file_setup_kwargs)                
            well_dataPath = well_data
        else:
            print('ERROR: metadata must be a string filepath, a pathlib.Path object, or pandas.DataFrame')
    else:
        print('ERROR: well_data must be a string filepath, a pathlib.Path object, or pandas.DataFrame')

    if not export_dir:
        if export_dir is False:
            pass
        else:
            nowTime = datetime.datetime.now()
            nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0]
            nowTimeStr = '_'+str(nowTime)
            outDir = 'Output_'+nowTimeStr
            if isinstance(well_dataPath, pd.DataFrame) or isinstance(well_dataPath, gpd.GeoDataFrame):
                export_dir = pathlib.Path(outDir)
            elif isinstance(well_dataPath, pathlib.PurePath):
                if well_dataPath.is_dir():
                    export_dir = well_dataPath.joinpath(outDir)
                else:
                    export_dir = well_dataPath.parent.joinpath(outDir)
            else:
                raise IOError('export_dir should be explicitly defined if well_data is not a filepath')

            if not export_dir.exists():
                try:
                    export_dir.mkdir()
                except Exception:
                    print('Export Directory not created')

    #Get pandas dataframes from input
    read_raw_txt_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_raw_csv).parameters.keys()}
    well_data_IN, metadata_IN = w4h.read_raw_csv(data_filepath=well_dataPath, metadata_filepath=metadataPath, verbose=verbose, log=log, **read_raw_txt_kwargs)
    #Functions to read data into dataframes. Also excludes extraneous columns, and drops header data with no location information

    #Define data types (file will need to be udpated)
    well_data_DF = w4h.define_dtypes(undefined_df=well_data_IN, datatypes='./resources/downholeDataTypes.txt', verbose=verbose, log=log)
    metadata_DF = w4h.define_dtypes(undefined_df=metadata_IN, datatypes='./resources/headerDataTypes.txt', verbose=verbose, log=log)

    if metadata_DF is None:
        well_data_xyz = well_data_DF
    else:
        merge_metadata_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.merge_metadata).parameters.keys()}
        well_data_xyz = w4h.merge_metadata(data_df=well_data_DF, header_df=metadata_DF, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, **merge_metadata_kwargs)

    #Convert well_data_xyz to have geometry
    coords2geometry_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.coords2geometry).parameters.keys()}
    well_data_xyz = w4h.coords2geometry(df_no_geometry=well_data_xyz, xcol=xcol, ycol=ycol, zcol=zcol, verbose=verbose, log=log, **coords2geometry_kwargs)

    #Get Study area
    read_study_area_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_study_area).parameters.keys()}
    if study_area is None:
        studyAreaIN = None
        use_study_area = False
    else:
        studyAreaIN = w4h.read_study_area(study_area_path=study_area, log=log, output_crs=output_crs, **read_study_area_kwargs)
        use_study_area = True

    clip_gdf2study_area_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.clip_gdf2study_area).parameters.keys()}
    well_data_xyz = w4h.clip_gdf2study_area(study_area=studyAreaIN, gdf=well_data_xyz,  verbose=verbose, log=log,**clip_gdf2study_area_kwargs)
    #Get surfaces and grid(s)
    read_grid_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_grid).parameters.keys()}

    modelGridPath = model_grid
    surfaceElevPath = surf_elev_grid
    bedrockElevPath = bedrock_elev_grid

    modelGrid = w4h.read_grid(grid_path=modelGridPath, grid_type='model', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs)
    surfaceElevGridIN = w4h.read_grid(grid_path=surfaceElevPath, grid_type='surface', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs)
    bedrockElevGridIN = w4h.read_grid(grid_path=bedrockElevPath, grid_type='bedrock', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs)

    #UPDATE: MAKE SURE CRS's all align ***
    #Add control points
    add_control_points_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.add_control_points).parameters.keys()}
    well_data_xyz = w4h.add_control_points(df_without_control=well_data_xyz, xcol=xcol, ycol=ycol, zcol=zcol, top_col=top_col, bottom_col=bottom_col, description_col=description_col, verbose=verbose, log=log, **add_control_points_kwargs)

    #Clean up data
    well_data_xyz = w4h.remove_nonlocated(df_with_locations=well_data_xyz, log=log, verbose=verbose)
    well_data_xyz = w4h.remove_no_topo(df_with_topo=well_data_xyz, zcol=zcol, verbose=verbose, log=log)

    remove_no_depth_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_no_depth).parameters.keys()}
    well_data_xyz = w4h.remove_no_depth(well_data_xyz, verbose=verbose, top_col=top_col, bottom_col=bottom_col, log=log, **remove_no_depth_kwargs) #Drop records with no depth information

    remove_bad_depth_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_bad_depth).parameters.keys()}
    well_data_xyz = w4h.remove_bad_depth(well_data_xyz, verbose=verbose, top_col=top_col, bottom_col=bottom_col, depth_type=depth_type, log=log, **remove_bad_depth_kwargs)#Drop records with bad depth information (i.e., top depth > bottom depth) (Also calculates thickness of each record)

    remove_no_formation_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_no_description).parameters.keys()}
    well_data_xyz = w4h.remove_no_description(well_data_xyz, description_col=description_col, verbose=verbose, log=log, **remove_no_formation_kwargs)

    #CLASSIFICATION
    #Read dictionary definitions and classify
    get_search_terms_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.get_search_terms).parameters.keys()}
    specTermsPATH, startTermsPATH, wildcardTermsPATH, = w4h.get_search_terms(spec_path=lith_dict, start_path=lith_dict_start, wildcard_path=lith_dict_wildcard, log=log, **get_search_terms_kwargs)
    read_dictionary_terms_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_dictionary_terms).parameters.keys()}
    if 'class_flag' in read_dictionary_terms_kwargs.keys():
        del read_dictionary_terms_kwargs['class_flag'] #This is specific to an invidiual dict terms file, so don't want to use for all
    specTerms = w4h.read_dictionary_terms(dict_file=specTermsPATH, log=log, **read_dictionary_terms_kwargs)
    startTerms = w4h.read_dictionary_terms(dict_file=startTermsPATH, log=log, **read_dictionary_terms_kwargs)
    wildcardTerms = w4h.read_dictionary_terms(dict_file=wildcardTermsPATH, log=log, **read_dictionary_terms_kwargs)

    #Clean up dictionary terms
    specTerms.drop_duplicates(subset='DESCRIPTION', inplace=True)
    specTerms.reset_index(inplace=True, drop=True)

    startTerms.drop_duplicates(subset='DESCRIPTION', inplace=True)
    startTerms.reset_index(inplace=True, drop=True)

    wildcardTerms.drop_duplicates(subset='DESCRIPTION', inplace=True)
    wildcardTerms.reset_index(inplace=True, drop=True)

    if verbose:
        print('Search terms to be used:')
        print('\t {} exact match term/definition pairs')
        print('\t {} starting match term/definition pairs')
        print('\t {} wildcard match term/definition pairs')

    #CLASSIFICATIONS
    #Exact match classifications
    well_data_xyz = w4h.specific_define(well_data_xyz, terms_df=specTerms, description_col=description_col, verbose=verbose, log=log)
    
    #.startswith classifications
    if lith_dict_start is not None:
        classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log)
        searchDF = w4h.start_define(df=searchDF, terms_df=startTerms, description_col=description_col, verbose=verbose, log=log)
        well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? ***    

    #wildcard/any substring match classifications
    if lith_dict_wildcard is not None:
        classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log)
        searchDF = w4h.wildcard_define(df=searchDF, terms_df=wildcardTerms, description_col=description_col, verbose=verbose, log=log)
        well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? ***    

    #Depth classification
    classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log)
    searchDF = w4h.depth_define(df=searchDF, thresh=550, verbose=verbose, log=log)
    well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? ***
    
    #Fill unclassified data
    well_data_xyz = w4h.fill_unclassified(well_data_xyz, classification_col='CLASS_FLAG')

    #Add target interpratations
    read_lithologies_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_lithologies).parameters.keys()}
    targetInterpDF = w4h.read_lithologies(lith_file=target_dict, log=log, **read_lithologies_kwargs)
    well_data_xyz = w4h.merge_lithologies(well_data_df=well_data_xyz, targinterps_df=targetInterpDF, target_col='TARGET', target_class='bool')

    #Sort dataframe to prepare for next steps
    #well_data_xyz = w4h.sort_dataframe(df=well_data_xyz, sort_cols=['API_NUMBER','TOP'], remove_nans=True)
    well_data_xyz = well_data_xyz.sort_values(by=[well_id_col, top_col])
    well_data_xyz.reset_index(inplace=True, drop=True)
    #UPDATE: Option to remove nans?
    well_data_xyz = well_data_xyz[pd.notna(well_data_xyz["LITHOLOGY"])]

    #Analyze Surface(s) and grid(s)
    bedrockGrid, surfaceGrid = w4h.align_rasters(grids_unaligned=[bedrockElevGridIN, surfaceElevGridIN], model_grid=modelGrid, no_data_val_grid=0, log=log)
    driftThickGrid, layerThickGrid = w4h.get_drift_thick(surface_elev=surfaceGrid, bedrock_elev=bedrockGrid, layers=layers, plot=verbose, log=log)
    
    well_data_xyz = w4h.sample_raster_points(raster=bedrockGrid, points_df=well_data_xyz, xcol=xcol, ycol=ycol, new_col='BEDROCK_ELEV', verbose=verbose, log=log)
    well_data_xyz = w4h.sample_raster_points(raster=surfaceGrid, points_df=well_data_xyz, xcol=xcol, ycol=ycol, new_col='SURFACE_ELEV', verbose=verbose, log=log)
    well_data_xyz['BEDROCK_DEPTH'] = well_data_xyz['SURFACE_ELEV'] - well_data_xyz['BEDROCK_ELEV']
    well_data_xyz['LAYER_THICK'] = well_data_xyz['BEDROCK_DEPTH'] / layers
    
    well_data_xyz = w4h.get_layer_depths(df_with_depths=well_data_xyz, layers=layers, log=log)

    layer_target_thick_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.layer_target_thick).parameters.keys()}
    if 'return_all' in layer_target_thick_kwargs.keys():
        del layer_target_thick_kwargs['return_all'] #This needs to be set to False, so we don't want it reading in twice

    resdf = w4h.layer_target_thick(df=well_data_xyz, layers=layers, return_all=False, export_dir=export_dir, depth_top_col=top_col, depth_bot_col=bottom_col, log=log, **layer_target_thick_kwargs)
    
    layer_interp_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.layer_interp).parameters.keys()}
    layers_data = w4h.layer_interp(points=resdf, grid=modelGrid, layers=9, verbose=verbose, log=log, **layer_interp_kwargs)

    nowTime = datetime.datetime.now()
    nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0]
    nowTimeStr = '_'+str(nowTime)

    #THIS MAY BE REPEAT OF LAST LINES OF layer_interp()
    w4h.export_grids(grid_data=layers_data, out_path=export_dir, file_id=target_name,filetype='tif', variable_sep=True, date_stamp=True, verbose=verbose, log=log)

    return resdf, layers_data
def sample_raster_points(raster=None, points_df=None, well_id_col='API_NUMBER', xcol='LONGITUDE', ycol='LATITUDE', new_col='SAMPLED', verbose=True, log=False)

Sample raster values to points from geopandas geodataframe.

Parameters

raster : rioxarray data array
Raster containing values to be sampled.
points_df : geopandas.geodataframe
Geopandas dataframe with geometry column containing point values to sample.
well_id_col : str, default="API_NUMBER"
Column that uniquely identifies each well so multiple sampling points are not taken per well
xcol : str, default='LONGITUDE'
Column containing name for x-column, by default 'LONGITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original.
ycol : str, default='LATITUDE'
Column containing name for y-column, by default 'LATITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original. new_col : str, optional
new_col : str, default='SAMPLED'
Name for name of new column containing points sampled from the raster, by default 'SAMPLED'.
verbose : bool, default=True
Whether to send to print() information about progress of function, by default True.
log : bool, default = False
Whether to log results to log file, by default False

Returns

points_df : geopandas.geodataframe
Same as points_df, but with sampled values and potentially with reprojected coordinates.
Expand source code
def sample_raster_points(raster=None, points_df=None, well_id_col='API_NUMBER', xcol='LONGITUDE', ycol='LATITUDE', new_col='SAMPLED', verbose=True, log=False):  
    """Sample raster values to points from geopandas geodataframe.

    Parameters
    ----------
    raster : rioxarray data array
        Raster containing values to be sampled.
    points_df : geopandas.geodataframe
        Geopandas dataframe with geometry column containing point values to sample.
    well_id_col : str, default="API_NUMBER"
        Column that uniquely identifies each well so multiple sampling points are not taken per well
    xcol : str, default='LONGITUDE'
        Column containing name for x-column, by default 'LONGITUDE.'
        This is used to output (potentially) reprojected point coordinates so as not to overwrite the original.
    ycol : str, default='LATITUDE'
        Column containing name for y-column, by default 'LATITUDE.'
        This is used to output (potentially) reprojected point coordinates so as not to overwrite the original.    new_col : str, optional
    new_col : str, default='SAMPLED'
        Name for name of new column containing points sampled from the raster, by default 'SAMPLED'.
    verbose : bool, default=True
        Whether to send to print() information about progress of function, by default True.
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    points_df : geopandas.geodataframe
        Same as points_df, but with sampled values and potentially with reprojected coordinates.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if raster is None:
        raster = w4h.get_resources()['surf_elev']
    if points_df is None:
        points_df = w4h.get_resources()['well_data']

    if verbose:
        verbose_print(sample_raster_points, locals(), exclude_params=['raster', 'points_df'])
        print(f"\tSampling {new_col} grid for at all well locations.")

    #Project points to raster CRS
    rastercrsWKT=raster.spatial_ref.crs_wkt
    if rastercrsWKT != points_df.crs:
        if verbose:
            print("\tTemporarily reprojecting raster data to point data's CRS.")
        pointCRS = points_df.crs.to_epsg()
        if pointCRS is None:
            pointCRS = points_df.crs.to_wkt()

        raster = raster.rio.reproject(pointCRS)
        #points_df = points_df.to_crs(rastercrsWKT)

    xCOLOUT = xcol+'_PROJ'
    yCOLOUT = ycol+'_PROJ'
    points_df[xCOLOUT] = points_df['geometry'].x
    points_df[yCOLOUT] = points_df['geometry'].y
    xData = np.array(points_df[xCOLOUT].values)
    yData = np.array(points_df[yCOLOUT].values)
    zData = []
    zID = []
    zInd = []

    # Get unique well values to reduce sampling time
    uniqueWells = points_df.drop_duplicates(subset=[well_id_col])
    if verbose:
        print(f"\t{uniqueWells.shape[0]} unique wells idenfied using {well_id_col} column")
    
    # Loop over DataFrame rows
    for i, row in uniqueWells.iterrows():
        # Select data from DataArray at current coordinates and append to list
        zInd.append(i)
        zData.append([row[well_id_col], raster.sel(x=row[xCOLOUT], y=row[yCOLOUT], method='nearest').item()])
    
    inputtype = points_df.dtypes[well_id_col]
    wellZDF = pd.DataFrame(zData, columns=[well_id_col, new_col], index=pd.Index(zInd))

    # Merge each unique well's data with all well intervals
    wellZDF[well_id_col].astype(inputtype, copy=False)
    points_df = points_df.merge(wellZDF, how='left', on=well_id_col)
    #points_df[new_col] = zData#sampleDF[new_col]
    return points_df
def sort_dataframe(df, sort_cols=['API_NUMBER', 'TOP'], remove_nans=True)

Function to sort dataframe by one or more columns.

Parameters

df : pandas.DataFrame
Dataframe to be sorted
sort_cols : str or list of str, default = ['API_NUMBER','TOP']
Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
remove_nans : bool, default = True
Whether or not to remove nans in the process, by default True

Returns

df_sorted : pandas.DataFrame
Sorted dataframe
Expand source code
def sort_dataframe(df, sort_cols=['API_NUMBER','TOP'], remove_nans=True):
    """Function to sort dataframe by one or more columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to be sorted
    sort_cols : str or list of str, default = ['API_NUMBER','TOP']
        Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
    remove_nans : bool, default = True
        Whether or not to remove nans in the process, by default True

    Returns
    -------
    df_sorted : pandas.DataFrame
        Sorted dataframe
    """
    #Sort columns for better processing later
    df_sorted = df.sort_values(sort_cols)
    df_sorted.reset_index(inplace=True, drop=True)
    if remove_nans:
        df_sorted = df_sorted[pd.notna(df_sorted["LITHOLOGY"])]
    return df_sorted
def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify terms that have been specifically defined in the terms_df.

Parameters

df : pandas.DataFrame
Input dataframe with unclassified well descriptions.
terms_df : pandas.DataFrame
Dataframe containing the classifications
description_col : str, default='FORMATION'
Column name in df containing the well descriptions, by default 'FORMATION'.
terms_col : str, default='DESCRIPTION'
Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
verbose : bool, default=False
Whether to print up results, by default False.

Returns

df_Interps : pandas.DataFrame
Dataframe containing the well descriptions and their matched classifications.
Expand source code
def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify terms that have been specifically defined in the terms_df.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe with unclassified well descriptions.
    terms_df : pandas.DataFrame
        Dataframe containing the classifications
    description_col : str, default='FORMATION'
        Column name in df containing the well descriptions, by default 'FORMATION'.
    terms_col : str, default='DESCRIPTION'
        Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
    verbose : bool, default=False
        Whether to print up results, by default False.

    Returns
    -------
    df_Interps : pandas.DataFrame
        Dataframe containing the well descriptions and their matched classifications.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(specific_define, locals(), exclude_params=['df', 'terms_df'])

    if description_col != terms_col:
        terms_df.rename(columns={terms_col:description_col}, inplace=True)
        terms_col = description_col

    df[description_col] = df[description_col].astype(str)
    terms_df[terms_col] = terms_df[terms_col].astype(str)

    df[description_col] = df[description_col].str.casefold()
    terms_df[terms_col] = terms_df[terms_col].str.casefold()
    #df['FORMATION'] = df['FORMATION'].str.strip(['.,:?\t\s'])
    #terms_df['FORMATION'] = terms_df['FORMATION'].str.strip(['.,:?\t\s'])

    terms_df.drop_duplicates(subset=terms_col, keep='last', inplace=True)
    terms_df.reset_index(drop=True, inplace=True)
    
    df_Interps = pd.merge(left=df, right=terms_df.set_index(terms_col), on=description_col, how='left')
    df_Interps.rename(columns={description_col:'FORMATION'}, inplace=True)
    df_Interps['BEDROCK_FLAG'] = df_Interps['LITHOLOGY'] == 'BEDROCK'
    
    if verbose:
        print('Classified well records using exact matches')
        numRecsClass = int(df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum())
        recsRemainig = int(df_Interps.shape[0]-numRecsClass)
        percRecsClass =round((df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum()/df_Interps.shape[0])*100,2)
        print("\t{} records classified using exact matches ({}% of unclassified data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass))

    return df_Interps
def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False)

Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.

Parameters

df : pandas.DataFrame
Dataframe containing all the well descriptions
classification_col : str, default = 'CLASS_FLAG'
Name of column containing the classification flag, by default 'CLASS_FLAG'
verbose : bool, default = False
Whether to print results, by default False
log : bool, default = False
Whether to log results to log file

Returns

Two-item tuple of pandas.Dataframe
tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.
Expand source code
def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False):
    """Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    classification_col : str, default = 'CLASS_FLAG'
        Name of column containing the classification flag, by default 'CLASS_FLAG'
    verbose : bool, default = False
        Whether to print results, by default False
    log : bool, default = False
        Whether to log results to log file

    Returns
    -------
    Two-item tuple of pandas.Dataframe
        tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    classifedDF= df[df[classification_col].notna()] #Already-classifed data
    searchDF = df[df[classification_col].isna()] #Unclassified data
        
    return classifedDF, searchDF
def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify descriptions according to starting substring.

Parameters

df : pandas.DataFrame
Dataframe containing all the well descriptions
terms_df : pandas.DataFrame
Dataframe containing all the startswith substrings to use for searching
description_col : str, default = 'FORMATION'
Name of column in df containing descriptions, by default 'FORMATION'
terms_col : str, default = 'FORMATION'
Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose : bool, default = False
Whether to print out results, by default False
log : bool, default = True
Whether to log results to log file

Returns

df : pandas.DataFrame
Dataframe containing the original data and new classifications
Expand source code
def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to starting substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(start_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Start Term process should be done by {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.startswith(s,na=False),4,inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.startswith(s,na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using initial substring matches')
        print("\t{} records classified using initial substring matches ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df
def verbose_print(func, local_variables, exclude_params=[])
Expand source code
def verbose_print(func, local_variables, exclude_params=[]):
    print_list = ['\n']
    sTime = datetime.datetime.now()
    print_list.append(f"{func.__name__}")
    print_list.append(f"\tStarted at {sTime}.")
    print_list.append(f"\tParameters:")
    for k, v in local_variables.items():
        if k in inspect.signature(func).parameters:
            if 'kwargs' in k:
                print_list.append(f"\t\t{k}")
                for kk, vv in local_variables[k].items():
                    print_list.append(f"\t\t\t{kk}={vv}")
            elif k in exclude_params:
                print_list.append(f"\t\t{k}=<input object>")
            else:
                print_list.append(f"\t\t{k}={v}")

    for line in print_list:
        print(line)
    return print_list
def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)

Function to classify descriptions according to any substring.

Parameters

df : pandas.DataFrame
Dataframe containing all the well descriptions
terms_df : pandas.DataFrame
Dataframe containing all the startswith substrings to use for searching
description_col : str, default = 'FORMATION'
Name of column in df containing descriptions, by default 'FORMATION'
terms_col : str, default = 'FORMATION'
Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose : bool, default = False
Whether to print out results, by default False
log : bool, default = True
Whether to log results to log file

Returns

df : pandas.DataFrame
Dataframe containing the original data and new classifications
Expand source code
def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False):
    """Function to classify descriptions according to any substring. 

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing all the well descriptions
    terms_df : pandas.DataFrame
        Dataframe containing all the startswith substrings to use for searching
    description_col : str, default = 'FORMATION'
        Name of column in df containing descriptions, by default 'FORMATION'
    terms_col : str, default = 'FORMATION'
        Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
    verbose : bool, default = False
        Whether to print out results, by default False
    log : bool, default = True
        Whether to log results to log file

    Returns
    -------
    df : pandas.DataFrame
        Dataframe containing the original data and new classifications
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(wildcard_define, locals(), exclude_params=['df', 'terms_df'])
    #if verbose:
    #    #Estimate when it will end, based on test run
    #    estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take
    #    nowTime = datetime.datetime.now()
    #    endTime = nowTime+datetime.timedelta(minutes=estTime)
    #    print("Wildcard Term process should be done by (?) {:d}:{:02d}".format(endTime.hour, endTime.minute))

    #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation.
    for i,s in enumerate(terms_df[terms_col]):
        df['CLASS_FLAG'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False), 5, inplace=True)
        df['LITHOLOGY'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True)
    df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK']
    
    if verbose:
        numRecsClass = int(df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum())
        percRecsClass= round((df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum()/df.shape[0])*100,2)
        recsRemainig = int(df.shape[0]-numRecsClass)

        print('Classified well records using any substring (wildcard) match')
        print("\t{} records classified using any substring match ({}% of unclassified  data)".format(numRecsClass, percRecsClass))
        print('\t{} records remain unclassified ({}% of unclassified  data).'.format(recsRemainig, 1-percRecsClass))
    return df
def xyz_metadata_merge(xyz, metadata, verbose=False, log=False)

Add elevation to header data file.

Parameters

xyz : pandas.Dataframe
Contains elevation for the points
metadata : pandas dataframe
Header data file
log : bool, default = False
Whether to log results to log file, by default False

Returns

headerXYZData : pandas.Dataframe
Header dataset merged to get elevation values
Expand source code
def xyz_metadata_merge(xyz, metadata, verbose=False, log=False):
    """Add elevation to header data file.

    Parameters
    ----------
    xyz : pandas.Dataframe
        Contains elevation for the points
    metadata : pandas dataframe
        Header data file
    log : bool, default = False
        Whether to log results to log file, by default False

    Returns
    -------
    headerXYZData : pandas.Dataframe
        Header dataset merged to get elevation values

    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(xyz_metadata_merge, locals(), exclude_params=['xyz'])
    headerXYZData = metadata.merge(xyz, how='left', on='API_NUMBER')
    headerXYZData.drop(['LATITUDE_x', 'LONGITUDE_x'], axis=1, inplace=True)
    headerXYZData.rename({'LATITUDE_y':'LATITUDE', 'LONGITUDE_y':'LONGITUDE'}, axis=1, inplace=True)
    return headerXYZData