Package w4h
This is the wells4hydrogeology package.
It contains the functions needed to convert raw well descriptions into usable (hydro)geologic data.
Expand source code
#__init__.py
"""
This is the wells4hydrogeology package.
It contains the functions needed to convert raw well descriptions into usable (hydro)geologic data.
"""
from w4h.core import(logger_function,
verbose_print,
run,
get_resources)
from w4h.classify import (specific_define,
split_defined,
start_define,
wildcard_define,
remerge_data,
depth_define,
export_undefined,
fill_unclassified,
merge_lithologies,
get_unique_wells,
sort_dataframe)
from w4h.clean import (remove_nonlocated,
remove_no_topo,
remove_no_depth,
remove_bad_depth,
remove_no_description)
from w4h.export import (export_dataframe,
export_grids)
from w4h.layers import (get_layer_depths,
merge_metadata,
layer_target_thick,
layer_interp,
combine_dataset)
from w4h.mapping import (read_study_area,
coords2geometry,
clip_gdf2study_area,
sample_raster_points,
xyz_metadata_merge,
read_wms,
read_wcs,
grid2study_area,
read_model_grid,
read_grid,
align_rasters,
get_drift_thick)
from w4h.read import (get_current_date,
get_most_recent,
file_setup,
read_raw_csv,
read_xyz,
read_dict,
define_dtypes,
get_search_terms,
read_dictionary_terms,
read_lithologies,
add_control_points)
__all__=('logger_function','verbose_print', 'run', 'get_resources',
'specific_define',
'split_defined',
'start_define',
'wildcard_define',
'remerge_data',
'depth_define',
'export_undefined',
'fill_unclassified',
'merge_lithologies',
'get_unique_wells',
'sort_dataframe',
'remove_nonlocated',
'remove_no_topo',
'remove_no_depth',
'remove_bad_depth',
'remove_no_description',
'export_dataframe',
'export_grids',
'get_layer_depths',
'merge_metadata',
'layer_target_thick',
'layer_interp',
'combine_dataset',
'read_study_area',
'coords2geometry',
'clip_gdf2study_area',
'sample_raster_points',
'xyz_metadata_merge',
'read_wms',
'read_wcs',
'grid2study_area',
'read_model_grid',
'read_grid',
'align_rasters',
'get_drift_thick',
'get_current_date',
'get_most_recent',
'file_setup',
'read_raw_csv',
'read_xyz',
'read_dict',
'define_dtypes',
'get_search_terms',
'read_dictionary_terms',
'read_lithologies',
'add_control_points'
)
# Update the w4h.run() help() return to actually be helpful
run.__doc__ = core._run_docstring()
__author__='Riley Balikian, Joe Franke, Allan Jones, Mike Krasowski'
Sub-modules
w4h.classify-
The Classify module contains functions for defining geological intervals into a preset subset of geologic interpretations.
w4h.clean-
The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)
w4h.core-
The Core module contains core functions of the package used in other modules or as primary functions in the package. This includes the main run() …
w4h.export-
The Export module contains functions for exporting processed data.
w4h.layers-
The Layers module contains functions for splitting data into a layered model and for interpolating data within the layers
w4h.mapping-
The Mapping module contains the functions used for geospatial analysis throughout the package. This includes some input/output as well as functions to …
w4h.read-
The Read module contains funtions primarily for the input of data through the reading of data files, as well as support functions to carry out this task
Functions
def add_control_points(df_without_control, df_control=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', controlpoints_crs='EPSG:4269', output_crs='EPSG:4269', description_col='FORMATION', interp_col='INTERPRETATION', target_col='TARGET', verbose=False, log=False, **kwargs)-
Function to add control points, primarily to aid in interpolation. This may be useful when conditions are known but do not exist in input well database
Parameters
df_without_control:pandas.DataFrame- Dataframe with current working data
df_control:str, pathlib.Purepath,orpandas.DataFrame- Pandas dataframe with control points
well_key:str, optional- The column containing the "key" (unique identifier) for each well, by default 'API_NUMBER'
xcol:str, optional- The column in df_control containing the x coordinates for each control point, by default 'LONGITUDE'
ycol:str, optional- The column in df_control containing the y coordinates for each control point, by default 'LATITUDE'
zcol:str, optional- The column in df_control containing the z coordinates for each control point, by default 'ELEV_FT'
controlpoints_crs:str, optional- The column in df_control containing the crs of points, by default 'EPSG:4269'
output_crs:str, optional- The output coordinate system, by default 'EPSG:4269'
description_col:str, optional- The column in df_control with the description (if this is used), by default 'FORMATION'
interp_col:str, optional- The column in df_control with the interpretation (if this is used), by default 'INTERPRETATION'
target_col:str, optional- The column in df_control with the target code (if this is used), by default 'TARGET'
verbose:bool, optional- Whether to print information to terminal, by default False
log:bool, optional- Whether to log information in log file, by default False
**kwargs- Keyword arguments of pandas.concat() or pandas.read_csv that will be passed to that function, except for objs, which are df and df_control
Returns
pandas.DataFrame- Pandas DataFrame with original data and control points formatted the same way and concatenated together
Expand source code
def add_control_points(df_without_control, df_control=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', controlpoints_crs='EPSG:4269', output_crs='EPSG:4269', description_col='FORMATION', interp_col='INTERPRETATION', target_col='TARGET', verbose=False, log=False, **kwargs): """Function to add control points, primarily to aid in interpolation. This may be useful when conditions are known but do not exist in input well database Parameters ---------- df_without_control : pandas.DataFrame Dataframe with current working data df_control : str, pathlib.Purepath, or pandas.DataFrame Pandas dataframe with control points well_key : str, optional The column containing the "key" (unique identifier) for each well, by default 'API_NUMBER' xcol : str, optional The column in df_control containing the x coordinates for each control point, by default 'LONGITUDE' ycol : str, optional The column in df_control containing the y coordinates for each control point, by default 'LATITUDE' zcol : str, optional The column in df_control containing the z coordinates for each control point, by default 'ELEV_FT' controlpoints_crs : str, optional The column in df_control containing the crs of points, by default 'EPSG:4269' output_crs : str, optional The output coordinate system, by default 'EPSG:4269' description_col : str, optional The column in df_control with the description (if this is used), by default 'FORMATION' interp_col : str, optional The column in df_control with the interpretation (if this is used), by default 'INTERPRETATION' target_col : str, optional The column in df_control with the target code (if this is used), by default 'TARGET' verbose : bool, optional Whether to print information to terminal, by default False log : bool, optional Whether to log information in log file, by default False **kwargs Keyword arguments of pandas.concat() or pandas.read_csv that will be passed to that function, except for objs, which are df and df_control Returns ------- pandas.DataFrame Pandas DataFrame with original data and control points formatted the same way and concatenated together """ if verbose: verbose_print(add_control_points, locals(), exclude_params=['df_without_control', 'df_control']) if df_control is None: return df_without_control elif isinstance(df_control, pd.DataFrame) or isinstance(df_control, gpd.GeoDataFrame): pass else: read_csv_kwargs = {k: v for k, v in locals()['kwargs'].items() if k in inspect.signature(pd.read_csv).parameters.keys()} df_control = pd.read_csv(df_control, **read_csv_kwargs) #Drop unnecessary columns, if needed if target_col not in df_without_control.columns and target_col in df_control.columns: df_control.drop([target_col], axis=1, inplace=True) if interp_col not in df_without_control.columns and interp_col in df_control.columns: df_control.drop([interp_col], axis=1, inplace=True) if description_col not in df_without_control.columns and description_col in df_control.columns: df_control.drop([description_col], axis=1, inplace=True) #If our main df is already a geodataframe, make df_control one too if isinstance(df_without_control, gpd.GeoDataFrame): from w4h import coords2geometry gdf_control = coords2geometry(df_no_geometry=df_control, xcol=xcol, ycol=ycol, zcol=zcol, input_coords_crs=controlpoints_crs, log=log) #Get kwargs passed to pd.concat, and set defaults for ignore_index and join concat_kwargs = {k: v for k, v in locals()['kwargs'].items() if k in inspect.signature(pd.concat).parameters.keys()} if 'ignore_index' not in concat_kwargs.keys(): concat_kwargs['ignore_index'] = True if 'join' not in concat_kwargs.keys(): concat_kwargs['join'] = 'outer' gdf = pd.concat([df_without_control, gdf_control], **concat_kwargs) if controlpoints_crs != output_crs: gdf = gdf.to_crs(output_crs) return gdf def align_rasters(grids_unaligned=None, model_grid=None, no_data_val_grid=0, verbose=False, log=False)-
Reprojects two rasters and aligns their pixels
Parameters
grids_unaligned:listorxarray.DataArray- Contains a list of grids or one unaligned grid
model_grid:xarray.DataArray- Contains model grid
no_data_val_grid:int, default=0- Sets value of no data pixels
log:bool, default= False- Whether to log results to log file, by default False
Returns
alignedGrids:listorxarray.DataArray- Contains aligned grids
Expand source code
def align_rasters(grids_unaligned=None, model_grid=None, no_data_val_grid=0, verbose=False, log=False): """Reprojects two rasters and aligns their pixels Parameters ---------- grids_unaligned : list or xarray.DataArray Contains a list of grids or one unaligned grid model_grid : xarray.DataArray Contains model grid no_data_val_grid : int, default=0 Sets value of no data pixels log : bool, default = False Whether to log results to log file, by default False Returns ------- alignedGrids : list or xarray.DataArray Contains aligned grids """ if grids_unaligned is None: grids_unaligned = [w4h.get_resources()['surf_elev'], w4h.get_resources()['bedrock_elev']] if model_grid is None: model_grid = w4h.get_resources()['model_grid'] logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(align_rasters, locals(), exclude_params=['grids_unaligned', 'model_grid']) if isinstance(grids_unaligned, (tuple, list)): alignedGrids = [] for g in grids_unaligned: alignedGrid = g.rio.reproject_match(model_grid) try: no_data_val_grid = alignedGrid.attrs['_FillValue'] #Extract from dataset itself except: pass alignedGrid = alignedGrid.where(alignedGrid != no_data_val_grid) #Replace no data values with NaNs alignedGrids.append(alignedGrid) else: alignedGrid = grids_unaligned.rio.reproject_match(model_grid) try: noDataVal = alignedGrid.attrs['_FillValue'] #Extract from dataset itself except: pass alignedGrids = alignedGrid.where(alignedGrid != noDataVal, other=np.nan) #Replace no data values with NaNs return alignedGrids def clip_gdf2study_area(study_area, gdf, log=False, verbose=False)-
Clips dataframe to only include things within study area.
Parameters
study_area:geopandas.GeoDataFrame- Inputs study area polygon
gdf:geopandas.GeoDataFrame- Inputs point data
log:bool, default= False- Whether to log results to log file, by default False
Returns
gdfClip:geopandas.GeoDataFrame- Contains only points within the study area
Expand source code
def clip_gdf2study_area(study_area, gdf, log=False, verbose=False): """Clips dataframe to only include things within study area. Parameters ---------- study_area : geopandas.GeoDataFrame Inputs study area polygon gdf : geopandas.GeoDataFrame Inputs point data log : bool, default = False Whether to log results to log file, by default False Returns ------- gdfClip : geopandas.GeoDataFrame Contains only points within the study area """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(clip_gdf2study_area, locals(), exclude_params=['study_area', 'gdf']) if study_area is None: return gdf else: studyArea_proj = study_area.to_crs(gdf.crs).copy() gdfClip = gpd.clip(gdf, studyArea_proj) #Easier to project just study area to ensure data fit gdfClip.reset_index(inplace=True, drop=True) #Reset index return gdfClip def combine_dataset(layer_dataset, surface_elev, bedrock_elev, layer_thick, log=False)-
Function to combine xarray datasets or datarrays into a single xr.Dataset. Useful to add surface, bedrock, layer thick, and layer datasets all into one variable, for pickling, for example.
Parameters
layer_dataset:xr.DataArray- DataArray contining all the interpolated layer information.
surface_elev:xr.DataArray- DataArray containing surface elevation data
bedrock_elev:xr.DataArray- DataArray containing bedrock elevation data
layer_thick:xr.DataArray- DataArray containing the layer thickness at each point in the model grid
log:bool, default= False- Whether to log inputs and outputs to log file.
Returns
xr.Dataset- Dataset with all input arrays set to different variables within the dataset.
Expand source code
def combine_dataset(layer_dataset, surface_elev, bedrock_elev, layer_thick, log=False): """Function to combine xarray datasets or datarrays into a single xr.Dataset. Useful to add surface, bedrock, layer thick, and layer datasets all into one variable, for pickling, for example. Parameters ---------- layer_dataset : xr.DataArray DataArray contining all the interpolated layer information. surface_elev : xr.DataArray DataArray containing surface elevation data bedrock_elev : xr.DataArray DataArray containing bedrock elevation data layer_thick : xr.DataArray DataArray containing the layer thickness at each point in the model grid log : bool, default = False Whether to log inputs and outputs to log file. Returns ------- xr.Dataset Dataset with all input arrays set to different variables within the dataset. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) daDict = {} daDict['Layers'] = layer_dataset daDict['Surface_Elev'] = surface_elev daDict['Bedrock_Elev'] = bedrock_elev daDict['Layer_Thickness'] = layer_thick combined_dataset = xr.Dataset(daDict) return combined_dataset def coords2geometry(df_no_geometry, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', input_coords_crs='EPSG:4269', use_z=False, wkt_col='WKT', geometry_source='coords', verbose=False, log=False)-
Adds geometry to points with xy coordinates in the specified coordinate reference system.
Parameters
df_no_geometry:pandas.Dataframe- a Pandas dataframe containing points
xcol:str, default='LONGITUDE'- Name of column holding x coordinate data in df_no_geometry
ycol:str, default='LATITUDE'- Name of column holding y coordinate data in df_no_geometry
zcol:str, default='ELEV_FT'- Name of column holding z coordinate data in df_no_geometry
input_coords_crs:str, default='EPSG:4269- Name of crs used for geometry
use_z:bool, default=False- Whether to use z column in calculation
geometry_source:str {'coords', 'wkt', 'geometry'}log:bool, default= False- Whether to log results to log file, by default False
Returns
gdf:geopandas.GeoDataFrame- Geopandas dataframe with points and their geometry values
Expand source code
def coords2geometry(df_no_geometry, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEV_FT', input_coords_crs='EPSG:4269', use_z=False, wkt_col='WKT', geometry_source='coords', verbose=False, log=False): """Adds geometry to points with xy coordinates in the specified coordinate reference system. Parameters ---------- df_no_geometry : pandas.Dataframe a Pandas dataframe containing points xcol : str, default='LONGITUDE' Name of column holding x coordinate data in df_no_geometry ycol : str, default='LATITUDE' Name of column holding y coordinate data in df_no_geometry zcol : str, default='ELEV_FT' Name of column holding z coordinate data in df_no_geometry input_coords_crs : str, default='EPSG:4269 Name of crs used for geometry use_z : bool, default=False Whether to use z column in calculation geometry_source : str {'coords', 'wkt', 'geometry'} log : bool, default = False Whether to log results to log file, by default False Returns ------- gdf : geopandas.GeoDataFrame Geopandas dataframe with points and their geometry values """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(coords2geometry, locals(), exclude_params=['df_no_geometry']) if isinstance(df_no_geometry, gpd.GeoDataFrame): gdf = df_no_geometry else: wktList = ['wkt', 'well known text', 'wellknowntext', 'well_known_text', 'w'] coords_list = ['coords', 'coordinates', 'xycoords', 'xy', 'xyz', 'xyzcoords', 'xycoordinates', 'xyzcoordinates', 'c'] geometryList = ['geometry', 'geom', 'geo', 'g'] if geometry_source.lower() in wktList: from shapely import wkt df_no_geometry['geometry'] = df_no_geometry[wkt_col].apply(wkt.loads) df_no_geometry.drop(wkt_col, axis=1, inplace=True) #Drop WKT column elif geometry_source.lower() in coords_list:#coords = pd.concat([y, x], axis=1) x = df_no_geometry[xcol].to_numpy() y = df_no_geometry[ycol].to_numpy() if use_z: z = df_no_geometry[zcol].to_numpy() df_no_geometry["geometry"] = gpd.points_from_xy(x, y, z=z, crs=input_coords_crs) else: df_no_geometry["geometry"] = gpd.points_from_xy(x, y, crs=input_coords_crs) elif geometry_source.lower() in geometryList: pass else: warnings.warn(message=f"""The parameter geometry_source={geometry_source} is not recognized. Should be one of 'coords' (if x, y (and/or z) columns with coordintes used), 'wkt' (if column with wkt string used), or 'geometry' (if column with shapely geometry objects used, as with a GeoDataFrame)""") gdf = gpd.GeoDataFrame(df_no_geometry, geometry='geometry', crs=input_coords_crs) return gdf def define_dtypes(undefined_df, datatypes=None, verbose=False, log=False)-
Function to define datatypes of a dataframe, especially with file-indicated dyptes
Parameters
undefined_df:pd.DataFrame- Pandas dataframe with columns whose datatypes need to be (re)defined
datatypes:dict, str, pathlib.PurePath() object,orNone, default= None- Dictionary containing datatypes, to be used in pandas.DataFrame.astype() function. If None, will read from file indicated by dtype_file (which must be defined, along with dtype_dir), by default None
log:bool, default= False- Whether to log inputs and outputs to log file.
Returns
dfout:pandas.DataFrame- Pandas dataframe containing redefined columns
Expand source code
def define_dtypes(undefined_df, datatypes=None, verbose=False, log=False): """Function to define datatypes of a dataframe, especially with file-indicated dyptes Parameters ---------- undefined_df : pd.DataFrame Pandas dataframe with columns whose datatypes need to be (re)defined datatypes : dict, str, pathlib.PurePath() object, or None, default = None Dictionary containing datatypes, to be used in pandas.DataFrame.astype() function. If None, will read from file indicated by dtype_file (which must be defined, along with dtype_dir), by default None log : bool, default = False Whether to log inputs and outputs to log file. Returns ------- dfout : pandas.DataFrame Pandas dataframe containing redefined columns """ if verbose: verbose_print(define_dtypes, locals(), exclude_params=['undefined_df']) if undefined_df is None: dfout = None else: logger_function(log, locals(), inspect.currentframe().f_code.co_name) dfout = undefined_df.copy() if isinstance(datatypes, pathlib.PurePath) or isinstance(datatypes, str): datatypes = pathlib.Path(datatypes) if not datatypes.exists(): if verbose: print("ERROR: datatypes file '{}' does not exist, using inferred datatypes.".format(datatypes),) return dfout elif datatypes.is_dir(): if verbose: print('ERROR: datatypes must be either dict or filepath (path to directories not allowed)') return dfout datatypes = read_dict(file=datatypes) dfout = dfout.astype(datatypes) elif isinstance(datatypes, dict): if verbose: print('datatypes is None, not updating datatypes') dfout = dfout.astype(datatypes) else: if verbose: print('ERROR: datatypes must be either dict or a filepath, not {}'.format(type(datatypes))) return dfout #This is likely redundant dfcols = dfout.columns for i in range(0, np.shape(dfout)[1]): dfout.iloc[:,i] = undefined_df.iloc[:,i].astype(datatypes[dfcols[i]]) return dfout def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False)-
Function to define all intervals lower than thresh as bedrock
Parameters
df:pandas.DataFrame- Dataframe to classify
top_col:str, default= 'TOP'- Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP'
thresh:float, default= 550.0- Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0.
verbose:bool, default= False- Whether to print results, by default False
log:bool, default= True- Whether to log results to log file
Returns
df:pandas.DataFrame- Dataframe containing intervals classified as bedrock due to depth
Expand source code
def depth_define(df, top_col='TOP', thresh=550.0, verbose=False, log=False): """Function to define all intervals lower than thresh as bedrock Parameters ---------- df : pandas.DataFrame Dataframe to classify top_col : str, default = 'TOP' Name of column that contains the depth information, likely of the top of the well interval, by default 'TOP' thresh : float, default = 550.0 Depth (in units used in df['top_col']) below which all intervals will be classified as bedrock, by default 550.0. verbose : bool, default = False Whether to print results, by default False log : bool, default = True Whether to log results to log file Returns ------- df : pandas.DataFrame Dataframe containing intervals classified as bedrock due to depth """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(depth_define, locals(), exclude_params=['df']) df = df.copy() df['CLASS_FLAG'].mask(df[top_col]>thresh, 3 ,inplace=True) #Add a Classification Flag of 3 (bedrock b/c it's deepter than 550') to all records where the top of the interval is >550' df['BEDROCK_FLAG'].mask(df[top_col]>thresh, True, inplace=True) if verbose: if df.CLASS_FLAG.notnull().sum() == 0: brDepthClass = 0 else: brDepthClass = df['CLASS_FLAG'].value_counts()[3.0] total = df.shape[0] numRecsClass = int(df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum()) percRecsClass= round((df[df['CLASS_FLAG']==3]['CLASS_FLAG'].sum()/df.shape[0])*100,2) recsRemainig = int(df.shape[0]-numRecsClass) print('Classified bedrock well records using depth threshold at depth of {}'.format(thresh)) print("\t{} records classified using bedrock threshold depth ({}% of unclassified data)".format(numRecsClass, percRecsClass)) print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass)) return df def export_dataframe(df, out_dir, filename, date_stamp=True, log=False)-
Function to export dataframes
Parameters
df:pandas dataframe,orlistofpandas dataframes- Data frame or list of dataframes to be exported
out_dir:stringorpathlib.Path object- Directory to which to export dataframe object(s) as .csv
filename:strorlistofstrings- Filename(s) of output files
date_stamp:bool, default=True- Whether to include a datestamp in the filename. If true, file ends with _yyyy-mm-dd.csv of current date, by default True.
log:bool, default= True- Whether to log inputs and outputs to log file.
Expand source code
def export_dataframe(df, out_dir, filename, date_stamp=True, log=False): """Function to export dataframes Parameters ---------- df : pandas dataframe, or list of pandas dataframes Data frame or list of dataframes to be exported out_dir : string or pathlib.Path object Directory to which to export dataframe object(s) as .csv filename : str or list of strings Filename(s) of output files date_stamp : bool, default=True Whether to include a datestamp in the filename. If true, file ends with _yyyy-mm-dd.csv of current date, by default True. log : bool, default = True Whether to log inputs and outputs to log file. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if date_stamp: nowTime = datetime.datetime.now() nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0] nowTimeStr = '_'+str(nowTime) else: nowTimeStr='' if type(out_dir) is str or isinstance(out_dir, pathlib.PurePath): out_dir = str(out_dir) out_dir = out_dir.replace('\\', '/').replace('\\'[-1], '/') if out_dir[-1] != '/': out_dir = out_dir + '/' else: print('Please input string or pathlib object for out_dir parameters') return if type(filename) is str: dfOutFile = out_dir+filename+nowTimeStr+'.csv' df.to_csv(dfOutFile, index_label='ID') print('Exported '+filename+nowTimeStr+'.csv') elif type(filename) is list and type(df) is list and len(df) == len(filename): for i, f in enumerate(df): fname = filename[i] dfOutFile = out_dir+fname+nowTimeStr+'.csv' f.to_csv(dfOutFile, index_label='ID') print('Exported '+fname+nowTimeStr+'.csv') def export_grids(grid_data, out_path, file_id='', filetype='tif', variable_sep=True, date_stamp=True, verbose=False, log=False)-
Function to export grids to files.
Parameters
grid_data:xarray DataArrayorxarray Dataset- Dataset or dataarray to be exported
out_path:strorpathlib.Path object- Output location for data export. If variable_sep=True, this should be a directory. Otherwise, this should also include the filename. The file extension should not be included here.
file_id:str, optional- If specified, will add this after 'LayerXX' or 'AllLayers' in the filename, just before datestamp, if used. Example filename for file_id='Coarse': Layer1_Coarse_2023-04-18.tif.
filetype:str, optional- Output filetype. Can either be pickle or any file extension supported by rioxarray.rio.to_raster(). Can either include period or not., by default 'tif'
variable_sep:bool, optional- If grid_data is an xarray Dataset, this will export each variable in the dataset as a separate file, including the variable name in the filename, by default False
date_stamp:bool, optional- Whether to include a date stamp in the file name., by default True
log:bool, default= True- Whether to log inputs and outputs to log file.
Expand source code
def export_grids(grid_data, out_path, file_id='',filetype='tif', variable_sep=True, date_stamp=True, verbose=False, log=False): """Function to export grids to files. Parameters ---------- grid_data : xarray DataArray or xarray Dataset Dataset or dataarray to be exported out_path : str or pathlib.Path object Output location for data export. If variable_sep=True, this should be a directory. Otherwise, this should also include the filename. The file extension should not be included here. file_id : str, optional If specified, will add this after 'LayerXX' or 'AllLayers' in the filename, just before datestamp, if used. Example filename for file_id='Coarse': Layer1_Coarse_2023-04-18.tif. filetype : str, optional Output filetype. Can either be pickle or any file extension supported by rioxarray.rio.to_raster(). Can either include period or not., by default 'tif' variable_sep : bool, optional If grid_data is an xarray Dataset, this will export each variable in the dataset as a separate file, including the variable name in the filename, by default False date_stamp : bool, optional Whether to include a date stamp in the file name., by default True log : bool, default = True Whether to log inputs and outputs to log file. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(export_grids, locals(), exclude_params=['grid_data']) #Initialize lists to determine which filetype will be used for export ncdfList = ['netcdf', 'ncdf', 'n'] tifList = ['tif', 'tiff', 'geotiff', 'geotif', 't'] pickleList = ['pickle', 'pkl', 'p'] filenames = [] #Format output string(s) #Format output filepath if isinstance(out_path, (pathlib.PurePath, str)): if isinstance(out_path, pathlib.PurePath): pass else: out_path = pathlib.Path(out_path) if out_path.parent.exists() == False: print('Directory does not exist. Please enter a different value for the out_path parameter.') return if out_path.is_dir(): if isinstance(grid_data, xr.DataArray): if variable_sep: lyrs = grid_data.coords['Layer'].values filenames = [] for l in lyrs: filenames.append('Layer'+str(l)) else: filenames = ['AllLayers'] if isinstance(grid_data, xr.Dataset): if variable_sep: filenames = [] for var in grid_data: filenames.append(var) else: filenames = ['AllLayers'] else: filenames = [out_path.stem] out_path = out_path.parent else: print('No output path specified (out_path). Please input string or pathlib object for out_path parameters') return #Format datestamp, if desired in output filename if date_stamp: nowTime = datetime.datetime.now() nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0] nowTimeStr = '_'+str(nowTime) else: nowTimeStr='' #Ensure the file suffix includes . if filetype[0] == '.': pass else: filetype = '.' + filetype if file_id != '': file_id = '_'+file_id out_path = out_path.as_posix()+'/' if verbose: print('Export filepath(s):') outPaths = [] for f in filenames: currOutPath = out_path+f+file_id+nowTimeStr+filetype outPaths.append(currOutPath) if verbose: print('\t {}'.format(currOutPath)) #Do export if filetype.lower() in pickleList: import pickle for op in outPaths: try: with open(op, 'wb') as f: pickle.dump(grid_data, f) except: print('An error occured during export.') print(op, 'could not be exported as a pickle object.') print('Try again using different parameters.') else: import rioxarray as rxr try: if isinstance(grid_data, xr.Dataset): if variable_sep: for i, var in enumerate(grid_data.data_vars): grid_data[var].rio.to_raster(outPaths[i]) else: grid_data.rio.to_raster(outPaths[0]) elif isinstance(grid_data, xr.DataArray): if variable_sep: lyrs = grid_data.coords['Layer'].values for i, l in enumerate(lyrs): out_grid = grid_data.sel(Layer = l).copy() out_grid.rio.to_raster(outPaths[i]) else: grid_data.rio.to_raster(outPaths[0]) else: grid_data.rio.to_raster(outPaths[0]) except: print('An error occured during export.') print('{} could not be exported as {} file.'.format(outPaths, filetype)) print('Try again using different parameters.') return def export_undefined(df, outdir)-
Function to export terms that still need to be defined.
Parameters
df:pandas.DataFrame- Dataframe containing at least some unclassified data
outdir:strorpathlib.Path- Directory to save file. Filename will be generated automatically based on today's date.
Returns
stillNeededDF:pandas.DataFrame- Dataframe containing only unclassified terms, and the number of times they occur
Expand source code
def export_undefined(df, outdir): """Function to export terms that still need to be defined. Parameters ---------- df : pandas.DataFrame Dataframe containing at least some unclassified data outdir : str or pathlib.Path Directory to save file. Filename will be generated automatically based on today's date. Returns ------- stillNeededDF : pandas.DataFrame Dataframe containing only unclassified terms, and the number of times they occur """ import pathlib if isinstance(outdir, pathlib.PurePath): if not outdir.is_dir() or not outdir.exists(): print('Please specify a valid directory for export. Filename is generated automatically.') return outdir = outdir.as_posix() else: outdir.replace('\\','/') outdir.replace('\\'[-1], '/') #Get directory path correct if outdir[-1] != '/': outdir = outdir+'/' todayDate = datetime.date.today() todayDateStr = str(todayDate) searchDF = df[df['CLASS_FLAG'].isna()] stillNeededDF=searchDF['FORMATION'].value_counts() stillNeededDF.to_csv(outdir+'Undefined_'+todayDateStr+'.csv') return stillNeededDF def file_setup(well_data, metadata=None, data_filename='*ISGS_DOWNHOLE_DATA*.txt', metadata_filename='*ISGS_HEADER*.txt', log_dir=None, verbose=False, log=False)-
Function to setup files, assuming data, metadata, and elevation/location are in separate files (there should be one "key"/identifying column consistent across all files to join/merge them later)
This function may not be useful if files are organized differently than this structure. If that is the case, it is recommended to use the get_most_recent() function for each individual file if needed. It may also be of use to simply skip this function altogether and directly define each filepath in a manner that can be used by pandas.read_csv()
Parameters
well_data:strorpathlib.Path object- Str or pathlib.Path to directory containing input files, by default str(repoDir)+'/resources'
metadata:strorpathlib.Path object, optional- Str or pathlib.Path to directory containing input metadata files, by default str(repoDir)+'/resources'
data_filename:str, optional- Pattern used by pathlib.glob() to get the most recent data file, by default 'ISGS_DOWNHOLE_DATA.txt'
metadata_filename:str, optional- Pattern used by pathlib.glob() to get the most recent metadata file, by default 'ISGS_HEADER.txt'
log_dir:strorpathlib.PurePath()orNone, default=None- Directory to place log file in. This is not read directly, but is used indirectly by w4h.logger_function()
verbose:bool, default= False- Whether to print name of files to terminal, by default True
log:bool, default= True- Whether to log inputs and outputs to log file.
Returns
tuple- Tuple with paths to (well_data, metadata)
Expand source code
def file_setup(well_data, metadata=None, data_filename='*ISGS_DOWNHOLE_DATA*.txt', metadata_filename='*ISGS_HEADER*.txt', log_dir=None, verbose=False, log=False): """Function to setup files, assuming data, metadata, and elevation/location are in separate files (there should be one "key"/identifying column consistent across all files to join/merge them later) This function may not be useful if files are organized differently than this structure. If that is the case, it is recommended to use the get_most_recent() function for each individual file if needed. It may also be of use to simply skip this function altogether and directly define each filepath in a manner that can be used by pandas.read_csv() Parameters ---------- well_data : str or pathlib.Path object Str or pathlib.Path to directory containing input files, by default str(repoDir)+'/resources' metadata : str or pathlib.Path object, optional Str or pathlib.Path to directory containing input metadata files, by default str(repoDir)+'/resources' data_filename : str, optional Pattern used by pathlib.glob() to get the most recent data file, by default '*ISGS_DOWNHOLE_DATA*.txt' metadata_filename : str, optional Pattern used by pathlib.glob() to get the most recent metadata file, by default '*ISGS_HEADER*.txt' log_dir : str or pathlib.PurePath() or None, default=None Directory to place log file in. This is not read directly, but is used indirectly by w4h.logger_function() verbose : bool, default = False Whether to print name of files to terminal, by default True log : bool, default = True Whether to log inputs and outputs to log file. Returns ------- tuple Tuple with paths to (well_data, metadata) """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(file_setup, locals(), exclude_params=['well_data']) #Define filepath variables to be used later for reading/writing files data_path = pathlib.Path(well_data) if metadata is None: origMetaPath = None metadata=data_path else: origMetaPath = metadata metadata=pathlib.Path(metadata) #If input path is a directory, find most recent version of the file. If file, just read the file if data_path.is_dir(): downholeDataFILE = get_most_recent(data_path, data_filename, verbose=verbose) else: downholeDataFILE = data_path if metadata.is_dir(): headerDataFILE = get_most_recent(metadata, metadata_filename, verbose=verbose) if headerDataFILE == []: headerDataFILE = downholeDataFILE else: if origMetaPath is None: headerDataFILE = downholeDataFILE else: headerDataFILE = metadata #Set all input as pathlib.Path objects (may be redundant, but just in case) downholeDataPATH = pathlib.Path(downholeDataFILE) headerDataPATH = pathlib.Path(headerDataFILE) if verbose: print('Using the following file(s):') print('\t', downholeDataFILE) if headerDataFILE != downholeDataFILE: print('\t', headerDataFILE) #Define datatypes, to use later #downholeDataDTYPES = {'ID':np.uint32, "API_NUMBER":np.uint64,"TABLE_NAME":str,"WHO":str,"INTERPRET_DATE":str,"FORMATION":str,"THICKNESS":np.float64,"TOP":np.float64,"BOTTOM":np.float64} #headerDataDTYPES = {'ID':np.uint32,'API_NUMBER':np.uint64,"TDFORMATION":str,"PRODFORM":str,"TOTAL_DEPTH":np.float64,"SECTION":np.float64,"TWP":np.float64,"TDIR":str,"RNG":np.float64,"RDIR":str,"MERIDIAN":np.float64,"FARM_NAME":str,"NSFOOT":np.float64,"NSDIR":str,"EWFOOT":np.float64,"EWDIR":str,"QUARTERS":str,"ELEVATION":np.float64,"ELEVREF":str,"COMP_DATE":str,"STATUS":str,"FARM_NUM":str,"COUNTY_CODE":np.float64,"PERMIT_NUMBER":str,"COMPANY_NAME":str,"COMPANY_CODE":str,"PERMIT_DATE":str,"CORNER":str,"LATITUDE":np.float64,"LONGITUDE":np.float64,"ENTERED_BY":str,"UPDDATE":str,"ELEVSOURCE":str, "ELEV_FT":np.float64} return downholeDataPATH, headerDataPATH def fill_unclassified(df, classification_col='CLASS_FLAG')-
Fills unclassified rows in 'CLASS_FLAG' column with np.nan
Parameters
df:pandas.DataFrame- Dataframe on which to perform operation
Returns
df:pandas.DataFrame- Dataframe on which operation has been performed
Expand source code
def fill_unclassified(df, classification_col='CLASS_FLAG'): """Fills unclassified rows in 'CLASS_FLAG' column with np.nan Parameters ---------- df : pandas.DataFrame Dataframe on which to perform operation Returns ------- df : pandas.DataFrame Dataframe on which operation has been performed """ df[classification_col].fillna(0, inplace=True) return df def get_current_date()-
Gets The Current Date To Help With Finding The Most Recent File
Parameters
None
Returns
todayDate- datetime object with today's date
dateSuffix- str to use for naming output files
Expand source code
def get_current_date(): """ Gets the current date to help with finding the most recent file --------------------- Parameters: None --------------------- Returns: todayDate : datetime object with today's date dateSuffix : str to use for naming output files """ todayDate = datetime.date.today() todayDateStr = str(todayDate) dateSuffix = '_'+todayDateStr return todayDate, dateSuffix def get_drift_thick(surface_elev=None, bedrock_elev=None, layers=9, plot=False, verbose=False, log=False)-
Finds the distance from surface_elev to bedrock_elev and then divides by number of layers to get layer thickness.
Parameters
surface_elev:rioxarray.DataArray- array holding surface elevation
bedrock_elev:rioxarray.DataArray- array holding bedrock elevation
layers:int, default=9- number of layers needed to calculate thickness for
plot:bool, default=False- tells function to either plot the data or not
Returns
driftThick:rioxarray.DataArray- Contains data array containing depth to bedrock at each point
layerThick:rioxarray.DataArray- Contains data array with layer thickness at each point
Expand source code
def get_drift_thick(surface_elev=None, bedrock_elev=None, layers=9, plot=False, verbose=False, log=False): """Finds the distance from surface_elev to bedrock_elev and then divides by number of layers to get layer thickness. Parameters ---------- surface_elev : rioxarray.DataArray array holding surface elevation bedrock_elev : rioxarray.DataArray array holding bedrock elevation layers : int, default=9 number of layers needed to calculate thickness for plot : bool, default=False tells function to either plot the data or not Returns ------- driftThick : rioxarray.DataArray Contains data array containing depth to bedrock at each point layerThick : rioxarray.DataArray Contains data array with layer thickness at each point """ if surface_elev is None: surface_elev = w4h.get_resources()['surf_elev'] if bedrock_elev is None: bedrock_elev = w4h.get_resources()['bedrock_elev'] logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(get_drift_thick, locals(), exclude_params=['surface_elev', 'bedrock_elev']) xr.set_options(keep_attrs=True) driftThick = surface_elev - bedrock_elev driftThick = driftThick.clip(0,max=5000,keep_attrs=True) if plot: import matplotlib.pyplot as plt fig, ax = plt.subplots() maxThick = np.nanmax(driftThick) if maxThick > 550: maxThick = 550 dtPlot = driftThick.plot(vmin=0, vmax=maxThick, ax=ax) ax.set_title("Drift Thickness") try: noDataVal = driftThick.attrs['_FillValue'] #Extract from dataset itself except: noDataVal = 100001 driftThick = driftThick.where(driftThick <100000, other=np.nan) #Replace no data values with NaNs driftThick = driftThick.where(driftThick >-100000, other=np.nan) #Replace no data values with NaNs layerThick = driftThick/layers xr.set_options(keep_attrs='default') return driftThick, layerThick def get_layer_depths(df_with_depths, surface_elev_col='SURFACE_ELEV', layer_thick_col='LAYER_THICK', layers=9, log=False)-
Function to calculate depths and elevations of each model layer at each well based on surface elevation, bedrock elevation, and number of layers/layer thickness
Parameters
df_with_depths:pandas.DataFrame- Dataframe containing well metdata
layers:int, default=9- Number of layers. This should correlate with get_drift_thick() input parameter, if drift thickness was calculated using that function, by default 9.
log:bool, default= False- Whether to log inputs and outputs to log file.
Returns
pandas.Dataframe- Dataframe containing new columns for depth to layers and elevation of layers.
Expand source code
def get_layer_depths(df_with_depths, surface_elev_col='SURFACE_ELEV', layer_thick_col='LAYER_THICK', layers=9, log=False): """Function to calculate depths and elevations of each model layer at each well based on surface elevation, bedrock elevation, and number of layers/layer thickness Parameters ---------- df_with_depths : pandas.DataFrame Dataframe containing well metdata layers : int, default=9 Number of layers. This should correlate with get_drift_thick() input parameter, if drift thickness was calculated using that function, by default 9. log : bool, default = False Whether to log inputs and outputs to log file. Returns ------- pandas.Dataframe Dataframe containing new columns for depth to layers and elevation of layers. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) for layer in range(0, layers): #For each layer #Make column names depthColName = 'DEPTH_LAYER'+str(layer+1) #depthMcolName = 'Depth_M_LAYER'+str(layer) #Calculate depth to each layer at each well, in feet and meters df_with_depths[depthColName] = df_with_depths[layer_thick_col] * layer #headerData[depthMcolName] = headerData[depthColName] * 0.3048 for layer in range(0, layers): #For each layer elevColName = 'ELEV_LAYER'+str(layer+1) #elevMColName = 'ELEV_M_LAYER'+str(layer) df_with_depths[elevColName] = df_with_depths[surface_elev_col] - df_with_depths[layer_thick_col] * layer #headerData[elevMColName] = headerData['SURFACE_ELEV_M'] - headerData['LAYER_THICK_M'] * layer return df_with_depths def get_most_recent(dir=WindowsPath('c:/Users/riley/LocalData/Github/wells4hydrogeology/w4h/resources'), glob_pattern='*', verbose=True)-
Function to find the most recent file with the indicated pattern, using pathlib.glob function.
Parameters
dir:strorpathlib.Path object, optional- Directory in which to find the most recent file, by default str(repoDir)+'/resources'
glob_pattern:str, optional- String used by the pathlib.glob() function/method for searching, by default '*'
Returns
pathlib.Path object- Pathlib Path object of the most recent file fitting the glob pattern indicated in the glob_pattern parameter.
Expand source code
def get_most_recent(dir=resource_dir, glob_pattern='*', verbose=True): """Function to find the most recent file with the indicated pattern, using pathlib.glob function. Parameters ---------- dir : str or pathlib.Path object, optional Directory in which to find the most recent file, by default str(repoDir)+'/resources' glob_pattern : str, optional String used by the pathlib.glob() function/method for searching, by default '*' Returns ------- pathlib.Path object Pathlib Path object of the most recent file fitting the glob pattern indicated in the glob_pattern parameter. """ if verbose: verbose_print(get_most_recent, locals()) todayDate = datetime.date.today() todayDateStr = str(todayDate) files = pathlib.Path(dir).rglob(glob_pattern) #Get all the files that fit the pattern fileDates = [] for f in files: #Get the file dates from their file modification times fileDates.append(np.datetime64(datetime.datetime.fromtimestamp(os.path.getmtime(f)))) if fileDates == []: #If no files found that match pattern, return an empty pathlib.Path() if verbose: print('No file found in {} matching {} pattern'.format(dir, glob_pattern)) mostRecentFile = pathlib.Path() return mostRecentFile else: globInd = np.argmin(np.datetime64(todayDateStr) - np.array(fileDates)) #Find the index of the most recent file #Iterate through glob/files again (need to recreate glob) files = pathlib.Path(dir).rglob(glob_pattern) for j, f in enumerate(files): if j == globInd: mostRecentFile=f break if verbose: print('Most Recent version of file fitting {} pattern is: {}'.format(glob_pattern, mostRecentFile.name)) return mostRecentFile def get_resources(resource_type='filepaths', scope='local', verbose=False)-
Function to get filepaths for resources included with package
Parameters
resource_type:str, {'filepaths', 'data'}- If filepaths, will return dictionary with filepaths to sample data. If data, returns dictionary with data objects.
scope:str, {'local', 'statewide'}- If 'local', will read in sample data for a local (around county sized) project. If 'state', will read in sample data for a statewide project (Illinois)
verbose:bool, optional- Whether to print results to terminal, by default False
Returns
resources_dict:dict- Dictionary containing key, value pairs with filepaths to resources that may be of interest.
Expand source code
def get_resources(resource_type='filepaths', scope='local', verbose=False): """Function to get filepaths for resources included with package Parameters ---------- resource_type : str, {'filepaths', 'data'} If filepaths, will return dictionary with filepaths to sample data. If data, returns dictionary with data objects. scope : str, {'local', 'statewide'} If 'local', will read in sample data for a local (around county sized) project. If 'state', will read in sample data for a statewide project (Illinois) verbose : bool, optional Whether to print results to terminal, by default False Returns ------- resources_dict : dict Dictionary containing key, value pairs with filepaths to resources that may be of interest. """ resources_dict = {} sample_data_dir = resource_dir.joinpath('sample_data') #Get sample data #Get lithology dictionaries' filepaths sample_dictionary_dir = sample_data_dir.joinpath('DictionaryTerms') resources_dict['LithologyDict_Exact'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*DICTIONARY_SearchTerms*', verbose=verbose) resources_dict['LithologyDict_Start'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*SearchTerms-Start*', verbose=verbose) resources_dict['LithologyDict_Wildcard'] = w4h.get_most_recent(dir=sample_dictionary_dir, glob_pattern='*SearchTerms-Wildcard*', verbose=verbose) #Get Lithology Interpretation filepaths lith_interp_dir = sample_data_dir.joinpath('LithologyInterpretations') resources_dict['LithInterps_FineCoarse'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*FineCoarse*', verbose=verbose) resources_dict['LithInterps_Clay'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Clay*', verbose=verbose) resources_dict['LithInterps_Silt'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Silt*', verbose=verbose) resources_dict['LithInterps_Sand'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Sand*', verbose=verbose) resources_dict['LithInterps_Gravel'] = w4h.get_most_recent(dir=lith_interp_dir, glob_pattern='*Gravel*', verbose=verbose) #Get other resource filepaths resources_dict['well_data_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*downholeDataTypes*', verbose=verbose) resources_dict['metadata_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*headerDataTypes*', verbose=verbose) resources_dict['ISWS_CRS'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='isws_crs.json', verbose=verbose) resources_dict['xyz_dtypes'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='xyzDataTypes.json', verbose=verbose) resources_dict['model_grid'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='grid_625_raster.tif', verbose=verbose) statewideSampleDir = sample_data_dir.joinpath('statewide_sample_data') statewideList = ['statewide', 'state', 'regional', 'region', 's', 'r'] if scope.lower() in statewideList: resources_dict['well_data'] = statewideSampleDir.joinpath("IL_Statewide_WellData_XYz_2023-07-20_cleaned.zip") resources_dict['surf_elev'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_Surface_Elev_ft_625ft_Lambert_GridAlign*', verbose=verbose) resources_dict['bedrock_elev'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_Bedrock_Elev_2023_ft_625ft_Lambert_GridAlign*', verbose=verbose) resources_dict['study_area'] = w4h.get_most_recent(dir=statewideSampleDir, glob_pattern='*IL_Statewide_boundary*', verbose=verbose) else: resources_dict['study_area'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*sample_studyArea*', verbose=verbose) resources_dict['surf_elev'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*sample_surface_bedrock_lidarresampled100ft*', verbose=verbose) resources_dict['bedrock_elev'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='*LocalSample_Bedrock_elev_EStLGrimleyPhillips*', verbose=verbose) resources_dict['well_data'] = w4h.get_most_recent(dir=sample_data_dir, glob_pattern='sample_well_data*', verbose=verbose) # Get data objects if specified dataObjList = ['data', 'objects', 'do', 'data objects', 'dataobjects'] if resource_type.lower() in dataObjList: resources_dict['LithologyDict_Exact'] = pd.read_csv(resources_dict['LithologyDict_Exact'], dtype={"ID":int, "DESCRIPTION":str, "LITHOLOGY":str, "COLOR":str, "CONSISTENCY":str, "MOD1":str, "MOD2":str, "INTERPRETED":str, "COMPLETED":str, "ORIGIN_INDIANA":str}, index_col='ID') resources_dict['LithologyDict_Start'] = pd.read_csv(resources_dict['LithologyDict_Start']) resources_dict['LithologyDict_Wildcard'] = pd.read_csv(resources_dict['LithologyDict_Wildcard']) resources_dict['LithInterps_FineCoarse'] = pd.read_csv(resources_dict['LithInterps_FineCoarse']) resources_dict['LithInterps_Clay'] = pd.read_csv(resources_dict['LithInterps_Clay']) resources_dict['LithInterps_Silt'] = pd.read_csv(resources_dict['LithInterps_Silt']) resources_dict['LithInterps_Sand'] = pd.read_csv(resources_dict['LithInterps_Sand']) resources_dict['LithInterps_Gravel'] = pd.read_csv(resources_dict['LithInterps_Gravel']) with open(resources_dict['well_data_dtypes'], 'r', encoding='utf-8') as f: resources_dict['well_data_dtypes'] = json.load(f) with open(resources_dict['metadata_dtypes'], 'r', encoding='utf-8') as f: resources_dict['metadata_dtypes'] = json.load(f) with open(resources_dict['ISWS_CRS'], 'r', encoding='utf-8') as f: resources_dict['ISWS_CRS'] = json.load(f) with open(resources_dict['xyz_dtypes'], 'r', encoding='utf-8') as f: resources_dict['xyz_dtypes'] = json.load(f) if scope.lower() in statewideList: sacrs = resources_dict['ISWS_CRS'] with zipfile.ZipFile(resources_dict['well_data'].as_posix(), 'r') as archive: for file_name in archive.namelist(): with archive.open(file_name) as file: if 'HEADER' in file_name: metaDF = pd.read_csv(file) else: resources_dict['well_data'] = pd.read_csv(file) geometry = [Point(xy) for xy in zip(resources_dict['well_data']['LONGITUDE'], resources_dict['well_data']['LATITUDE'])] resources_dict['well_data'] = gpd.GeoDataFrame(resources_dict['well_data'], geometry=geometry, crs='EPSG:4269') else: sacrs = 'EPSG:4269' df = pd.read_csv(resources_dict['well_data']) df['geometry'] = df['geometry'].apply(wkt.loads) resources_dict['well_data'] = gpd.GeoDataFrame(df, geometry='geometry') resources_dict['study_area'] = gpd.read_file(resources_dict['study_area'], geometry='geometry', crs=sacrs) resources_dict['model_grid'] = rxr.open_rasterio(resources_dict['model_grid']) resources_dict['surf_elev'] = rxr.open_rasterio(resources_dict['surf_elev']) #resources_dict['surf_elev'] = resources_dict['surf_elev'].sel(band=1) resources_dict['bedrock_elev'] = rxr.open_rasterio(resources_dict['bedrock_elev']) #resources_dict['bedrock_elev'] = resources_dict['bedrock_elev'].sel(band=1) return resources_dict def get_search_terms(spec_path='C:\\Users\\riley\\LocalData\\Github\\wells4hydrogeology/resources/', spec_glob_pattern='*SearchTerms-Specific*', start_path=None, start_glob_pattern='*SearchTerms-Start*', wildcard_path=None, wildcard_glob_pattern='*SearchTerms-Wildcard', verbose=False, log=False)-
Read in dictionary files for downhole data
Parameters
spec_path:strorpathlib.Path, optional- Directory where the file containing the specific search terms is located, by default str(repoDir)+'/resources/'
spec_glob_pattern:str, optional- Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Specific'
start_path:strorNone, optional- Directory where the file containing the start search terms is located, by default None
start_glob_pattern:str, optional- Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Start'
wildcard_path:strorpathlib.Path, default= None- Directory where the file containing the wildcard search terms is located, by default None
wildcard_glob_pattern:str, default= '*SearchTerms-Wildcard'- Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default 'SearchTerms-Wildcard'
log:bool, default= True- Whether to log inputs and outputs to log file.
Returns
(specTermsPath, startTermsPath, wilcardTermsPath) : tuple Tuple containing the pandas dataframes with specific search terms, with start search terms, and with wildcard search terms
Expand source code
def get_search_terms(spec_path=str(repoDir)+'/resources/', spec_glob_pattern='*SearchTerms-Specific*', start_path=None, start_glob_pattern = '*SearchTerms-Start*', wildcard_path=None, wildcard_glob_pattern='*SearchTerms-Wildcard', verbose=False, log=False): """Read in dictionary files for downhole data Parameters ---------- spec_path : str or pathlib.Path, optional Directory where the file containing the specific search terms is located, by default str(repoDir)+'/resources/' spec_glob_pattern : str, optional Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Specific*' start_path : str or None, optional Directory where the file containing the start search terms is located, by default None start_glob_pattern : str, optional Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Start*' wildcard_path : str or pathlib.Path, default = None Directory where the file containing the wildcard search terms is located, by default None wildcard_glob_pattern : str, default = '*SearchTerms-Wildcard' Search string used by pathlib.glob() to find the most recent file of interest, uses get_most_recent() function, by default '*SearchTerms-Wildcard*' log : bool, default = True Whether to log inputs and outputs to log file. Returns ------- (specTermsPath, startTermsPath, wilcardTermsPath) : tuple Tuple containing the pandas dataframes with specific search terms, with start search terms, and with wildcard search terms """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(get_search_terms, locals()) #specTermsFile = "SearchTerms-Specific_BedrockOrNo_2022-09.csv" #Specific matches #startTermsFile = "SearchTerms-Start_BedrockOrNo.csv" #Wildcard matches for the start of the description #Exact match path if spec_path is None: specTermsPath = spec_path else: spec_path = pathlib.Path(spec_path) if spec_path.is_dir(): specTermsPath = get_most_recent(spec_path, spec_glob_pattern) else: specTermsPath = spec_path #Startswith path if start_path is None: startTermsPath = start_path else: start_path = pathlib.Path(start_path) if start_path.is_dir(): startTermsPath = get_most_recent(start_path, start_glob_pattern) else: startTermsPath = start_path #Wildcard Path if wildcard_path is None: wilcardTermsPath = wildcard_path else: wildcard_path = pathlib.Path(wildcard_path) if wildcard_path.is_dir(): wilcardTermsPath = get_most_recent(wildcard_path, wildcard_glob_pattern) else: wilcardTermsPath = wildcard_path return specTermsPath, startTermsPath, wilcardTermsPath def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False)-
Gets unique wells as a dataframe based on a given column name.
Parameters
df:pandas.DataFrame- Dataframe containing all wells and/or well intervals of interest
wellid_col:str, default="API_NUMBER"- Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values.
log:bool, default= False- Whether to log results to log file
Returns
wellsDF- DataFrame containing only the unique well IDs
Expand source code
def get_unique_wells(df, wellid_col='API_NUMBER', verbose=False, log=False): """Gets unique wells as a dataframe based on a given column name. Parameters ---------- df : pandas.DataFrame Dataframe containing all wells and/or well intervals of interest wellid_col : str, default="API_NUMBER" Name of column in df containing a unique identifier for each well, by default 'API_NUMBER'. .unique() will be run on this column to get the unique values. log : bool, default = False Whether to log results to log file Returns ------- wellsDF DataFrame containing only the unique well IDs """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(get_unique_wells, locals(), exclude_params=['df']) #Get Unique well APIs uniqueWells = df[wellid_col].unique() wellsDF = pd.DataFrame(uniqueWells) if verbose: print('Number of unique wells: '+str(wellsDF.shape[0])) wellsDF.columns = ['UNIQUE_ID'] return wellsDF def grid2study_area(study_area, grid, output_crs='EPSG:4269', verbose=False, log=False)-
Clips grid to study area.
Parameters
study_area:geopandas.GeoDataFrame- inputs study area polygon
grid:xarray.DataArray- inputs grid array
output_crs:str, default='EPSG:4269'- inputs the coordinate reference system for the study area
log:bool, default= False- Whether to log results to log file, by default False
Returns
grid:xarray.DataArray- returns xarray containing grid clipped only to area within study area
Expand source code
def grid2study_area(study_area, grid, output_crs='EPSG:4269',verbose=False, log=False): """Clips grid to study area. Parameters ---------- study_area : geopandas.GeoDataFrame inputs study area polygon grid : xarray.DataArray inputs grid array output_crs : str, default='EPSG:4269' inputs the coordinate reference system for the study area log : bool, default = False Whether to log results to log file, by default False Returns ------- grid : xarray.DataArray returns xarray containing grid clipped only to area within study area """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(grid2study_area, locals(), exclude_params=['study_area', 'grid']) if output_crs=='': output_crs=study_area.crs #Get input CRS's grid_crs = pyproj.CRS.from_wkt(grid.rio.crs.to_wkt()) study_area_crs = study_area.crs # Reproject if needed if output_crs != grid_crs: grid = grid.rio.reproject(output_crs) grid_crs = pyproj.CRS.from_wkt(grid.rio.crs.to_wkt()) if grid_crs != study_area_crs: study_area = study_area.to_crs(grid_crs) # We'll just clip to outer bounds of study area saExtent = study_area.total_bounds if grid['y'][-1].values - grid['y'][0].values > 0: miny=saExtent[1] maxy=saExtent[3] else: miny=saExtent[3] maxy=saExtent[1] if grid['x'][-1].values - grid['x'][0].values > 0: minx=saExtent[0] maxx=saExtent[2] else: minx=saExtent[2] maxx=saExtent[0] # "Clip" it grid = grid.sel(x=slice(minx, maxx), y=slice(miny, maxy)) if 'band' in grid.dims: grid = grid.sel(band=1) return grid def layer_interp(points, grid, layers=None, interp_kind='nearest', return_type='dataarray', export_dir=None, target_col='TARG_THICK_PER', layer_col='LAYER', xcol=None, ycol=None, xcoord='x', ycoord='y', log=False, verbose=False, **kwargs)-
Function to interpolate results, going from points to grid data. Uses scipy.interpolate module.
Parameters
points:list- List containing pandas dataframes or geopandas geoadataframes containing the point data. Should be resDF_list output from layer_target_thick().
grid:xr.DataArrayorxr.Dataset- Xarray DataArray or DataSet with the coordinates/spatial reference of the output grid to interpolate to
layers:int, default=None- Number of layers for interpolation. If None, uses the length ofthe points list to determine number of layers. By default None.
interp_kind:str, {'nearest', 'interp2d','linear', 'cloughtocher', 'radial basis function'}- Type of interpolation to use. See scipy.interpolate N-D scattered. Values can be any of the following (also shown in "kind" column of N-D scattered section of table here: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html). By default 'nearest'
return_type:str, {'dataarray', 'dataset'}- Type of xarray object to return, either xr.DataArray or xr.Dataset, by default 'dataarray.'
export_dir:strorpathlib.Path, default=None- Export directory for interpolated grids, using w4h.export_grids(). If None, does not export, by default None.
target_col:str, default= 'TARG_THICK_PER'- Name of column in points containing data to be interpolated, by default 'TARG_THICK_PER'.
layer_col:str, default= 'Layer'- Name of column containing layer number. Not currently used, by default 'LAYER'
xcol:str, default= 'None'- Name of column containing x coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
ycol:str, default= 'None'- Name of column containing y coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
xcoord:str, default='x'- Name of x coordinate in grid, used to extract x values of grid, by default 'x'
ycoord:str, default='y'- Name of y coordinate in grid, used to extract x values of grid, by default 'y'
log:bool, default= True- Whether to log inputs and outputs to log file.
**kwargs- Keyword arguments to be read directly into whichever scipy.interpolate function is designated by the interp_kind parameter.
Returns
interp_data:xr.DataArrayorxr.Dataset, depending on return_type- By default, returns an xr.DataArray object with the layers added as a new dimension called Layer. Can also specify return_type='dataset' to return an xr.Dataset with each layer as a separate variable.
Expand source code
def layer_interp(points, grid, layers=None, interp_kind='nearest', return_type='dataarray', export_dir=None, target_col='TARG_THICK_PER', layer_col='LAYER', xcol=None, ycol=None, xcoord='x', ycoord='y', log=False, verbose=False, **kwargs): """Function to interpolate results, going from points to grid data. Uses scipy.interpolate module. Parameters ---------- points : list List containing pandas dataframes or geopandas geoadataframes containing the point data. Should be resDF_list output from layer_target_thick(). grid : xr.DataArray or xr.Dataset Xarray DataArray or DataSet with the coordinates/spatial reference of the output grid to interpolate to layers : int, default=None Number of layers for interpolation. If None, uses the length ofthe points list to determine number of layers. By default None. interp_kind : str, {'nearest', 'interp2d','linear', 'cloughtocher', 'radial basis function'} Type of interpolation to use. See scipy.interpolate N-D scattered. Values can be any of the following (also shown in "kind" column of N-D scattered section of table here: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html). By default 'nearest' return_type : str, {'dataarray', 'dataset'} Type of xarray object to return, either xr.DataArray or xr.Dataset, by default 'dataarray.' export_dir : str or pathlib.Path, default=None Export directory for interpolated grids, using w4h.export_grids(). If None, does not export, by default None. target_col : str, default = 'TARG_THICK_PER' Name of column in points containing data to be interpolated, by default 'TARG_THICK_PER'. layer_col : str, default = 'Layer' Name of column containing layer number. Not currently used, by default 'LAYER' xcol : str, default = 'None' Name of column containing x coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None ycol : str, default = 'None' Name of column containing y coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None xcoord : str, default='x' Name of x coordinate in grid, used to extract x values of grid, by default 'x' ycoord : str, default='y' Name of y coordinate in grid, used to extract x values of grid, by default 'y' log : bool, default = True Whether to log inputs and outputs to log file. **kwargs Keyword arguments to be read directly into whichever scipy.interpolate function is designated by the interp_kind parameter. Returns ------- interp_data : xr.DataArray or xr.Dataset, depending on return_type By default, returns an xr.DataArray object with the layers added as a new dimension called Layer. Can also specify return_type='dataset' to return an xr.Dataset with each layer as a separate variable. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(layer_interp, locals(), exclude_params=['points', 'grid']) nnList = ['nearest', 'nearest neighbor', 'nearestneighbor','neighbor', 'nn','n'] splineList = ['interp2d', 'interp2', 'interp', 'spline', 'spl', 'sp', 's'] linList = ['linear', 'lin', 'l'] ctList = ['clough tocher', 'clough', 'cloughtocher', 'ct', 'c'] rbfList = ['rbf', 'radial basis', 'radial basis function', 'r', 'radial'] #k-nearest neighbors from scikit-learn? #kriging? (from pykrige or maybe also from scikit-learn) X = np.round(grid[xcoord].values, 5)# #Extract xcoords from grid Y = np.round(grid[ycoord].values, 5)# #Extract ycoords from grid if layers is None and (type(points) is list or type(points) is dict): layers = len(points) if len(points) != layers: print('You have specified a different number of layers than what is iterable in the points argument. This may not work properly.') if verbose: print('Interpolating target lithology at each layer:') daDict = {} for lyr in range(1, layers+1): if type(points) is list or type(points) is dict: pts = points[lyr-1] else: pts = points if xcol is None: if 'geometry' in pts.columns: dataX = pts['geometry'].x else: print('xcol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)' ) return else: dataX = pts[xcol] if ycol is None: if 'geometry' in pts.columns: dataY = pts['geometry'].y else: print('ycol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)' ) return else: dataY = pts[ycol] #layer = pts[layer_col] interpVal = pts[target_col] #return points dataX.dropna(inplace=True) dataY = dataY.loc[dataX.index] dataY.dropna(inplace=True) interpVal = interpVal.loc[dataY.index] interpVal.dropna(inplace=True) dataX = dataX.loc[interpVal.index] dataY = dataY.loc[interpVal.index] dataX = dataX.reset_index(drop=True) dataY = dataY.reset_index(drop=True) interpVal = interpVal.reset_index(drop=True) if interp_kind.lower() in nnList: interpType = 'Nearest Neighbor' X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation dataPoints = np.array(list(zip(dataX, dataY))) interp = interpolate.NearestNDInterpolator(dataPoints, interpVal, **kwargs) Z = interp(X, Y) elif interp_kind.lower() in linList: interpType = 'Linear' dataPoints = np.array(list(zip(dataX, dataY))) interp = interpolate.LinearNDInterpolator(dataPoints, interpVal, **kwargs) X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation Z = interp(X, Y) elif interp_kind.lower() in ctList: interpType = 'Clough-Toucher' X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation if 'tol' not in kwargs: kwargs['tol'] = 1e10 interp = interpolate.CloughTocher2DInterpolator(list(zip(dataX, dataY)), interpVal, **kwargs) Z = interp(X, Y) elif interp_kind.lower() in rbfList: interpType = 'Radial Basis' dataXY= np.column_stack((dataX, dataY)) interp = interpolate.RBFInterpolator(dataXY, interpVal, **kwargs) print("Radial Basis Function does not work well with many well-based datasets. Consider instead specifying 'nearest', 'linear', 'spline', or 'clough tocher' for interpolation interp_kind.") Z = interp(np.column_stack((X.ravel(), Y.ravel()))).reshape(X.shape) elif interp_kind.lower() in splineList: interpType = 'Spline Interpolation' Z = interpolate.bisplrep(dataX, dataY, interpVal, **kwargs) #interp = interpolate.interp2d(dataX, dataY, interpVal, kind=lin_kind, **kwargs) #Z = interp(X, Y) else: if verbose: print('Specified interpolation interp_kind not recognized, using nearest neighbor.') interpType = 'Nearest Neighbor' X, Y = np.meshgrid(X, Y, sparse=True) #2D Grid for interpolation interp = interpolate.NearestNDInterpolator(list(zip(dataX, dataY)), interpVal, **kwargs) Z = interp(X, Y) #global ZTest #ZTest = Z interp_grid = xr.DataArray( #Create new datarray with new data values, but everything else the same data=Z, dims=grid.dims, coords=grid.coords) if 'band' in interp_grid.coords: interp_grid = interp_grid.drop_vars('band') interp_grid = interp_grid.clip(min=0, max=1, keep_attrs=True) interp_grid = interp_grid.expand_dims(dim='Layer') interp_grid = interp_grid.assign_coords(Layer=[lyr]) del Z del dataX del dataY del interpVal del interp #interp_grid=interp_grid.interpolate_na(dim=x) zFillDigs = len(str(layers)) daDict['Layer'+str(lyr).zfill(zFillDigs)] = interp_grid del interp_grid if verbose: print('\tCompleted {} interpolation for Layer {}'.format(str(interpType).lower(), str(lyr).zfill(zFillDigs))) dataAList = ['dataarray', 'da', 'a', 'array'] dataSList = ['dataset', 'ds', 'set'] if return_type.lower() in dataAList: interp_data = xr.concat(daDict.values(), dim='Layer') interp_data = interp_data.assign_coords(Layer=np.arange(1,10)) elif return_type.lower() in dataSList: interp_data = xr.Dataset(daDict) if verbose: print('Done with interpolation, getting global attrs') #Get common attributes from all layers to use as "global" attributes common_attrs = {} for i, (var_name, data_array) in enumerate(interp_data.data_vars.items()): if i == 0: common_attrs = data_array.attrs else: common_attrs = {k: v for k, v in common_attrs.items() if k in data_array.attrs and data_array.attrs[k] == v} interp_data.attrs.update(common_attrs) else: print(f"{return_type} is not a valid input for return_type. Please set return_type to either 'dataarray' or 'dataset'") return if verbose: for i, layer_data in enumerate(interp_data): pts = points[i] pts.plot(c=pts[target_col]) if export_dir is None: pass else: w4h.export_grids(grid_data=interp_data, out_path=export_dir, file_id='',filetype='tif', variable_sep=True, date_stamp=True) print('Exported to {}'.format(export_dir)) return interp_data def layer_target_thick(df, layers=9, return_all=False, export_dir=None, outfile_prefix=None, depth_top_col='TOP', depth_bot_col='BOTTOM', log=False)-
Function to calculate thickness of target material in each layer at each well point
Parameters
df:geopandas.geodataframe- Geodataframe containing classified data, surface elevation, bedrock elevation, layer depths, geometry.
layers:int, default=9- Number of layers in model, by default 9
return_all:bool, default=False- If True, return list of original geodataframes with extra column added for target thick for each layer. If False, return list of geopandas.geodataframes with only essential information for each layer.
export_dir:strorpathlib.Path, default=None- If str or pathlib.Path, should be directory to which to export dataframes built in function.
outfile_prefix:str, default=None- Only used if export_dir is set. Will be used at the start of the exported filenames
depth_top_col:str, default='TOP'- Name of column containing data for depth to top of described well intervals
depth_bot_col:str, default='BOTTOM'- Name of column containing data for depth to bottom of described well intervals
log:bool, default= True- Whether to log inputs and outputs to log file.
Returns
res_dforres : geopandas.geodataframe- Geopandas geodataframe containing only important information needed for next stage of analysis.
Expand source code
def layer_target_thick(df, layers=9, return_all=False, export_dir=None, outfile_prefix=None, depth_top_col='TOP', depth_bot_col='BOTTOM', log=False): """Function to calculate thickness of target material in each layer at each well point Parameters ---------- df : geopandas.geodataframe Geodataframe containing classified data, surface elevation, bedrock elevation, layer depths, geometry. layers : int, default=9 Number of layers in model, by default 9 return_all : bool, default=False If True, return list of original geodataframes with extra column added for target thick for each layer. If False, return list of geopandas.geodataframes with only essential information for each layer. export_dir : str or pathlib.Path, default=None If str or pathlib.Path, should be directory to which to export dataframes built in function. outfile_prefix : str, default=None Only used if export_dir is set. Will be used at the start of the exported filenames depth_top_col : str, default='TOP' Name of column containing data for depth to top of described well intervals depth_bot_col : str, default='BOTTOM' Name of column containing data for depth to bottom of described well intervals log : bool, default = True Whether to log inputs and outputs to log file. Returns ------- res_df or res : geopandas.geodataframe Geopandas geodataframe containing only important information needed for next stage of analysis. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) df['TOP_ELEV'] = df['SURFACE_ELEV'] - df[depth_top_col] df['BOT_ELEV'] = df['SURFACE_ELEV'] - df[depth_bot_col] layerList = range(1,layers+1) res_list = [] resdf_list = [] #Generate Column names based on (looped) integers for layer in layerList: zStr = 'ELEV' zColT = 'TOP_ELEV' zColB = 'BOT_ELEV' topCol = zStr+'_LAYER'+str(layer) if layer != 9: #For all layers except the bottom layer.... botCol = zStr+'_LAYER'+str(layer+1) #use the layer below it to else: #Otherwise, ... botCol = "BEDROCK_"+zStr #Use the (corrected) bedrock depth #Divide records into 4 categories for ease of calculation, to be joined back together later #Category 1: Well interval starts above layer top, ends within model layer #Category 2: Well interval is entirely contained withing model layer #Category 3: Well interval starts within model layer, continues through bottom of model layer #Category 4: well interval begins and ends on either side of model layer (model layer is contained within well layer) #records1 = intervals that go through the top of the layer and bottom is within layer records1 = df.loc[(df[zColT] >= df[topCol]) & #Top of the well interval is above or equal to the top of the layer (df[zColB] <= df[topCol]) & # & #Bottom is below the top of the layer (df[zColB] <= df[zColT])].copy() #Bottom is deeper than top (should already be the case) records1['TARG_THICK'] = pd.DataFrame(np.round((records1.loc[:,topCol]-records1.loc[: , zColB]) * records1['TARGET'],3)).copy() #Multiply "target" (1 or 0) by length within layer #records2 = entire interval is within layer records2 = df.loc[(df[zColT] <= df[topCol]) & #Top of the well is lower than top of the layer (df[zColB] >= df[botCol]) & #Bottom of the well is above bottom of the layer (df[zColB] <= df[zColT])].copy() #Bottom ofthe well is deeper than or equal to top (should already be the case) records2['TARG_THICK'] = pd.DataFrame(np.round((records2.loc[: , zColT] - records2.loc[: , zColB]) * records2['TARGET'],3)).copy() #records3 = intervals with top within layer and bottom of interval going through bottom of layer records3 = df.loc[(df[zColT] > df[botCol]) & #Top of the well is above bottom of layer (df[zColB] < df[botCol]) & #Bottom of the well is below bottom of layer (df[zColT] <= df[topCol]) & #Top of well is below top of layer (df[zColB] <= df[zColT])].copy() #Bottom is deeper than top (should already be the case) records3['TARG_THICK'] = pd.DataFrame(np.round((records3.loc[: , zColT] - (records3.loc[:,botCol]))*records3['TARGET'],3)).copy() #records4 = interval goes through entire layer records4 = df.loc[(df[zColT] >= df[topCol]) & #Top of well is above top of layer (df[zColB] < df[botCol]) & #Bottom of well is below bottom of layer (df[zColB] <= df[zColT])].copy() #Bottom of well is below top of well records4['TARG_THICK'] = pd.DataFrame(np.round((records4.loc[: , topCol]-records4.loc[: , botCol]) * records4['TARGET'],3)).copy() #Put the four calculated record categories back together into single dataframe res = pd.concat([records1, records2, records3, records4]) #The sign may be reversed if using depth rather than elevation if (res['TARG_THICK'] < 0).all(): res['TARG_THICK'] = res['TARG_THICK'] * -1 #Cannot have negative thicknesses res['TARG_THICK'].clip(lower=0, inplace=True) res['LAYER_THICK'].clip(lower=0, inplace=True) #Get geometrys for each unique API/well res.reset_index(drop=True, inplace=True) res_df = res.groupby(by=['API_NUMBER','LATITUDE','LONGITUDE'], as_index=False).sum(numeric_only=True)#Calculate thickness for each well interval in the layer indicated (e.g., if there are two well intervals from same well in one model layer) uniqInd = pd.DataFrame([v.values[0] for k, v in res.groupby(by=['API_NUMBER','LATITUDE','LONGITUDE']).groups.items()]).loc[:,0] geomCol = res.loc[uniqInd, 'geometry'] geomCol = pd.DataFrame(geomCol[~geomCol.index.duplicated(keep='first')]).reset_index() res_df['TARG_THICK_PER'] = pd.DataFrame(np.round(res_df['TARG_THICK']/res_df['LAYER_THICK'],3)) #Calculate thickness as percent of total layer thickness #Replace np.inf and np.nans with 0 res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER']!=np.inf, other=0) res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER']!=np.nan, other=0) res_df["LAYER"] = layer #Just to have as part of the output file, include the present layer in the file itself as a separate column res_df = res_df[['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'LATITUDE_PROJ', 'LONGITUDE_PROJ','TOP', 'BOTTOM', 'TOP_ELEV', 'BOT_ELEV', 'SURFACE_ELEV', topCol, botCol,'LAYER_THICK','TARG_THICK', 'TARG_THICK_PER', 'LAYER']].copy() #Format dataframe for output res_df = gpd.GeoDataFrame(res_df, geometry=geomCol.loc[:,'geometry']) resdf_list.append(res_df) res_list.append(res) if isinstance(export_dir, (pathlib.PurePath, str)): export_dir = pathlib.Path(export_dir) if export_dir.is_dir(): pass else: try: os.mkdir(export_dir) except: print('Specified export directory does not exist and cannot be created. Function will continue run, but data will not be exported.') #Format and build export filepath export_dir = str(export_dir).replace('\\', '/').replace('\\'[0], '/') zFillDigs = len(str(len(layerList))) if str(outfile_prefix).endswith('_'): outfile_prefix = outfile_prefix[:-1] elif outfile_prefix is None: outfile_prefix = '' if export_dir[-1] =='/': export_dir = export_dir[:-1] nowStr = str(datetime.datetime.today().date())+'_'+str(datetime.datetime.today().hour)+'-'+str(datetime.datetime.today().minute)+'-'+str(datetime.datetime.today().second) outPath = export_dir+'/'+outfile_prefix+'_Lyr'+str(layer).zfill(zFillDigs)+'_'+nowStr+'.csv' if return_all: res.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID') res_df.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID') else: res_df.to_csv(outPath, sep=',', na_rep=np.nan, index_label='ID') if return_all: return res_list, resdf_list else: return resdf_list def logger_function(logtocommence, parameters, func_name)-
Function to log other functions, to be called from within other functions
Parameters
logtocommence:bool- Whether to perform logging steps
parameters:dict- Dictionary containing parameters and their values, from function
func_name:str- Name of function within which this is called
Expand source code
def logger_function(logtocommence, parameters, func_name): """Function to log other functions, to be called from within other functions Parameters ---------- logtocommence : bool Whether to perform logging steps parameters : dict Dictionary containing parameters and their values, from function func_name : str Name of function within which this is called """ if logtocommence: global log_filename #log parameter should be false by default on all. If true, will show up in kwargs #Get the log parameter value if 'log' in parameters.keys(): log_file = parameters.pop('log', None) else: #If it wasn't set, default to None log_file = None #Get currenet time and setup format for log messages curr_time = datetime.datetime.now() FORMAT = '%(asctime)s %(message)s' #Check if we are starting a new logfile (only does this during run of file_setup() or (currently non-existent) new_logfile() functions) if log_file == True and (func_name == 'file_setup' or func_name == 'new_logfile'): #Get the log_dir variable set as a file_setup() parameter, or default to None if not specified out_dir = parameters.pop('log_dir', None) if out_dir is None: #If output directory not specified, default to the input directory out_dir = parameters['well_data'] #Get the timestamp for the filename (this won't change, so represents the start of logging) timestamp = curr_time.strftime('%Y-%m-%d_%H-%M-%S') log_filename = pathlib.Path(out_dir).joinpath(f"log_{timestamp}.txt") if 'verbose' in parameters.keys(): print('Logging data to', log_filename) #Set up logging stream using logging module logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT, filemode='w') #Log logging.info(f"{func_name} CALLED WITH PARAMETERS:\n\t {parameters}") elif log_file == True: #Run this for functions that aren't setting up logging file if log_filename: #Get the log stream and log this function's call with parameters logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT) logging.info(f"{func_name} CALLED WITH PARAMETERS: \n\t{parameters}") else: #If log file has not already been set up, set it up timestamp = curr_time.strftime('%Y-%m-%d_%H-%M-%S') log_filename = f"log_{timestamp}.txt" #Now, get the log stream and log this function's call with parameters logging.basicConfig(filename=log_filename, level=logging.INFO, format=FORMAT) logging.info(f"{func_name} CALLED WITH PARAMETERS: \n\t{parameters}") else: #Don't log if log=False pass return def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool')-
Function to merge lithologies and target booleans based on classifications
Parameters
well_data_df:pandas.DataFrame- Dataframe containing classified well data
targinterps_df:pandas.DataFrame- Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0)
target_col:str, default= 'TARGET'- Name of column in targinterps_df containing the target interpretations
target_class, default = 'bool' Whether the input column is using boolean values as its target indicator
Returns
df_targ:pandas.DataFrame- Dataframe containing merged lithologies/targets
Expand source code
def merge_lithologies(well_data_df, targinterps_df, interp_col='INTERPRETATION', target_col='TARGET', target_class='bool'): """Function to merge lithologies and target booleans based on classifications Parameters ---------- well_data_df : pandas.DataFrame Dataframe containing classified well data targinterps_df : pandas.DataFrame Dataframe containing lithologies and their target interpretations, depending on what the target is for this analysis (often, coarse materials=1, fine=0) target_col : str, default = 'TARGET' Name of column in targinterps_df containing the target interpretations target_class, default = 'bool' Whether the input column is using boolean values as its target indicator Returns ------- df_targ : pandas.DataFrame Dataframe containing merged lithologies/targets """ #by default, use the boolean input if target_class=='bool': targinterps_df[target_col] = targinterps_df[target_col].where(targinterps_df[target_col]=='1', other='0').astype(int) targinterps_df[target_col].fillna(value=0, inplace=True) else: targinterps_df[target_col].replace('DoNotUse', value=-1, inplace=True) targinterps_df[target_col].fillna(value=-2, inplace=True) targinterps_df[target_col].astype(np.int8) df_targ = pd.merge(well_data_df, targinterps_df.set_index(interp_col), right_on=interp_col, left_on='LITHOLOGY', how='left') return df_targ def merge_metadata(data_df, header_df, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, verbose=False, **kwargs)-
Function to merge tables, intended for merging metadata table with data table
Parameters
data_df:pandas.DataFrame- "Left" dataframe, intended for this purpose to be dataframe with main data, but can be anything
header_df:pandas.DataFrame- "Right" dataframe, intended for this purpose to be dataframe with metadata, but can be anything
data_cols:list, optional- List of strings of column names, for columns to be included after join from "left" table (data table). If None, all columns are kept, by default None
header_cols:list, optional- List of strings of columns names, for columns to be included in merged table after merge from "right" table (metadata). If None, all columns are kept, by default None
auto_pick_cols:bool, default= False- Whether to autopick the columns from the metadata table. If True, the following column names are kept:['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'], by default False
drop_duplicate_cols:bool, optional- If True, drops duplicate columns from the tables so that columns do not get renamed upon merge, by default True
log:bool, default= False- Whether to log inputs and outputs to log file.
**kwargs- kwargs that are passed directly to pd.merge(). By default, the 'on' and 'how' parameters are defined as on='API_NUMBER' and how='inner'
Returns
mergedTable:pandas.DataFrame- Merged dataframe
Expand source code
def merge_metadata(data_df, header_df, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, verbose=False, **kwargs): """Function to merge tables, intended for merging metadata table with data table Parameters ---------- data_df : pandas.DataFrame "Left" dataframe, intended for this purpose to be dataframe with main data, but can be anything header_df : pandas.DataFrame "Right" dataframe, intended for this purpose to be dataframe with metadata, but can be anything data_cols : list, optional List of strings of column names, for columns to be included after join from "left" table (data table). If None, all columns are kept, by default None header_cols : list, optional List of strings of columns names, for columns to be included in merged table after merge from "right" table (metadata). If None, all columns are kept, by default None auto_pick_cols : bool, default = False Whether to autopick the columns from the metadata table. If True, the following column names are kept:['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'], by default False drop_duplicate_cols : bool, optional If True, drops duplicate columns from the tables so that columns do not get renamed upon merge, by default True log : bool, default = False Whether to log inputs and outputs to log file. **kwargs kwargs that are passed directly to pd.merge(). By default, the 'on' and 'how' parameters are defined as on='API_NUMBER' and how='inner' Returns ------- mergedTable : pandas.DataFrame Merged dataframe """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(merge_metadata, locals(), exclude_params=['data_df', 'header_df']) if header_df is None: #Figure out which columns to include if data_cols is None: #If not specified, get all the cols data_cols = data_df.columns else: if header_cols is not None: data_cols = data_cols + header_cols data_df = data_df[data_cols] mergedTable = data_df else: if auto_pick_cols: header_cols = ['API_NUMBER', 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'] for c in header_df.columns: if c.startswith('ELEV_') or c.startswith('DEPTH'): header_cols.append(c) if '_PROJ' in c: header_cols.append(c) header_cols.append('geometry') elif header_cols is None: header_cols = header_df.columns else: header_cols = header_cols #If not specified, get all the cols if data_cols is None: data_cols = data_df.columns #Defults for on and how if 'on' not in kwargs.keys(): kwargs['on']='API_NUMBER' if 'how' not in kwargs.keys(): kwargs['how']='inner' #Drop duplicate columns if drop_duplicate_cols: header_colCopy= header_cols.copy() remCount = 0 for i, c in enumerate(header_colCopy): if c in data_cols and c != kwargs['on']: if verbose: print('Removing {} (duplicate columns) from data.'.format(header_cols[i-remCount])) header_cols.pop(i - remCount) remCount += 1 leftTable_join = data_df[data_cols] rightTable_join = header_df[header_cols] mergedTable = pd.merge(left=leftTable_join, right=rightTable_join, **kwargs) return mergedTable def read_dict(file, keytype='np')-
Function to read a text file with a dictionary in it into a python dictionary
Parameters
file:strorpathlib.Path object- Filepath to the file of interest containing the dictionary text
keytype:str, optional- String indicating the datatypes used in the text, currently only 'np' is implemented, by default 'np'
Returns
dict- Dictionary translated from text file.
Expand source code
def read_dict(file, keytype='np'): """Function to read a text file with a dictionary in it into a python dictionary Parameters ---------- file : str or pathlib.Path object Filepath to the file of interest containing the dictionary text keytype : str, optional String indicating the datatypes used in the text, currently only 'np' is implemented, by default 'np' Returns ------- dict Dictionary translated from text file. """ if pathlib.Path(file).suffix == '.json': with open(file, 'r') as f: jsDict = json.load(f) return jsDict with open(file, 'r') as f: data= f.read() jsDict = json.loads(data) if keytype=='np': for k in jsDict.keys(): jsDict[k] = getattr(np, jsDict[k]) return jsDict def read_dictionary_terms(dict_file=None, id_col='ID', search_col='DESCRIPTION', definition_col='LITHOLOGY', class_flag_col='CLASS_FLAG', dictionary_type=None, class_flag=6, rem_extra_cols=True, verbose=False, log=False)-
Function to read dictionary terms from file into pandas dataframe
Parameters
dict_file:strorpathlib.Path object,orlistofthese- File or list of files to be read
search_col:str, default= 'DESCRIPTION'- Name of column containing search terms (geologic formations)
definition_col:str, default= 'LITHOLOGY'- Name of column containing interpretations of search terms (lithologies)
dictionary_type:strorNone, {None, 'exact', 'start', 'wildcard',}- Indicator of which kind of dictionary terms to be read in: None, 'exact', 'start', or 'wildcard' by default None. - If None, uses name of file to try to determine. If it cannot, it will default to using the classification flag from class_flag - If 'exact', will be used to search for exact matches to geologic descriptions - If 'start', will be used as with the .startswith() string method to find inexact matches to geologic descriptions - If 'wildcard', will be used to find any matching substring for inexact geologic matches
class_flag:int, default= 1- Classification flag to be used if dictionary_type is None and cannot be otherwise determined, by default 1
rem_extra_cols:bool, default= True- Whether to remove the extra columns from the input file after it is read in as a pandas dataframe, by default True
log:bool, default= False- Whether to log inputs and outputs to log file.
Returns
dict_terms:pandas.DataFrame- Pandas dataframe with formatting ready to be used in the classification steps of this package
Expand source code
def read_dictionary_terms(dict_file=None, id_col='ID', search_col='DESCRIPTION', definition_col='LITHOLOGY', class_flag_col='CLASS_FLAG', dictionary_type=None, class_flag=6, rem_extra_cols=True, verbose=False, log=False): """Function to read dictionary terms from file into pandas dataframe Parameters ---------- dict_file : str or pathlib.Path object, or list of these File or list of files to be read search_col : str, default = 'DESCRIPTION' Name of column containing search terms (geologic formations) definition_col : str, default = 'LITHOLOGY' Name of column containing interpretations of search terms (lithologies) dictionary_type : str or None, {None, 'exact', 'start', 'wildcard',} Indicator of which kind of dictionary terms to be read in: None, 'exact', 'start', or 'wildcard' by default None. - If None, uses name of file to try to determine. If it cannot, it will default to using the classification flag from class_flag - If 'exact', will be used to search for exact matches to geologic descriptions - If 'start', will be used as with the .startswith() string method to find inexact matches to geologic descriptions - If 'wildcard', will be used to find any matching substring for inexact geologic matches class_flag : int, default = 1 Classification flag to be used if dictionary_type is None and cannot be otherwise determined, by default 1 rem_extra_cols : bool, default = True Whether to remove the extra columns from the input file after it is read in as a pandas dataframe, by default True log : bool, default = False Whether to log inputs and outputs to log file. Returns ------- dict_terms : pandas.DataFrame Pandas dataframe with formatting ready to be used in the classification steps of this package """ if dict_file is None: dict_file=w4h.get_resources()['LithologyDict_Exact'] logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_dictionary_terms, locals()) #Read files into pandas dataframes dict_terms = [] #if dict_file is None: # dict_file = get_resources()['LithologyDict_Exact'] # df = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAGS']) # dict_terms.append(df) # dict_file = [''] if isinstance(dict_file, (tuple, list)): for i, f in enumerate(dict_file): if not f.exists(): df = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAGS']) dict_terms.append(df) else: dict_terms.append(pd.read_csv(f)) if id_col in dict_terms[i].columns: dict_terms[i].set_index(id_col, drop=True, inplace=True) else: if dict_file is None: dict_file = w4h.get_resources()['LithologyDict_Exact'] dict_file = pathlib.Path(dict_file) if dict_file.exists() and dict_file.is_file(): dict_terms.append(pd.read_csv(dict_file, low_memory=False)) if id_col in dict_terms[-1].columns: dict_terms[-1].set_index(id_col, drop=True, inplace=True) dict_file = [dict_file] else: print(f'ERROR: dict_file ({dict_file}) does not exist.') #Create empty dataframe to return dict_terms = pd.DataFrame(columns=['ID', 'DESCRIPTION', 'LITHOLOGY', "CLASS_FLAGS"]) return dict_terms #Rename important columns searchTermList = ['searchterm', 'search', 'exact'] startTermList = ['startterm', 'start', 'startswith'] wildcardTermList = ['wildcard', 'substring', ] #Recast all columns to datatypes of headerData to defined types dict_termDtypes = {search_col:str, definition_col:str, class_flag_col:np.uint8} if dictionary_type is None: dictionary_type='' #Allow string methods on this variable #Iterating, to allow reading of multiple dict file at once (also works with just one at at time) for i, d in enumerate(dict_terms): if dictionary_type.lower() in searchTermList or (dictionary_type=='' and 'spec' in str(dict_file[i]).lower()): d[class_flag_col] = 1 elif dictionary_type.lower() in startTermList or (dictionary_type=='' and 'start' in str(dict_file[i]).lower()): d[class_flag_col] = 4 #Start term classification flag elif dictionary_type.lower() in wildcardTermList or (dictionary_type=='' and 'wildcard' in str(dict_file[i]).lower()): d[class_flag_col] = 5 #Wildcard term classification flag else: d[class_flag_col] = class_flag #Custom classification flag, defined as argument #1: exact classification match, 2: (not defined...ML?), 3: bedrock classification for obvious bedrock, 4: start term, 5: wildcard/substring, 6: Undefined #Rename columns so it is consistent through rest of code if search_col != 'DESCRIPTION': d.rename(columns={search_col:'DESCRIPTION'}, inplace=True) if definition_col != 'LITHOLOGY': d.rename(columns={definition_col:'LITHOLOGY'}, inplace=True) if class_flag_col != 'CLASS_FLAG': d.rename(columns={class_flag_col:'CLASS_FLAG'}, inplace=True) #Cast all columns as type str, if not already for i in range(0, np.shape(d)[1]): if d.iloc[:,i].name in list(dict_termDtypes.keys()): d.iloc[:,i] = d.iloc[:,i].astype(dict_termDtypes[d.iloc[:,i].name]) #Delete duplicate definitions d.drop_duplicates(subset='DESCRIPTION',inplace=True) #Apparently, there are some duplicate definitions, which need to be deleted first d.reset_index(inplace=True, drop=True) #Whether to remove extra columns that aren't needed from dataframe if rem_extra_cols: d = d[['DESCRIPTION', 'LITHOLOGY', 'CLASS_FLAG']] #If only one file designated, convert it so it's no longer a list, but just a dataframe if len(dict_terms) == 1: dict_terms = dict_terms[0] return dict_terms def read_grid(grid_path=None, grid_type='model', no_data_val_grid=0, use_service=False, study_area=None, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False, **kwargs)-
Reads in grid
Parameters
grid_path:strorpathlib.Path, default=None- Path to a grid file
grid_type:str, default='model'- Sets what type of grid to load in
no_data_val_grid:int, default=0- Sets the no data value of the grid
use_service:str, default=False- Sets which service the function uses
study_area:geopandas.GeoDataFrame, default=None- Dataframe containing study area polygon
grid_crs:str, default=None- Sets crs to use if clipping to study area
log:bool, default= False- Whether to log results to log file, by default False
Returns
gridIN:xarray.DataArray- Returns grid
Expand source code
def read_grid(grid_path=None, grid_type='model', no_data_val_grid=0, use_service=False, study_area=None, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False, **kwargs): """Reads in grid Parameters ---------- grid_path : str or pathlib.Path, default=None Path to a grid file grid_type : str, default='model' Sets what type of grid to load in no_data_val_grid : int, default=0 Sets the no data value of the grid use_service : str, default=False Sets which service the function uses study_area : geopandas.GeoDataFrame, default=None Dataframe containing study area polygon grid_crs : str, default=None Sets crs to use if clipping to study area log : bool, default = False Whether to log results to log file, by default False Returns ------- gridIN : xarray.DataArray Returns grid """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_grid, locals(), exclude_params=['study_area']) if grid_type=='model': if 'read_grid' in list(kwargs.keys()): rgrid = kwargs['read_grid'] else: rgrid=True gridIN = read_model_grid(model_grid_path=grid_path, study_area=study_area, no_data_val_grid=0, read_grid=rgrid, grid_crs=grid_crs, output_crs=output_crs, verbose=verbose) else: if use_service==False: gridIN = rxr.open_rasterio(grid_path) elif use_service.lower()=='wcs': gridIN = read_wcs(study_area, wcs_url=lidarURL, **kwargs) elif use_service.lower()=='wms': gridIN = read_wms(study_area, wcs_url=lidarURL, **kwargs) elif use_service: #Deafults to wms gridIN = read_wms(study_area, wcs_url=lidarURL, **kwargs) if study_area is not None: study_area_crs = study_area.crs if grid_crs is None: try: grid_crs = pyproj.CRS.from_wkt(gridIN.rio.crs.to_wkt()) except Exception: iswsCRS = pyproj.CRS.from_json_dict(w4h.read_dict(w4h.get_resources()['ISWS_CRS'])) gridIN.rio.write_crs(iswsCRS) elif grid_crs.lower()=='isws': iswsCRS = pyproj.CRS.from_json_dict(w4h.read_dict(w4h.get_resources()['ISWS_CRS'])) gridIN.rio.write_crs(iswsCRS) if study_area_crs is None: study_area_crs=study_area.crs study_area = study_area.to_crs(output_crs) study_area_crs=study_area.crs gridIN = gridIN.rio.reproject(output_crs) gridIN = grid2study_area(study_area=study_area, grid=gridIN, output_crs=output_crs,) else: gridIN = gridIN.rio.reproject(output_crs) if 'band' in gridIN.dims: gridIN = gridIN.sel(band=1) try: no_data_val_grid = gridIN.attrs['_FillValue'] #Extract from dataset itself except: pass gridIN = gridIN.where(gridIN != no_data_val_grid, other=np.nan) #Replace no data values with NaNs return gridIN def read_lithologies(lith_file=None, interp_col='LITHOLOGY', target_col='CODE', use_cols=None, verbose=False, log=False)-
Function to read lithology file into pandas dataframe
Parameters
lith_file:strorpathlib.Path object, default= None- Filename of lithology file. If None, default is contained within repository, by default None
interp_col:str, default= 'LITHOLOGY'- Column to used to match interpretations
target_col:str, default= 'CODE'- Column to be used as target code
use_cols:list, default= None- Which columns to use when reading in dataframe. If None, defaults to ['LITHOLOGY', 'CODE'].
log:bool, default= True- Whether to log inputs and outputs to log file.
Returns
pandas.DataFrame- Pandas dataframe with lithology information
Expand source code
def read_lithologies(lith_file=None, interp_col='LITHOLOGY', target_col='CODE', use_cols=None, verbose=False, log=False): """Function to read lithology file into pandas dataframe Parameters ---------- lith_file : str or pathlib.Path object, default = None Filename of lithology file. If None, default is contained within repository, by default None interp_col : str, default = 'LITHOLOGY' Column to used to match interpretations target_col : str, default = 'CODE' Column to be used as target code use_cols : list, default = None Which columns to use when reading in dataframe. If None, defaults to ['LITHOLOGY', 'CODE']. log : bool, default = True Whether to log inputs and outputs to log file. Returns ------- pandas.DataFrame Pandas dataframe with lithology information """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_lithologies, locals()) if lith_file is None: #Find resources import w4h lith_file= w4h.get_resources()['LithInterps_FineCoarse'] if not isinstance(lith_file, pathlib.PurePath): lith_file = pathlib.Path(lith_file) if use_cols is None: use_cols = [interp_col, target_col] lithoDF = pd.read_csv(lith_file, usecols=use_cols) lithoDF.rename(columns={interp_col:'INTERPRETATION', target_col:'TARGET'}, inplace=True) return lithoDF def read_model_grid(model_grid_path, study_area=None, no_data_val_grid=0, read_grid=True, node_byspace=True, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False)-
Reads in model grid to xarray data array
Parameters
grid_path:str- Path to model grid file
study_area:geopandas.GeoDataFrame, default=None- Dataframe containing study area polygon
no_data_val_grid:int, default=0- value assigned to areas with no data
readGrid:bool, default=True- Whether function to either read grid or create grid
node_byspace:bool, default=False- Denotes how to create grid
output_crs:str, default='EPSG:4269'- Inputs study area crs
grid_crs:str, default=None- Inputs grid crs
log:bool, default= False- Whether to log results to log file, by default False
Returns
modelGrid:xarray.DataArray- Data array containing model grid
Expand source code
def read_model_grid(model_grid_path, study_area=None, no_data_val_grid=0, read_grid=True, node_byspace=True, grid_crs=None, output_crs='EPSG:4269', verbose=False, log=False): """Reads in model grid to xarray data array Parameters ---------- grid_path : str Path to model grid file study_area : geopandas.GeoDataFrame, default=None Dataframe containing study area polygon no_data_val_grid : int, default=0 value assigned to areas with no data readGrid : bool, default=True Whether function to either read grid or create grid node_byspace : bool, default=False Denotes how to create grid output_crs : str, default='EPSG:4269' Inputs study area crs grid_crs : str, default=None Inputs grid crs log : bool, default = False Whether to log results to log file, by default False Returns ------- modelGrid : xarray.DataArray Data array containing model grid """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_model_grid, locals(), exclude_params='study_area') if read_grid and model_grid_path is not None: modelGridIN = rxr.open_rasterio(model_grid_path) if grid_crs is None: try: grid_crs=modelGridIN.spatial_ref.crs_wkt except Exception: iswsCRSPath = w4h.get_resources()['ISWS_CRS'] iswsCRS = w4h.read_dict(iswsCRSPath, keytype=None) grid_crs = iswsCRS modelGridIN.rio.write_crs(grid_crs) elif isinstance(grid_crs, str) and grid_crs.lower()=='isws': iswsCRSPath = w4h.get_resources()['ISWS_CRS'] iswsCRS = w4h.read_dict(iswsCRSPath, keytype=None) grid_crs = iswsCRS modelGridIN.rio.write_crs(grid_crs) elif isinstance(grid_crs, pathlib.PurePath) or (isinstance(grid_crs, str) and pathlib.Path(grid_crs).exists()): iswsCRSPath = w4h.get_resources()['ISWS_CRS'] grid_crs = w4h.read_dict(iswsCRSPath, keytype=None) modelGridIN.rio.write_crs(grid_crs) else: warnings.warn(f'CRS Specification for grid is {grid_crs}, but this cannot be written to the grid') modelGridIN = modelGridIN.rio.reproject(output_crs) if study_area is not None: study_area = study_area.to_crs(output_crs) modelGrid = grid2study_area(study_area=study_area, grid=modelGridIN, output_crs=output_crs) else: modelGrid = modelGridIN try: noDataVal = float(modelGrid.attrs['_FillValue']) #Extract from dataset itsel except: noDataVal = no_data_val_grid modelGrid = modelGrid.where(modelGrid != noDataVal, other=np.nan) #Replace no data values with NaNs modelGrid = modelGrid.where(modelGrid == np.nan, other=1) #Replace all other values with 1 modelGrid.rio.reproject(grid_crs, inplace=True) elif model_grid_path is None and study_area is None: if verbose: print("ERROR: Either model_grid_path or study_area must be defined.") else: spatRefDict = w4h.read_dict(iswsCRSPath, keytype=None) saExtent = study_area.total_bounds startX = saExtent[0] #Starting X Coordinate startY = saExtent[1] #starting Y Coordinate endX = saExtent[2] endY = saExtent[3] if node_byspace: xSpacing = 625 #X Node spacing ySpacing = xSpacing #Y Node spacing x = np.arange(startX, endX, xSpacing) y = np.arange(startY, endY, ySpacing) else: xNodes = 100 #Number of X Nodes yNodes = 100 #Number of Y Nodes x = np.linspace(startX, endX, num=xNodes) y = np.linspace(startY, endY, num=yNodes) xx, yy = np.meshgrid(x, y) zz = np.ones_like(xx).transpose() yIn = np.flipud(y) coords = {'x':x,'y':yIn, 'spatial_ref':0} dims = {'x':x,'y':yIn} modelGrid = xr.DataArray(data=zz,coords=coords,attrs={'_FillValue':no_data_val_grid}, dims=dims) modelGrid.spatial_ref.attrs['spatial_ref'] = {} if grid_crs is None or grid_crs=='isws' or grid_crs=='ISWS': for k in spatRefDict: modelGrid.spatial_ref.attrs[k] = spatRefDict[k] return modelGrid def read_raw_csv(data_filepath, metadata_filepath, data_cols=None, metadata_cols=None, xcol='LONGITUDE', ycol='LATITUDE', well_key='API_NUMBER', encoding='latin-1', verbose=False, log=False, **read_csv_kwargs)-
Easy function to read raw .txt files output from (for example), an Access database
Parameters
data_filepath:str- Filename of the file containing data, including the extension.
metadata_filepath:str- Filename of the file containing metadata, including the extension.
data_cols:list, default= None- List with strings with names of columns from txt file to keep after reading. If None, ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"], by default None.
metadata_cols:list, default= None- List with strings with names of columns from txt file to keep after reading. If None, ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"], by default None
x_col:str, default= 'LONGITUDE'- Name of column in metadata file indicating the x-location of the well, by default 'LONGITUDE'
ycol:str, default= 'LATITUDE'- Name of the column in metadata file indicating the y-location of the well, by default 'LATITUDE'
well_key:str, default= 'API_NUMBER'- Name of the column with the key/identifier that will be used to merge data later, by default 'API_NUMBER'
encoding:str, default= 'latin-1'- Encoding of the data in the input files, by default 'latin-1'
verbose:bool, default= False- Whether to print the number of rows in the input columns, by default False
log:bool, default= False- Whether to log inputs and outputs to log file.
**read_csv_kwargs- **kwargs that get passed to pd.read_csv()
Returns
(pandas.DataFrame, pandas.DataFrame/None) Tuple/list with two pandas dataframes: (well_data, metadata) metadata is None if only well_data is used
Expand source code
def read_raw_csv(data_filepath, metadata_filepath, data_cols=None, metadata_cols=None, xcol='LONGITUDE', ycol='LATITUDE', well_key='API_NUMBER', encoding='latin-1', verbose=False, log=False, **read_csv_kwargs): """Easy function to read raw .txt files output from (for example), an Access database Parameters ---------- data_filepath : str Filename of the file containing data, including the extension. metadata_filepath : str Filename of the file containing metadata, including the extension. data_cols : list, default = None List with strings with names of columns from txt file to keep after reading. If None, ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"], by default None. metadata_cols : list, default = None List with strings with names of columns from txt file to keep after reading. If None, ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"], by default None x_col : str, default = 'LONGITUDE' Name of column in metadata file indicating the x-location of the well, by default 'LONGITUDE' ycol : str, default = 'LATITUDE' Name of the column in metadata file indicating the y-location of the well, by default 'LATITUDE' well_key : str, default = 'API_NUMBER' Name of the column with the key/identifier that will be used to merge data later, by default 'API_NUMBER' encoding : str, default = 'latin-1' Encoding of the data in the input files, by default 'latin-1' verbose : bool, default = False Whether to print the number of rows in the input columns, by default False log : bool, default = False Whether to log inputs and outputs to log file. **read_csv_kwargs **kwargs that get passed to pd.read_csv() Returns ------- (pandas.DataFrame, pandas.DataFrame/None) Tuple/list with two pandas dataframes: (well_data, metadata) metadata is None if only well_data is used """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_raw_csv, locals()) #Check if input data is already dataframe, otherwise, read it in as dataframe if not isinstance(data_filepath, pd.DataFrame) or isinstance(data_filepath, gpd.GeoDataFrame): downholeDataIN = pd.read_csv(data_filepath, sep=',', header='infer', encoding=encoding, **read_csv_kwargs) if data_cols is None: data_useCols = downholeDataIN.columns elif data_cols == 'auto': data_useCols = ["API_NUMBER","TABLE_NAME","FORMATION","THICKNESS","TOP","BOTTOM"] else: data_useCols= data_cols downholeDataIN = downholeDataIN[data_useCols] if metadata_filepath is None: headerDataIN = None elif not isinstance(metadata_filepath, pd.DataFrame) or isinstance(metadata_filepath, gpd.GeoDataFrame): headerDataIN = pd.read_csv(metadata_filepath, sep=',', header='infer', encoding=encoding, **read_csv_kwargs) else: headerDataIN = None if metadata_cols is None and headerDataIN is not None: metadata_useCols = headerDataIN.columns elif metadata_cols == 'auto': metadata_useCols = ['API_NUMBER',"TOTAL_DEPTH","SECTION","TWP","TDIR","RNG","RDIR","MERIDIAN","QUARTERS","ELEVATION","ELEVREF","COUNTY_CODE","LATITUDE","LONGITUDE","ELEVSOURCE"] else: metadata_useCols= metadata_cols if headerDataIN is not None: headerDataIN = headerDataIN[metadata_useCols] #Drop data with no API downholeDataIN = downholeDataIN.dropna(subset=[well_key]) #Drop data with no API if headerDataIN is not None: headerDataIN = headerDataIN.dropna(subset=[well_key]) #Drop metadata with no API #Drop data with no or missing location information headerDataIN = headerDataIN.dropna(subset=[ycol]) headerDataIN = headerDataIN.dropna(subset=[xcol]) #Reset index so index goes from 0 in numerical/integer order headerDataIN.reset_index(inplace=True, drop=True) else: #***UPDATE: Need to make sure these columns exist in this case*** #Drop data with no or missing location information downholeDataIN = downholeDataIN.dropna(subset=[ycol]) downholeDataIN = downholeDataIN.dropna(subset=[xcol]) downholeDataIN.reset_index(inplace=True, drop=True) #Print outputs to terminal, if designated if verbose: print('Data file has ' + str(downholeDataIN.shape[0])+' valid well records.') if headerDataIN is not None: print("Metadata file has "+str(headerDataIN.shape[0])+" unique wells with valid location information.") return downholeDataIN, headerDataIN def read_study_area(study_area=None, output_crs='EPSG:4269', buffer=None, return_original=False, log=False, verbose=False, **read_file_kwargs)-
Read study area geospatial file into geopandas
Parameters
study_area:str, pathlib.Path, geopandas.GeoDataFrame,orshapely.Geometry- Filepath to any geospatial file readable by geopandas. Polygon is best, but may work with other types if extent is correct.
study_area_crs:str, tuple, dict, optional- CRS designation readable by geopandas/pyproj
buffer:Noneornumeric, default=None- If None, no buffer created. If a numeric value is given (float or int, for example), a buffer will be created at that distance in the unit of the study_area_crs.
return_original:bool, default=False- Whether to return the (reprojected) study area as well as the (reprojected) buffered study area. Study area is only used for clipping data, so usually return_original=False is sufficient.
log:bool, default= False- Whether to log results to log file, by default False
verbose:bool, default=False- Whether to print status and results to terminal
Returns
studyAreaIN:geopandas dataframe- Geopandas dataframe with polygon geometry.
Expand source code
def read_study_area(study_area=None, output_crs='EPSG:4269', buffer=None, return_original=False, log=False, verbose=False, **read_file_kwargs): """Read study area geospatial file into geopandas Parameters ---------- study_area : str, pathlib.Path, geopandas.GeoDataFrame, or shapely.Geometry Filepath to any geospatial file readable by geopandas. Polygon is best, but may work with other types if extent is correct. study_area_crs : str, tuple, dict, optional CRS designation readable by geopandas/pyproj buffer : None or numeric, default=None If None, no buffer created. If a numeric value is given (float or int, for example), a buffer will be created at that distance in the unit of the study_area_crs. return_original : bool, default=False Whether to return the (reprojected) study area as well as the (reprojected) buffered study area. Study area is only used for clipping data, so usually return_original=False is sufficient. log : bool, default = False Whether to log results to log file, by default False verbose : bool, default=False Whether to print status and results to terminal Returns ------- studyAreaIN : geopandas dataframe Geopandas dataframe with polygon geometry. """ if study_area is None: study_area = w4h.get_resources()['study_area'] logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_study_area, locals()) if isinstance(study_area, (gpd.GeoDataFrame, gpd.GeoSeries)): studyAreaIN = study_area elif isinstance(study_area, shapely.Geometry): if 'crs' in read_file_kwargs: crs = read_file_kwargs['crs'] else: warnings.warn('A shapely Geometry object was read in as the study area, but no crs specified. Using CRS specified in output_crs: {output_crs}.\ You can specify the crs as a keyword argument in the read_study_area() function') crs = output_crs studyAreaIN = gpd.GeoDataFrame(index=[0], crs=crs, geometry=[study_area]) else: studyAreaIN = gpd.read_file(study_area, **read_file_kwargs) studyAreaIN.to_crs(output_crs, inplace=True) # Create a buffered study area for clipping if buffer is not None: if return_original: studyAreaNoBuffer = studyAreaIN studyAreaIN = studyAreaIN.buffer(distance=buffer) if verbose: print('\tBuffer applied.') if verbose: print("\tStudy area read.") if return_original: return studyAreaIN, studyAreaNoBuffer return studyAreaIN def read_wcs(study_area, wcs_url='https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS', res_x=30, res_y=30, verbose=False, log=False, **kwargs)-
Reads a WebCoverageService from a url and returns a rioxarray dataset containing it.
Parameters
study_area:geopandas.GeoDataFrame- Dataframe containing study area polygon
wcs_url:str, default=lidarURL- Represents the url for the WCS
res_x:int, default=30- Sets resolution for x axis
res_y:int, default=30- Sets resolution for y axis
log:bool, default= False- Whether to log results to log file, by default False
**kwargs
Returns
wcsData_rxr:xarray.DataArray- A xarray dataarray holding the image from the WebCoverageService
Expand source code
def read_wcs(study_area, wcs_url=lidarURL, res_x=30, res_y=30, verbose=False, log=False, **kwargs): """Reads a WebCoverageService from a url and returns a rioxarray dataset containing it. Parameters ---------- study_area : geopandas.GeoDataFrame Dataframe containing study area polygon wcs_url : str, default=lidarURL Represents the url for the WCS res_x : int, default=30 Sets resolution for x axis res_y : int, default=30 Sets resolution for y axis log : bool, default = False Whether to log results to log file, by default False **kwargs Returns ------- wcsData_rxr : xarray.DataArray A xarray dataarray holding the image from the WebCoverageService """ #Drawn largely from: https://git.wur.nl/isric/soilgrids/soilgrids.notebooks/-/blob/master/01-WCS-basics.ipynb #30m DEM #wcs_url = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_DEM_30M/ImageServer/WCSServer?request=GetCapabilities&service=WCS' #lidar url: #lidarURL = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS' #studyAreaPath = r"\\isgs-sinkhole.ad.uillinois.edu\geophysics\Balikian\ISWS_HydroGeo\WellDataAutoClassification\SampleData\ESL_StudyArea_5mi.shp" #study_area = gpd.read_file(studyAreaPath) if study_area is None: print('ERROR: study_area must be specified to use read_wcs (currently set to {})'.format(study_area)) return logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_wcs, locals(), exclude_params=['study_area']) if 'wcs_url' in kwargs: wcs_url = kwargs['wcs_url'] if 'res_x' in kwargs: res_x = kwargs['res_x'] if 'res_y' in kwargs: res_y = kwargs['res_y'] width_in = '' height_in= '' #Create coverage object my_wcs = WebCoverageService(wcs_url, version='1.0.0') #names = [k for k in my_wcs.contents.keys()] #print(names) dataID = 'IL_Statewide_Lidar_DEM' data = my_wcs.contents[dataID] dBBox = data.boundingboxes #Is this an error? study_area = study_area.to_crs(data.boundingboxes[0]['nativeSrs']) saBBox = study_area.total_bounds #In case study area bounding box goes outside data bounding box, use data bounding box values newBBox = [] for i,c in enumerate(dBBox[0]['bbox']): if i == 0 or i==2: if saBBox[i] < c: newBBox.append(saBBox[i]) else: newBBox.append(c) else: if saBBox[i] > c: newBBox.append(saBBox[i]) else: newBBox.append(c) #Recalculate resolution if it is too fine to read in #Start by getting the area of the study area bounding box saWidth = saBBox[2]-saBBox[0] saHeight = saBBox[3]-saBBox[1] saBBoxAreaM = saWidth*saHeight saBBoxAreaKM = saBBoxAreaM/(1000*1000) #Area in km^2 if saBBoxAreaM/(res_x*res_y) > (4100*15000)*0.457194: #What I think might be the max download size? print("Resolution inputs overriden, file request too large.") res_x=str(round(saWidth/2500, 2)) width_in = str(int(saWidth/float(res_x ))) height_in = str(int(saHeight/float(res_x))) res_y=str(round(saHeight/height_in, 2)) print('New resolution is: '+res_x+'m_x X '+res_y+'m_y' ) print('Dataset size: '+width_in+' pixels_x X '+height_in+' pixels_y') bBox = tuple(newBBox) bBox_str = str(tuple(newBBox)[1:-1]).replace(' ','') dataCRS = 'EPSG:3857' #Format WCS request using owslib response = my_wcs.getCoverage( identifier=my_wcs[dataID].id, crs=dataCRS,#'urn:ogc:def:crs:EPSG::26716', bbox=bBox, resx=res_x, resy=res_y, timeout=60, #width = width_in, height=height_in, format='GeoTIFF') response #If I can figure out url, this might be better? #baseURL = r'https://data.isgs.illinois.edu/arcgis/services/Elevation/'+dataID+'/ImageServer/WCSServer' #addonRequestURL = '?request=GetCoverage&service=WCS&bbox='+bBox_str+'&srs='+dataCRS+'&format=GeoTIFF'+'&WIDTH='+width_in+'&HEIGHT='+height_in+')' #reqURL = baseURL+addonRequestURL #wcsData_rxr = rxr.open_rasterio(reqURL) with MemoryFile(response) as memfile: with memfile.open() as dataset: wcsData_rxr = rxr.open_rasterio(dataset) return wcsData_rxr def read_wms(study_area, layer_name='IL_Statewide_Lidar_DEM_WGS:None', wms_url='https://data.isgs.illinois.edu/arcgis/services/Elevation/IL_Statewide_Lidar_DEM_WGS/ImageServer/WCSServer?request=GetCapabilities&service=WCS', srs='EPSG:3857', clip_to_studyarea=True, bbox=[-9889002.6155, 5134541.069716, -9737541.607038, 5239029.6274], res_x=30, res_y=30, size_x=512, size_y=512, format='image/tiff', verbose=False, log=False, **kwargs)-
Reads a WebMapService from a url and returns a rioxarray dataset containing it.
Parameters
study_area:geopandas.GeoDataFrame- Dataframe containg study area polygon
layer_name:str, default='IL_Statewide_Lidar_DEM_WGS:None'- Represents the layer name in the WMS
wms_url:str, default=lidarURL- Represents the url for the WMS
srs:str, default='EPSG:3857'- Sets the srs
clip_to_studyarea:bool, default=True- Whether to clip to study area or not
res_x:int, default=30- Sets resolution for x axis
res_y:int, default=512- Sets resolution for y axis
size_x:int, default=512- Sets width of result
size_y:int, default=512- Sets height of result
log:bool, default= False- Whether to log results to log file, by default False
Returns
wmsData_rxr:xarray.DataArray- Holds the image from the WebMapService
Expand source code
def read_wms(study_area, layer_name='IL_Statewide_Lidar_DEM_WGS:None', wms_url=lidarURL, srs='EPSG:3857', clip_to_studyarea=True, bbox=[-9889002.615500,5134541.069716,-9737541.607038,5239029.627400], res_x=30, res_y=30, size_x=512, size_y=512, format='image/tiff', verbose=False, log=False, **kwargs): """ Reads a WebMapService from a url and returns a rioxarray dataset containing it. Parameters ---------- study_area : geopandas.GeoDataFrame Dataframe containg study area polygon layer_name : str, default='IL_Statewide_Lidar_DEM_WGS:None' Represents the layer name in the WMS wms_url : str, default=lidarURL Represents the url for the WMS srs : str, default='EPSG:3857' Sets the srs clip_to_studyarea : bool, default=True Whether to clip to study area or not res_x : int, default=30 Sets resolution for x axis res_y : int, default=512 Sets resolution for y axis size_x : int, default=512 Sets width of result size_y : int, default=512 Sets height of result log : bool, default = False Whether to log results to log file, by default False Returns ------- wmsData_rxr : xarray.DataArray Holds the image from the WebMapService """ if study_area is None: print('ERROR: study_area must be specified to use read_wms (currently set to {})'.format(study_area)) return logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_wms, locals(), exclude_params=['study_area']) from owslib.wms import WebMapService # Define WMS endpoint URL if 'wms_url' in kwargs: wms_url = kwargs['wms_url'] else: wms_url = wms_url # Create WMS connection object wms = WebMapService(wms_url) # Print available layers #print(wms.contents) # Select desired layer if 'layer_name' in kwargs: layer = kwargs['layer_name'] else: layer = layer_name data = wms.contents#[layer] if 'srs' in kwargs: studyArea_proj = study_area.to_crs(kwargs['srs']) saBBox = studyArea_proj.total_bounds else: studyArea_proj = study_area.to_crs(srs) saBBox = studyArea_proj.total_bounds if layer == 'IL_Statewide_Lidar_DEM_WGS:None': dBBox = data['0'].boundingBox #Is this an error? gpdDict = {'Label': ['Surf Data Box'], 'geometry': [shapely.geometry.Polygon(((dBBox[0], dBBox[1]), (dBBox[0], dBBox[3]), (dBBox[2], dBBox[3]), (dBBox[2], dBBox[1]), (dBBox[0], dBBox[1])))]} dBBoxGDF = gpd.GeoDataFrame(gpdDict, crs=dBBox[4]) dBBoxGDF.to_crs(srs) #In case study area bounding box goes outside data bounding box, use data bounding box values newBBox = [] for i,c in enumerate(dBBox): if type(c) is str: pass elif i == 0 or i==2: if saBBox[i] < c: newBBox.append(saBBox[i]) else: newBBox.append(c) else: if saBBox[i] > c: newBBox.append(saBBox[i]) else: newBBox.append(c) saWidth = saBBox[2]-saBBox[0] saHeight = saBBox[3]-saBBox[1] # Check kwargs for rest of parameters if 'size_x' in kwargs: size_x = kwargs['size_x'] if 'size_y' in kwargs: size_y = kwargs['size_y'] if 'format' in kwargs: format = kwargs['format'] if 'clip_to_studyarea' in kwargs: clip_to_studyarea = kwargs['clip_to_studyarea'] #get the wms if clip_to_studyarea: img = wms.getmap(layers=[layer], srs=srs, bbox=saBBox, size=(size_x, size_y), format=format, transparent=True, timeout=60) else: img = wms.getmap(layers=[layer], srs=srs, bbox=bbox, size=(size_x, size_y), format=format, transparent=True, timeout=60) #Save wms in memory to a raster dataset with MemoryFile(img) as memfile: with memfile.open() as dataset: wmsData_rxr = rxr.open_rasterio(dataset) #if clip_to_studyarea: # wmsData_rxr = wmsData_rxr.sel(x=slice(saBBox[0], saBBox[2]), y=slice(saBBox[3], saBBox[1]))#.sel(band=1) return wmsData_rxr def read_xyz(xyzpath, datatypes=None, verbose=False, log=False)-
Function to read file containing xyz data (elevation/location)
Parameters
xyzpath:strorpathlib.Path- Filepath of the xyz file, including extension
datatypes:dict, default= None- Dictionary containing the datatypes for the columns int he xyz file. If None, {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64}, by default None
verbose:bool, default= False- Whether to print the number of xyz records to the terminal, by default False
log:bool, default= False- Whether to log inputs and outputs to log file.
Returns
pandas.DataFrame- Pandas dataframe containing the elevation and location data
Expand source code
def read_xyz(xyzpath, datatypes=None, verbose=False, log=False): """Function to read file containing xyz data (elevation/location) Parameters ---------- xyzpath : str or pathlib.Path Filepath of the xyz file, including extension datatypes : dict, default = None Dictionary containing the datatypes for the columns int he xyz file. If None, {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64}, by default None verbose : bool, default = False Whether to print the number of xyz records to the terminal, by default False log : bool, default = False Whether to log inputs and outputs to log file. Returns ------- pandas.DataFrame Pandas dataframe containing the elevation and location data """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(read_xyz, locals()) if datatypes is None: xyzDTypes = {'ID':np.uint32,'API_NUMBER':np.uint64,'LATITUDE':np.float64,'LONGITUDE':np.float64,'ELEV_FT':np.float64} xyzDataIN = pd.read_csv(xyzpath, sep=',', header='infer', dtype=xyzDTypes, index_col='ID') if verbose: print('XYZ file has ' + str(xyzDataIN.shape[0])+' records with elevation and location.') return xyzDataIN def remerge_data(classifieddf, searchdf)-
Function to merge newly-classified (or not) and previously classified data
Parameters
classifieddf:pandas.DataFrame- Dataframe that had already been classified previously
searchdf:pandas.DataFrame- Dataframe with new classifications
Returns
remergeDF:pandas.DataFrame- Dataframe containing all the data, merged back together
Expand source code
def remerge_data(classifieddf, searchdf): """Function to merge newly-classified (or not) and previously classified data Parameters ---------- classifieddf : pandas.DataFrame Dataframe that had already been classified previously searchdf : pandas.DataFrame Dataframe with new classifications Returns ------- remergeDF : pandas.DataFrame Dataframe containing all the data, merged back together """ remergeDF = pd.concat([classifieddf,searchdf], join='inner').sort_index() return remergeDF def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False)-
Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)
Parameters
df_with_depth:pandas.DataFrame- Pandas dataframe containing the well records and descriptions for each interval
top_col:str, default='TOP'- The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
bottom_col:str, default='BOTTOM'- The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
depth_type:str, {'depth', 'elevation'}- Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
verbose:bool, default= False- Whether to print results to the terminal, by default False
log:bool, default= False- Whether to log results to log file, by default False
Returns
pandas.Dataframe- Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
Expand source code
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False): """Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top) Parameters ---------- df_with_depth : pandas.DataFrame Pandas dataframe containing the well records and descriptions for each interval top_col : str, default='TOP' The name of the column containing the depth or elevation for the top of the interval, by default 'TOP' bottom_col : str, default='BOTTOM' The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM' depth_type : str, {'depth', 'elevation'} Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth' verbose : bool, default = False Whether to print results to the terminal, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.Dataframe Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth']) if depth_type.lower() =='depth': df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness elif depth_type.lower() =='elevation' or depth_type=='elev': df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness before = df_with_depth.shape[0] #Calculate number of rows before dropping df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top) df_with_depth.reset_index(inplace=True, drop=True) #Reset index if verbose: after = df_with_depth.shape[0] print('Removed well records with obviously bad depth information. ') print("\tNumber of records before removing: "+str(before)) print("\tNumber of records after removing: "+str(after)) print("\t\t{} well records removed without depth information".format(before-after)) return df_with_depth def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False)-
Function to remove well intervals with no depth information
Parameters
df_with_depth:pandas.DataFrame- Dataframe containing well descriptions
top_col:str, optional- Name of column containing information on the top of the well intervals, by default 'TOP'
bottom_col:str, optional- Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
no_data_val_table:any, optional- No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
verbose:bool, optional- Whether to print results to console, by default False
log:bool, default= False- Whether to log results to log file, by default False
Returns
df_with_depth:pandas.DataFrame- Dataframe with depths dropped
Expand source code
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False): """Function to remove well intervals with no depth information Parameters ---------- df_with_depth : pandas.DataFrame Dataframe containing well descriptions top_col : str, optional Name of column containing information on the top of the well intervals, by default 'TOP' bottom_col : str, optional Name of column containing information on the bottom of the well intervals, by default 'BOTTOM' no_data_val_table : any, optional No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default '' verbose : bool, optional Whether to print results to console, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- df_with_depth : pandas.DataFrame Dataframe with depths dropped """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth']) #Replace empty cells in top and bottom columns with nan df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan) df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan) #Calculate number of rows before dropping before = df_with_depth.shape[0] #Drop records without depth information df_with_depth = df_with_depth.dropna(subset=[top_col]) df_with_depth = df_with_depth.dropna(subset=[bottom_col]) df_with_depth.reset_index(inplace=True, drop=True) #Reset index if verbose: after = df_with_depth.shape[0] print('Removed well records with no depth information. ') print("\tNumber of records before removing: "+str(before)) print("\tNumber of records after removing: "+str(after)) print("\t\t{} well records removed without depth information".format(before-after)) return df_with_depth def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False)-
Function that removes all records in the dataframe containing the well descriptions where no description is given.
Parameters
df_with_descriptions:pandas.DataFrame- Pandas dataframe containing the well records with their individual descriptions
description_col:str, optional- Name of the column containing the geologic description of each interval, by default 'FORMATION'
no_data_val_table:str, optional- The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
verbose:bool, optional- Whether to print the results of this step to the terminal, by default False
log:bool, default= False- Whether to log results to log file, by default False
Returns
pandas.DataFrame- Pandas dataframe with records with no description removed.
Expand source code
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False): """Function that removes all records in the dataframe containing the well descriptions where no description is given. Parameters ---------- df_with_descriptions : pandas.DataFrame Pandas dataframe containing the well records with their individual descriptions description_col : str, optional Name of the column containing the geologic description of each interval, by default 'FORMATION' no_data_val_table : str, optional The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default '' verbose : bool, optional Whether to print the results of this step to the terminal, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.DataFrame Pandas dataframe with records with no description removed. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions']) #Replace empty cells in formation column with nans df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan) before = df_with_descriptions.shape[0] #Calculate number of rows before dropping #Drop records without FORMATION information df_with_descriptions = df_with_descriptions.dropna(subset=[description_col]) df_with_descriptions.reset_index(inplace=True, drop=True) #Reset index if verbose: after = df_with_descriptions.shape[0] print('Removed well records without geologic descriptions. ') print("\tNumber of records before removing: "+str(before)) print("\tNumber of records after removing: "+str(after)) print("\t\t{} well records removed without geologic descriptions".format(before-after)) return df_with_descriptions def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False)-
Function to remove wells that do not have topography data (needed for layer selection later).
This function is intended to be run on the metadata table after elevations have attempted to been added.
Parameters
df_with_topo:pandas.DataFrame- Pandas dataframe containing elevation information.
zcol:str- Name of elevation column
no_data_val_table:any- Value in dataset that indicates no data is present (replaced with np.nan)
verbose:bool, optional- Whether to print outputs, by default True
log:bool, default= False- Whether to log results to log file, by default False
Returns
pandas.DataFrame- Pandas dataframe with intervals with no topography removed.
Expand source code
def remove_no_topo(df_with_topo, zcol='ELEVATION', no_data_val_table='', verbose=False, log=False): """Function to remove wells that do not have topography data (needed for layer selection later). This function is intended to be run on the metadata table after elevations have attempted to been added. Parameters ---------- df_with_topo : pandas.DataFrame Pandas dataframe containing elevation information. zcol : str Name of elevation column no_data_val_table : any Value in dataset that indicates no data is present (replaced with np.nan) verbose : bool, optional Whether to print outputs, by default True log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.DataFrame Pandas dataframe with intervals with no topography removed. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo']) before = df_with_topo.shape[0] df_with_topo[zcol].replace(no_data_val_table, np.nan, inplace=True) df_with_topo.dropna(subset=[zcol], inplace=True) if verbose: after = df_with_topo.shape[0] print('Removed well records with no surface elevation information. ') print("\tNumber of records before removing: "+str(before)) print("\tNumber of records after removing: "+str(after)) print("\t\t{} wells records removed without surface elevation information".format(before-after)) return df_with_topo def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False)-
Function to remove wells and well intervals where there is no location information
Parameters
df_with_locations:pandas.DataFrame- Pandas dataframe containing well descriptions
metadata_DF:pandas.DataFrame- Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
log:bool, default= False- Whether to log results to log file, by default False
Returns
df_with_locations:pandas.DataFrame- Pandas dataframe containing only data with location information
Expand source code
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False): """Function to remove wells and well intervals where there is no location information Parameters ---------- df_with_locations : pandas.DataFrame Pandas dataframe containing well descriptions metadata_DF : pandas.DataFrame Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude) log : bool, default = False Whether to log results to log file, by default False Returns ------- df_with_locations : pandas.DataFrame Pandas dataframe containing only data with location information """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations']) before = df_with_locations.shape[0] #Extract length of data before this process df_with_locations[xcol].replace(no_data_val_table, np.nan, inplace=True) df_with_locations[ycol].replace(no_data_val_table, np.nan, inplace=True) df_with_locations.dropna(subset=xcol, inplace=True) df_with_locations.dropna(subset=ycol, inplace=True) if verbose: after = df_with_locations.shape[0] print('Removed well records with no location information. ') print("\tNumber of records before removing: "+str(before)) print("\tNumber of records after removing: "+str(after)) print("\t\t{} wells records removed without location information".format(before-after)) return df_with_locations def run(well_data, surf_elev_grid, bedrock_elev_grid, model_grid=None, metadata=None, layers=9, well_data_cols=None, well_metadata_cols=None, description_col='FORMATION', top_col='TOP', bottom_col='BOTTOM', depth_type='depth', study_area=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEVATION', well_id_col='API_NUMBER', output_crs='EPSG:4269', lith_dict=None, lith_dict_start=None, lith_dict_wildcard=None, target_dict=None, target_name='', export_dir=None, verbose=False, log=False, **kw_params)-
w4h.run() is a function that runs the intended workflow of the wells4hydrogeology (w4h) package. This means that it runs several constituent functions. The workflow that this follows is provided in the package wiki. It accepts the parameters of the constituent functions. To see a list of these functions and parameters, use
help(run()).The following functions used in w4h.run() are listed below, along with their parameters and default values for those parameters. See the documentation for the each of the individual functions for more information on a specific parameter:file_setup
well_data | default = '<no default>' metadata | default = None data_filename | default = '*ISGS_DOWNHOLE_DATA*.txt' metadata_filename | default = '*ISGS_HEADER*.txt' log_dir | default = None verbose | default = False log | default = Falseread_raw_csv
data_filepath | default = '<output of previous function>' metadata_filepath | default = '<output of previous function>' data_cols | default = None metadata_cols | default = None xcol | default = 'LONGITUDE' ycol | default = 'LATITUDE' well_key | default = 'API_NUMBER' encoding | default = 'latin-1' verbose | default = False log | default = False read_csv_kwargs | default = {}define_dtypes
undefined_df | default = '<output of previous function>' datatypes | default = None verbose | default = False log | default = Falsemerge_metadata
data_df | default = '<output of previous function>' header_df | default = '<output of previous function>' data_cols | default = None header_cols | default = None auto_pick_cols | default = False drop_duplicate_cols | default = True log | default = False verbose | default = False kwargs | default = {}coords2geometry
df_no_geometry | default = '<output of previous function>' xcol | default = 'LONGITUDE' ycol | default = 'LATITUDE' zcol | default = 'ELEV_FT' input_coords_crs | default = 'EPSG:4269' use_z | default = False wkt_col | default = 'WKT' geometry_source | default = 'coords' verbose | default = False log | default = Falseread_study_area
study_area | default = None output_crs | default = 'EPSG:4269' buffer | default = None return_original | default = False log | default = False verbose | default = False read_file_kwargs | default = {}clip_gdf2study_area
study_area | default = '<output of previous function>' gdf | default = '<output of previous function>' log | default = False verbose | default = Falseread_grid
grid_path | default = None grid_type | default = 'model' no_data_val_grid | default = 0 use_service | default = False study_area | default = None grid_crs | default = None output_crs | default = 'EPSG:4269' verbose | default = False log | default = False kwargs | default = {}add_control_points
df_without_control | default = '<output of previous function>' df_control | default = None xcol | default = 'LONGITUDE' ycol | default = 'LATITUDE' zcol | default = 'ELEV_FT' controlpoints_crs | default = 'EPSG:4269' output_crs | default = 'EPSG:4269' description_col | default = 'FORMATION' interp_col | default = 'INTERPRETATION' target_col | default = 'TARGET' verbose | default = False log | default = False kwargs | default = {}remove_nonlocated
df_with_locations | default = '<output of previous function>' xcol | default = 'LONGITUDE' ycol | default = 'LATITUDE' no_data_val_table | default = '' verbose | default = False log | default = Falseremove_no_topo
df_with_topo | default = '<output of previous function>' zcol | default = 'ELEVATION' no_data_val_table | default = '' verbose | default = False log | default = Falseremove_no_depth
df_with_depth | default = '<output of previous function>' top_col | default = 'TOP' bottom_col | default = 'BOTTOM' no_data_val_table | default = '' verbose | default = False log | default = Falseremove_bad_depth
df_with_depth | default = '<output of previous function>' top_col | default = 'TOP' bottom_col | default = 'BOTTOM' depth_type | default = 'depth' verbose | default = False log | default = Falseremove_no_description
df_with_descriptions | default = '<output of previous function>' description_col | default = 'FORMATION' no_data_val_table | default = '' verbose | default = False log | default = Falseget_search_terms
spec_path | default = 'C:\Users\riley\LocalData\Github\wells4hydrogeology/resources/' spec_glob_pattern | default = '*SearchTerms-Specific*' start_path | default = None start_glob_pattern | default = '*SearchTerms-Start*' wildcard_path | default = None wildcard_glob_pattern | default = '*SearchTerms-Wildcard' verbose | default = False log | default = Falseread_dictionary_terms
dict_file | default = None id_col | default = 'ID' search_col | default = 'DESCRIPTION' definition_col | default = 'LITHOLOGY' class_flag_col | default = 'CLASS_FLAG' dictionary_type | default = None class_flag | default = 6 rem_extra_cols | default = True verbose | default = False log | default = Falsespecific_define
df | default = '<output of previous function>' terms_df | default = '<output of previous function>' description_col | default = 'FORMATION' terms_col | default = 'DESCRIPTION' verbose | default = False log | default = Falsesplit_defined
df | default = '<output of previous function>' classification_col | default = 'CLASS_FLAG' verbose | default = False log | default = Falsestart_define
df | default = '<output of previous function>' terms_df | default = '<output of previous function>' description_col | default = 'FORMATION' terms_col | default = 'DESCRIPTION' verbose | default = False log | default = Falsewildcard_define
df | default = '<output of previous function>' terms_df | default = '<output of previous function>' description_col | default = 'FORMATION' terms_col | default = 'DESCRIPTION' verbose | default = False log | default = Falseremerge_data
classifieddf | default = '<output of previous function>' searchdf | default = '<output of previous function>'fill_unclassified
df | default = '<output of previous function>' classification_col | default = 'CLASS_FLAG'read_lithologies
lith_file | default = None interp_col | default = 'LITHOLOGY' target_col | default = 'CODE' use_cols | default = None verbose | default = False log | default = Falsemerge_lithologies
well_data_df | default = '<output of previous function>' targinterps_df | default = '<output of previous function>' interp_col | default = 'INTERPRETATION' target_col | default = 'TARGET' target_class | default = 'bool'align_rasters
grids_unaligned | default = None model_grid | default = None no_data_val_grid | default = 0 verbose | default = False log | default = Falseget_drift_thick
surface_elev | default = None bedrock_elev | default = None layers | default = 9 plot | default = False verbose | default = False log | default = Falsesample_raster_points
raster | default = None points_df | default = None well_id_col | default = 'API_NUMBER' xcol | default = 'LONGITUDE' ycol | default = 'LATITUDE' new_col | default = 'SAMPLED' verbose | default = True log | default = Falseget_layer_depths
df_with_depths | default = '<output of previous function>' surface_elev_col | default = 'SURFACE_ELEV' layer_thick_col | default = 'LAYER_THICK' layers | default = 9 log | default = Falselayer_target_thick
df | default = '<output of previous function>' layers | default = 9 return_all | default = False export_dir | default = None outfile_prefix | default = None depth_top_col | default = 'TOP' depth_bot_col | default = 'BOTTOM' log | default = Falselayer_interp
points | default = '<no default>' grid | default = '<no default>' layers | default = None interp_kind | default = 'nearest' return_type | default = 'dataarray' export_dir | default = None target_col | default = 'TARG_THICK_PER' layer_col | default = 'LAYER' xcol | default = None ycol | default = None xcoord | default = 'x' ycoord | default = 'y' log | default = False verbose | default = False kwargs | default = {}export_grids
grid_data | default = '<no default>' out_path | default = '<no default>' file_id | default = '' filetype | default = 'tif' variable_sep | default = True date_stamp | default = True verbose | default = False log | default = False"Expand source code
def run(well_data, surf_elev_grid, bedrock_elev_grid, model_grid=None, metadata=None, layers = 9, well_data_cols=None, well_metadata_cols=None, description_col='FORMATION', top_col='TOP', bottom_col='BOTTOM', depth_type='depth', study_area=None, xcol='LONGITUDE', ycol='LATITUDE', zcol='ELEVATION', well_id_col='API_NUMBER', output_crs='EPSG:4269', lith_dict=None, lith_dict_start=None, lith_dict_wildcard=None, target_dict=None, target_name='', export_dir=None, verbose=False, log=False, **kw_params): """Function to run entire process with one line of code. NOTE: verbose and log are boolean parameters used for most of the functions. verbose=True prints information to terminal, log=True logs to a file in the log_dir, which defaults to the export_dir Parameters ---------- well_data : str or pathlib.Path obj Filepath to file or directory containing well data. surf_elev_grid : str or pathlib.Path object _description_ bedrock_elev_grid : str or pathlib.Path object _description_ model_grid : str or pathlib.Path object, or model grid parameters (see model_grid function) _description_ metadata : str or pathlib.Path object, or None, default=None Filepath to file or directory containing well metadata, such as location and elevation. If None, will check if well_data is a directory, and if so, will use metadata_filename to search in same directory. well_data_cols : List or list-like Columns to well_metadata_cols : List or list-like _description_ layers : int, default = 9 The number of layers in the model grid description_col : str, default = 'FORMATION' Name of column containing geologic descriptions of the well interval. This column should be in well_data. top_col : str, default = 'TOP' Name of column containing depth/elevation at top of well interval. This column should be in well_data. bottom_col : str, default = 'BOTTOM' Name of column containing depth/elevation at bottom of well interval. This column should be in well_data. depth_type : str, default = 'depth' Whether values top_col or bottom_col refer to depth or elevation. study_area : str or pathlib.Path object, or geopandas.GeoDataFrame _description_ xcol : str, default = 'LONGITUDE' Name of column containing x coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data. ycol : str, default = 'LATITUDE' Name of column containing y coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data. zcol : str, default = 'ELEVATION' Name of column containing z coordinates. This column should be in metadata unless metadata is not read, then it should be in well_data. output_crs : crs definition accepted by pyproj, default = 'EPSG:4269' CRS to output all of the data into lith_dict : str or pathlib.Path object, or pandas.DataFrame _description_ lith_dict_start : str or pathlib.Path object, or pandas.DataFrame _description_ lith_dict_wildcard : str or pathlib.Path object, or pandas.DataFrame _description_ target_dict : str or pathlib.Path object, or pandas.DataFrame _description_ target_name : str, default = 'CoarseFine' Name of target of interest, to be used on exported files export_dir : str or pathlib.Path object, default = None Directory to export output files verbose : bool, default = False Whether to print updates/results log : bool, default = False Whether to send parameters and outputs to log file, to be saved in export_dir, or the same directory as well_data if export_dir not defined. **kw_params Keyword parameters used by any of the functions throughout the process. See list of functions above, and the API documentation for their possible parameters """ if verbose: verbose_print(run, locals()) #Get data (files or otherwise) file_setup_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.file_setup).parameters.keys()} #Check how well_data and metadata were defined if isinstance(well_data, pathlib.PurePath) or isinstance(well_data, str): #Convert well_data to pathlib.Path if not already if isinstance(well_data, str): well_data = pathlib.Path(well_data) if metadata is None: if well_data.is_dir(): #If the two files are supposed to be in the same directory (or just want well_data found) well_dataPath, metadataPath = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs) elif well_data.exists(): #If well_data is a file, and metadata is not used well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs) metadataPath = None else: #Need for well_data to exist at the very least raise IOError('well_data file does not exist:{}'.format(well_data)) elif isinstance(metadata, pathlib.PurePath) or isinstance(metadata, str): #Metdata has specifically been specified by a filepath if isinstance(metadata, str): metadata = pathlib.Path(metadata) well_dataPath, metadataPath = w4h.file_setup(well_data=well_data, metadata=metadata, **file_setup_kwargs) else: if isinstance(metadata, pd.DataFrame): well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs) metadataPath = metadata elif metadata is None: well_dataPath, _ = w4h.file_setup(well_data=well_data, verbose=verbose, log=log, **file_setup_kwargs) elif isinstance(well_data, pd.DataFrame): if isinstance(metadata, pd.DataFrame): well_dataPath = well_data metadataPath = metadata elif isinstance(metadata, pathlib.PurePath) or isinstance(metadata, str): _, metadataPath = w4h.file_setup(well_data=metadata, metadata=metadata, verbose=verbose, log=log, **file_setup_kwargs) well_dataPath = well_data else: print('ERROR: metadata must be a string filepath, a pathlib.Path object, or pandas.DataFrame') else: print('ERROR: well_data must be a string filepath, a pathlib.Path object, or pandas.DataFrame') if not export_dir: if export_dir is False: pass else: nowTime = datetime.datetime.now() nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0] nowTimeStr = '_'+str(nowTime) outDir = 'Output_'+nowTimeStr if isinstance(well_dataPath, pd.DataFrame) or isinstance(well_dataPath, gpd.GeoDataFrame): export_dir = pathlib.Path(outDir) elif isinstance(well_dataPath, pathlib.PurePath): if well_dataPath.is_dir(): export_dir = well_dataPath.joinpath(outDir) else: export_dir = well_dataPath.parent.joinpath(outDir) else: raise IOError('export_dir should be explicitly defined if well_data is not a filepath') if not export_dir.exists(): try: export_dir.mkdir() except Exception: print('Export Directory not created') #Get pandas dataframes from input read_raw_txt_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_raw_csv).parameters.keys()} well_data_IN, metadata_IN = w4h.read_raw_csv(data_filepath=well_dataPath, metadata_filepath=metadataPath, verbose=verbose, log=log, **read_raw_txt_kwargs) #Functions to read data into dataframes. Also excludes extraneous columns, and drops header data with no location information #Define data types (file will need to be udpated) well_data_DF = w4h.define_dtypes(undefined_df=well_data_IN, datatypes='./resources/downholeDataTypes.txt', verbose=verbose, log=log) metadata_DF = w4h.define_dtypes(undefined_df=metadata_IN, datatypes='./resources/headerDataTypes.txt', verbose=verbose, log=log) if metadata_DF is None: well_data_xyz = well_data_DF else: merge_metadata_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.merge_metadata).parameters.keys()} well_data_xyz = w4h.merge_metadata(data_df=well_data_DF, header_df=metadata_DF, data_cols=None, header_cols=None, auto_pick_cols=False, drop_duplicate_cols=True, log=False, **merge_metadata_kwargs) #Convert well_data_xyz to have geometry coords2geometry_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.coords2geometry).parameters.keys()} well_data_xyz = w4h.coords2geometry(df_no_geometry=well_data_xyz, xcol=xcol, ycol=ycol, zcol=zcol, verbose=verbose, log=log, **coords2geometry_kwargs) #Get Study area read_study_area_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_study_area).parameters.keys()} if study_area is None: studyAreaIN = None use_study_area = False else: studyAreaIN = w4h.read_study_area(study_area_path=study_area, log=log, output_crs=output_crs, **read_study_area_kwargs) use_study_area = True clip_gdf2study_area_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.clip_gdf2study_area).parameters.keys()} well_data_xyz = w4h.clip_gdf2study_area(study_area=studyAreaIN, gdf=well_data_xyz, verbose=verbose, log=log,**clip_gdf2study_area_kwargs) #Get surfaces and grid(s) read_grid_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_grid).parameters.keys()} modelGridPath = model_grid surfaceElevPath = surf_elev_grid bedrockElevPath = bedrock_elev_grid modelGrid = w4h.read_grid(grid_path=modelGridPath, grid_type='model', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs) surfaceElevGridIN = w4h.read_grid(grid_path=surfaceElevPath, grid_type='surface', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs) bedrockElevGridIN = w4h.read_grid(grid_path=bedrockElevPath, grid_type='bedrock', study_area=studyAreaIN, verbose=verbose, log=log, **read_grid_kwargs) #UPDATE: MAKE SURE CRS's all align *** #Add control points add_control_points_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.add_control_points).parameters.keys()} well_data_xyz = w4h.add_control_points(df_without_control=well_data_xyz, xcol=xcol, ycol=ycol, zcol=zcol, top_col=top_col, bottom_col=bottom_col, description_col=description_col, verbose=verbose, log=log, **add_control_points_kwargs) #Clean up data well_data_xyz = w4h.remove_nonlocated(df_with_locations=well_data_xyz, log=log, verbose=verbose) well_data_xyz = w4h.remove_no_topo(df_with_topo=well_data_xyz, zcol=zcol, verbose=verbose, log=log) remove_no_depth_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_no_depth).parameters.keys()} well_data_xyz = w4h.remove_no_depth(well_data_xyz, verbose=verbose, top_col=top_col, bottom_col=bottom_col, log=log, **remove_no_depth_kwargs) #Drop records with no depth information remove_bad_depth_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_bad_depth).parameters.keys()} well_data_xyz = w4h.remove_bad_depth(well_data_xyz, verbose=verbose, top_col=top_col, bottom_col=bottom_col, depth_type=depth_type, log=log, **remove_bad_depth_kwargs)#Drop records with bad depth information (i.e., top depth > bottom depth) (Also calculates thickness of each record) remove_no_formation_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.remove_no_description).parameters.keys()} well_data_xyz = w4h.remove_no_description(well_data_xyz, description_col=description_col, verbose=verbose, log=log, **remove_no_formation_kwargs) #CLASSIFICATION #Read dictionary definitions and classify get_search_terms_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.get_search_terms).parameters.keys()} specTermsPATH, startTermsPATH, wildcardTermsPATH, = w4h.get_search_terms(spec_path=lith_dict, start_path=lith_dict_start, wildcard_path=lith_dict_wildcard, log=log, **get_search_terms_kwargs) read_dictionary_terms_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_dictionary_terms).parameters.keys()} if 'class_flag' in read_dictionary_terms_kwargs.keys(): del read_dictionary_terms_kwargs['class_flag'] #This is specific to an invidiual dict terms file, so don't want to use for all specTerms = w4h.read_dictionary_terms(dict_file=specTermsPATH, log=log, **read_dictionary_terms_kwargs) startTerms = w4h.read_dictionary_terms(dict_file=startTermsPATH, log=log, **read_dictionary_terms_kwargs) wildcardTerms = w4h.read_dictionary_terms(dict_file=wildcardTermsPATH, log=log, **read_dictionary_terms_kwargs) #Clean up dictionary terms specTerms.drop_duplicates(subset='DESCRIPTION', inplace=True) specTerms.reset_index(inplace=True, drop=True) startTerms.drop_duplicates(subset='DESCRIPTION', inplace=True) startTerms.reset_index(inplace=True, drop=True) wildcardTerms.drop_duplicates(subset='DESCRIPTION', inplace=True) wildcardTerms.reset_index(inplace=True, drop=True) if verbose: print('Search terms to be used:') print('\t {} exact match term/definition pairs') print('\t {} starting match term/definition pairs') print('\t {} wildcard match term/definition pairs') #CLASSIFICATIONS #Exact match classifications well_data_xyz = w4h.specific_define(well_data_xyz, terms_df=specTerms, description_col=description_col, verbose=verbose, log=log) #.startswith classifications if lith_dict_start is not None: classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log) searchDF = w4h.start_define(df=searchDF, terms_df=startTerms, description_col=description_col, verbose=verbose, log=log) well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? *** #wildcard/any substring match classifications if lith_dict_wildcard is not None: classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log) searchDF = w4h.wildcard_define(df=searchDF, terms_df=wildcardTerms, description_col=description_col, verbose=verbose, log=log) well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? *** #Depth classification classifedDF, searchDF = w4h.split_defined(well_data_xyz, verbose=verbose, log=log) searchDF = w4h.depth_define(df=searchDF, thresh=550, verbose=verbose, log=log) well_data_xyz = w4h.remerge_data(classifieddf=classifedDF, searchdf=searchDF) #UPDATE: Needed? *** #Fill unclassified data well_data_xyz = w4h.fill_unclassified(well_data_xyz, classification_col='CLASS_FLAG') #Add target interpratations read_lithologies_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.read_lithologies).parameters.keys()} targetInterpDF = w4h.read_lithologies(lith_file=target_dict, log=log, **read_lithologies_kwargs) well_data_xyz = w4h.merge_lithologies(well_data_df=well_data_xyz, targinterps_df=targetInterpDF, target_col='TARGET', target_class='bool') #Sort dataframe to prepare for next steps #well_data_xyz = w4h.sort_dataframe(df=well_data_xyz, sort_cols=['API_NUMBER','TOP'], remove_nans=True) well_data_xyz = well_data_xyz.sort_values(by=[well_id_col, top_col]) well_data_xyz.reset_index(inplace=True, drop=True) #UPDATE: Option to remove nans? well_data_xyz = well_data_xyz[pd.notna(well_data_xyz["LITHOLOGY"])] #Analyze Surface(s) and grid(s) bedrockGrid, surfaceGrid = w4h.align_rasters(grids_unaligned=[bedrockElevGridIN, surfaceElevGridIN], model_grid=modelGrid, no_data_val_grid=0, log=log) driftThickGrid, layerThickGrid = w4h.get_drift_thick(surface_elev=surfaceGrid, bedrock_elev=bedrockGrid, layers=layers, plot=verbose, log=log) well_data_xyz = w4h.sample_raster_points(raster=bedrockGrid, points_df=well_data_xyz, xcol=xcol, ycol=ycol, new_col='BEDROCK_ELEV', verbose=verbose, log=log) well_data_xyz = w4h.sample_raster_points(raster=surfaceGrid, points_df=well_data_xyz, xcol=xcol, ycol=ycol, new_col='SURFACE_ELEV', verbose=verbose, log=log) well_data_xyz['BEDROCK_DEPTH'] = well_data_xyz['SURFACE_ELEV'] - well_data_xyz['BEDROCK_ELEV'] well_data_xyz['LAYER_THICK'] = well_data_xyz['BEDROCK_DEPTH'] / layers well_data_xyz = w4h.get_layer_depths(df_with_depths=well_data_xyz, layers=layers, log=log) layer_target_thick_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.layer_target_thick).parameters.keys()} if 'return_all' in layer_target_thick_kwargs.keys(): del layer_target_thick_kwargs['return_all'] #This needs to be set to False, so we don't want it reading in twice resdf = w4h.layer_target_thick(df=well_data_xyz, layers=layers, return_all=False, export_dir=export_dir, depth_top_col=top_col, depth_bot_col=bottom_col, log=log, **layer_target_thick_kwargs) layer_interp_kwargs = {k: v for k, v in locals()['kw_params'].items() if k in inspect.signature(w4h.layer_interp).parameters.keys()} layers_data = w4h.layer_interp(points=resdf, grid=modelGrid, layers=9, verbose=verbose, log=log, **layer_interp_kwargs) nowTime = datetime.datetime.now() nowTime = str(nowTime).replace(':', '-').replace(' ','_').split('.')[0] nowTimeStr = '_'+str(nowTime) #THIS MAY BE REPEAT OF LAST LINES OF layer_interp() w4h.export_grids(grid_data=layers_data, out_path=export_dir, file_id=target_name,filetype='tif', variable_sep=True, date_stamp=True, verbose=verbose, log=log) return resdf, layers_data def sample_raster_points(raster=None, points_df=None, well_id_col='API_NUMBER', xcol='LONGITUDE', ycol='LATITUDE', new_col='SAMPLED', verbose=True, log=False)-
Sample raster values to points from geopandas geodataframe.
Parameters
raster:rioxarray data array- Raster containing values to be sampled.
points_df:geopandas.geodataframe- Geopandas dataframe with geometry column containing point values to sample.
well_id_col:str, default="API_NUMBER"- Column that uniquely identifies each well so multiple sampling points are not taken per well
xcol:str, default='LONGITUDE'- Column containing name for x-column, by default 'LONGITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original.
ycol:str, default='LATITUDE'- Column containing name for y-column, by default 'LATITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original. new_col : str, optional
new_col:str, default='SAMPLED'- Name for name of new column containing points sampled from the raster, by default 'SAMPLED'.
verbose:bool, default=True- Whether to send to print() information about progress of function, by default True.
log:bool, default= False- Whether to log results to log file, by default False
Returns
points_df:geopandas.geodataframe- Same as points_df, but with sampled values and potentially with reprojected coordinates.
Expand source code
def sample_raster_points(raster=None, points_df=None, well_id_col='API_NUMBER', xcol='LONGITUDE', ycol='LATITUDE', new_col='SAMPLED', verbose=True, log=False): """Sample raster values to points from geopandas geodataframe. Parameters ---------- raster : rioxarray data array Raster containing values to be sampled. points_df : geopandas.geodataframe Geopandas dataframe with geometry column containing point values to sample. well_id_col : str, default="API_NUMBER" Column that uniquely identifies each well so multiple sampling points are not taken per well xcol : str, default='LONGITUDE' Column containing name for x-column, by default 'LONGITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original. ycol : str, default='LATITUDE' Column containing name for y-column, by default 'LATITUDE.' This is used to output (potentially) reprojected point coordinates so as not to overwrite the original. new_col : str, optional new_col : str, default='SAMPLED' Name for name of new column containing points sampled from the raster, by default 'SAMPLED'. verbose : bool, default=True Whether to send to print() information about progress of function, by default True. log : bool, default = False Whether to log results to log file, by default False Returns ------- points_df : geopandas.geodataframe Same as points_df, but with sampled values and potentially with reprojected coordinates. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if raster is None: raster = w4h.get_resources()['surf_elev'] if points_df is None: points_df = w4h.get_resources()['well_data'] if verbose: verbose_print(sample_raster_points, locals(), exclude_params=['raster', 'points_df']) print(f"\tSampling {new_col} grid for at all well locations.") #Project points to raster CRS rastercrsWKT=raster.spatial_ref.crs_wkt if rastercrsWKT != points_df.crs: if verbose: print("\tTemporarily reprojecting raster data to point data's CRS.") pointCRS = points_df.crs.to_epsg() if pointCRS is None: pointCRS = points_df.crs.to_wkt() raster = raster.rio.reproject(pointCRS) #points_df = points_df.to_crs(rastercrsWKT) xCOLOUT = xcol+'_PROJ' yCOLOUT = ycol+'_PROJ' points_df[xCOLOUT] = points_df['geometry'].x points_df[yCOLOUT] = points_df['geometry'].y xData = np.array(points_df[xCOLOUT].values) yData = np.array(points_df[yCOLOUT].values) zData = [] zID = [] zInd = [] # Get unique well values to reduce sampling time uniqueWells = points_df.drop_duplicates(subset=[well_id_col]) if verbose: print(f"\t{uniqueWells.shape[0]} unique wells idenfied using {well_id_col} column") # Loop over DataFrame rows for i, row in uniqueWells.iterrows(): # Select data from DataArray at current coordinates and append to list zInd.append(i) zData.append([row[well_id_col], raster.sel(x=row[xCOLOUT], y=row[yCOLOUT], method='nearest').item()]) inputtype = points_df.dtypes[well_id_col] wellZDF = pd.DataFrame(zData, columns=[well_id_col, new_col], index=pd.Index(zInd)) # Merge each unique well's data with all well intervals wellZDF[well_id_col].astype(inputtype, copy=False) points_df = points_df.merge(wellZDF, how='left', on=well_id_col) #points_df[new_col] = zData#sampleDF[new_col] return points_df def sort_dataframe(df, sort_cols=['API_NUMBER', 'TOP'], remove_nans=True)-
Function to sort dataframe by one or more columns.
Parameters
df:pandas.DataFrame- Dataframe to be sorted
sort_cols:strorlistofstr, default= ['API_NUMBER','TOP']- Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP']
remove_nans:bool, default= True- Whether or not to remove nans in the process, by default True
Returns
df_sorted:pandas.DataFrame- Sorted dataframe
Expand source code
def sort_dataframe(df, sort_cols=['API_NUMBER','TOP'], remove_nans=True): """Function to sort dataframe by one or more columns. Parameters ---------- df : pandas.DataFrame Dataframe to be sorted sort_cols : str or list of str, default = ['API_NUMBER','TOP'] Name(s) of columns by which to sort dataframe, by default ['API_NUMBER','TOP'] remove_nans : bool, default = True Whether or not to remove nans in the process, by default True Returns ------- df_sorted : pandas.DataFrame Sorted dataframe """ #Sort columns for better processing later df_sorted = df.sort_values(sort_cols) df_sorted.reset_index(inplace=True, drop=True) if remove_nans: df_sorted = df_sorted[pd.notna(df_sorted["LITHOLOGY"])] return df_sorted def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)-
Function to classify terms that have been specifically defined in the terms_df.
Parameters
df:pandas.DataFrame- Input dataframe with unclassified well descriptions.
terms_df:pandas.DataFrame- Dataframe containing the classifications
description_col:str, default='FORMATION'- Column name in df containing the well descriptions, by default 'FORMATION'.
terms_col:str, default='DESCRIPTION'- Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'.
verbose:bool, default=False- Whether to print up results, by default False.
Returns
df_Interps:pandas.DataFrame- Dataframe containing the well descriptions and their matched classifications.
Expand source code
def specific_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False): """Function to classify terms that have been specifically defined in the terms_df. Parameters ---------- df : pandas.DataFrame Input dataframe with unclassified well descriptions. terms_df : pandas.DataFrame Dataframe containing the classifications description_col : str, default='FORMATION' Column name in df containing the well descriptions, by default 'FORMATION'. terms_col : str, default='DESCRIPTION' Column name in terms_df containing the classified descriptions, by default 'DESCRIPTION'. verbose : bool, default=False Whether to print up results, by default False. Returns ------- df_Interps : pandas.DataFrame Dataframe containing the well descriptions and their matched classifications. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(specific_define, locals(), exclude_params=['df', 'terms_df']) if description_col != terms_col: terms_df.rename(columns={terms_col:description_col}, inplace=True) terms_col = description_col df[description_col] = df[description_col].astype(str) terms_df[terms_col] = terms_df[terms_col].astype(str) df[description_col] = df[description_col].str.casefold() terms_df[terms_col] = terms_df[terms_col].str.casefold() #df['FORMATION'] = df['FORMATION'].str.strip(['.,:?\t\s']) #terms_df['FORMATION'] = terms_df['FORMATION'].str.strip(['.,:?\t\s']) terms_df.drop_duplicates(subset=terms_col, keep='last', inplace=True) terms_df.reset_index(drop=True, inplace=True) df_Interps = pd.merge(left=df, right=terms_df.set_index(terms_col), on=description_col, how='left') df_Interps.rename(columns={description_col:'FORMATION'}, inplace=True) df_Interps['BEDROCK_FLAG'] = df_Interps['LITHOLOGY'] == 'BEDROCK' if verbose: print('Classified well records using exact matches') numRecsClass = int(df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum()) recsRemainig = int(df_Interps.shape[0]-numRecsClass) percRecsClass =round((df_Interps[df_Interps['CLASS_FLAG']==1]['CLASS_FLAG'].sum()/df_Interps.shape[0])*100,2) print("\t{} records classified using exact matches ({}% of unclassified data)".format(numRecsClass, percRecsClass)) print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass)) return df_Interps def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False)-
Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified.
Parameters
df:pandas.DataFrame- Dataframe containing all the well descriptions
classification_col:str, default= 'CLASS_FLAG'- Name of column containing the classification flag, by default 'CLASS_FLAG'
verbose:bool, default= False- Whether to print results, by default False
log:bool, default= False- Whether to log results to log file
Returns
Two-item tupleofpandas.Dataframe- tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data.
Expand source code
def split_defined(df, classification_col='CLASS_FLAG', verbose=False, log=False): """Function to split dataframe with well descriptions into two dataframes based on whether a row has been classified. Parameters ---------- df : pandas.DataFrame Dataframe containing all the well descriptions classification_col : str, default = 'CLASS_FLAG' Name of column containing the classification flag, by default 'CLASS_FLAG' verbose : bool, default = False Whether to print results, by default False log : bool, default = False Whether to log results to log file Returns ------- Two-item tuple of pandas.Dataframe tuple[0] is dataframe containing classified data, tuple[1] is dataframe containing unclassified data. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) classifedDF= df[df[classification_col].notna()] #Already-classifed data searchDF = df[df[classification_col].isna()] #Unclassified data return classifedDF, searchDF def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)-
Function to classify descriptions according to starting substring.
Parameters
df:pandas.DataFrame- Dataframe containing all the well descriptions
terms_df:pandas.DataFrame- Dataframe containing all the startswith substrings to use for searching
description_col:str, default= 'FORMATION'- Name of column in df containing descriptions, by default 'FORMATION'
terms_col:str, default= 'FORMATION'- Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose:bool, default= False- Whether to print out results, by default False
log:bool, default= True- Whether to log results to log file
Returns
df:pandas.DataFrame- Dataframe containing the original data and new classifications
Expand source code
def start_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False): """Function to classify descriptions according to starting substring. Parameters ---------- df : pandas.DataFrame Dataframe containing all the well descriptions terms_df : pandas.DataFrame Dataframe containing all the startswith substrings to use for searching description_col : str, default = 'FORMATION' Name of column in df containing descriptions, by default 'FORMATION' terms_col : str, default = 'FORMATION' Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION' verbose : bool, default = False Whether to print out results, by default False log : bool, default = True Whether to log results to log file Returns ------- df : pandas.DataFrame Dataframe containing the original data and new classifications """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(start_define, locals(), exclude_params=['df', 'terms_df']) #if verbose: # #Estimate when it will end, based on test run # estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take # nowTime = datetime.datetime.now() # endTime = nowTime+datetime.timedelta(minutes=estTime) # print("Start Term process should be done by {:d}:{:02d}".format(endTime.hour, endTime.minute)) #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation. for i,s in enumerate(terms_df[terms_col]): df['CLASS_FLAG'].where(~df[description_col].str.startswith(s,na=False),4,inplace=True) df['LITHOLOGY'].where(~df[description_col].str.startswith(s,na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True) df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK'] if verbose: numRecsClass = int(df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum()) percRecsClass= round((df[df['CLASS_FLAG']==4]['CLASS_FLAG'].sum()/df.shape[0])*100,2) recsRemainig = int(df.shape[0]-numRecsClass) print('Classified well records using initial substring matches') print("\t{} records classified using initial substring matches ({}% of unclassified data)".format(numRecsClass, percRecsClass)) print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass)) return df def verbose_print(func, local_variables, exclude_params=[])-
Expand source code
def verbose_print(func, local_variables, exclude_params=[]): print_list = ['\n'] sTime = datetime.datetime.now() print_list.append(f"{func.__name__}") print_list.append(f"\tStarted at {sTime}.") print_list.append(f"\tParameters:") for k, v in local_variables.items(): if k in inspect.signature(func).parameters: if 'kwargs' in k: print_list.append(f"\t\t{k}") for kk, vv in local_variables[k].items(): print_list.append(f"\t\t\t{kk}={vv}") elif k in exclude_params: print_list.append(f"\t\t{k}=<input object>") else: print_list.append(f"\t\t{k}={v}") for line in print_list: print(line) return print_list def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False)-
Function to classify descriptions according to any substring.
Parameters
df:pandas.DataFrame- Dataframe containing all the well descriptions
terms_df:pandas.DataFrame- Dataframe containing all the startswith substrings to use for searching
description_col:str, default= 'FORMATION'- Name of column in df containing descriptions, by default 'FORMATION'
terms_col:str, default= 'FORMATION'- Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION'
verbose:bool, default= False- Whether to print out results, by default False
log:bool, default= True- Whether to log results to log file
Returns
df:pandas.DataFrame- Dataframe containing the original data and new classifications
Expand source code
def wildcard_define(df, terms_df, description_col='FORMATION', terms_col='DESCRIPTION', verbose=False, log=False): """Function to classify descriptions according to any substring. Parameters ---------- df : pandas.DataFrame Dataframe containing all the well descriptions terms_df : pandas.DataFrame Dataframe containing all the startswith substrings to use for searching description_col : str, default = 'FORMATION' Name of column in df containing descriptions, by default 'FORMATION' terms_col : str, default = 'FORMATION' Name of column in terms_df containing startswith substring to match with description_col, by default 'FORMATION' verbose : bool, default = False Whether to print out results, by default False log : bool, default = True Whether to log results to log file Returns ------- df : pandas.DataFrame Dataframe containing the original data and new classifications """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(wildcard_define, locals(), exclude_params=['df', 'terms_df']) #if verbose: # #Estimate when it will end, based on test run # estTime = df.shape[0]/3054409 * 6 #It took about 6 minutes to classify data with entire dataframe. This estimates the fraction of that it will take # nowTime = datetime.datetime.now() # endTime = nowTime+datetime.timedelta(minutes=estTime) # print("Wildcard Term process should be done by (?) {:d}:{:02d}".format(endTime.hour, endTime.minute)) #First, for each startterm, find all results in df that start with, add classification flag, and add interpretation. for i,s in enumerate(terms_df[terms_col]): df['CLASS_FLAG'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False), 5, inplace=True) df['LITHOLOGY'].where(~df[description_col].str.contains(s, case=False, regex=False, na=False),terms_df.loc[i,'LITHOLOGY'],inplace=True) df['BEDROCK_FLAG'].loc[df["LITHOLOGY"] == 'BEDROCK'] if verbose: numRecsClass = int(df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum()) percRecsClass= round((df[df['CLASS_FLAG']==5]['CLASS_FLAG'].sum()/df.shape[0])*100,2) recsRemainig = int(df.shape[0]-numRecsClass) print('Classified well records using any substring (wildcard) match') print("\t{} records classified using any substring match ({}% of unclassified data)".format(numRecsClass, percRecsClass)) print('\t{} records remain unclassified ({}% of unclassified data).'.format(recsRemainig, 1-percRecsClass)) return df def xyz_metadata_merge(xyz, metadata, verbose=False, log=False)-
Add elevation to header data file.
Parameters
xyz:pandas.Dataframe- Contains elevation for the points
metadata:pandas dataframe- Header data file
log:bool, default= False- Whether to log results to log file, by default False
Returns
headerXYZData:pandas.Dataframe- Header dataset merged to get elevation values
Expand source code
def xyz_metadata_merge(xyz, metadata, verbose=False, log=False): """Add elevation to header data file. Parameters ---------- xyz : pandas.Dataframe Contains elevation for the points metadata : pandas dataframe Header data file log : bool, default = False Whether to log results to log file, by default False Returns ------- headerXYZData : pandas.Dataframe Header dataset merged to get elevation values """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(xyz_metadata_merge, locals(), exclude_params=['xyz']) headerXYZData = metadata.merge(xyz, how='left', on='API_NUMBER') headerXYZData.drop(['LATITUDE_x', 'LONGITUDE_x'], axis=1, inplace=True) headerXYZData.rename({'LATITUDE_y':'LATITUDE', 'LONGITUDE_y':'LONGITUDE'}, axis=1, inplace=True) return headerXYZData