Declustering#

import pandas as pd
import geolime as geo
from pyproj import CRS
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

geo.Project().set_crs(CRS("EPSG:20350"))

Data loading#

dh = geo.read_file("../data/domained_drillholes.geo")

Declustering#

To avoid the biased difference that comes from preferential sampling concentrated in area of interest we need to decluster the data set. Samples should be located at regular gridded intervals or at random. The first thing to consider in the declustering step is the cell size. To choose the best celle size, it is great to assess the declustered mean for various celle size and choose a value that minimize or maximize the declustered mean. The algorithm used to decluster the data set will asses weight to every sample in order to have the minimun declustered mean.

Naive/declustered data#

Naive data refers to the raw data set, without any declustering while the declustered data refers to the one which have incorporated the declustering weights for each sample.

Moving window#

This method consists in counting the number of points (also called neighbors) ni in the volume Vi centered on the point (xi,yi,zi).

fig  = geo.plot(
    dh, 
    property="Fe_pct", 
    agg_method="mean",
    xaxis=dict(    
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"
    ),
    yaxis=dict(   
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"
    ) 
)
fig.update_layout(plot_bgcolor="rgba(0,0,0,0)")
fig
var = "Fe_pct"
rad_x = 1.
rad_y = 1.
rad_z = 1.

cell_size = 500
diam_x = rad_x * cell_size
diam_y = rad_y * cell_size
diam_z = rad_z * cell_size
declus_weight_mw = geo.moving_window_declus(
    obj=dh, 
    obj_region=None, 
    obj_attribute=var, 
    diam_x=diam_x, 
    diam_y=diam_y, 
    diam_z=diam_z, 
    geometry="BALL"
)

print("Weight mean: ", declus_weight_mw.mean())
Weight mean:  0.9999999999999998
counts_declus, bin_edges_decluse = np.histogram(dh.data(var, f"{var} == {var}"), bins=10, weights=declus_weight_mw)
counts_naive, bin_edges_naive = np.histogram(dh.data(var, f"{var} == {var}"), bins=10)

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=bin_edges_decluse,
        y=counts_declus,
        name="Declustered Values", 
        marker_color="#6495ED"
    )
)
fig.add_trace(
    go.Bar(
        x=bin_edges_naive,
        y=counts_naive,
        name="Naive Values",
        marker_color="#ff5733"
    )
)

fig.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1, # gap between bars of the same location coordinates
    barmode="group",        
    height=600,
    width=750.,
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        tickmode = "linear",
        tick0 = 0.,
        dtick = 5,
        title_text="Iron Grade"
    ),
    yaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        rangemode="nonnegative",
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey",
        title_text="Count"        
    )    
)

fig.show()

Cell declustering#

This method (the most used) adopts a predefined grid covering all data points to assign weights. The weights are calculated based on the number of points in each cell of the grid. The method consists in considering several sizes of cells H of the grid, and several origins O. For each grid, we count the number of points present in each cell

Declustered weights computation#

cell_size = 500
size_x = cell_size
size_y = cell_size
size_z = cell_size

nb_off = 50

declus_weight_cd = geo.cell_declustering(
    obj=dh, 
    obj_region=None, 
    obj_attribute=var, 
    size_x=size_x, 
    size_y=size_y, 
    size_z=size_z, 
    nb_off=nb_off
)

print("Weights mean: ", declus_weight_cd.mean())
Weights mean:  1.0

Visualization of declustered variable#

counts_declus, bin_edges_decluse = np.histogram(dh.data(var, f"{var} == {var}"), bins=10, weights=declus_weight_cd)
counts_naive, bin_edges_naive = np.histogram(dh.data(var, f"{var} == {var}"), bins=10)

fig = go.Figure()
fig.add_trace(
        go.Bar(
        x=bin_edges_decluse,
        y=counts_declus,
        name="Declustered Values",
        marker_color="#6495ED"
    )
)
fig.add_trace(
    go.Bar(
        x=bin_edges_naive,
        y=counts_naive,
        name="Naive Values",
        marker_color="#ff5733"
    )
)

fig.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1, # gap between bars of the same location coordinates
    barmode="group",        
    height=600,
    width=750,
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        tickmode = "linear",
        tick0 = 0.,
        dtick = 5,
        title_text="Iron concentration"    
    ),
    yaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey",
        title_text="Count"    
    )
)

fig.show()

Declustered mean for optimal cell size#

cmin = 0
cmax = 1000
inc = 4
nb_off = 25

x_cs_cd = np.arange(cmin, cmax, inc)
naive_mean = dh.data(var, f"{var} == {var}").mean()
var_mean_cd = [None] * len(x_cs_cd)

i = 0
for s in x_cs_cd[1:]:
    declus_weight_cd = geo.cell_declustering(
        obj=dh, 
        obj_region=None, 
        obj_attribute=var, 
        size_x=s, 
        size_y=s, 
        size_z=100, 
        nb_off=nb_off
    )
    var_mean_cd[i] = np.dot(declus_weight_cd, dh.data(var, f"{var} == {var}")) / np.sum(declus_weight_cd)
    i += 1
var_mean_cd[0] = naive_mean
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=x_cs_cd, 
        y=var_mean_cd,
        mode="lines",
        name="Declustered Mean"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_cs_cd, 
        y=np.full((len(x_cs_cd)),naive_mean),
        mode="lines",
        name="Naive Mean"
    )
)

fig.update_layout(    
    height=600,
    width=750,
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"    
    ),
    yaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        rangemode="tozero",
        title="Iron Mean Grade",
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"    
    )
)

fig.show()

Methods comparison#

Even though we have focused on two here, there are dozens of declustering methods, and the choice of which one to use is based on the appearance of the output histograms and the extent to which it changes the distribution

Declustered variable#

counts_declus_cd, bin_edges_declus_cd = np.histogram(dh.data(var, f"{var} == {var}"), bins=30, weights=declus_weight_cd)
counts_declus_mw, bin_edges_declus_mw = np.histogram(dh.data(var, f"{var} == {var}"), bins=30, weights=declus_weight_mw)
counts_naive, bin_edges_naive = np.histogram(dh.data(var, f"{var} == {var}"), bins=30)

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=bin_edges_declus_mw,
        y=counts_declus_mw,
        name="Declustered Values - Moving Window",
        marker_color="#bd2d17",
        legendgroup="group",
        legendgrouptitle_text="Fe Distribution"
    )
)
fig.add_trace(
    go.Bar(
        x=bin_edges_declus_cd,
        y=counts_declus_cd,
        name="Declustered Values - Cell Declustering",
        marker_color="#17bd2d",
        legendgroup="group",
        legendgrouptitle_text="Histograms"
    )
)
fig.add_trace(
    go.Bar(
        x=bin_edges_naive,
        y=counts_naive,
        name="Naive Values",
        marker_color="#2d17bd",
        legendgroup="group"
    )
)
fig.add_trace(
    go.Scatter(
        x=[0], 
        y=[50], 
        mode="lines", 
        line=dict(color="#bd2d17", width=2, dash="dash"),
        name="Moving Window Declustered Fe Mean", 
        showlegend=True,
        legendgroup="group2",
        legendgrouptitle_text="Mean Values"             
    )
)
fig.add_trace(
    go.Scatter(
        x=[0], 
        y=[50], 
        mode="lines", 
        line=dict(color="#17bd2d", width=2, dash="dash"),
        name="Cell Declustered Fe Mean", 
        showlegend=True,
        legendgroup="group2"    
    )
)
fig.add_trace(
    go.Scatter(
        x=[0], 
        y=[50], 
        mode="lines", 
        line=dict(color="#2d17bd", width=2, dash="dash"),
        name="Naive Fe Mean", 
        showlegend=True,
        legendgroup="group2"
      
    )
)

fig.add_vline(
    x=np.average(dh.data(var, f"{var} == {var}"), weights=declus_weight_mw), 
    line_width=3, 
    line_dash="dash", 
    line_color="#bd2d17"
)

fig.add_vline(
    x=np.average(dh.data(var, f"{var} == {var}"), weights=declus_weight_cd), 
    line_width=3, 
    line_dash="dash", 
    line_color="#17bd2d"
    
)
fig.add_vline(
    x=np.average(dh.data(var, f"{var} == {var}")), 
    line_width=3, 
    line_dash="dash", 
    line_color="#2d17bd"
)

fig.update_layout(
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1, # gap between bars of the same location coordinates
    barmode="group",        
    height=600,
    width=750,
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        title_text="Iron concentration"
    ),
    yaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey",
        title_text="Count"
    )
)

fig.show()
np.average(dh.data(var, f"{var} == {var}"))
31.597339782345827
np.average(dh.data(var, f"{var} == {var}"), weights=declus_weight_cd)
24.49593882391066

Declustered means#

x_cs = np.arange(10,1000,20)
var_mean_cd = [None]*len(x_cs)
naive_mean = dh.data(var, f"{var} == {var}").mean()
declus_weight_mw_ellipse = [None]*len(x_cs)
var_mean_ellipse = [None]*len(x_cs)
declus_weight_mw_parallelepiped = [None]*len(x_cs)
var_mean_parallelepiped = [None]*len(x_cs)

i = 0
for s in x_cs[1:]:
    declus_weight_cd = geo.cell_declustering(
        obj=dh, 
        obj_region=None, 
        obj_attribute=var, 
        size_x=s, 
        size_y=s, 
        size_z=100, 
        nb_off=nb_off
    ) 
    var_mean_cd[i] = np.dot(declus_weight_cd, dh.data(var, f"{var} == {var}")) / np.sum(declus_weight_cd)
    i += 1
var_mean_cd[0] = naive_mean

i = 0
for s in x_cs:
    declus_weight_mw_ellipse[i] = geo.moving_window_declus(
        obj=dh, 
        obj_region=None, 
        obj_attribute=var, 
        diam_x=s, 
        diam_y=s, 
        diam_z=100,
        geometry="BALL"
    )
    var_mean_ellipse[i] = np.dot(declus_weight_mw_ellipse[i], dh.data(var, f"{var} == {var}"))/np.sum(declus_weight_mw_ellipse[i])
    declus_weight_mw_parallelepiped[i] = geo.moving_window_declus(
        obj=dh, 
        obj_region=None, 
        obj_attribute=var, 
        diam_x=s, 
        diam_y=s, 
        diam_z=100, 
        geometry="PARALLELEPIPED"
    )
    var_mean_parallelepiped[i] = np.dot(declus_weight_mw_parallelepiped[i], dh.data(var, f"{var} == {var}"))/np.sum(declus_weight_mw_parallelepiped[i])
    i += 1

var_mean_cd[0] = naive_mean
var_mean_ellipse[0] = naive_mean
var_mean_parallelepiped[0] = naive_mean
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=x_cs,
        y=var_mean_ellipse,
        mode="lines",
        name="Declustered Mean (Ellipsoid)"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_cs, 
        y=var_mean_parallelepiped,
        mode="lines",
        name="Declustered Mean (Cuboid)"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_cs, 
        y=var_mean_cd,
        mode="lines",
        name="Declustered Mean (Cell declustering)"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_cs, 
        y=np.full((len(x_cs)),naive_mean),
        mode="lines",
        name="Naive Mean"
    )
)

fig.update_layout(    
    height=600,
    width=750,
    plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"
    ),
    yaxis=dict(
        showline=True,
        linewidth=2,
        linecolor="black",
        mirror=True,
        rangemode="tozero",
        title="Iron Mean Grade",
        showgrid=True, 
        gridwidth=.2, 
        gridcolor="lightgrey"    
    )
)

fig.show()

Declustered data points#

dh.set_property_expr(name="declus_weight_cd", expr="0")
dh.set_property_expr(name="declus_weight_mw", expr="0")

dh.set_property_value(name="declus_weight_cd", data=declus_weight_cd, region="Fe_pct == Fe_pct")
dh.set_property_value(name="declus_weight_mw", data=declus_weight_mw, region="Fe_pct == Fe_pct")
fig = make_subplots(
    rows=1, 
    cols=2, 
    shared_yaxes=True,
    subplot_titles=("Moving Window Declustering","Cell Declustering")
)

fig.add_trace(
    geo.plot(
        dh, 
        "declus_weight_mw", 
        "max"
    ).update_traces(
        marker_showscale=False,
        marker_coloraxis="coloraxis",
        marker_size=dh.aggregate(["declus_weight_mw"], ["mean"])["declus_weight_mw_mean"] * 10

    )["data"][0],
    row=1, 
    col=1
)

fig.add_trace(
    geo.plot(
        dh, 
        "declus_weight_cd", "max"
    ).update_traces(
        marker_showscale=False, 
        marker_coloraxis="coloraxis", 
        marker_size=dh.aggregate(["declus_weight_cd"], ["mean"])["declus_weight_cd_mean"] * 10
    )["data"][0],
    row=1, 
    col=2
)

fig.update_layout(
    height=900, 
    width=750, 
    plot_bgcolor="rgba(0,0,0,0)",
    coloraxis=dict(colorscale="viridis"), 
    showlegend=False,
    coloraxis_colorbar=dict(
        title="Weight Value",
    )
)
fig.update_xaxes(
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=True, 
    gridwidth=.2, 
    gridcolor="lightgrey"
)
fig.update_yaxes(
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=True, 
    gridwidth=.2, 
    gridcolor="lightgrey"
)

fig.show()

As we can see on the previous figure, the declustering method is the one that gives us the most clearly bimodal gives us the most clearly bimodal distribution, which is preferable for the domaining.