for region Ri in all regions:
    if x ∈ Ri:
      return yi

X = np.random.rand(75, 2)
y = 1.*(X[:,1] > X[:,0])

plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7b530f18ffa0>

def entropyGain(X, y, d, theta):
    if len(y) <= 1:
        return 0.
    p = y.mean()
    e = jax.scipy.special.entr(p)
    l = 1.*(X[:,d] < theta)
    p1 = (y * l).sum()/(l.sum()+1e-12)
    e1 = jax.scipy.special.entr(p1)
    r = 1-l
    p2 = (y * r).sum()/(r.sum()+1e-12)
    e2 = jax.scipy.special.entr(p2)
    return e - (l.sum()*e1 + r.sum()*e2)/len(y)

def findBestTheta(X, y, d, gain=entropyGain):
    n = len(y)
    best_g = -1.
    theta = None
    xx = jnp.sort(X[:,d])-1e-7
    for t in xx:
        g = gain(X, y, d, t)
        if g > best_g:
            best_g = g
            theta = t
    if theta == None:
        print('theta faillure!!')
    return theta, best_g

def findBestDTheta(X, y, gain=entropyGain):
    best_d = None
    theta = None
    best_g = -1
    for d in range(X.shape[1]):
        t, g = findBestTheta(X, y, d, gain)
        if g > best_g:
            best_d = d
            theta = t
            best_g = g
    if best_d is None:
        print('D failure!!')
    return best_d, theta

class BinaryClassificationTree():
    def __init__(self, X, y, gain=entropyGain, min_size=1):
        p = y.mean()
        if len(y) <= min_size or jax.scipy.special.entr(p) == 0.:
            self.label = 1.*(p>=0.5)
        else:
            self.label = None
            self.d, self.theta = findBestDTheta(X, y, gain)
            ind = 1.*(X[:,self.d] < self.theta) 
            if ind.sum() == 0 or ind.sum() == len(y):
                print('single split !!! {} {} {}'.format(ind, y, X))
            ind1 = ind.nonzero()
            X1 = X[ind1]
            y1 = y[ind1]
            ind2 = (1-ind).nonzero()
            X2 = X[ind2]
            y2 = y[ind2]
            self.T1 = BinaryClassificationTree(X1, y1, gain=gain, min_size=min_size)
            self.T2 = BinaryClassificationTree(X2, y2, gain=gain, min_size=min_size)
    def __call__(self, X):
        if self.label is not None:
            return self.label * jnp.ones(len(X))
        return jnp.concatenate([ self.T1([x]) if x[self.d] < self.theta else self.T2([x]) for x in X])

T = BinaryClassificationTree(X, y)

t = 50; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7b52d4222f80>

y[6] = 1 - y[6]
T = BinaryClassificationTree(X, y)

t = 50; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5d6f7c310>

class RandomForest():
    def __init__(self, X, y, nb_tree=25, p=0.5):
        self.trees = []
        n = len(y)
        k = int(p*n)
        for b in range(nb_tree):
            i = np.random.permutation(n)
            Xb = X[i[0:k], ...]
            yb = y[i[0:k]]
            DT = BinaryClassificationTree(Xb, yb)
            self.trees.append(DT)
    def __call__(self, X):
        y = []
        for DT in self.trees:
            y.append(DT(X))
        return 1.*(jnp.array(y).mean(axis=0))

T = RandomForest(X, y)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5ac716ce0>

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -1.*(y_pred>0.5), shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5ac7b5870>

T = RandomForest(X, y, nb_tree=100, p=0.2)

t = 50; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5ac6463b0>

t = 50; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -1.*(y_pred>0.5), shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5aff8f3a0>

class RandomAxisClassifier():
    def __init__(self, X, y, d):
        self.d = d
        self.theta, _ = findBestTheta(X, y, d)
        i = jnp.where(X[:,d]<self.theta)
        self.y = (y[i].mean()>0.5)
    def __call__(self, X):
        return (X[:,self.d]<self.theta) if self.y else 1-(X[:,self.d]<self.theta)

class BaggingClassifier():
    def __init__(self, X, y, nb_cls, p=0.5):
        self.cls = []
        n = len(y)
        k = int(p*n)
        for b in range(nb_cls):
            i = np.random.permutation(n)
            Xb = X[i[0:k], ...]
            yb = y[i[0:k]]
            self.cls.append(RandomAxisClassifier(Xb, yb, np.random.randint(2)))
    def __call__(self, X):
        y = []
        for c in self.cls:
            y.append(c(X))
        return jnp.array(y).mean(axis=0)

T = BaggingClassifier(X, y, 1)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5affa1330>

T = BaggingClassifier(X, y, 10)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5afd81b10>

T = BaggingClassifier(X, y, 100, p=0.2)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5afe03640>

def weightedError(w, y_pred, y_true):
    return (w * (y_pred != y_true)).sum()/w.sum()

def weightedFindBestTheta(w, X, y, d):
    n = len(y)
    err = w.sum()+1
    theta = None
    p = None
    xx = jnp.sort(X[:,d])-1e-7
    for t in xx:
        e = weightedError(w, 1.*(X[:,d] < t), y)
        if e < err:
            err = e
            theta = t
            p = True
        e = weightedError(w, 1. - (X[:,d] < t), y)
        if e < err:
            err = e
            theta = t
            p = False
    if theta == None:
        print('theta faillure!!')
    return theta, p, err

class WeightedRandomAxisClassifier():
    def __init__(self, w, X, y, d):
        self.d = d
        self.theta, self.y, self.err = weightedFindBestTheta(w, X, y, d)
    def __call__(self, X):
        return 1.*(X[:,self.d]<self.theta) if self.y else 1.-(X[:,self.d]<self.theta)

class AdaBoost():
    def __init__(self, X, y, nb_cls=5):
        n = len(y)
        w = jnp.ones(n)/n
        self.beta = []
        self.cls = []
        for b in range(nb_cls):
            err_b = []
            cls_b = []
            for d in range(X.shape[1]):
                c = WeightedRandomAxisClassifier(w, X, y, d)
                cls_b.append(c)
                err_b.append(c.err)
            a = jnp.argmin(jnp.array(err_b))
            c = cls_b[a]
            e = c.err
            b = jnp.log((1-e)/e)
            w = w * jnp.exp(b * (c(X)!=y))
            self.beta.append(b)
            self.cls.append(c)
    def __call__(self, X):
        y = []
        for i, c in enumerate(self.cls):
            y.append(self.beta[i] * c(X))
        return jnp.array(y).mean(axis=0)

T = AdaBoost(X, y, 2)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=(y>0.5))

<matplotlib.collections.PathCollection at 0x7b52bb7818a0>

T = AdaBoost(X, y, 10)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=y)

<matplotlib.collections.PathCollection at 0x7fb5af9adea0>

T = AdaBoost(X, y, 50)

t = 20; tx = jnp.linspace(0, 1, t); ty = jnp.linspace(0, 1, t)
xv, yv = jnp.meshgrid(tx, ty, sparse=True); xv = xv.squeeze(); yv = yv.squeeze()
xx = jnp.array([[xx, yy] for yy in yv for xx in xv])
y_pred = jnp.array(T(xx)).reshape(t, t)
cmap = plt.get_cmap('PiYG')
levels=jnp.linspace(-1.5, .5, 10)
norm = matplotlib.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=True)
plt.pcolormesh(xv, yv, -y_pred, shading='nearest', norm=norm);
plt.scatter(X[:,0], X[:,1], c=(y>0.5))

<matplotlib.collections.PathCollection at 0x7b52bb7f1930>

Machine Learning and Applications - Decision Trees and Ensembling Methods¶

David Picard¶

École des Ponts ParisTech¶

david.picard@enpc.fr¶

Region based classification¶

Tree based equivalent¶

Tree representation¶

Decision Tree¶

Growing the tree¶

Gain measure¶

Information Gain¶

Small example¶

Decision Trees¶

Unstable¶

Generalization¶

Random Forest¶

Limiting overfitting¶

Reducing the variance¶

Ensemble learning¶

Bagging¶

Exemple¶

Boosting¶

Adaboost (Freund et al, 1996)¶

Exponential loss function¶

Independent updates¶

Solving for $G$¶

Solving for $\beta$¶

Adaboost¶

Gradient Tree Boosting¶

Remarks¶

Exercise¶

Decision Trees and Ensemble Learning, take home¶