Decision Tree

hard· Machine Learningruns: 0

Implement a CART-style decision tree classifier from scratch. At each node, pick the feature and threshold that maximize the reduction in Gini impurity. Recurse until a depth limit, a minimum sample threshold, or a pure node is reached. Predict by walking the tree.

wpm 0acc 100%time 0:000 / 2619

import numpy as np

class Node:
    def __init__(self):
        self.feature = None
        self.threshold = None
        self.left = None
        self.right = None
        self.label = None

class DecisionTree:
    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        return 1 - np.sum(p ** 2)

    def _best_split(self, X, y):
        best = None
        best_gain = 0
        parent_gini = self._gini(y)
        n_samples, n_features = X.shape
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left = y[X[:, feature] <= t]
                right = y[X[:, feature] > t]
                if len(left) == 0 or len(right) == 0:
                    continue
                weighted = (
                    len(left) * self._gini(left)
                    + len(right) * self._gini(right)
                ) / n_samples
                gain = parent_gini - weighted
                if gain > best_gain:
                    best_gain = gain
                    best = (feature, t)
        return best

    def _build(self, X, y, depth: int):
        node = Node()
        if (
            depth >= self.max_depth
            or len(y) < self.min_samples_split
            or len(np.unique(y)) == 1
        ):
            values, counts = np.unique(y, return_counts=True)
            node.label = values[np.argmax(counts)]
            return node
        split = self._best_split(X, y)
        if split is None:
            values, counts = np.unique(y, return_counts=True)
            node.label = values[np.argmax(counts)]
            return node
        feature, threshold = split
        node.feature = feature
        node.threshold = threshold
        left_mask = X[:, feature] <= threshold
        node.left = self._build(X[left_mask], y[left_mask], depth + 1)
        node.right = self._build(X[~left_mask], y[~left_mask], depth + 1)
        return node

    def fit(self, X, y):
        self.root = self._build(X, y, 0)
        return self

    def _predict_one(self, x, node):
        if node.label is not None:
            return node.label
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        return self._predict_one(x, node.right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.root) for x in X])

click the box to focus · tab inserts 4 spaces · backspace to correct · esc to pause

desktop only

codedrill is a typing game and needs a real keyboard. open this on a laptop or desktop to practice.

you can still browse problems and sections from your phone.