Skip to content Skip to sidebar Skip to footer

Fastest General Purpose Levenshtein Javascript Implementation

I am looking for a good general purpose Levenshtein implementation in Javascript. It must be fast and be useful for short and long strings. It should also be used many times (hence

Solution 1:

We had a competition for fun at work about making the fastest levenshtein implementation and I came up with a faster one. First of all, I must say that it was not easy to beat your solution which was the fastest to find "out there". :)

This is tested with node.js and it my benchmarks results indicates that this implementation is ~15% faster on small texts (random words size 2-10 characters) and over twice as fast on longer texts (with lengths 30+ containing random characters)

Note: I removed array caching of all implementations

functionlevenshtein(s, t) {
    if (s === t) {
        return0;
    }
    var n = s.length, m = t.length;
    if (n === 0 || m === 0) {
        return n + m;
    }
    var x = 0, y, a, b, c, d, g, h, k;
    var p = newArray(n);
    for (y = 0; y < n;) {
        p[y] = ++y;
    }

    for (; (x + 3) < m; x += 4) {
        var e1 = t.charCodeAt(x);
        var e2 = t.charCodeAt(x + 1);
        var e3 = t.charCodeAt(x + 2);
        var e4 = t.charCodeAt(x + 3);
        c = x;
        b = x + 1;
        d = x + 2;
        g = x + 3;
        h = x + 4;
        for (y = 0; y < n; y++) {
            k = s.charCodeAt(y);
            a = p[y];
            if (a < c || b < c) {
                c = (a > b ? b + 1 : a + 1);
            }
            else {
                if (e1 !== k) {
                    c++;
                }
            }

            if (c < b || d < b) {
                b = (c > d ? d + 1 : c + 1);
            }
            else {
                if (e2 !== k) {
                    b++;
                }
            }

            if (b < d || g < d) {
                d = (b > g ? g + 1 : b + 1);
            }
            else {
                if (e3 !== k) {
                    d++;
                }
            }

            if (d < g || h < g) {
                g = (d > h ? h + 1 : d + 1);
            }
            else {
                if (e4 !== k) {
                    g++;
                }
            }
            p[y] = h = g;
            g = d;
            d = b;
            b = c;
            c = a;
        }
    }

    for (; x < m;) {
        var e = t.charCodeAt(x);
        c = x;
        d = ++x;
        for (y = 0; y < n; y++) {
            a = p[y];
            if (a < c || d < c) {
                d = (a > d ? d + 1 : a + 1);
            }
            else {
                if (e !== s.charCodeAt(y)) {
                    d = c + 1;
                }
                else {
                    d = c;
                }
            }
            p[y] = d;
            c = a;
        }
        h = d;
    }

    return h;
}

On longer texts it will get almost up to 3 times the speed of your implementation if it initially cache the inner loop's s.charCodeAt(y) in an Uint32Array. Longer texts also seemed to benefit from using a Uint16Array as a the distance cost array. Here is the code for that solution

functionlevenshtein(s, t) {
    if (s === t) {
        return0;
    }
    var n = s.length, m = t.length;
    if (n === 0 || m === 0) {
        return n + m;
    }
    var x = 0, y, a, b, c, d, g, h;
    var p = newUint16Array(n);
    var u = newUint32Array(n);
    for (y = 0; y < n;) {
        u[y] = s.charCodeAt(y);
        p[y] = ++y;
    }

    for (; (x + 3) < m; x += 4) {
        var e1 = t.charCodeAt(x);
        var e2 = t.charCodeAt(x + 1);
        var e3 = t.charCodeAt(x + 2);
        var e4 = t.charCodeAt(x + 3);
        c = x;
        b = x + 1;
        d = x + 2;
        g = x + 3;
        h = x + 4;
        for (y = 0; y < n; y++) {
            a = p[y];
            if (a < c || b < c) {
                c = (a > b ? b + 1 : a + 1);
            }
            else {
                if (e1 !== u[y]) {
                    c++;
                }
            }

            if (c < b || d < b) {
                b = (c > d ? d + 1 : c + 1);
            }
            else {
                if (e2 !== u[y]) {
                    b++;
                }
            }

            if (b < d || g < d) {
                d = (b > g ? g + 1 : b + 1);
            }
            else {
                if (e3 !== u[y]) {
                    d++;
                }
            }

            if (d < g || h < g) {
                g = (d > h ? h + 1 : d + 1);
            }
            else {
                if (e4 !== u[y]) {
                    g++;
                }
            }
            p[y] = h = g;
            g = d;
            d = b;
            b = c;
            c = a;
        }
    }

    for (; x < m;) {
        var e = t.charCodeAt(x);
        c = x;
        d = ++x;
        for (y = 0; y < n; y++) {
            a = p[y];
            if (a < c || d < c) {
                d = (a > d ? d + 1 : a + 1);
            }
            else {
                if (e !== u[y]) {
                    d = c + 1;
                }
                else {
                    d = c;
                }
            }
            p[y] = d;
            c = a;
        }
        h = d;
    }

    return h;
}

All benchmark results is from my tests and test-data might be different with your test-data.

The main 2 difference in this solution than to yours (and some other fast ones) I think is

  • Not always do compare of the characters in the inner loop if not necessary.
  • Sort of "Loop unrolling" in the outer loop doing 4 rows at a time in the levenshtein "matrix". This was a major performance win.

http://jsperf.com/levenshtein-distance/24

I will put this solution on github when I find the time :)

Update: Finally, I put the solution on github https://github.com/gustf/js-levenshtein. Its a bit modified/optimized but it is the same base algorithm.

Post a Comment for "Fastest General Purpose Levenshtein Javascript Implementation"