From 86470abc77890d212bdda193344f3f1b70b1f18d Mon Sep 17 00:00:00 2001 From: kbkpbot Date: Thu, 31 Oct 2024 19:34:37 +0800 Subject: [PATCH] strings: add hamming_distance/jaro_similarity/jaro_winkler_similarity functions (#22701) --- vlib/strings/similarity.v | 142 +++++++++++++++++++++++++++++++++ vlib/strings/similarity_test.v | 49 ++++++++++++ 2 files changed, 191 insertions(+) diff --git a/vlib/strings/similarity.v b/vlib/strings/similarity.v index a57acfda3a..cb5a015720 100644 --- a/vlib/strings/similarity.v +++ b/vlib/strings/similarity.v @@ -12,6 +12,30 @@ fn min(a u16, b u16, c u16) u16 { return m } +@[inline] +fn max2(a int, b int) int { + if a < b { + return b + } + return a +} + +@[inline] +fn min2(a int, b int) int { + if a < b { + return a + } + return b +} + +@[inline] +fn abs2(a int, b int) int { + if a < b { + return b - a + } + return a - b +} + // levenshtein_distance uses the Levenshtein Distance algorithm to calculate // the distance between between two strings `a` and `b` (lower is closer). @[direct_array_access] @@ -85,3 +109,121 @@ pub fn dice_coefficient(s1 string, s2 string) f32 { } return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2) } + +// hamming_distance uses the Hamming Distance algorithm to calculate +// the distance between two strings `a` and `b` (lower is closer). +@[direct_array_access] +pub fn hamming_distance(a string, b string) int { + if a.len == 0 && b.len == 0 { + return 0 + } + mut match_len := min2(a.len, b.len) + mut diff_count := abs2(a.len, b.len) + for i in 0 .. match_len { + if a[i] != b[i] { + diff_count++ + } + } + return diff_count +} + +// hamming_similarity uses the Hamming Distance algorithm to calculate +// the distance between two strings `a` and `b`. +// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). +pub fn hamming_similarity(a string, b string) f32 { + l := max2(a.len, b.len) + if l == 0 { + // Both are empty strings, should return 1.0 + return 1.0 + } + d := hamming_distance(a, b) + return 1.00 - f32(d) / f32(l) +} + +// jaro_similarity uses the Jaro Distance algorithm to calculate +// the distance between two strings `a` and `b`. +// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). +@[direct_array_access] +pub fn jaro_similarity(a string, b string) f64 { + a_len := a.len + b_len := b.len + if a_len == 0 && b_len == 0 { + // Both are empty strings, should return 1.0 + return 1.0 + } + if a_len == 0 || b_len == 0 { + return 0 + } + + // Maximum distance upto which matching is allowed + match_distance := max2(a_len, b_len) / 2 - 1 + + mut a_matches := []bool{len: a_len} + mut b_matches := []bool{len: b_len} + mut matches := 0 + mut transpositions := 0.0 + + // Traverse through the first string + for i in 0 .. a_len { + start := max2(0, i - match_distance) + end := min2(b_len, i + match_distance + 1) + for k in start .. end { + // If there is a match + if b_matches[k] { + continue + } + if a[i] != b[k] { + continue + } + a_matches[i] = true + b_matches[k] = true + matches++ + break + } + } + // If there is no match + if matches == 0 { + return 0 + } + mut k := 0 + // Count number of occurrences where two characters match but + // there is a third matched character in between the indices + for i in 0 .. a_len { + if !a_matches[i] { + continue + } + // Find the next matched character in second string + for !b_matches[k] { + k++ + } + if a[i] != b[k] { + transpositions++ + } + k++ + } + transpositions /= 2 + return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3 +} + +// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate +// the distance between two strings `a` and `b`. +// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). +// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix +// similarities, making it especially effective for cases where slight misspellings +// or prefixes are common. +@[direct_array_access] +pub fn jaro_winkler_similarity(a string, b string) f64 { + // Maximum of 4 characters are allowed in prefix + mut lmax := min2(4, min2(a.len, b.len)) + mut l := 0 + for i in 0 .. lmax { + if a[i] == b[i] { + l++ + } + } + js := jaro_similarity(a, b) + // select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity + p := 0.1 + ws := js + f64(l) * p * (1 - js) + return ws +} diff --git a/vlib/strings/similarity_test.v b/vlib/strings/similarity_test.v index a3153b93e3..8d053dba31 100644 --- a/vlib/strings/similarity_test.v +++ b/vlib/strings/similarity_test.v @@ -12,3 +12,52 @@ fn test_levenshtein_distance() { assert strings.levenshtein_distance('flomax', 'volmax') == 3 assert strings.levenshtein_distance('ab', 'cd') == 2 } + +fn test_hamming_distance() { + assert strings.hamming_distance('', '') == 0 + assert strings.hamming_distance('one', 'one') == 0 + assert strings.hamming_distance('', 'two') == 3 + assert strings.hamming_distance('three', '') == 5 + assert strings.hamming_distance('bananna', '') == 7 + assert strings.hamming_distance('cats', 'hats') == 1 + assert strings.hamming_distance('hugs', 'shrugs') == 6 + assert strings.hamming_distance('broom', 'shroom') == 5 + assert strings.hamming_distance('flomax', 'volmax') == 3 + assert strings.hamming_distance('ab', 'cd') == 2 +} + +fn test_hamming_similarity() { + assert strings.hamming_similarity('', '') == 1.0 + assert strings.hamming_similarity('one', 'one') == 1.0 + assert strings.hamming_similarity('', 'two') == 0 + assert strings.hamming_similarity('three', '') == 0 + assert strings.hamming_similarity('bananna', '') == 0 + assert strings.hamming_similarity('cats', 'hats') == 0.75 + assert strings.hamming_similarity('hugs', 'shrugs') == 0 + assert strings.hamming_similarity('broom', 'shroom') == 0.1666666865348816 + assert strings.hamming_similarity('flomax', 'volmax') == 0.5 + assert strings.hamming_similarity('ab', 'cd') == 0 +} + +fn test_jaro_similarity() { + assert strings.jaro_similarity('', '') == 1 + assert strings.jaro_similarity('one', 'one') == 1 + assert strings.jaro_similarity('', 'two') == 0 + assert strings.jaro_similarity('three', '') == 0 + assert strings.jaro_similarity('bananna', '') == 0 + assert strings.jaro_similarity('MARTHA', 'MARHTA') == 0.9444444444444445 + assert strings.jaro_similarity('DIXON', 'DICKSONX') == 0.7666666666666666 + assert strings.jaro_similarity('JELLYFISH', 'SMELLYFISH') == 0.8962962962962964 +} + +fn test_jaro_winkler_similarity() { + assert strings.jaro_winkler_similarity('', '') == 1 + assert strings.jaro_winkler_similarity('one', 'one') == 1 + assert strings.jaro_winkler_similarity('', 'two') == 0 + assert strings.jaro_winkler_similarity('three', '') == 0 + assert strings.jaro_winkler_similarity('bananna', '') == 0 + assert strings.jaro_winkler_similarity('accomodate', 'accommodate') == 0.9818181818181818 + assert strings.jaro_winkler_similarity('accomodate', 'accompanist') == 0.8672727272727273 + assert strings.jaro_winkler_similarity('untill', 'huntsville') == 0.8666666666666667 + assert strings.jaro_winkler_similarity('wich', 'wichita') == 0.9142857142857143 +}