mirror of
https://github.com/vlang/v.git
synced 2025-09-13 14:32:26 +03:00
strings: add hamming_distance/jaro_similarity/jaro_winkler_similarity functions (#22701)
This commit is contained in:
parent
c32c2d732a
commit
86470abc77
2 changed files with 191 additions and 0 deletions
|
@ -12,6 +12,30 @@ fn min(a u16, b u16, c u16) u16 {
|
||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@[inline]
|
||||||
|
fn max2(a int, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
|
||||||
|
@[inline]
|
||||||
|
fn min2(a int, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
@[inline]
|
||||||
|
fn abs2(a int, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return b - a
|
||||||
|
}
|
||||||
|
return a - b
|
||||||
|
}
|
||||||
|
|
||||||
// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
|
// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
|
||||||
// the distance between between two strings `a` and `b` (lower is closer).
|
// the distance between between two strings `a` and `b` (lower is closer).
|
||||||
@[direct_array_access]
|
@[direct_array_access]
|
||||||
|
@ -85,3 +109,121 @@ pub fn dice_coefficient(s1 string, s2 string) f32 {
|
||||||
}
|
}
|
||||||
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
|
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hamming_distance uses the Hamming Distance algorithm to calculate
|
||||||
|
// the distance between two strings `a` and `b` (lower is closer).
|
||||||
|
@[direct_array_access]
|
||||||
|
pub fn hamming_distance(a string, b string) int {
|
||||||
|
if a.len == 0 && b.len == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
mut match_len := min2(a.len, b.len)
|
||||||
|
mut diff_count := abs2(a.len, b.len)
|
||||||
|
for i in 0 .. match_len {
|
||||||
|
if a[i] != b[i] {
|
||||||
|
diff_count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return diff_count
|
||||||
|
}
|
||||||
|
|
||||||
|
// hamming_similarity uses the Hamming Distance algorithm to calculate
|
||||||
|
// the distance between two strings `a` and `b`.
|
||||||
|
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
|
||||||
|
pub fn hamming_similarity(a string, b string) f32 {
|
||||||
|
l := max2(a.len, b.len)
|
||||||
|
if l == 0 {
|
||||||
|
// Both are empty strings, should return 1.0
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
d := hamming_distance(a, b)
|
||||||
|
return 1.00 - f32(d) / f32(l)
|
||||||
|
}
|
||||||
|
|
||||||
|
// jaro_similarity uses the Jaro Distance algorithm to calculate
|
||||||
|
// the distance between two strings `a` and `b`.
|
||||||
|
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
|
||||||
|
@[direct_array_access]
|
||||||
|
pub fn jaro_similarity(a string, b string) f64 {
|
||||||
|
a_len := a.len
|
||||||
|
b_len := b.len
|
||||||
|
if a_len == 0 && b_len == 0 {
|
||||||
|
// Both are empty strings, should return 1.0
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
if a_len == 0 || b_len == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maximum distance upto which matching is allowed
|
||||||
|
match_distance := max2(a_len, b_len) / 2 - 1
|
||||||
|
|
||||||
|
mut a_matches := []bool{len: a_len}
|
||||||
|
mut b_matches := []bool{len: b_len}
|
||||||
|
mut matches := 0
|
||||||
|
mut transpositions := 0.0
|
||||||
|
|
||||||
|
// Traverse through the first string
|
||||||
|
for i in 0 .. a_len {
|
||||||
|
start := max2(0, i - match_distance)
|
||||||
|
end := min2(b_len, i + match_distance + 1)
|
||||||
|
for k in start .. end {
|
||||||
|
// If there is a match
|
||||||
|
if b_matches[k] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if a[i] != b[k] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
a_matches[i] = true
|
||||||
|
b_matches[k] = true
|
||||||
|
matches++
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If there is no match
|
||||||
|
if matches == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
mut k := 0
|
||||||
|
// Count number of occurrences where two characters match but
|
||||||
|
// there is a third matched character in between the indices
|
||||||
|
for i in 0 .. a_len {
|
||||||
|
if !a_matches[i] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Find the next matched character in second string
|
||||||
|
for !b_matches[k] {
|
||||||
|
k++
|
||||||
|
}
|
||||||
|
if a[i] != b[k] {
|
||||||
|
transpositions++
|
||||||
|
}
|
||||||
|
k++
|
||||||
|
}
|
||||||
|
transpositions /= 2
|
||||||
|
return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
|
||||||
|
}
|
||||||
|
|
||||||
|
// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
|
||||||
|
// the distance between two strings `a` and `b`.
|
||||||
|
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
|
||||||
|
// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
|
||||||
|
// similarities, making it especially effective for cases where slight misspellings
|
||||||
|
// or prefixes are common.
|
||||||
|
@[direct_array_access]
|
||||||
|
pub fn jaro_winkler_similarity(a string, b string) f64 {
|
||||||
|
// Maximum of 4 characters are allowed in prefix
|
||||||
|
mut lmax := min2(4, min2(a.len, b.len))
|
||||||
|
mut l := 0
|
||||||
|
for i in 0 .. lmax {
|
||||||
|
if a[i] == b[i] {
|
||||||
|
l++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
js := jaro_similarity(a, b)
|
||||||
|
// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
|
||||||
|
p := 0.1
|
||||||
|
ws := js + f64(l) * p * (1 - js)
|
||||||
|
return ws
|
||||||
|
}
|
||||||
|
|
|
@ -12,3 +12,52 @@ fn test_levenshtein_distance() {
|
||||||
assert strings.levenshtein_distance('flomax', 'volmax') == 3
|
assert strings.levenshtein_distance('flomax', 'volmax') == 3
|
||||||
assert strings.levenshtein_distance('ab', 'cd') == 2
|
assert strings.levenshtein_distance('ab', 'cd') == 2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn test_hamming_distance() {
|
||||||
|
assert strings.hamming_distance('', '') == 0
|
||||||
|
assert strings.hamming_distance('one', 'one') == 0
|
||||||
|
assert strings.hamming_distance('', 'two') == 3
|
||||||
|
assert strings.hamming_distance('three', '') == 5
|
||||||
|
assert strings.hamming_distance('bananna', '') == 7
|
||||||
|
assert strings.hamming_distance('cats', 'hats') == 1
|
||||||
|
assert strings.hamming_distance('hugs', 'shrugs') == 6
|
||||||
|
assert strings.hamming_distance('broom', 'shroom') == 5
|
||||||
|
assert strings.hamming_distance('flomax', 'volmax') == 3
|
||||||
|
assert strings.hamming_distance('ab', 'cd') == 2
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_hamming_similarity() {
|
||||||
|
assert strings.hamming_similarity('', '') == 1.0
|
||||||
|
assert strings.hamming_similarity('one', 'one') == 1.0
|
||||||
|
assert strings.hamming_similarity('', 'two') == 0
|
||||||
|
assert strings.hamming_similarity('three', '') == 0
|
||||||
|
assert strings.hamming_similarity('bananna', '') == 0
|
||||||
|
assert strings.hamming_similarity('cats', 'hats') == 0.75
|
||||||
|
assert strings.hamming_similarity('hugs', 'shrugs') == 0
|
||||||
|
assert strings.hamming_similarity('broom', 'shroom') == 0.1666666865348816
|
||||||
|
assert strings.hamming_similarity('flomax', 'volmax') == 0.5
|
||||||
|
assert strings.hamming_similarity('ab', 'cd') == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_jaro_similarity() {
|
||||||
|
assert strings.jaro_similarity('', '') == 1
|
||||||
|
assert strings.jaro_similarity('one', 'one') == 1
|
||||||
|
assert strings.jaro_similarity('', 'two') == 0
|
||||||
|
assert strings.jaro_similarity('three', '') == 0
|
||||||
|
assert strings.jaro_similarity('bananna', '') == 0
|
||||||
|
assert strings.jaro_similarity('MARTHA', 'MARHTA') == 0.9444444444444445
|
||||||
|
assert strings.jaro_similarity('DIXON', 'DICKSONX') == 0.7666666666666666
|
||||||
|
assert strings.jaro_similarity('JELLYFISH', 'SMELLYFISH') == 0.8962962962962964
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_jaro_winkler_similarity() {
|
||||||
|
assert strings.jaro_winkler_similarity('', '') == 1
|
||||||
|
assert strings.jaro_winkler_similarity('one', 'one') == 1
|
||||||
|
assert strings.jaro_winkler_similarity('', 'two') == 0
|
||||||
|
assert strings.jaro_winkler_similarity('three', '') == 0
|
||||||
|
assert strings.jaro_winkler_similarity('bananna', '') == 0
|
||||||
|
assert strings.jaro_winkler_similarity('accomodate', 'accommodate') == 0.9818181818181818
|
||||||
|
assert strings.jaro_winkler_similarity('accomodate', 'accompanist') == 0.8672727272727273
|
||||||
|
assert strings.jaro_winkler_similarity('untill', 'huntsville') == 0.8666666666666667
|
||||||
|
assert strings.jaro_winkler_similarity('wich', 'wichita') == 0.9142857142857143
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue