【算法】Levenshtein Distance算法 求字符串相似度
Levenshtein Distance算法 也就是 “编辑距离算法”,是俄国科学家Levenshtein提出的

具体就是,一个字符串,通过加一个字符,删一个字符,替换一个字符得到另一个字串。我们把从字符串1转换成字符串2,前面3种操作用的最少次数称为12相似度

具体步骤如图所示

ld.png


vb.net实现方法

调用Levenshtein_Distance返回相似度百分比,浮点数

Levenshtein_Distance(ByVal str1 As String, ByVal str2 As String) As Double

    Public Function Levenshtein_Distance(ByVal str1 As String, ByVal str2 As String) As Double
        Dim d As Long = 0
        Dim result As Double
        If str1.Length > str2.Length Then
            d = str1.Length
        Else
            d = str2.Length
        End If

        result = 1 - (LD(str1, str2) / d)

        Return result

    End Function

    Private Function Minimum(ByVal a As Integer, ByVal b As Integer, ByVal c As Integer) As Integer
        Dim min As Integer

        min = a
        If b < min Then
            min = b
        End If
        If c < min Then
            min = c
        End If

        Minimum = min

    End Function

    Public Function LD(ByVal s As String, ByVal t As String) As Integer
        Dim dis(,) As Integer
        Dim m As Integer
        Dim n As Integer
        Dim i As Integer
        Dim j As Integer
        Dim s_i As String
        Dim t_j As String
        Dim cost As Integer

        n = Len(s)
        m = Len(t)
        If n = 0 Then
            LD = m
            Exit Function
        End If
        If m = 0 Then
            LD = n
            Exit Function
        End If

        ReDim dis(0 To n, 0 To m)

        For i = 0 To n
            dis(i, 0) = i
        Next i

        For j = 0 To m
            dis(0, j) = j
        Next j

        For i = 1 To n

            s_i = Mid$(s, i, 1)

            For j = 1 To m

                t_j = Mid$(t, j, 1)

                If s_i = t_j Then
                    cost = 0
                Else
                    cost = 1
                End If

                dis(i, j) = Minimum(dis(i - 1, j) + 1, dis(i, j - 1) + 1, dis(i - 1, j - 1) + cost)

            Next j

        Next i

        LD = dis(n, m)
        Erase dis

    End Function
+1000  科创币    novakon   2009-08-15    multilang
来自 软件综合
 
2009-8-15 22:32:54
93°(作者)
1楼
C#的实现方法,同样是调用第一个函数

public double Levenshtein_Distance(string str1, string str2)
{
    long d = 0L;
    if (str1.Length > str2.Length)
    {
        d = str1.Length;
    }
    else
    {
        d = str2.Length;
    }
    return (1.0 - (((double) this.LD(str1, str2)) / ((double) d)));
}


public int LD(string s, string t)
{
    int i;
    int n = Strings.Len(s);
    int m = Strings.Len(t);
    if (n == 0)
    {
        return m;
    }
    if (m == 0)
    {
        return n;
    }
    int[,] dis = new int[n + 1, m + 1];
    int VB$t_i4$L0 = n;
    for (i = 0; i <= VB$t_i4$L0; i++)
    {
        dis[i, 0] = i;
    }
    int VB$t_i4$L1 = m;
    int j = 0;
    while (j <= VB$t_i4$L1)
    {
        dis[0, j] = j;
        j++;
    }
    int VB$t_i4$L2 = n;
    for (i = 1; i <= VB$t_i4$L2; i++)
    {
        string s_i = Strings.Mid(s, i, 1);
        int VB$t_i4$L3 = m;
        for (j = 1; j <= VB$t_i4$L3; j++)
        {
            int cost;
            string t_j = Strings.Mid(t, j, 1);
            if (s_i == t_j)
            {
                cost = 0;
            }
            else
            {
                cost = 1;
            }
            dis[i, j] = this.Minimum(dis[i - 1, j] + 1, dis[i, j - 1] + 1, dis[i - 1, j - 1] + cost);
        }
    }
    int LD = dis[n, m];
    dis = null;
    return LD;
}


private int Minimum(int a, int b, int c)
{
    int min = a;
    if (b < min)
    {
        min = b;
    }
    if (c < min)
    {
        min = c;
    }
    return min;
}

折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
2楼
Java的实现方法,那个,返回百分比的函数没写 。。。自己根据上面写就可以了

public class Distance {

  private int Minimum (int a, int b, int c) {
  int mi;

    mi = a;
    if (b < mi) {
      mi = b;
    }
    if (c < mi) {
      mi = c;
    }
    return mi;

  }

  public int LD (String s, String t) {
  int d[][];
  int n;
  int m;
  int i;
  int j;
  char s_i;
  char t_j;
  int cost;


    n = s.length ();
    m = t.length ();
    if (n == 0) {
      return m;
    }
    if (m == 0) {
      return n;
    }
    d = new int[n+1][m+1];



    for (i = 0; i <= n; i++) {
      d[i][0] = i;
    }

    for (j = 0; j <= m; j++) {
      d[0][j] = j;
    }


    for (i = 1; i <= n; i++) {

      s_i = s.charAt (i - 1);


      for (j = 1; j <= m; j++) {

        t_j = t.charAt (j - 1);


        if (s_i == t_j) {
          cost = 0;
        }
        else {
          cost = 1;
        }


        d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);

      }

    }

    return d[n][m];

  }

}

折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
3楼
Delphi的实现方法,写了百分比的函数 。。。【注意form1,直接复制要改代码】


function Form1.Levenshtein_Distance(str1: string; str2: string): Double;
begin
    d := 0;
    if (str1.Length > str2.Length) then
        d := str1.Length
    else
        d := str2.Length;
    begin
        Result := (1 - ((self.LD(str1, str2) as Double) div (d as Double)));
        exit
    end
end;


function Form1.LD(s: string; t: string): Integer;
var
    i: Integer;
    cost: Integer;
begin
    n := Strings.Len(s);
    m := Strings.Len(t);
    if (n = 0) then
        begin
            Result := m;
            exit
        end;
    if (m = 0) then
        begin
            Result := n;
            exit
        end;
    dis := New(array[(n + 1), (m + 1)] of Integer);
    VB$t_i4$L0 := n;
    i := 0;
    while ((i <= VB$t_i4$L0)) do
    begin
        dis[i, 0] := i;
        inc(i)
    end;
    VB$t_i4$L1 := m;
    j := 0;
    while ((j <= VB$t_i4$L1)) do
    begin
        dis[0, j] := j;
        inc(j)
    end;
    VB$t_i4$L2 := n;
    i := 1;
    while ((i <= VB$t_i4$L2)) do
    begin
        s_i := Strings.Mid(s, i, 1);
        VB$t_i4$L3 := m;
        j := 1;
        while ((j <= VB$t_i4$L3)) do
        begin
            t_j := Strings.Mid(t, j, 1);
            if (s_i = t_j) then
                cost := 0
            else
                cost := 1;
            dis[i, j] := self.Minimum((dis[(i - 1), j] + 1), (dis[i, (j - 1)] + 1), (dis[(i - 1), (j - 1)] + cost));
            inc(j)
        end;
        inc(i)
    end;
    LD := dis[n, m];
    dis := nil;
    begin
        Result := LD;
        exit
    end
end;

function Form1.Minimum(a: Integer; b: Integer; c: Integer): Integer;
begin
    min := a;
    if (b < min) then
        min := b;
    if (c < min) then
        min := c;
    begin
        Result := min;
        exit
    end
end;
折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
4楼
cpp的实现方法,cpp有点那啥,算了。 = =

记得#include别丢了。。。

public: Double __gc* Levenshtein_Distance(String __gc* str1, String __gc* str2)
{
    Int64 __gc* d = 0;
    if (str1->Length > str2->Length)
    {
        d = str1->Length;
    }
    else
    {
        d = str2->Length;
    }
    return (1 - (*static_cast<__box Double*>(this->LD(str1, str2)) / *static_cast<__box Double*>(d)));
}

public: Int32 __gc* LD(String __gc* s, String __gc* t)
{
    Int32 __gc* i;
    Int32 __gc* n = Strings:[s:10]en(s);
    Int32 __gc* m = Strings:[s:10]en(t);
    if (n == 0)
    {
        return m;
    }
    if (m == 0)
    {
        return n;
    }
    Int32 __gc* dis __gc [,] = __gc new Int32 __gc*[(n + 1), (m + 1)];
    Int32 __gc* R$t_i4$L0 = n;
    for (i = 0; (i <= R$t_i4$L0); i++)
    {
        dis[i, 0] = i;
    }
    Int32 __gc* R$t_i4$L1 = m;
    Int32 __gc* j = 0;
    while ((j <= R$t_i4$L1))
    {
        dis[0, j] = j;
        j++;
    }
    Int32 __gc* R$t_i4$L2 = n;
    for (i = 1; (i <= R$t_i4$L2); i++)
    {
        String __gc* s_i = Strings::Mid(s, i, 1);
        Int32 __gc* R$t_i4$L3 = m;
        for (j = 1; (j <= R$t_i4$L3); j++)
        {
            Int32 __gc* cost;
            String __gc* t_j = Strings::Mid(t, j, 1);
            if (s_i == t_j)
            {
                cost = 0;
            }
            else
            {
                cost = 1;
            }
            dis[i, j] = this->Minimum((dis[(i - 1), j] + 1), (dis[i, (j - 1)] + 1), (dis[(i - 1), (j - 1)] + cost));
        }
    }
    Int32 __gc* LD = dis[n, m];
    dis = 0;
    return LD;
}

private: Int32 __gc* Minimum(Int32 __gc* a, Int32 __gc* b, Int32 __gc* c)
{
    Int32 __gc* min = a;
    if (b < min)
    {
        min = b;
    }
    if (c < min)
    {
        min = c;
    }
    return min;
}
折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
5楼
为了让代码更好看,很黄很暴力!

LD1.png
折叠评论
加载评论中,请稍候...
折叠评论
6楼
Delphi 的很囧...好多exit....
折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
7楼
引用第6楼我说要有光于2009-08-15 23:44发表的  :
Delphi 的很囧...好多exit....

等了半天 。。。好不容易等到一个回帖,居然是这破帖。。 = = 不活了 555555555 pia
折叠评论
加载评论中,请稍候...
折叠评论
2009-08-23 08:14:54
2009-8-23 08:14:54
8楼
93水平不错,会这么多语言,我对叫java,delphi,c#一窍不通
不过C++的效率应该是最高
折叠评论
加载评论中,请稍候...
折叠评论
9楼
嗯..新闻说最近出了一个什么查论文相似度的系统..
折叠评论
加载评论中,请稍候...
折叠评论
93°(作者)
10楼
…………………………………………………………………………………………………………………………………………
折叠评论
加载评论中,请稍候...
折叠评论
2015-03-03 17:04:46
2015-3-3 17:04:46
11楼
这个算法蛮有用的,我以前曾经做过简单的验证码识别就用的是这个原理
折叠评论
加载评论中,请稍候...
折叠评论

想参与大家的讨论?现在就 登录 或者 注册

插入资源
全部
图片
视频
音频
附件
全部
未使用
已使用
正在上传
空空如也~
上传中..{{f.progress}}%
处理中..
上传失败,点击重试
{{f.name}}
空空如也~
(视频){{r.oname}}
{{selectedResourcesId.indexOf(r.rid) + 1}}
ID:{{user.uid}}
{{user.username}}
{{user.info.certsName}}
{{user.description}}
{{format("YYYY/MM/DD", user.toc)}}注册,{{fromNow(user.tlv)}}活动
{{submitted?"":"投诉"}}
请选择违规类型:
{{reason.description}}
支持的图片格式:jpg, jpeg, png