C++의 BK 트리 소개

<시간/>

BK 트리 또는 Burkhard 트리는 일반적으로 Levenshtein 거리를 기반으로 맞춤법 검사를 수행하는 데 사용되는 데이터 구조의 한 형태입니다. 문자열 일치에도 사용됩니다. 자동 고침 기능은 이 데이터 구조를 만드는 데 사용할 수 있습니다. 사전에 단어가 있고 철자 오류가 있는지 다른 단어를 확인해야 한다고 가정해 보겠습니다. 철자를 검사할 주어진 단어에 가까운 단어 모음이 필요합니다. 예를 들어 "uck"이라는 단어가 있는 경우 올바른 단어는 (truck, duck, duck, suck)입니다. 따라서 철자 오류는 단어를 삭제하거나 문자를 적절한 문자로 대체하는 새 단어를 추가하여 수정할 수 있습니다. 편집 거리를 매개 변수로 사용하고 사전으로 철자를 확인합니다.

다른 모든 트리와 마찬가지로 BK 트리도 노드와 에지로 구성됩니다. 노드는 사전의 단어를 나타냅니다. 에지는 한 노드에서 다른 노드까지의 편집 거리에 대한 정보를 제공하는 정수 가중치를 포함합니다.

{ book, books, boo, cake, cape} −

단어가 있는 사전을 생각해 보세요.

C++의 BK 트리 소개

BK 나무

BK Tree의 모든 메모에는 편집 거리가 동일한 정확히 하나의 자식 노드가 있습니다. 노드를 삽입하는 동안 편집 거리에서 충돌이 발생하면 올바른 자식을 얻을 때까지 삽입 프로세스를 전파합니다. 모든 삽입은 루트 노드로 시작하며 루트 노드는 모든 단어가 될 수 있습니다. 지금까지 우리는 B Tree가 무엇인지 배웠습니다. 이제 가장 가까운 단어를 찾는 방법을 알아보겠습니다. 우선, 허용 오차 값을 설정해야 합니다. 이 허용 오차 값은 철자가 틀린 단어와 올바른 단어 사이의 최대 편집 거리에 불과합니다.

허용 범위 내에서 적합한 올바른 단어를 찾기 위해 반복 프로세스를 사용합니다. 그러나 이것은 더 높은 복잡성을 가지므로 이진 트리의 각 노드가 부모로부터의 편집 거리를 기반으로 구성된다는 것을 알고 있으므로 이제 BK 트리가 작동합니다. 따라서 루트 노드에서 허용 한계 내에 있는 특정 노드로 직접 이동할 수 있습니다. TOL이 허용 한계이고 철자가 틀린 노드에서 현재 노드의 편집 거리는 dist입니다. 이제 편집 거리가 범위에 있는 자식만 반복합니다.

[dist - TOL, dist+TOL], 이것은 복잡성을 크게 줄입니다.

예

작업을 설명하는 프로그램 -

#include "bits/stdc++.h"
using namespace std;
#define MAXN 100
#define TOL 2
#define LEN 10
struct Node {
   string word;
   int next[2*LEN];
   Node(string x):word(x){
      for(int i=0; i<2*LEN; i++)
      next[i] = 0;
   }
   Node() {}
};
Node RT;
Node tree[MAXN];
int ptr;
int min(int a, int b, int c) {
   return min(a, min(b, c));
}
int editDistance(string& a,string& b) {
   int m = a.length(), n = b.length();
   int dp[m+1][n+1];
   for (int i=0; i<=m; i++)
      dp[i][0] = i;
   for (int j=0; j<=n; j++)
      dp[0][j] = j;
   for (int i=1; i<=m; i++) {
      for (int j=1; j<=n; j++) {
         if (a[i-1] != b[j-1])
            dp[i][j] = min( 1 + dp[i-1][j], 1 + dp[i][j-1], 1 + dp[i-1][j-1] );
         else
            dp[i][j] = dp[i-1][j-1];
      }
   }
   return dp[m][n];
}
void insertValue(Node& root,Node& curr) {
   if (root.word == "" ){
      root = curr;
      return;
   }
   int dist = editDistance(curr.word,root.word);
   if (tree[root.next[dist]].word == ""){
      ptr++;
      tree[ptr] = curr;
      root.next[dist] = ptr;
   }
   else{
      insertValue(tree[root.next[dist]],curr);
   }
}
vector <string> findCorrectSuggestions(Node& root,string& s){
   vector <string> corrections;
   if (root.word == "")
      return corrections;
   int dist = editDistance(root.word,s);
   if (dist <= TOL) corrections.push_back(root.word);
      int start = dist - TOL;
   if (start < 0)
      start = 1;
   while (start < dist + TOL){
      vector <string> temp = findCorrectSuggestions(tree[root.next[start]],s);
      for (auto i : temp)
      corrections.push_back(i);
      start++;
   }
   return corrections;
}
int main(){
   string dictionary[] = {"book","cake","cart","books", "boo" };
   ptr = 0;
   int size = sizeof(dictionary)/sizeof(string);
   for(int i=0; i<size; i++){
      Node tmp = Node(dictionary[i]);
      insertValue(RT,tmp);
   }
   string word1 = "ok";
   string word2 = "ke";
   vector <string> match = findCorrectSuggestions(RT,word1);
   cout<<"Correct words suggestions from dictionary for : "<<word1<<endl;
   for (auto correctWords : match)
   cout<<correctWords<<endl;
   match = findCorrectSuggestions(RT,word2);
   cout<<"Correct words suggestions from dictionary for : "<<word2<<endl;
   for (auto correctWords : match)
   cout<<correctWords<<endl;
   return 0;
}

출력

Correct words suggestions from dictionary for : ok
book
boo
Correct words suggestions from dictionary for : ke
cake