首页 > > 程序设计 > C/C++ >

C++里创建 Trie字典树（中文词典）（三）（联想…

2018-06-17 23:31:54来源：未知阅读 ()

　　萌新做词典第三篇，做得不好，还请指正，谢谢大佬！

　　今天把词典的联想做好了，也是比较low的，还改了之前的查询、遍历等代码。 Orz

　　一样地先放上运行结果：

 1 test1
 2 ID : 2    char : 件    word : 编程软件
 3 ID : 3    char : 习    word : 编程学习
 4 ID : 4    char : 站    word : 编程学习网站
 5 ID : 1    char : 门    word : 编程入门
 6 
 7 test2
 8 ID : 5    char : 练    word : 编程训练
 9 ID : 1    char : 门    word : 编程入门
10 ID : 3    char : 习    word : 编程学习
11 ID : 4    char : 站    word : 编程学习网站
12 ID : 2    char : 件    word : 编程软件
13 find ID : 3    word : 编程学习
14 
15 associate "编程" : 
16 find!
17 训练
18 入门
19 学习
20 学习网站
21 软件

　　测试用的test.cc

 1 #include "Dictionary.h"
 2 #include <iostream>
 3 #include <string>
 4 #include <vector>
 5 using std::cout;
 6 using std::endl;
 7 using std::string;
 8 using std::vector;
 9 
10 int test1()
11 {
12     ccx::Dictionary words;
13     string word1 = "编程入门";    
14     string word2 = "编程软件";    
15     string word3 = "编程学习";    
16     string word4 = "编程学习网站";    
17     
18     words.push(word1);    
19     words.push(word2);    
20     words.push(word3);    
21     words.push(word4);    
22 
23     words.resetIt();
24     
25     while(!words.isEnd())
26     {
27         cout << "ID : " << words.getCurWordId() 
28             << "\tchar : " << words.getCurChar() 
29             << "\tword : " << words.getCurWord() << endl;
30         words.next();
31     }
32     
33     words.leading_out();
34     return 0;
35 }
36 
37 
38 int test2()
39 {
40     ccx::Dictionary words;
41     words.leading_in();
42 
43 
44     string word("编程训练");
45     words.push(word);
46     words.resetIt();
47 
48     while(!words.isEnd())
49     {
50         cout << "ID : " << words.getCurWordId() 
51             << "\tchar : " << words.getCurChar() 
52             << "\tword : " << words.getCurWord() << endl;
53         words.next();
54     }
55     string tmp = "编程学习";    
56     int id = words.search(tmp);
57     if(-1 == id)
58     {
59         cout << "no such word like \"" << tmp << "\"" << endl;
60     }else{
61         cout << "find ID : " << id
62             << "\tword : " << tmp << endl;
63     }
64 
65     cout << endl;
66     cout << "associate \"编程\" : " << endl;
67 
68     vector<string> data;    
69     string temp = "编程";
70     
71     if(words.associate(temp, data))
72     {
73         cout << "find!" << endl;
74         for(auto & elem : data)
75         {
76             cout << elem << endl;
77         }
78     }else{
79         cout << "can't find" << endl;
80     }
81 
82 
83     return 0;
84 }
85 
86 int main()
87 {
88     cout << "test1" << endl;
89     test1();
90     cout << endl;
91     cout << "test2" << endl;
92     test2();
93     cout << endl;
94 }

View Code

　　test1不变，test2 在导入后再插入一个词“编程训练”，发现ID是正常的。

　　然后在test2最后调用联想函数，传入“编程”，能够正常传出所有的字符串。

　　在做这个的时候，一开始想的很简单，就是拿传入的词去树中查找，找到最后一人字对应的节点，然后以那个节点为根进行遍历。然后就开开心心地去写了，结果写一部分就要对之前的代码进行更改，于是，这个接口越来越“肥”了：

Dictionary.h

 1 #ifndef __DICTIONARY_H__
 2 #define __DICTIONARY_H__
 3 
 4 #include "DictionaryData.h"
 5 #include "DictionaryConf.h"
 6 
 7 #include <memory>
 8 #include <vector>
 9 #include <list>
10 
11 namespace ccx{
12 
13 using std::shared_ptr;
14 using std::vector;
15 using std::list;
16 
17 class Dictionary
18 {
19     typedef unordered_map<string, pDictElem>::iterator WordIt;
20     public:
21         Dictionary();
22         void push(const string & word);
23         void push(vector<string> & words);
24         int search(const string & word);
25         bool associate(const string & word, vector<string> & data);
26     private:
27         void AddWord(const string & word, int wordId);
28         void splitWord(const string & word, vector<string> & characters);//把词拆成字
29         int search(vector<string> & data, pDictElem & pcur);
30         pDictElem _dictionary;
31         DictionaryConf _conf;    
32 
33 //遍历
34     public:
35         string getCurChar();
36         string getCurWord();
37         int getCurWordId();
38         bool isEnd();
39         void resetIt();
40         void next();
41     private:
42         void resetPoint(pDictElem pcur);
43         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
44         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
45         string getCurWord(list<WordIt> & stackWord);
46         
47         pDictElem _pcur;
48         WordIt _itcur;
49         
50 //用list实现栈，遍历时方便
51         list<WordIt> _stackWord;
52         list<pDictElem> _stackDict;
53 
54 //导入导出
55     public:
56         void leading_in();
57         void leading_out();
58 };
59 
60 }
61 
62 #endif

　　对几个原有的函数进行了重载，主要是为了能够复用一些代码，但是又想不到合适的新的函数名（英语不太好Orz）。

　　首先，是要能够查找并返回新的根结点，于是对search进行修改：

 1 int Dictionary::search(vector<string> & characters, pDictElem & root)
 2 {
 3     vector<string>::iterator it_char;
 4     it_char = characters.begin();    
 5     root = _dictionary;
 6     int i = 0;
 7     for(; it_char != characters.end(); ++it_char, ++i)
 8     {
 9         WordIt it_word;
10         it_word = root->_words.find(*it_char);
11         
12         if(it_word == root->_words.end())
13         {
14             break;
15         }else{
16             root = it_word->second;
17         }
18     }
19     return i;
20 }

　　形参第一项是分解后的字集，第二项是一个智能指针，指向某个节点。这里返回值改为了字集的第几项，有两个目的：

　　1、插入函数中可以方便地知道下一个要插入的是哪个字符

　　2、联想函数中可以判断字集中的字是否都存在于词典中

　　3、好吧，我没想到其它好办法，而且当时是想到上面两点就这么做了，后来发现，插入部分的代码根本就不用改

　　然后是重载search：

 1 int Dictionary::search(const string & word)
 2 {
 3     pDictElem root = _dictionary;
 4     vector<string> temp;
 5     splitWord(word, temp);
 6     
 7     int ret = search(temp, root);
 8     int size = temp.size();
 9     if(ret != size)
10     {
11         return -1;
12     }
13     return root->_wordId;
14 }

　　在这里对字进行分解，并定义一个临时的根结点，这样做的目的是为了保护private中的根结点，并且可以在多线程环境中互不干扰。

　　能够找到“新的根”后，就要对它进行遍历了。如果只有单一线程或进程来使用它，这里可以直接把resetPoint（原来的）修改一下，设置指定结点就可以了：

 1 void Dictionary::resetPoint(pDictElem pcur)
 2 {
 3     _pcur = pcur;
 4     if(_stackDict.size())
 5     {
 6         _stackDict.clear();
 7     }
 8     if(_stackWord.size())
 9     {
10         _stackWord.clear();
11     }
12     next();
13 }

　　如果是这样，那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中，于是我对遍历部分的函数进行了修改：

 1 void Dictionary::next()
 2 {
 3     next(_pcur, _stackWord, _stackDict);
 4 }
 5 
 6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 7 {
 8     while(pcur)
 9     {
10         nextWord(pcur, stackWord, stackDict);
11         if(!pcur || pcur->_wordId)
12         {
13             break;
14         }
15     }
16 }
17 
18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
19 {
20     if(pcur)
21     {
22         if(pcur->_words.size())
23         {
24             stackDict.push_back(pcur);
25             stackWord.push_back(pcur->_words.begin());
26             pcur = stackWord.back()->second;
27         }else{
28             ++(stackWord.back());
29         }
30         while(stackWord.back() == stackDict.back()->_words.end())
31         {
32             stackDict.pop_back();
33             stackWord.pop_back();
34             if(!stackDict.size())
35             {
36                 pcur = NULL;
37             }
38             ++(stackWord.back());
39         }
40         if(pcur)
41         {
42             pcur = stackWord.back()->second;
43         }    
44     }
45 }

　　next部分，改为传入参数，这样就可以在associate里定义临时的栈和智能指针等，遍历的时候与其它工作并没有任何关系。

　　同样地，getWord也要做相同的更改：

 1 string Dictionary::getCurWord()
 2 {
 3     return getCurWord(_stackWord);
 4 }
 5 
 6 string Dictionary::getCurWord(list<WordIt> & stackWord)
 7 {
 8     string temp;
 9     list<WordIt>::iterator it_word;    
10     it_word = stackWord.begin();    
11 
12     for(; it_word != stackWord.end(); ++it_word)
13     {
14         temp += (*it_word)->first;
15     }
16     return temp;
17 }

　　当然了，对外提供的接口都是不要传参的，其它的只能在内部使用，于是放入了private区。

　　终于可以开始写联想了0.0

 1 bool Dictionary::associate(const string & word, vector<string> & data)
 2 {
 3     pDictElem root = _dictionary;
 4     vector<string> temp;
 5     splitWord(word, temp);
 6     
 7     int ret = search(temp, root);
 8     int size = temp.size();
 9     if(ret != size)
10     {
11         return false;
12     }
13     
14     list<WordIt> stackWord;
15     list<pDictElem> stackDict;
16     next(root, stackWord, stackDict);
17     while(root)
18     {
19         string temp = getCurWord(stackWord);
20         data.push_back(temp);    
21         next(root, stackWord, stackDict);
22     }
23     
24     if(!data.size())
25     {
26         return false;
27     }
28     return true;
29 }

　　返回bool类型，可以方便地判断是否联想成功，即以传入的词做为前缀，能否找到剩余部分（词典里有存）。于是乎，一个渣渣型号的词典就做好啦~~~

Dictionary.cc

  1 #include "Dictionary.h"
  2 #include <iostream>
  3 #include <fstream>
  4 #include <string>
  5 #include <json/json.h>
  6 
  7 namespace ccx{
  8 
  9 using std::endl;
 10 using std::cout;
 11 using std::pair;
 12 using std::ofstream;
 13 using std::ifstream;
 14 
 15 Dictionary::Dictionary()
 16 : _dictionary(new DictElem)
 17 , _conf()
 18 {
 19     _dictionary->_wordId = 0;
 20     _pcur = _dictionary;
 21 }
 22 
 23 void Dictionary::splitWord(const string & word, vector<string> & characters)
 24 {
 25     int num = word.size();
 26     int i = 0;
 27     while(i < num)
 28     {
 29         int size = 1;
 30         if(word[i] & 0x80)
 31         {
 32             char temp = word[i];
 33             temp <<= 1;
 34             do{
 35                 temp <<= 1;
 36                 ++size;
 37             }while(temp & 0x80);
 38         }
 39         string subWord;
 40         subWord = word.substr(i, size);
 41         characters.push_back(subWord);
 42         i += size;
 43     }
 44 }
 45 
 46 void Dictionary::AddWord(const string & word, int wordId)
 47 {
 48     vector<string> characters;
 49     splitWord(word, characters);
 50     
 51     vector<string>::iterator it_char;
 52     it_char = characters.begin();    
 53     pDictElem root;
 54     root = _dictionary;
 55     for(; it_char != characters.end(); ++it_char)
 56     {
 57         WordIt it_word;
 58         it_word = root->_words.find(*it_char);
 59         
 60         if(it_word == root->_words.end())
 61         {
 62             pair<string, pDictElem> temp;
 63             temp.first = *it_char;
 64             pDictElem dictemp(new DictElem);
 65             dictemp->_word = *it_char;
 66             dictemp->_wordId = 0;
 67             temp.second = dictemp;
 68             root->_words.insert(temp);
 69             root = dictemp;
 70         }else{
 71             root = it_word->second;
 72         }
 73     }
 74     if(!root->_wordId)
 75     {
 76         root->_wordId = wordId;
 77     }
 78 }
 79 
 80 void Dictionary::push(const string & word)
 81 {
 82     ++(_dictionary->_wordId);
 83     AddWord(word, _dictionary->_wordId);
 84 }
 85 
 86 void Dictionary::push(vector<string> & words)
 87 {
 88     int size = words.size();
 89     for(int i = 0; i < size; ++i)
 90     {
 91         push(words[i]);
 92     }
 93 }
 94 
 95 int Dictionary::search(const string & word)
 96 {
 97     pDictElem root = _dictionary;
 98     vector<string> temp;
 99     splitWord(word, temp);
100     
101     int ret = search(temp, root);
102     int size = temp.size();
103     if(ret != size)
104     {
105         return -1;
106     }
107     return root->_wordId;
108 }
109 
110 int Dictionary::search(vector<string> & characters, pDictElem & root)
111 {
112     vector<string>::iterator it_char;
113     it_char = characters.begin();    
114     root = _dictionary;
115     int i = 0;
116     for(; it_char != characters.end(); ++it_char, ++i)
117     {
118         WordIt it_word;
119         it_word = root->_words.find(*it_char);
120         
121         if(it_word == root->_words.end())
122         {
123             break;
124         }else{
125             root = it_word->second;
126         }
127     }
128     return i;
129 }
130 
131 bool Dictionary::associate(const string & word, vector<string> & data)
132 {
133     pDictElem root = _dictionary;
134     vector<string> temp;
135     splitWord(word, temp);
136     
137     int ret = search(temp, root);
138     int size = temp.size();
139     if(ret != size)
140     {
141         return false;
142     }
143     
144     list<WordIt> stackWord;
145     list<pDictElem> stackDict;
146     next(root, stackWord, stackDict);
147     while(root)
148     {
149         string temp = getCurWord(stackWord);
150         data.push_back(temp);    
151         next(root, stackWord, stackDict);
152     }
153     
154     if(!data.size())
155     {
156         return false;
157     }
158     return true;
159 }
160 
161 //遍历用
162 
163 void Dictionary::resetPoint(pDictElem pcur)
164 {
165     _pcur = pcur;
166     if(_stackDict.size())
167     {
168         _stackDict.clear();
169     }
170     if(_stackWord.size())
171     {
172         _stackWord.clear();
173     }
174     next();
175 }
176 
177 void Dictionary::resetIt()
178 {
179     resetPoint(_dictionary);
180 }
181 
182 void Dictionary::next()
183 {
184     next(_pcur, _stackWord, _stackDict);
185 }
186 
187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
188 {
189     while(pcur)
190     {
191         nextWord(pcur, stackWord, stackDict);
192         if(!pcur || pcur->_wordId)
193         {
194             break;
195         }
196     }
197 }
198 
199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
200 {
201     if(pcur)
202     {
203         if(pcur->_words.size())
204         {
205             stackDict.push_back(pcur);
206             stackWord.push_back(pcur->_words.begin());
207             pcur = stackWord.back()->second;
208         }else{
209             ++(stackWord.back());
210         }
211         while(stackWord.back() == stackDict.back()->_words.end())
212         {
213             stackDict.pop_back();
214             stackWord.pop_back();
215             if(!stackDict.size())
216             {
217                 pcur = NULL;
218             }
219             ++(stackWord.back());
220         }
221         if(pcur)
222         {
223             pcur = stackWord.back()->second;
224         }    
225     }
226 }
227 
228 string Dictionary::getCurChar()
229 {
230     return _pcur->_word;
231 }
232 
233 int Dictionary::getCurWordId()
234 {
235     return _pcur->_wordId;
236 }
237 
238 string Dictionary::getCurWord()
239 {
240     return getCurWord(_stackWord);
241 }
242 
243 string Dictionary::getCurWord(list<WordIt> & stackWord)
244 {
245     string temp;
246     list<WordIt>::iterator it_word;    
247     it_word = stackWord.begin();    
248 
249     for(; it_word != stackWord.end(); ++it_word)
250     {
251         temp += (*it_word)->first;
252     }
253     return temp;
254 }
255 
256 bool Dictionary::isEnd()
257 {
258     return _pcur == NULL;
259 }
260 
261 void Dictionary::leading_in()//导入，失败没必要退出程序
262 {
263     ifstream ifs;
264     const char * path = _conf.getDictionaryPath().c_str();
265     ifs.open(path);
266     if(!ifs.good())
267     {
268         cout << "open Dictionary.json error(leading_in)" << endl;
269     }else{
270         Json::Value root;
271         Json::Reader reader;
272         
273         if(!reader.parse(ifs, root, false))
274         {
275             cout << "json read Dictionary.json error" << endl;
276         }else{
277             int size = root.size();
278             for(int i = 0; i < size; ++i)
279             {
280                 string word = root[i]["Word"].asString();
281                 int wordId = root[i]["WordId"].asInt();
282                 AddWord(word, wordId);
283                 ++(_dictionary->_wordId);
284             }
285         }
286     }
287 }
288 
289 void Dictionary::leading_out()
290 {
291     Json::Value root;
292     Json::FastWriter writer;
293 
294     resetIt();
295     
296     while(!isEnd())
297     {
298         Json::Value elem;
299         elem["Word"] = getCurWord();
300         elem["WordId"] = getCurWordId();
301         root.append(elem);
302         next();
303     }
304 
305     string words;
306     words = writer.write(root);
307     
308     ofstream ofs;
309     const char * path = _conf.getDictionaryPath().c_str();
310     ofs.open(path);
311     if(!ofs.good())
312     {
313         cout << "open Dictionary.json error(leading_out)" << endl;
314         ofs.open("Dictionary.tmp");
315         if(!ofs.good())
316         {
317             exit(EXIT_FAILURE);
318         }
319     }
320     
321     ofs << words;
322     ofs.close();
323 }
324 
325 }