@pastqing 2014-08-16T02:26:30.000000Z 字数 3132 阅读 1633

0809

c++

文本纠错项目

编辑距离算法：
L(i, j) = distance(x[1..i], y[1..j])
min = { L(i-1, j-1) x[i] = y[j]
L(i, j-1) +1 //increa
L(i-1, j) +1 //del
L(i-1, j-1) +1 //update

size_t editDistance(const char *str1, const char *str2)                 
{
    int memo[100][100];
    int len1 = strlen(str1);
    int len2 = strlen(str2);
    for( int i = 0; i < len1 +1 ; ++i)//str1为空时
        memo[i][0] = i ; 
    for(int i = 0; i< len2 +1 ; ++i)
        memo[0][i] = i;
    for(int i = 1; i <= len1; ++i )
    {   
        for( int j = 1; j <= len2; ++j )
        {   
            if( str1[i-1] == str2[j-1] )
            {   
                memo[i][j] = memo[i-1][j-1];
            }   
            else
            {   
                memo[i][j] = myMin(memo[i-1][j], memo[i][j-1], memo[i-1][j-1])+1;
            }   
        }   
    }   
    return memo[len1][len2];
}

使用索引，找出查询词每个单词取并集,剔除高频出现的字母

std::map<std::string, int > index_[26]; //建立26个首字母索引,放入单词和词频对
void TextQuery::make_index()
{ 
    //Fre_line 是一个词频和 set行号结构体
    //遍历将每个单词和词频做成pair放入索引
    for(map<string, Fre_line>::iterator it = word_.begin(); it != word_.end(); ++it)
    {
        pair<string, int> temp = make_pair(it->first, (it->second).cnt); //单词,词频
        insert_index(temp);
    }
}
void TextQuery::insert_index(std::pair<string, int> word)
{
    for(string::iterator it = word.first.begin(); it != word.first.end(); ++it)
    {
        index_[*it - 97].insert(word);//小写字母-97就是map对象字母表的下表
    }
}
//输入单词后: 1.排除高频字母
//2.对所在字母的索引进行查找,符合要求放入临时set中
//3.若set不空,将set中的结构体放入优先队列
 for(string::iterator ix = word.begin(); ix != word.end(); ++ix)
    {
        if(*ix == 'i' || *ix == 'e' || *ix == 'a')
            continue;
        for(map<string, int>::iterator iter = index_[*ix - 97].begin(); iter != index_[*ix - 97].end(); ++iter )
        {
            temp = edit_distance(word, iter->first);
            //cout << temp << endl;
            if(temp >= 3)
                continue;
            if(temp < 3)
            {
                candidate cand;
// bug     memset(&cand, 0, sizeof(cand));
                cand.word = iter->first;
                cand.distance = temp;
                cand.frequence = iter->second;
                //cout << cand.word << " " << cand.frequence << " " << cand.distance << endl;
                //
                //首先放入set集合,防止重复,再写入cand_
                set_words.insert(cand);
             }
        }
    }

使用cache提高程序效率

LRU的典型实现是hash map + doubly linked list，
双向链表用于存储数据结点，并且它是按照结点最近被使用的时间来存储的。如果一个结点被访问了，我们有理由相信它在接下来的一段时间被访问的概率要大于其它结点。于是，我们把它放到双向链表的头部。当我们往双向链表里插入一个结点，我们也有可能很快就会使用到它，同样把它插入到头部。我们使用这种方式不断地调整着双向链表，链表尾部的结点自然也就是最近一段时间，最久没有使用到的结点。那么，当我们的Cache满了，需要替换掉的就是双向链表中最后的那个结点(不是尾结点，头尾结点不存储实际内容)。

//cache类的简单实现
template<typename K, typename T>
class cacheThread
{
    public:
        explicit cacheThread(size_t size);
        ~cacheThread();
        T    getValue(K key);
        void putValue(K key, T data);
        Node<K, T>* getHead();
        Node<K, T>* getTail();
    private:
        size_t cacheSize_;
        std::unordered_map<K, Node<K, T>* >cacheMap_;//map中存的是key与>对应value的结点地址
        std::vector<Node<K, T>* >saveEntries_;
        Node<K, T> *head_;
        Node<K, T> *tail_;
        Node<K, T> *entries_;//链表结点类型
        void detach(Node<K, T>*);//分离结点
        void attach(Node<K, T>*);//头插法
};
/*cache中，我们就可以通过Put接口将数据插入双向链表中。 如果此时的Cache还
没满，那么我们将新结点插入到链表头部， 同时用哈希表保存结点的键值及结点>地址对。如果Cache已经满了， 我们就将链表中的最后一个结点(注意不是尾结点)的内容替换为新内容， 然后移动到头部，更新哈希表。*/
template<typename K, typename T>                                        
inline void cacheThread<K, T>::putValue(K key, T data)
{
    Node<K, T> *node = cacheMap_[key];
    if(node)//如果这个结点已经存在
    {
        detach(node);
        node->data_ = data;
        attach(node);
    }else
    {
        if(saveEntries_.empty())//所有结点都已经弹出，即cache满了
        {
            node = tail_->pre_;
            detach(node);
            cacheMap_.erase(node->key_);
        }else
        {
            node = saveEntries_.back();
            saveEntries_.pop_back();
        }
        node->key_  = key;
        node->data_ = data;
        cacheMap_[key] = node;
        attach(node);
    }
}
/*当我们通过键值来访问类型为T的数据时， 调用getValue函数，如果键值为key>的数据已经存在cache中，则返回该数据，并且将该数据的结点移动到头部，如不>在则调用putValue插入链表 */
template<typename K, typename T>
inline T cacheThread<K, T>::getValue(K key)
{
    Node<K, T>* node = cacheMap_[key];
    if( node )
    {
        detach(node);
        attach(node);
        return node->data_;
    }else
        return T();
}

0809

文本纠错项目

使用索引，找出查询词每个单词取并集,剔除高频出现的字母

使用cache提高程序效率

内容目录