[关闭]
@spiritnotes 2016-03-01T13:48:32.000000Z 字数 3650 阅读 2175

FP-Growth算法

机器学习


FP-growth算法

FP-growth算法是用于发现频繁项集的,其基本过程如下:

  1. 构建FP树
  2. 从FP树中挖掘频繁项集

FP树特点

一个元素可以在FP树中出现多次。FP树会存储项集的出现频率,而每个项集以路径的方式存储在树中。存在相似元素的集合会共享树的一部分。只有当集合完全不同时才会分叉。树节点上给出集合中单个元素以及其在序列中出现的次数,路径给出该序列的出现次数。

构建FP树

创建一个树node类
在构建前,需要按照绝对出现频率排序。从空集开始,向其不断添加频繁项集。过滤、排序后的事务依次添加到树中,如果树中已存在现有元素,则增加现有元素的值,如果现有元素不存在,则向树中添加一个分支。

挖掘频繁项集

基本步骤
从FP树中获得条件模式基
利用条件模式基,构建一个条件FP树
迭代执行上述步骤,直到树包含一个元素项为止

Python实现

github地址: https://github.com/spiritwiki/codes/tree/master/FP-Tree

  1. import logging
  2. import itertools
  3. import collections
  4. #logging.basicConfig(level=logging.DEBUG)
  5. class FrequentPatternTree():
  6. '''FP树'''
  7. class Node():
  8. '''节点定义,头表和树中都使用该节点'''
  9. def __init__(self, name=None, parent=None, value=0):
  10. self.name = name
  11. self.parent = parent
  12. self.value = value
  13. self.child = {}
  14. self.next = None
  15. def link(self, new_node):
  16. node = self
  17. while node.next:
  18. node = node.next
  19. node.next = new_node
  20. def increase(self, value=1):
  21. self.value += value
  22. def __eq__(self, other):
  23. return (self.name == other.name and self.value == other.value)
  24. def __repr__(self):
  25. return str(self.value)+(str(self.child) if self.child else '')
  26. def __eq__(self, other):
  27. '''定义FP树的相等,test中使用'''
  28. return self.root_node == other.root_node and self.header_table == other.header_table
  29. def __repr__(self):
  30. '''字符串表示'''
  31. return '{table:' + str(self.header_table) + ' ; tree:' + str(self.root_node) + '}'
  32. def _is_empty(self):
  33. return len(self.header_table) == 0
  34. @staticmethod
  35. def mine_trans(trans, support_ratio=0.5, support=None):
  36. '''针对事务集获取频繁集'''
  37. support = support or len(trans)*support_ratio
  38. tree = FrequentPatternTree().create_tree(trans, support=support)
  39. return tree.get_freq_sets(support)
  40. def create_tree(self, trans, support_ratio=0.5, support=None):
  41. '''创建FP树'''
  42. support = support or len(trans)*support_ratio
  43. header_table = self._create_header_table(trans, support)
  44. root_node = FrequentPatternTree.Node(None, None)
  45. for tran in trans:
  46. logging.debug('before transform tran: {}'.format(tran))
  47. tran = [i for i in tran if i in header_table]
  48. if not tran:
  49. continue
  50. tran.sort(key=lambda i:(header_table[i].value,i), reverse=True)
  51. logging.debug('after transform tran: {}'.format(tran))
  52. self._add_item_fptree(tran, root_node, header_table)
  53. logging.debug('after add item tree: {}'.format(root_node))
  54. self.root_node = root_node
  55. self.header_table = header_table
  56. return self
  57. def _add_item_fptree(self, tran, node, header_table):
  58. '''添加一个事务到FP树中'''
  59. if not tran: return
  60. item = tran[0]
  61. if item not in node.child:
  62. new_node = FrequentPatternTree.Node(item, node)
  63. node.child[item] = new_node
  64. header_table[item].link(new_node)
  65. node.child[item].increase()
  66. self._add_item_fptree(tran[1:], node.child[item], header_table)
  67. def _create_header_table(self, trans, support):
  68. '''创建头表'''
  69. ret = collections.defaultdict(int)
  70. for tran in trans:
  71. for i in tran:
  72. ret[i] += 1
  73. filtered = {key:FrequentPatternTree.Node(key,value=value) for key,value in ret.items() if value >= support}
  74. return filtered
  75. def get_freq_sets(self, support):
  76. '''在创建的fp树上获取频繁集'''
  77. freq_sets = []
  78. self._find_freq_sets_byprefix(support, [], freq_sets)
  79. return freq_sets
  80. def _find_freq_sets_byprefix(self, support, pre_fix, freq_sets):
  81. '''通过构建条件FP树获取频繁集'''
  82. items = [i[0] for i in sorted(self.header_table.items(), key=lambda i:(i[1].value,i)) if i[1].value >= support]
  83. for item in items:
  84. new_freq_set = pre_fix.copy()
  85. new_freq_set.append(item)
  86. freq_sets.append(new_freq_set)
  87. condition_bases = self._find_prefix_path(item)
  88. new_tree = FrequentPatternTree().create_tree(condition_bases, support=support)
  89. if not new_tree._is_empty():
  90. new_tree._find_freq_sets_byprefix(support, new_freq_set, freq_sets)
  91. def _find_prefix_path(self, item):
  92. '''获取条件路径'''
  93. condition_path = []
  94. node = self.header_table[item].next
  95. while node:
  96. path, curr_node = [], node
  97. while True:
  98. curr_node = curr_node.parent
  99. if curr_node.name is None:
  100. break
  101. path.append(curr_node.name)
  102. if path:
  103. condition_path += [path]*node.value
  104. node = node.next
  105. return condition_path
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注