[关闭]
@diyer22 2016-11-28T13:53:24.000000Z 字数 2768 阅读 2296

FP_tree 算法

未分类


  1. from __future__ import unicode_literals
  2. def log(d):
  3. show_len = 40
  4. if isinstance(d,list):
  5. for i in d:
  6. strr = str(i)
  7. if len(strr) > show_len:
  8. print strr[:show_len] + '...'
  9. else:
  10. print strr
  11. return
  12. if isinstance(d,dict):
  13. for i in d:
  14. strr = str(i)+' = '+str(d[i])
  15. if len(strr) > show_len:
  16. print strr[:show_len] + '...'
  17. else:
  18. print strr
  19. return
  20. print d
  21. MAX_L = 22
  22. def p_tree(t,level = 0):
  23. if level > MAX_L:
  24. return
  25. for i in t:
  26. if not isinstance(i,int):
  27. continue
  28. print level*'| '+'%d :%d'%(i, t[i]['c'])
  29. p_tree(t[i],level+1)
  30. min_sup = 0.01
  31. min_conf = 0.5
  32. path = './retail2.dat'
  33. f = open(path, 'r')
  34. db = f.readlines()
  35. f.close()
  36. db = map(lambda strr:map(lambda x:int(x), strr[:-1].split(' ')[:-1]), db)
  37. min_sup *= len(db)
  38. dic = {} # 记录it 频率
  39. dicc = {} # 记录所有频繁项集
  40. min_f = [] # 记录最小边缘频繁项集
  41. a_r = [] # 记录边缘强关系
  42. for items in db:
  43. for item in items:
  44. dic[item] = 1 if item not in dic else dic[item] + 1
  45. # 删除小于 min_sup 的item
  46. [dic.pop(k) if not dic[k]>= min_sup else None for k in dic.keys()]
  47. root = {}
  48. record = {}
  49. def add_tree(its, tree, count, record, dic):
  50. '''
  51. 将 count个 items 加入 tree
  52. 如果record_flag为True 则记录
  53. tree 的结构
  54. s self
  55. f father
  56. c count
  57. [\d] child
  58. '''
  59. its = filter(lambda x: x in dic, its)
  60. its.sort(lambda x, y: 1 if dic[x] < dic[y] else -1 )
  61. for it in its:
  62. if it not in tree:
  63. tree[it] = {}
  64. tree[it]['f'] = tree
  65. tree[it]['s'] = it
  66. record[it]=[tree[it]] if it not in record else record[it]+[tree[it]]
  67. tree = tree[it]
  68. tree['c'] = count if 'c' not in tree else tree['c'] + count
  69. for items in db:
  70. add_tree(items, root, 1, record, dic)
  71. #print 'dic',
  72. #log(dic)
  73. print '\nrecord',log('')
  74. #p_tree(root)
  75. fp_l = dic.items()
  76. fp_l.sort(lambda x, y: 1 if x[1]<y[1] else -1)
  77. def creat_its(t):
  78. l=[]
  79. while 1:
  80. t = t['f']
  81. if 'f' not in t:
  82. break
  83. l += [t['s']]
  84. return l[::-1]
  85. dicc ={}
  86. def is_singl_tree(t):
  87. def c_n(n, listt, l, b, r): # 递归生成全排列
  88. ''' n 剩余选择数量
  89. listt 全排列目标
  90. b 起始值
  91. l 临时记录
  92. '''
  93. if n == 0: # 生成关系并计算支持度
  94. r.append(tuple(l))
  95. return
  96. for i in range(b, len(listt)-n+1):
  97. l += [listt[i]]
  98. c_n(n-1, listt, l, i+1,r)
  99. l.pop()
  100. return
  101. def combination(its): # 对边缘频繁项集 递归生成排列
  102. n = len(its)
  103. l = []
  104. if n == 1:
  105. return [its]
  106. for lenth in range(1, n+1):
  107. c_n(lenth, its, [],0,l)
  108. print 'its',its,l
  109. return l
  110. l = []
  111. root = t
  112. while 1:
  113. tmp = filter(lambda x:isinstance(x, int), t.keys())
  114. n = len(tmp)
  115. if n != 1 and n != 0:
  116. # print l,p_tree(root)
  117. return None
  118. if n == 0 :
  119. if 'f' in t:
  120. # print 'tmp',l
  121. # print tmp,root.keys(),l,combination(l)
  122. return [list(i) for i in combination(l)]
  123. return None
  124. l += tmp
  125. t=t[tmp[0]]
  126. def header_to_db(fp_list, record):
  127. l = []
  128. # print 'fp',fp_list
  129. for it in fp_list[::-1]:
  130. it = it[0]
  131. db = []
  132. for t in record[it]:
  133. db += [(creat_its(t), t['c'])]
  134. # print 'db',db
  135. dic = {}
  136. for items in db:
  137. for item in items[0]:
  138. dic[item] = items[1] if item not in dic else dic[item] + items[1]
  139. # 删除小于 min_sup 的item
  140. [dic.pop(k) if not dic[k]>= min_sup else None for k in dic.keys()]
  141. t = {'s':'root'}
  142. header = {}
  143. for its in db:
  144. add_tree(its[0], t, its[1], header, dic)
  145. r = is_singl_tree(t)
  146. # print 'it',it,r,dic
  147. if r != None: # 若已经是 single 了 则不必继续递归
  148. # print 'r',r
  149. l += [i+[it] for i in r]+[[it]]
  150. # print 'add',[i+[it] for i in r]
  151. continue
  152. fp_l = dic.items()
  153. fp_l.sort(lambda x, y: 1 if x[1]<y[1] else -1)
  154. r = header_to_db(fp_l, header)
  155. l += [i+[it] for i in r]+[[it]]
  156. return l
  157. #for it in fp_l[::-1]:
  158. # f(it[0])
  159. r = header_to_db(fp_l, record)
  160. #print 'resoult',r
  161. print 'frequent itemset', len(r)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注