[关闭]
@hainingwyx 2016-11-16T06:49:02.000000Z 字数 50066 阅读 2618

写给程序员的数据挖掘实战--读书笔记

数据挖掘 Python



写在之前

本书涉及的源程序和数据都可以在以下网站中找到:http://guidetodatamining.com/

这本书理论比较简单,书中错误较少,动手锻炼较多,如果每个代码都自己写出来,收获不少。总结:适合入门。

欢迎转载,转载请注明出处,如有问题欢迎指正。

协同过滤

相似用户评判标准:曼哈顿距离、欧氏距离、明氏距离。

  1. # Manhattan.py
  2. users = {
  3. "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5,
  4. "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5,
  5. "Vampire Weekend": 2.0},
  6. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0,
  7. "Phoenix": 2.0,"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
  8. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0,
  9. "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
  10. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5,
  11. "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,
  12. "Vampire Weekend": 2.0},
  13. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0,
  14. "The Strokes": 4.0, "Vampire Weekend": 1.0},
  15. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0,
  16. "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,
  17. "Vampire Weekend": 4.0},
  18. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0,
  19. "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
  20. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0,
  21. "Slightly Stoopid": 2.5, "The Strokes": 3.0}
  22.         }
  23. def manhattan(rating1, rating2):
  24.     """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries
  25.        of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
  26.     distance = 0
  27.     commonRatings = False
  28.     for key in rating1:
  29.         if key in rating2:
  30.             distance += abs(rating1[key] - rating2[key])
  31.             commonRatings = True
  32.     if commonRatings:
  33.         return distance
  34.     else:
  35.         return -1 #Indicates no ratings in common
  36. def computeNearestNeighbor(username, users):
  37.     """creates a sorted list of users based on their distance to username"""
  38.     distances = []
  39.     for user in users:
  40.         if user != username:
  41.             distance = manhattan(users[user], users[username])
  42.             distances.append((distance, user))
  43.     # sort based on distance -- closest first
  44.     distances.sort()
  45.     return distances
  46. def recommend(username, users):
  47.     """Give list of recommendations"""
  48.     # first find nearest neighbor
  49.     nearest = computeNearestNeighbor(username, users)[0][1]
  50.     print nearest
  51.     recommendations = []
  52.     # now find bands neighbor rated that user didn't
  53.     neighborRatings = users[nearest]
  54.     userRatings = users[username]
  55.     for artist in neighborRatings:
  56.         if not artist in userRatings:
  57.             recommendations.append((artist, neighborRatings[artist]))
  58.     # using the fn sorted for variety - sort is more efficient
  59.     return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
  1. # -*- coding: utf-8 -*-
  2. from math import sqrt
  3. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
  4. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
  5. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
  6. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
  7. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
  8. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
  9. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
  10. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
  11. }
  12. #明氏距离
  13. def minkowski(rating1,rating2,r):
  14. distance=0
  15. commonRatings=False
  16. for key in rating1:
  17. if key in rating2:
  18. distance += pow(abs(rating1[key]-rating2[key]),r)
  19. commonRatings=True
  20. distance = pow(distance,1.0/r)
  21. if commonRatings:
  22. return distance
  23. else:
  24. return -1 #Indicates no ratings in common
  25. def computeNearestNeighbor(username, users):
  26. """creates a sorted list of users based on their distance to username"""
  27. distances = []
  28. for user in users:
  29. if user != username:
  30. distance = minkowski(users[user], users[username],3)
  31. distances.append((distance, user))
  32. # sort based on distance -- closest first
  33. distances.sort()
  34. return distances
  35. def recommend(username, users):
  36. """Give list of recommendations"""
  37. # first find nearest neighbor
  38. nearest = computeNearestNeighbor(username, users)[0][1]
  39. print nearest
  40. recommendations = []
  41. # now find bands neighbor rated that user didn't
  42. neighborRatings = users[nearest]
  43. userRatings = users[username]
  44. for artist in neighborRatings:
  45. if not artist in userRatings:
  46. recommendations.append((artist, neighborRatings[artist]))
  47. # using the fn sorted for variety - sort is more efficient
  48. return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)

但是可能存在常数差别,但是两者爱好相同的问题。

皮尔逊相关系数:

  1. # Pearson.py
  2. from math import sqrt
  3. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
  4. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
  5. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
  6. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
  7. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
  8. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
  9. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
  10. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
  11. }
  12. # 这里为了简单使用近似代替
  13. def pearson(rating1,rating2):
  14. sum_xy=0
  15. sum_x=0
  16. sum_y=0
  17. sum_x2=0
  18. sum_y2=0
  19. n=0
  20. for key in rating1:
  21. if key in rating2:
  22. n += 1
  23. x = rating1[key]
  24. y = rating2[key]
  25. sum_xy += x*y
  26. sum_x += x
  27. sum_y += y
  28. sum_x2 += x**2
  29. sum_y2 += y**2
  30. denominnator = sqrt(sum_x2-(sum_x**2)/n)*sqrt(sum_y2-(sum_y**2)/n)
  31. if denominnator == 0:
  32. return 0
  33. else:
  34. return (sum_xy-(sum_x*sum_y)/n)/denominnator
  35. def cos_like(rating1,rating2):
  36. innerProd=0
  37. vector_x=0
  38. vectoy_y=0
  39. for key in rating1:
  40. if key in rating2:
  41. x=rating1[key]
  42. y=rating2[key]
  43. innerProd += x*y
  44. vector_x += x**2
  45. vectoy_y += y**2
  46. denominnator = sqrt(vector_x) * sqrt(vectoy_y)
  47. if denominnator == 0:
  48. return 0
  49. else:
  50. return innerProd / denominnator

余弦相似度:


总结:如果数据稠密使用欧氏距离;如果数据稀疏,使用余弦相似度;如果用户评级范围不同,使用皮尔逊相关系数。但是如果仅仅是基于一个用户进行推荐,个别用户的怪癖也会被推荐。

k近邻:利用k个最相似的用户确定推荐结果,K和应有有关。利用皮尔逊系数来确定每个人的影响因子。

  1. # A dictionary of movie critics and their ratings of a small
  2. # set of movies
  3. critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
  4. 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
  5. 'The Night Listener': 3.0},
  6. 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
  7. 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
  8. 'You, Me and Dupree': 3.5},
  9. 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
  10. 'Superman Returns': 3.5, 'The Night Listener': 4.0},
  11. 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
  12. 'The Night Listener': 4.5, 'Superman Returns': 4.0,
  13. 'You, Me and Dupree': 2.5},
  14. 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
  15. 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
  16. 'You, Me and Dupree': 2.0},
  17. 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
  18. 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
  19. 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
  20. from math import sqrt
  21. # Returns a distance-based similarity score for person1 and person2
  22. def sim_distance(prefs,person1,person2):
  23. # Get the list of shared_items
  24. si={}
  25. for item in prefs[person1]:
  26. if item in prefs[person2]: si[item]=1
  27. # if they have no ratings in common, return 0
  28. if len(si)==0: return 0
  29. # Add up the squares of all the differences
  30. sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
  31. for item in prefs[person1] if item in prefs[person2]])
  32. return 1/(1+sum_of_squares)
  33. # Returns the Pearson correlation coefficient for p1 and p2
  34. def sim_pearson(prefs,p1,p2):
  35. # Get the list of mutually rated items
  36. si={}
  37. for item in prefs[p1]:
  38. if item in prefs[p2]: si[item]=1
  39. # if they are no ratings in common, return 0
  40. if len(si)==0: return 0
  41. # Sum calculations
  42. n=len(si)
  43. # Sums of all the preferences
  44. sum1=sum([prefs[p1][it] for it in si])
  45. sum2=sum([prefs[p2][it] for it in si])
  46. # Sums of the squares
  47. sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  48. sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
  49. # Sum of the products
  50. pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
  51. # Calculate r (Pearson score)
  52. num=pSum-(sum1*sum2/n)
  53. den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  54. if den==0: return 0
  55. r=num/den
  56. return r
  57. # Returns the best matches for person from the prefs dictionary.
  58. # Number of results and similarity function are optional params.
  59. def topMatches(prefs,person,n=5,similarity=sim_pearson):
  60. scores=[(similarity(prefs,person,other),other)
  61. for other in prefs if other!=person]
  62. scores.sort()
  63. scores.reverse()
  64. return scores[0:n]
  65. # Gets recommendations for a person by using a weighted average
  66. # of every other user's rankings
  67. def getRecommendations(prefs,person,similarity=sim_pearson):
  68. totals={}
  69. simSums={}
  70. for other in prefs:
  71. # don't compare me to myself
  72. if other==person: continue
  73. sim=similarity(prefs,person,other)
  74. # ignore scores of zero or lower
  75. if sim<=0: continue
  76. for item in prefs[other]:
  77. # only score movies I haven't seen yet
  78. if item not in prefs[person] or prefs[person][item]==0:
  79. # Similarity * Score
  80. totals.setdefault(item,0)
  81. totals[item]+=prefs[other][item]*sim
  82. # Sum of similarities
  83. simSums.setdefault(item,0)
  84. simSums[item]+=sim
  85. # Create the normalized list
  86. rankings=[(total/simSums[item],item) for item,total in totals.items()]
  87. # Return the sorted list
  88. rankings.sort()
  89. rankings.reverse()
  90. return rankings
  91. def transformPrefs(prefs):
  92. result={}
  93. for person in prefs:
  94. for item in prefs[person]:
  95. result.setdefault(item,{})
  96. # Flip item and person
  97. result[item][person]=prefs[person][item]
  98. return result
  99. def calculateSimilarItems(prefs,n=10):
  100. # Create a dictionary of items showing which other items they
  101. # are most similar to.
  102. result={}
  103. # Invert the preference matrix to be item-centric
  104. itemPrefs=transformPrefs(prefs)
  105. c=0
  106. for item in itemPrefs:
  107. # Status updates for large datasets
  108. c+=1
  109. if c%100==0: print "%d / %d" % (c,len(itemPrefs))
  110. # Find the most similar items to this one
  111. scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
  112. result[item]=scores
  113. return result
  114. def getRecommendedItems(prefs,itemMatch,user):
  115. userRatings=prefs[user]
  116. scores={}
  117. totalSim={}
  118. # Loop over items rated by this user
  119. for (item,rating) in userRatings.items( ):
  120. # Loop over items similar to this one
  121. for (similarity,item2) in itemMatch[item]:
  122. # Ignore if this user has already rated this item
  123. if item2 in userRatings: continue
  124. # Weighted sum of rating times similarity
  125. scores.setdefault(item2,0)
  126. scores[item2]+=similarity*rating
  127. # Sum of all the similarities
  128. totalSim.setdefault(item2,0)
  129. totalSim[item2]+=similarity
  130. # Divide each total score by total weighting to get an average
  131. rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
  132. # Return the rankings from highest to lowest
  133. rankings.sort( )
  134. rankings.reverse( )
  135. return rankings
  136. def loadMovieLens(path='C:\Users\WangYixin\Desktop\PCI_Code\PCI_Code Folder\chapter2\data'):
  137. # Get movie titles
  138. movies={}
  139. for line in open(path+'/u.item'):
  140. (id,title)=line.split('|')[0:2]
  141. movies[id]=title
  142. # Load data
  143. prefs={}
  144. for line in open(path+'/u.data'):
  145. (user,movieid,rating,ts)=line.split('\t')
  146. prefs.setdefault(user,{})
  147. prefs[user][movies[movieid]]=float(rating)
  148. return prefs
  1. # -*- coding: utf-8 -*-
  2. # 推荐类
  3. import codecs
  4. from math import sqrt
  5. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
  6. "Norah Jones": 4.5, "Phoenix": 5.0,
  7. "Slightly Stoopid": 1.5,
  8. "The Strokes": 2.5, "Vampire Weekend": 2.0},
  9. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,
  10. "Deadmau5": 4.0, "Phoenix": 2.0,
  11. "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
  12. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
  13. "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
  14. "Slightly Stoopid": 1.0},
  15. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
  16. "Deadmau5": 4.5, "Phoenix": 3.0,
  17. "Slightly Stoopid": 4.5, "The Strokes": 4.0,
  18. "Vampire Weekend": 2.0},
  19. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
  20. "Norah Jones": 4.0, "The Strokes": 4.0,
  21. "Vampire Weekend": 1.0},
  22. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,
  23. "Norah Jones": 5.0, "Phoenix": 5.0,
  24. "Slightly Stoopid": 4.5, "The Strokes": 4.0,
  25. "Vampire Weekend": 4.0},
  26. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
  27. "Norah Jones": 3.0, "Phoenix": 5.0,
  28. "Slightly Stoopid": 4.0, "The Strokes": 5.0},
  29. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
  30. "Phoenix": 4.0, "Slightly Stoopid": 2.5,
  31. "The Strokes": 3.0}
  32. }
  33. class recommender:
  34. def __init__(self, data, k=1, metric='pearson', n=5):
  35. """ initialize recommender
  36. currently, if data is dictionary the recommender is initialized
  37. to it.
  38. For all other data types of data, no initialization occurs
  39. k is the k value for k nearest neighbor
  40. metric is which distance formula to use
  41. n is the maximum number of recommendations to make"""
  42. self.k = k
  43. self.n = n
  44. self.username2id = {}
  45. self.userid2name = {}
  46. self.productid2name = {}
  47. # for some reason I want to save the name of the metric
  48. self.metric = metric
  49. if self.metric == 'pearson':
  50. self.fn = self.pearson
  51. #
  52. # if data is dictionary set recommender data to it
  53. #
  54. if type(data).__name__ == 'dict':
  55. self.data = data
  56. def convertProductID2name(self, id):
  57. """Given product id number return product name"""
  58. if id in self.productid2name:
  59. return self.productid2name[id]
  60. else:
  61. return id
  62. def userRatings(self, id, n):
  63. """Return n top ratings for user with id"""
  64. print ("Ratings for " + self.userid2name[id])
  65. ratings = self.data[id]
  66. print(len(ratings))
  67. ratings = list(ratings.items())
  68. ratings = [(self.convertProductID2name(k), v)
  69. for (k, v) in ratings]
  70. # finally sort and return
  71. ratings.sort(key=lambda artistTuple: artistTuple[1],
  72. reverse = True)
  73. ratings = ratings[:n]
  74. for rating in ratings:
  75. print("%s\t%i" % (rating[0], rating[1]))
  76. def loadBookDB(self, path=''):
  77. """loads the BX book dataset. Path is where the BX files are
  78. located"""
  79. self.data = {}
  80. i = 0
  81. #
  82. # First load book ratings into self.data
  83. #
  84. f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
  85. f.readline() #read the title
  86. for line in f:
  87. i += 1
  88. #separate line into fields
  89. fields = line.split(';') # still with ""
  90. user = fields[0].strip('"') #delete “ in the fields
  91. book = fields[1].strip('"')
  92. rating = int(fields[2].strip().strip('"'))
  93. if user in self.data:
  94. currentRatings = self.data[user]
  95. else:
  96. currentRatings = {}
  97. currentRatings[book] = rating
  98. self.data[user] = currentRatings
  99. #line = f.readline()
  100. f.close()
  101. #
  102. # Now load books into self.productid2name
  103. # Books contains isbn, title, and author among other fields
  104. #
  105. f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
  106. for line in f:
  107. i += 1
  108. #separate line into fields
  109. fields = line.split(';')
  110. isbn = fields[0].strip('"')
  111. title = fields[1].strip('"')
  112. author = fields[2].strip().strip('"')
  113. title = title + ' by ' + author
  114. self.productid2name[isbn] = title
  115. f.close()
  116. #
  117. # Now load user info into both self.userid2name and
  118. # self.username2id
  119. #
  120. f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
  121. for line in f:
  122. i += 1
  123. #print(line)
  124. #separate line into fields
  125. fields = line.split(';')
  126. userid = fields[0].strip('"')
  127. location = fields[1].strip('"')
  128. if len(fields) > 3:
  129. age = fields[2].strip().strip('"')
  130. else:
  131. age = 'NULL'
  132. if age != 'NULL':
  133. value = location + ' (age: ' + age + ')'
  134. else:
  135. value = location
  136. self.userid2name[userid] = value
  137. self.username2id[location] = userid
  138. f.close()
  139. print(i)
  140. def pearson(self, rating1, rating2):
  141. sum_xy = 0
  142. sum_x = 0
  143. sum_y = 0
  144. sum_x2 = 0
  145. sum_y2 = 0
  146. n = 0
  147. for key in rating1:
  148. if key in rating2:
  149. n += 1
  150. x = rating1[key]
  151. y = rating2[key]
  152. sum_xy += x * y
  153. sum_x += x
  154. sum_y += y
  155. sum_x2 += pow(x, 2)
  156. sum_y2 += pow(y, 2)
  157. if n == 0:
  158. return 0
  159. # now compute denominator
  160. denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
  161. * sqrt(sum_y2 - pow(sum_y, 2) / n))
  162. if denominator == 0:
  163. return 0
  164. else:
  165. return (sum_xy - (sum_x * sum_y) / n) / denominator
  166. def computeNearestNeighbor(self, username):
  167. """creates a sorted list of users based on their distance to
  168. username"""
  169. distances = []
  170. for instance in self.data:
  171. if instance != username:
  172. distance = self.fn(self.data[username],
  173. self.data[instance])
  174. distances.append((instance, distance))
  175. # sort based on distance -- closest first
  176. distances.sort(key=lambda artistTuple: artistTuple[1],
  177. reverse=True)
  178. return distances
  179. def recommend(self, user):
  180. """Give list of recommendations"""
  181. recommendations = {}
  182. # first get list of users ordered by nearness
  183. nearest = self.computeNearestNeighbor(user)
  184. # now get the ratings for the user
  185. userRatings = self.data[user]
  186. # determine the total distance
  187. totalDistance = 0.0
  188. for i in range(self.k):
  189. totalDistance += nearest[i][1]
  190. # now iterate through the k nearest neighbors
  191. # accumulating their ratings
  192. for i in range(self.k):
  193. # compute slice of pie
  194. weight = nearest[i][1] / totalDistance
  195. # get the name of the person
  196. name = nearest[i][0]
  197. # get the ratings for this person
  198. neighborRatings = self.data[name]
  199. # get the name of the person
  200. # now find bands neighbor rated that user didn't
  201. for artist in neighborRatings:
  202. if not artist in userRatings:
  203. if artist not in recommendations:
  204. recommendations[artist] = (neighborRatings[artist]
  205. * weight)
  206. else:
  207. recommendations[artist] = (recommendations[artist]
  208. + neighborRatings[artist]
  209. * weight)
  210. # now make list from dictionary
  211. recommendations = list(recommendations.items())
  212. recommendations = [(self.convertProductID2name(k), v)
  213. for (k, v) in recommendations]
  214. # finally sort and return
  215. recommendations.sort(key=lambda artistTuple: artistTuple[1],
  216. reverse = True)
  217. # Return the first n items
  218. return recommendations[:self.n]
  219. ############test code############
  220. #r = recommender(users)
  221. #r.recommend('Jordyn')
  222. #r.recommend('Hailey')
  223. #r.loadBookDB('BX-CSV-Dump/')
  224. #r.recommend('171118')
  225. #r.userRatings('171118', 5)

协同过滤--基于物品的过滤

显示评级:显示给出评级结果,如Youtube的点赞、点差按钮

隐式评级:网站点击轨迹。

基于邻居(用户)的推荐系统计算的次数十分巨大,所以有延迟性。还有稀疏性的问题。也称为基于内存的协同过滤,因为需要保存所有的评级结果来进行推荐。

基于物品的过滤:事先找到最相似的物品,并结合物品的评级结果生成推荐。也称为基于模型的协同过滤,因为不需要保存所有的评级结果,取而代之的随时构建一个模型表示物品之间的相似度。

为了抵消分数夸大,调整余弦相似度


U表示所有同事对i和j进行过评级的用户组合,表示用户u对物品i的评分,表示用户u对所有物品评分的平均值。可以获得相似度矩阵。

  1. users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5,
  2. "Lorde": 4, "Fall Out Boy": 1},
  3. "Matt": {"Imagine Dragons": 3, "Daft Punk": 4,
  4. "Lorde": 4, "Fall Out Boy": 1},
  5. "Ben": {"Kacey Musgraves": 4, "Imagine Dragons": 3,
  6. "Lorde": 3, "Fall Out Boy": 1},
  7. "Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4,
  8. "Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1},
  9. "Tori": {"Kacey Musgraves": 5, "Imagine Dragons": 4,
  10. "Daft Punk": 5, "Fall Out Boy": 3}}
  11. def computeSimilarity(band1, band2, userRatings):
  12. averages = {}
  13. for (key, ratings) in userRatings.items():
  14. averages[key] = (float(sum(ratings.values()))
  15. / len(ratings.values()))
  16. num = 0 # numerator
  17. dem1 = 0 # first half of denominator
  18. dem2 = 0
  19. for (user, ratings) in userRatings.items():
  20. if band1 in ratings and band2 in ratings:
  21. avg = averages[user]
  22. num += (ratings[band1] - avg) * (ratings[band2] - avg)
  23. dem1 += (ratings[band1] - avg)**2
  24. dem2 += (ratings[band2] - avg)**2
  25. return num / (sqrt(dem1) * sqrt(dem2))

相似矩阵预测:


表示用户u对物品i的预测值

N表示用户u的所有评级物品中每个和i得分相似的物品。

是i和N之间的相识度

是u给N的评级结果,应该在之间取值,可能需要做线性变换


得到新的评级结果为

Slope One算法

  1. 计算偏差

    物品i到物品j的平均偏差为


    是S集合中的元素的个数。X是整个评分集合。是所有对i和j进行评分的用户集合。

    1. def computeDeviations(self):
    2. # for each person in the data:
    3. # get their ratings
    4. for ratings in self.data.values(): # data:users2, ratings:{song:value, , }
    5. # for each item & rating in that set of ratings:
    6. for (item, rating) in ratings.items():
    7. self.frequencies.setdefault(item, {}) #key is song
    8. self.deviations.setdefault(item, {})
    9. # for each item2 & rating2 in that set of ratings:
    10. for (item2, rating2) in ratings.items():
    11. if item != item2:
    12. # add the difference between the ratings to our
    13. # computation
    14. self.frequencies[item].setdefault(item2, 0)
    15. self.deviations[item].setdefault(item2, 0.0)
    16. # frequemcies is card
    17. self.frequencies[item][item2] += 1
    18. # diviations is the sum of dev of diff users
    19. #value of complex dic is dev
    20. self.deviations[item][item2] += rating - rating2
    21. for (item, ratings) in self.deviations.items():
    22. for item2 in ratings:
    23. ratings[item2] /= self.frequencies[item][item2]
    24. # test code for ComputeDeviations(self)
    25. #r = recommender(users2)
    26. #r.computeDeviations()
    27. #r.deviations

  2. 加权Slope预测


    表示加权Slope算法给出的用户u对物品j的预测

    1. def slopeOneRecommendations(self, userRatings):
    2. recommendations = {}
    3. frequencies = {}
    4. # for every item and rating in the user's recommendations
    5. for (userItem, userRating) in userRatings.items(): # userItem :i
    6. # for every item in our dataset that the user didn't rate
    7. for (diffItem, diffRatings) in self.deviations.items(): #diffItem : j
    8. if diffItem not in userRatings and \
    9. userItem in self.deviations[diffItem]:
    10. freq = self.frequencies[diffItem][userItem] #freq:c_ji
    11. # 如果键不存在于字典中,将会添加键并将值设为默认值。
    12. recommendations.setdefault(diffItem, 0.0)
    13. frequencies.setdefault(diffItem, 0)
    14. # add to the running sum representing the numerator
    15. # of the formula
    16. recommendations[diffItem] += (diffRatings[userItem] +
    17. userRating) * freq
    18. # keep a running sum of the frequency of diffitem
    19. frequencies[diffItem] += freq
    20. #p(u)j list
    21. recommendations = [(self.convertProductID2name(k),
    22. v / frequencies[k])
    23. for (k, v) in recommendations.items()]
    24. # finally sort and return
    25. recommendations.sort(key=lambda artistTuple: artistTuple[1],
    26. reverse = True)
    27. # I am only going to return the first 50 recommendations
    28. return recommendations[:50]
    29. # test code for SlopeOneRecommendations
    30. #r = recommender(users2)
    31. #r.computeDeviations()
    32. #g = users2['Ben']
    33. #r.slopeOneRecommendations(g)
    1. def loadMovieLens(self, path=''):
    2. self.data = {}
    3. #
    4. # first load movie ratings
    5. #
    6. i = 0
    7. #
    8. # First load book ratings into self.data
    9. #
    10. #f = codecs.open(path + "u.data", 'r', 'utf8')
    11. f = codecs.open(path + "u.data", 'r', 'ascii')
    12. # f = open(path + "u.data")
    13. for line in f:
    14. i += 1
    15. #separate line into fields
    16. fields = line.split('\t')
    17. user = fields[0]
    18. movie = fields[1]
    19. rating = int(fields[2].strip().strip('"'))
    20. if user in self.data:
    21. currentRatings = self.data[user]
    22. else:
    23. currentRatings = {}
    24. currentRatings[movie] = rating
    25. self.data[user] = currentRatings
    26. f.close()
    27. #
    28. # Now load movie into self.productid2name
    29. # the file u.item contains movie id, title, release date among
    30. # other fields
    31. #
    32. #f = codecs.open(path + "u.item", 'r', 'utf8')
    33. f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore')
    34. #f = open(path + "u.item")
    35. for line in f:
    36. i += 1
    37. #separate line into fields
    38. fields = line.split('|')
    39. mid = fields[0].strip()
    40. title = fields[1].strip()
    41. self.productid2name[mid] = title
    42. f.close()
    43. #
    44. # Now load user info into both self.userid2name
    45. # and self.username2id
    46. #
    47. #f = codecs.open(path + "u.user", 'r', 'utf8')
    48. f = open(path + "u.user")
    49. for line in f:
    50. i += 1
    51. fields = line.split('|')
    52. userid = fields[0].strip('"')
    53. self.userid2name[userid] = line
    54. self.username2id[line] = userid
    55. f.close()
    56. print(i)
    57. # test code
    58. #r = recommender(0)
    59. #r.loadMovieLens('ml-100k/')
    60. #r.computeDeviations()
    61. #r.slopeOneRecommendations(r.data['1'])
    62. #r.slopeOneRecommendations(r.data['25'])

内容过滤及分类--基于物品属性的过滤

新物品加入,会因为没有被评分过,永远不会被推荐。Pandora是基于基于一种称为音乐基因的项目。

当所用数据挖掘方法基于特征的值来计算 两个对象的距离,且不同特征的尺度不同,就需要使用归一化。一般使用均值和标准差来进行归一化,但这种方法可能会受到离群点的影响,所以引入改进后的归一化:均值用中位数()代替,标准差用绝对标准差(asd)代替。

  1. # 计算中位数和绝对标准差
  2. def getMedian(self, alist):
  3. """return median of alist"""
  4. if alist == []:
  5. return []
  6. blist = sorted(alist)
  7. length = len(alist)
  8. if length % 2 == 1:
  9. # length of list is odd so return middle element
  10. return blist[int(((length + 1) / 2) - 1)]
  11. else:
  12. # length of list is even so compute midpoint
  13. v1 = blist[int(length / 2)]
  14. v2 =blist[(int(length / 2) - 1)]
  15. return (v1 + v2) / 2.0
  16. def getAbsoluteStandardDeviation(self, alist, median):
  17. """given alist and median return absolute standard deviation"""
  18. sum = 0
  19. for item in alist:
  20. sum += abs(item - median)
  21. return sum / len(alist)
  22. def unitTest():
  23. list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]
  24. list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]
  25. list3 = [69]
  26. list4 = [69, 72]
  27. classifier = Classifier('data/athletesTrainingSet.txt')
  28. m1 = classifier.getMedian(list1)
  29. m2 = classifier.getMedian(list2)
  30. m3 = classifier.getMedian(list3)
  31. m4 = classifier.getMedian(list4)
  32. asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)
  33. asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)
  34. asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)
  35. asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)
  36. assert(round(m1, 3) == 65)
  37. assert(round(m2, 3) == 66)
  38. assert(round(m3, 3) == 69)
  39. assert(round(m4, 3) == 70.5)
  40. assert(round(asd1, 3) == 8)
  41. assert(round(asd2, 3) == 7.5)
  42. assert(round(asd3, 3) == 0)
  43. assert(round(asd4, 3) == 1.5)
  44. print("getMedian and getAbsoluteStandardDeviation work correctly")

assert语句用于软件组件测试的做法是一种常用的技术。产品每一部分分成一段实现代码加上对实现代码的测试代码,这一点十分重要。

  1. # 归一化
  2. def normalizeColumn(self, columnNumber):
  3. """given a column number, normalize that column in self.data"""
  4. # first extract values to list, v is vector, clounm is 0/1,col is a list
  5. col = [v[1][columnNumber] for v in self.data]
  6. median = self.getMedian(col)
  7. asd = self.getAbsoluteStandardDeviation(col, median)
  8. #print("Median: %f ASD = %f" % (median, asd))
  9. self.medianAndDeviation.append((median, asd))
  10. for v in self.data:
  11. v[1][columnNumber] = (v[1][columnNumber] - median) / asd

算法评估及kNN

10-flod Cross Validation:将数据集分为10份,使用其中9份进行训练,另外1份用作测试,重复该过程10次。

留一法:n-flod Cross Validation。结果是随机的,不是确定值,和数据的划分有关。缺点在于计算机开销很大。分层采样的时候保证样本的均匀性很重要。

混淆矩阵:行表示测试样本的真实类别,列表示预测器所预测出来的类别。可揭示分类器性能。

  1. # divide data into 10 buckets
  2. import random
  3. def buckets(filename, bucketName, separator, classColumn):
  4. """the original data is in the file named filename
  5. bucketName is the prefix for all the bucket names
  6. separator is the character that divides the columns
  7. (for ex., a tab or comma and classColumn is the column
  8. that indicates the class"""
  9. # put the data in 10 buckets
  10. numberOfBuckets = 10
  11. data = {}
  12. # first read in the data and divide by category
  13. with open(filename) as f:
  14. lines = f.readlines()
  15. for line in lines:
  16. if separator != '\t':
  17. line = line.replace(separator, '\t')
  18. # first get the category
  19. category = line.split()[classColumn]
  20. data.setdefault(category, []) #set the value for dic data
  21. data[category].append(line) #all the information
  22. # initialize the buckets [[], [], ...]
  23. buckets = []
  24. for i in range(numberOfBuckets):
  25. buckets.append([])
  26. # now for each category put the data into the buckets
  27. for k in data.keys():
  28. #randomize order of instances for each class
  29. #data[k] is a list of line
  30. random.shuffle(data[k])
  31. bNum = 0
  32. # divide into buckets
  33. for item in data[k]:
  34. buckets[bNum].append(item)
  35. bNum = (bNum + 1) % numberOfBuckets
  36. # write to file
  37. for bNum in range(numberOfBuckets):
  38. f = open("%s-%02i" % ('tmp/'+bucketName, bNum + 1), 'w')
  39. for item in buckets[bNum]:
  40. f.write(item)
  41. f.close()
  42. # example of how to use this code
  43. buckets("data/mpgData.txt", 'mpgData',',',0)

分类器评价:Kappa统计量。相对于随机分类器而言的分类器效果。


是实际分类器的准确率,是随机分类器的精确率。

Kappa区间 性能
<0 比随机方法性能差
0.01-0.2 轻微一致
0.21-0.4 一般一致
0.41-0.6 中度一致
0.61-0.8 高度一致
0.81-1 接近完美

KNN:当有一个样本是比较特别的时候,使用最近邻可能会导致特别样本的存在而出现误分类。改进的办法就是考察k个邻居。离得越近,影响因子就越大。影响因子可以用距离的倒数来表示。

  1. def knn(self, itemVector):
  2. """returns the predicted class of itemVector using k
  3. Nearest Neighbors"""
  4. # changed from min to heapq.nsmallest to get the
  5. # k closest neighbors
  6. neighbors = heapq.nsmallest(self.k,
  7. [(self.manhattan(itemVector, item[1]), item)
  8. for item in self.data])
  9. # each neighbor gets a vote
  10. results = {}
  11. for neighbor in neighbors:
  12. theClass = neighbor[1][0]
  13. results.setdefault(theClass, 0)
  14. results[theClass] += 1
  15. resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True)
  16. #get all the classes that have the maximum votes
  17. maxVotes = resultList[0][0]
  18. possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes]
  19. # randomly select one of the classes that received the max votes
  20. answer = random.choice(possibleAnswers)
  21. return( answer)

做工程,数据量大的时候算法的效果越好。做论文还是要研究出一个具有少量性能提高的算法。

概率及朴素贝叶斯

特点:分类并给出概率。

先验概率:P(h)

后验概率/条件概率:P(h/d)

  1. # 训练
  2. class Classifier:
  3. def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
  4. """ a classifier will be built from files with the bucketPrefix
  5. excluding the file with textBucketNumber. dataFormat is a string that
  6. describes how to interpret each line of the data files. For example,
  7. for the iHealth data the format is:
  8. "attr attr attr attr class"
  9. """
  10. total = 0
  11. classes = {}
  12. counts = {}
  13. # reading the data in from the file
  14. self.format = dataFormat.strip().split('\t')
  15. self.prior = {}
  16. self.conditional = {}
  17. # for each of the buckets numbered 1 through 10:
  18. for i in range(1, 11):
  19. # if it is not the bucket we should ignore, read in the data
  20. if i != testBucketNumber:
  21. filename = "%s-%02i" % (bucketPrefix, i)
  22. f = open(filename)
  23. lines = f.readlines()
  24. f.close()
  25. for line in lines:
  26. fields = line.strip().split('\t')
  27. ignore = []
  28. vector = []
  29. for i in range(len(fields)):
  30. if self.format[i] == 'num':
  31. vector.append(float(fields[i])) #vector!!
  32. elif self.format[i] == 'attr':
  33. vector.append(fields[i])
  34. elif self.format[i] == 'comment':
  35. ignore.append(fields[i])
  36. elif self.format[i] == 'class':
  37. category = fields[i]
  38. # now process this instance
  39. total += 1
  40. classes.setdefault(category, 0) #字典:分类类别计数
  41. counts.setdefault(category, {}) #复合字典:每类的每列的具体计数
  42. classes[category] += 1
  43. # now process each attribute of the instance
  44. col = 0
  45. for columnValue in vector:
  46. col += 1
  47. counts[category].setdefault(col, {})
  48. counts[category][col].setdefault(columnValue, 0)
  49. counts[category][col][columnValue] += 1
  50. # ok done counting. now compute probabilities
  51. # first prior probabilities p(h)
  52. for (category, count) in classes.items():
  53. self.prior[category] = count / total#字典:先验概率
  54. # now compute conditional probabilities p(D|h)
  55. for (category, columns) in counts.items():
  56. self.conditional.setdefault(category, {})
  57. for (col, valueCounts) in columns.items():
  58. self.conditional[category].setdefault(col, {})
  59. for (attrValue, count) in valueCounts.items():
  60. self.conditional[category][col][attrValue] = (
  61. count / classes[category]) #复合字典:每类的每个属性的条件概率
  62. self.tmp = counts #应该暂时没有用
  1. # 分类
  2. def classify(self, itemVector):
  3. """Return class we think item Vector is in"""
  4. results = []
  5. for (category, prior) in self.prior.items():
  6. prob = prior
  7. col = 1
  8. for attrValue in itemVector:
  9. if not attrValue in self.conditional[category][col]:
  10. # we did not find any instances of this attribute value
  11. # occurring with this category so prob = 0
  12. prob = 0
  13. else:
  14. prob = prob * self.conditional[category][col][attrValue]
  15. col += 1
  16. results.append((prob, category))
  17. # return the category with the highest probability
  18. return(max(results)[1])
  1. # test code
  2. c = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")
  3. print(c.classify(['health', 'moderate', 'moderate', 'yes']))

问题:当存在某个概率为0时,直接主导整个贝叶斯的计算过程,即使其他的独立事件的条件概率接近于1。此外,基于样本集估计出来概率往往是真实概率的偏低估计。

改进:将


修改为

其中是y事件总数,是y中x事件总数,是等效样本容量,通常的确定方法是:m为可选属性的个数值,p是可选属性的概率的先验估计,通常假设均匀分布有

当处理的数据是连续的时候,有两种解决办法。一是离散化,构建类别;一是假设概率分布服从高斯分布,然后计算概率。

样本标准差:


对于样本集而言,样本标准差相对于总体标准差计算公式是总体标准差的更优估计。

  1. # pdf计算实现
  2. def pdf(mean, ssd, x):
  3. """Probability Density Function computing P(x|y)
  4. input is the mean, sample standard deviation for all the items in y,
  5. and x."""
  6. ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))
  7. print (ePart)
  8. return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
  9. ````
  10. <div class="md-section-divider"></div>
  11. ```python
  12. # 连续数据的训练
  13. class Classifier:
  14. def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
  15. """ a classifier will be built from files with the bucketPrefix
  16. excluding the file with textBucketNumber. dataFormat is a string that
  17. describes how to interpret each line of the data files. For example,
  18. for the iHealth data the format is:
  19. "attr attr attr attr class"
  20. """
  21. total = 0
  22. classes = {}
  23. # counts used for attributes that are not numeric
  24. counts = {}
  25. # totals used for attributes that are numereric
  26. # we will use these to compute the mean and sample standard deviation for
  27. # each attribute - class pair.
  28. totals = {}
  29. numericValues = {}
  30. # reading the data in from the file
  31. self.format = dataFormat.strip().split('\t')
  32. #
  33. self.prior = {}
  34. self.conditional = {}
  35. # for each of the buckets numbered 1 through 10:
  36. for i in range(1, 11):
  37. # if it is not the bucket we should ignore, read in the data
  38. if i != testBucketNumber:
  39. filename = "%s-%02i" % (bucketPrefix, i)
  40. f = open(filename)
  41. lines = f.readlines()
  42. f.close()
  43. for line in lines:
  44. fields = line.strip().split('\t')
  45. ignore = []
  46. vector = []
  47. nums = []
  48. for i in range(len(fields)):
  49. if self.format[i] == 'num':
  50. nums.append(float(fields[i]))
  51. elif self.format[i] == 'attr':
  52. vector.append(fields[i])
  53. elif self.format[i] == 'comment':
  54. ignore.append(fields[i])
  55. elif self.format[i] == 'class':
  56. category = fields[i]
  57. # now process this instance
  58. total += 1
  59. classes.setdefault(category, 0)
  60. counts.setdefault(category, {})
  61. totals.setdefault(category, {})
  62. numericValues.setdefault(category, {})
  63. classes[category] += 1
  64. # now process each non-numeric attribute of the instance
  65. col = 0
  66. for columnValue in vector:
  67. col += 1
  68. counts[category].setdefault(col, {})
  69. counts[category][col].setdefault(columnValue, 0)
  70. counts[category][col][columnValue] += 1
  71. # process numeric attributes
  72. col = 0
  73. for columnValue in nums:
  74. col += 1
  75. totals[category].setdefault(col, 0)
  76. #totals[category][col].setdefault(columnValue, 0)
  77. totals[category][col] += columnValue
  78. numericValues[category].setdefault(col, [])
  79. numericValues[category][col].append(columnValue)
  80. #
  81. # ok done counting. now compute probabilities
  82. #
  83. # first prior probabilities p(h)
  84. #
  85. for (category, count) in classes.items():
  86. self.prior[category] = count / total
  87. #
  88. # now compute conditional probabilities p(h|D)
  89. #
  90. for (category, columns) in counts.items():
  91. self.conditional.setdefault(category, {})
  92. for (col, valueCounts) in columns.items():
  93. self.conditional[category].setdefault(col, {})
  94. for (attrValue, count) in valueCounts.items():
  95. self.conditional[category][col][attrValue] = (
  96. count / classes[category])
  97. self.tmp = counts
  98. #
  99. # now compute mean and sample standard deviation
  100. #
  101. self.means = {}
  102. self.totals = totals
  103. for (category, columns) in totals.items():
  104. self.means.setdefault(category, {})
  105. for (col, cTotal) in columns.items():
  106. self.means[category][col] = cTotal / classes[category]
  107. # standard deviation
  108. self.ssd = {}
  109. for (category, columns) in numericValues.items():
  110. self.ssd.setdefault(category, {})
  111. for (col, values) in columns.items():
  112. SumOfSquareDifferences = 0
  113. theMean = self.means[category][col]
  114. for value in values:
  115. SumOfSquareDifferences += (value - theMean)**2
  116. columns[col] = 0
  117. self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1))
  1. # 连续数据的分类
  2. def classify(self, itemVector, numVector):
  3. """Return class we think item Vector is in"""
  4. results = []
  5. sqrt2pi = math.sqrt(2 * math.pi)
  6. for (category, prior) in self.prior.items():
  7. prob = prior
  8. col = 1
  9. for attrValue in itemVector:
  10. if not attrValue in self.conditional[category][col]:
  11. # we did not find any instances of this attribute value
  12. # occurring with this category so prob = 0
  13. prob = 0
  14. else:
  15. prob = prob * self.conditional[category][col][attrValue]
  16. col += 1
  17. col = 1
  18. for x in numVector:
  19. mean = self.means[category][col]
  20. ssd = self.ssd[category][col]
  21. ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
  22. prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
  23. col += 1
  24. results.append((prob, category))
  25. # return the category with the highest probability
  26. #print(results)
  27. return(max(results)[1])

贝叶斯和kNN的比较

许多真实数据挖掘问题中,很多属性不是独立的。有时候可以假设独立。之所以称朴素贝叶斯是因为尽管知道不成立仍然假设属性之间是独立的。

朴素贝叶斯和文本

训练阶段

  1. 将标识为同一假设的文档合并成一个文本文件

  2. 计算词在该文件中的出现次数n,形成一个词汇表

  3. 对于词汇表中的每个词计算器在文本中的出现次数,记为

  4. 对词汇表中的每个词(去除停用词),计算

  1. class BayesText:
  2. def __init__(self, trainingdir, stopwordlist):
  3. """This class implements a naive Bayes approach to text
  4. classification
  5. trainingdir is the training data. Each subdirectory of
  6. trainingdir is titled with the name of the classification
  7. category -- those subdirectories in turn contain the text
  8. files for that category.
  9. The stopwordlist is a list of words (one per line) will be
  10. removed before any counting takes place.
  11. """
  12. self.vocabulary = {}
  13. self.prob = {}
  14. self.totals = {}
  15. self.stopwords = {} #停用词字典
  16. f = open(stopwordlist)
  17. for line in f:
  18. self.stopwords[line.strip()] = 1
  19. f.close()
  20. categories = os.listdir(trainingdir)
  21. #filter out files that are not directories
  22. self.categories = [filename for filename in categories
  23. if os.path.isdir(trainingdir + filename)]
  24. print("Counting ...")
  25. for category in self.categories:
  26. print(' ' + category)
  27. # 计算当前类别的单词和单词数量,单词的总量
  28. (self.prob[category],
  29. self.totals[category]) = self.train(trainingdir, category)
  30. # I am going to eliminate any word in the 所有种类的单词库vocabulary
  31. # that doesn't occur at least 3 times
  32. toDelete = []
  33. for word in self.vocabulary:
  34. if self.vocabulary[word] < 3:
  35. # mark word for deletion
  36. # can't delete now because you can't delete
  37. # from a list you are currently iterating over
  38. toDelete.append(word)
  39. # now delete
  40. for word in toDelete:
  41. del self.vocabulary[word]
  42. # now compute probabilities
  43. vocabLength = len(self.vocabulary)
  44. print("Computing probabilities:")
  45. for category in self.categories:
  46. print(' ' + category)
  47. denominator = self.totals[category] + vocabLength
  48. for word in self.vocabulary:
  49. if word in self.prob[category]:
  50. count = self.prob[category][word]
  51. else:
  52. count = 1
  53. # 条件概率计算
  54. self.prob[category][word] = (float(count + 1)
  55. / denominator)
  56. print ("DONE TRAINING\n\n")
  57. # input:trainingdir训练文件的目录, category训练文件的种类
  58. # return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)
  59. def train(self, trainingdir, category):
  60. """counts word occurrences for a particular category"""
  61. currentdir = trainingdir + category
  62. files = os.listdir(currentdir)
  63. counts = {}
  64. total = 0
  65. for file in files:
  66. #print(currentdir + '/' + file)
  67. f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
  68. for line in f:
  69. tokens = line.split()
  70. for token in tokens:
  71. # get rid of punctuation and lowercase token
  72. token = token.strip('\'".,?:-')
  73. token = token.lower()
  74. if token != '' and not token in self.stopwords:
  75. self.vocabulary.setdefault(token, 0)
  76. self.vocabulary[token] += 1#所有文档的单词和单词数量
  77. counts.setdefault(token, 0)
  78. counts[token] += 1#当前文件的单词和单词数量
  79. total += 1#所有单词的数量
  80. f.close()
  81. return(counts, total)
  82. # test code
  83. bT = BayesText(trainingDir, stoplistfile)
  84. bT.prob['rec.motorcycles']["god"]

分类阶段


如果概率非常小,Python无法计算,可以采用取对数的形式。

停用词:当停用词是噪声时,去掉这些词能减少处理量,提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。

  1. def classify(self, itemVector, numVector):
  2. """Return class we think item Vector is in"""
  3. results = []
  4. sqrt2pi = math.sqrt(2 * math.pi)
  5. for (category, prior) in self.prior.items():
  6. prob = prior
  7. col = 1
  8. for attrValue in itemVector:
  9. if not attrValue in self.conditional[category][col]:
  10. # we did not find any instances of this attribute value
  11. # occurring with this category so prob = 0
  12. prob = 0
  13. else:
  14. prob = prob * self.conditional[category][col][attrValue]
  15. col += 1
  16. col = 1
  17. for x in numVector:
  18. mean = self.means[category][col]
  19. ssd = self.ssd[category][col]
  20. ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
  21. prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
  22. col += 1
  23. results.append((prob, category))
  24. # return the category with the highest probability
  25. #print(results)
  26. return(max(results)[1])
  27. # test code
  28. bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

  1. from __future__ import print_function
  2. import os, codecs, math
  3. class BayesText:
  4. # input:训练文件目录,停用词,忽略的文件子集
  5. def __init__(self, trainingdir, stopwordlist, ignoreBucket):
  6. """This class implements a naive Bayes approach to text
  7. classification
  8. trainingdir is the training data. Each subdirectory of
  9. trainingdir is titled with the name of the classification
  10. category -- those subdirectories in turn contain the text
  11. files for that category.
  12. The stopwordlist is a list of words (one per line) will be
  13. removed before any counting takes place.
  14. """
  15. self.vocabulary = {}
  16. self.prob = {}
  17. self.totals = {}
  18. self.stopwords = {}
  19. f = open(stopwordlist)
  20. for line in f:
  21. self.stopwords[line.strip()] = 1
  22. f.close()
  23. categories = os.listdir(trainingdir)
  24. #filter out files that are not directories,in this program, neg and pos
  25. self.categories = [filename for filename in categories
  26. if os.path.isdir(trainingdir + filename)]
  27. print("Counting ...")
  28. for category in self.categories:
  29. #print(' ' + category)
  30. (self.prob[category],
  31. self.totals[category]) = self.train(trainingdir, category,
  32. ignoreBucket)
  33. # I am going to eliminate any word in the vocabulary
  34. # that doesn't occur at least 3 times
  35. toDelete = []
  36. for word in self.vocabulary:
  37. if self.vocabulary[word] < 3:
  38. # mark word for deletion
  39. # can't delete now because you can't delete
  40. # from a list you are currently iterating over
  41. toDelete.append(word)
  42. # now delete
  43. for word in toDelete:
  44. del self.vocabulary[word]
  45. # now compute probabilities
  46. vocabLength = len(self.vocabulary)
  47. #print("Computing probabilities:")
  48. for category in self.categories:
  49. #print(' ' + category)
  50. denominator = self.totals[category] + vocabLength
  51. for word in self.vocabulary:
  52. if word in self.prob[category]:
  53. count = self.prob[category][word]
  54. else:
  55. count = 1
  56. self.prob[category][word] = (float(count + 1)
  57. / denominator)
  58. #print ("DONE TRAINING\n\n")
  59. def train(self, trainingdir, category, bucketNumberToIgnore):
  60. """counts word occurrences for a particular category"""
  61. ignore = "%i" % bucketNumberToIgnore
  62. currentdir = trainingdir + category
  63. directories = os.listdir(currentdir)
  64. counts = {}
  65. total = 0
  66. for directory in directories:
  67. if directory != ignore:
  68. currentBucket = trainingdir + category + "/" + directory
  69. files = os.listdir(currentBucket)
  70. #print(" " + currentBucket)
  71. for file in files:
  72. f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')
  73. for line in f:
  74. tokens = line.split()
  75. for token in tokens:
  76. # get rid of punctuation and lowercase token
  77. token = token.strip('\'".,?:-')
  78. token = token.lower()
  79. if token != '' and not token in self.stopwords:
  80. self.vocabulary.setdefault(token, 0)
  81. self.vocabulary[token] += 1
  82. counts.setdefault(token, 0)
  83. counts[token] += 1
  84. total += 1
  85. f.close()
  86. return(counts, total)
  87. def classify(self, filename):
  88. results = {}
  89. for category in self.categories:
  90. results[category] = 0
  91. f = codecs.open(filename, 'r', 'iso8859-1')
  92. for line in f:
  93. tokens = line.split()
  94. for token in tokens:
  95. #print(token)
  96. token = token.strip('\'".,?:-').lower()
  97. if token in self.vocabulary:
  98. for category in self.categories:
  99. if self.prob[category][token] == 0:
  100. print("%s %s" % (category, token))
  101. results[category] += math.log(
  102. self.prob[category][token])
  103. f.close()
  104. results = list(results.items())
  105. results.sort(key=lambda tuple: tuple[1], reverse = True)
  106. # for debugging I can change this to give me the entire list
  107. return results[0][0]
  108. # input: 测试文件的分类目录,当前类别, 忽略子集
  109. # return: 当前类别下的分类结果{0:12,1:23}
  110. def testCategory(self, direc, category, bucketNumber):
  111. results = {}
  112. directory = direc + ("%i/" % bucketNumber)
  113. #print("Testing " + directory)
  114. files = os.listdir(directory)
  115. total = 0
  116. #correct = 0
  117. for file in files:
  118. total += 1
  119. result = self.classify(directory + file)
  120. results.setdefault(result, 0)
  121. results[result] += 1
  122. #if result == category:
  123. # correct += 1
  124. return results
  125. # input: 测试文件目录, 忽略的子集文件
  126. # return: 所有类别的分类结果{1:{0:12,1:23},}
  127. def test(self, testdir, bucketNumber):
  128. """Test all files in the test directory--that directory is
  129. organized into subdirectories--each subdir is a classification
  130. category"""
  131. results = {}
  132. categories = os.listdir(testdir)
  133. #filter out files that are not directories
  134. categories = [filename for filename in categories if
  135. os.path.isdir(testdir + filename)]
  136. for category in categories:
  137. #print(".", end="")
  138. results[category] = self.testCategory(
  139. testdir + category + '/', category, bucketNumber)
  140. return results
  141. def tenfold(dataPrefix, stoplist):
  142. results = {}
  143. for i in range(0,10):
  144. bT = BayesText(dataPrefix, stoplist, i)
  145. r = bT.test(theDir, i)
  146. for (key, value) in r.items():
  147. results.setdefault(key, {})
  148. for (ckey, cvalue) in value.items():
  149. results[key].setdefault(ckey, 0)
  150. results[key][ckey] += cvalue
  151. categories = list(results.keys())
  152. categories.sort()
  153. print( "\n Classified as: ")
  154. header = " "
  155. subheader = " +"
  156. for category in categories:
  157. header += "% 2s " % category
  158. subheader += "-----+"
  159. print (header)
  160. print (subheader)
  161. total = 0.0
  162. correct = 0.0
  163. for category in categories:
  164. row = " %s |" % category
  165. for c2 in categories:
  166. if c2 in results[category]:
  167. count = results[category][c2]
  168. else:
  169. count = 0
  170. row += " %3i |" % count
  171. total += count
  172. if c2 == category:
  173. correct += count
  174. print(row)
  175. print(subheader)
  176. print("\n%5.3f percent correct" %((correct * 100) / total))
  177. print("total of %i instances" % total)
  178. # change these to match your directory structure
  179. prefixPath = "data/review_polarity/"
  180. theDir = prefixPath + "/txt_sentoken/"
  181. stoplistfile = prefixPath + "stopwords25.txt"
  182. tenfold(theDir, stoplistfile)

聚类--群组发现

层次聚类:每次迭代将最相似的两个簇合并,不断重复直至只有一个簇。两个簇距离计算方法不一样,聚类的方法就不一样,合并的过程也不一样。

常规队列:先进先出

优先级队列:去除次序基于队列元素的关联权重,数值越小先去除。

  1. # 从队列中获得数据的最小值
  2. from Queue import PriorityQueue
  3. singersQueue = PriorityQueue()
  4. singersQueue.put((16, 'Suzaka'))
  5. singersQueue.put((14, 'Yui'))
  6. singersQueue.put((15, 'Moa'))
  7. print singersQueue.get()
  8. print singersQueue.get()
  9. print singersQueue.get()
  1. class hClusterer:
  2. """ this clusterer assumes that the first column of the data is a label
  3. not used in the clustering. The other columns contain numeric data"""
  4. def __init__(self, filename):
  5. file = open(filename)
  6. self.data = {}
  7. self.counter = 0
  8. self.queue = PriorityQueue()
  9. lines = file.readlines()
  10. file.close()
  11. header = lines[0].split(',')
  12. self.cols = len(header)
  13. self.data = [[] for i in range(len(header))]
  14. for line in lines[1:]:
  15. cells = line.split(',')
  16. toggle = 0
  17. # self.data [['Border Collie',...], [20.0,...], [45.0,...]]
  18. for cell in range(self.cols):
  19. if toggle == 0:
  20. self.data[cell].append(cells[cell])
  21. toggle = 1
  22. else:
  23. self.data[cell].append(float(cells[cell]))
  24. # now normalize number columns (that is, skip the first column)
  25. for i in range(1, self.cols):
  26. self.data[i] = normalizeColumn(self.data[i])
  27. ###
  28. ### I have read in the data and normalized the
  29. ### columns. Now for each element i in the data, I am going to
  30. ### 1. compute the Euclidean Distance from element i to all the
  31. ### other elements. This data will be placed in neighbors,
  32. ### which is a Python dictionary. Let's say i = 1, and I am
  33. ### computing the distance to the neighbor j and let's say j
  34. ### is 2. The neighbors dictionary for i will look like
  35. ### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... }
  36. ###
  37. ### 2. find the closest neighbor
  38. ###
  39. ### 3. place the element on a priority queue, called simply queue,
  40. ### based on the distance to the nearest neighbor (and a counter
  41. ### used to break ties.
  42. # now push distances on queue
  43. rows = len(self.data[0])
  44. for i in range(rows):
  45. minDistance = 99999
  46. nearestNeighbor = 0
  47. neighbors = {} #存储所有邻居的对和距离
  48. for j in range(rows):
  49. if i != j:
  50. dist = self.distance(i, j)
  51. if i < j:
  52. pair = (i,j)
  53. else:
  54. pair = (j,i)
  55. neighbors[j] = (pair, dist)
  56. if dist < minDistance:
  57. minDistance = dist
  58. nearestNeighbor = j
  59. # nearestNum = j
  60. # create nearest Pair
  61. if i < nearestNeighbor:
  62. nearestPair = (i, nearestNeighbor)
  63. else:
  64. nearestPair = (nearestNeighbor, i)
  65. # put instance on priority queue
  66. self.queue.put((minDistance, self.counter,
  67. [[self.data[0][i]], nearestPair, neighbors]))
  68. self.counter += 1 #存储对象的数量,先后顺序
  69. def distance(self, i, j):
  70. sumSquares = 0
  71. for k in range(1, self.cols):
  72. sumSquares += (self.data[k][i] - self.data[k][j])**2
  73. return math.sqrt(sumSquares)
  74. def cluster(self):
  75. done = False
  76. while not done:
  77. # 依据 minDistance 取出当前最小距离的对象,应该是两对距离。
  78. topOne = self.queue.get()
  79. nearestPair = topOne[2][1]
  80. if not self.queue.empty():
  81. nextOne = self.queue.get()
  82. nearPair = nextOne[2][1] #当等距离时,不一定一样的
  83. tmp = []
  84. ##
  85. ## I have just popped two elements off the queue,
  86. ## topOne and nextOne. I need to check whether nextOne
  87. ## is topOne's nearest neighbor and vice versa.
  88. ## If not, I will pop another element off the queue
  89. ## until I find topOne's nearest neighbor. That is what
  90. ## this while loop does.
  91. ##
  92. # 处理等距离的情况
  93. while nearPair != nearestPair:
  94. tmp.append((nextOne[0], self.counter, nextOne[2]))
  95. self.counter += 1
  96. nextOne = self.queue.get()
  97. nearPair = nextOne[2][1]
  98. ##
  99. ## this for loop pushes the elements I popped off in the
  100. ## above while loop.
  101. ## 放回等距离不配对的对象
  102. for item in tmp:
  103. self.queue.put(item)
  104. if len(topOne[2][0]) == 1:
  105. item1 = topOne[2][0][0]
  106. else:
  107. item1 = topOne[2][0]
  108. if len(nextOne[2][0]) == 1:
  109. item2 = nextOne[2][0][0]
  110. else:
  111. item2 = nextOne[2][0]
  112. ## curCluster is, perhaps obviously, the new cluster
  113. ## which combines cluster item1 with cluster item2.
  114. curCluster = (item1, item2)
  115. ## Now I am doing two things. First, finding the nearest
  116. ## neighbor to this new cluster. Second, building a new
  117. ## neighbors list by merging the neighbors lists of item1
  118. ## and item2. If the distance between item1 and element 23
  119. ## is 2 and the distance betweeen item2 and element 23 is 4
  120. ## the distance between element 23 and the new cluster will
  121. ## be 2 (i.e., the shortest distance).
  122. ##
  123. minDistance = 99999
  124. nearestPair = ()
  125. #nearestNeighbor = ''
  126. merged = {}
  127. nNeighbors = nextOne[2][2] #所有的邻居对和距离
  128. for (key, value) in topOne[2][2].items():
  129. if key in nNeighbors: #只剩下最后一个的时候,不成立
  130. if nNeighbors[key][1] < value[1]:
  131. dist = nNeighbors[key]
  132. else:
  133. dist = value
  134. if dist[1] < minDistance:
  135. minDistance = dist[1]
  136. nearestPair = dist[0] #这里的对比较特别,只去了其中一个点
  137. #nearestNeighbor = key
  138. merged[key] = dist
  139. if merged == {}:
  140. return curCluster#返回元祖对,用来建立树
  141. else:
  142. self.queue.put( (minDistance, self.counter,
  143. [curCluster, nearestPair, merged]))
  144. self.counter += 1

KMeans

这个算法和爬山法类似,不能保证最后能够找到最有的划分簇。因为算法一开始选择的是随机中心点的集合,所以只能确保找到局部最有划分簇。

误差平方和(sum of squared error,简称SSE):度量某个簇集合的质量。

  1. import math
  2. import random
  3. """
  4. Implementation of the K-means algorithm
  5. for the book A Programmer's Guide to Data Mining"
  6. http://www.guidetodatamining.com
  7. """
  8. def getMedian(alist):
  9. """get median of list"""
  10. tmp = list(alist)
  11. tmp.sort()
  12. alen = len(tmp)
  13. if (alen % 2) == 1:
  14. return tmp[alen // 2]
  15. else:
  16. return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2
  17. def normalizeColumn(column):
  18. """normalize the values of a column using Modified Standard Score
  19. that is (each value - median) / (absolute standard deviation)"""
  20. median = getMedian(column)
  21. asd = sum([abs(x - median) for x in column]) / len(column)
  22. result = [(x - median) / asd for x in column]
  23. return result
  24. class kClusterer:
  25. """ Implementation of kMeans Clustering
  26. This clusterer assumes that the first column of the data is a label
  27. not used in the clustering. The other columns contain numeric data
  28. """
  29. def __init__(self, filename, k):
  30. """ k is the number of clusters to make
  31. This init method:
  32. 1. reads the data from the file named filename
  33. 2. stores that data by column in self.data
  34. 3. normalizes the data using Modified Standard Score
  35. 4. randomly selects the initial centroids
  36. 5. assigns points to clusters associated with those centroids
  37. """
  38. file = open(filename)
  39. self.data = {}
  40. self.k = k
  41. self.counter = 0
  42. self.iterationNumber = 0
  43. # used to keep track of % of points that change cluster membership
  44. # in an iteration
  45. self.pointsChanged = 0
  46. # Sum of Squared Error
  47. self.sse = 0
  48. #
  49. # read data from file
  50. #
  51. lines = file.readlines()
  52. file.close()
  53. header = lines[0].split(',')
  54. self.cols = len(header)
  55. self.data = [[] for i in range(len(header))]
  56. # we are storing the data by column.
  57. # For example, self.data[0] is the data from column 0.
  58. # self.data[0][10] is the column 0 value of item 10.
  59. for line in lines[1:]:
  60. cells = line.split(',')
  61. toggle = 0
  62. for cell in range(self.cols):
  63. if toggle == 0:
  64. self.data[cell].append(cells[cell])
  65. toggle = 1
  66. else:
  67. self.data[cell].append(float(cells[cell]))
  68. self.datasize = len(self.data[1])
  69. self.memberOf = [-1 for x in range(len(self.data[1]))]
  70. #
  71. # now normalize number columns
  72. #
  73. for i in range(1, self.cols):
  74. self.data[i] = normalizeColumn(self.data[i])
  75. # select random centroids from existing points
  76. random.seed()
  77. #sample(seq, n) 从序列seq中选择n个随机且独立的元素
  78. self.centroids = [[self.data[i][r] for i in range(1, len(self.data))]
  79. for r in random.sample(range(len(self.data[0])),
  80. self.k)]
  81. self.assignPointsToCluster()
  82. def updateCentroids(self):
  83. """Using the points in the clusters, determine the centroid
  84. (mean point) of each cluster"""
  85. members = [self.memberOf.count(i) for i in range(len(self.centroids))]#计数列表
  86. #对centroid类的数据的第k列数据 第i个中心点求和,计算平均
  87. self.centroids = [[sum([self.data[k][i]
  88. for i in range(len(self.data[0]))
  89. if self.memberOf[i] == centroid])/members[centroid]
  90. for k in range(1, len(self.data))]
  91. for centroid in range(len(self.centroids))]
  92. def assignPointToCluster(self, i):
  93. """ assign point to cluster based on distance from centroids"""
  94. min = 999999
  95. clusterNum = -1
  96. for centroid in range(self.k):
  97. dist = self.euclideanDistance(i, centroid)
  98. if dist < min:
  99. min = dist
  100. clusterNum = centroid
  101. # here is where I will keep track of changing points
  102. if clusterNum != self.memberOf[i]: #第一次认为全部变动
  103. self.pointsChanged += 1
  104. # add square of distance to running sum of squared error
  105. self.sse += min**2
  106. return clusterNum
  107. def assignPointsToCluster(self):
  108. """ assign each data point to a cluster"""
  109. self.pointsChanged = 0
  110. self.sse = 0
  111. self.memberOf = [self.assignPointToCluster(i)
  112. for i in range(len(self.data[1]))]
  113. def euclideanDistance(self, i, j):
  114. """ compute distance of point i from centroid j"""
  115. sumSquares = 0
  116. for k in range(1, self.cols):
  117. sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2
  118. return math.sqrt(sumSquares)
  119. def kCluster(self):
  120. """the method that actually performs the clustering
  121. As you can see this method repeatedly
  122. updates the centroids by computing the mean point of each cluster
  123. re-assign the points to clusters based on these new centroids
  124. until the number of points that change cluster membership is less than 1%.
  125. """
  126. done = False
  127. while not done:
  128. self.iterationNumber += 1
  129. self.updateCentroids()
  130. self.assignPointsToCluster()
  131. #
  132. # we are done if fewer than 1% of the points change clusters
  133. #
  134. if float(self.pointsChanged) / len(self.memberOf) < 0.01:
  135. done = True
  136. print("Final SSE: %f" % self.sse)
  137. def showMembers(self):
  138. """Display the results"""
  139. for centroid in range(len(self.centroids)):
  140. print ("\n\nClass %i\n========" % centroid)
  141. for name in [self.data[0][i] for i in range(len(self.data[0]))
  142. if self.memberOf[i] == centroid]:
  143. print (name)
  144. ##
  145. ## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3
  146. ###
  147. # change the path in the following to match where dogs.csv is on your machine
  148. km = kClusterer('data/dogs.csv', 3)
  149. km.kCluster()
  150. km.showMembers()

K-means++通过修改初始中心点的选择方法改进了由于初始的随机性导致的结果非优的缺点。虽然第一个点选择是随机的,但其他的点则优先选择那些彼此距离很远的点。

选择初始中心点的步骤:

  1. def selectInitialCentroids(self):
  2. """implement the k-means++ method of selecting
  3. the set of initial centroids"""
  4. centroids = []
  5. total = 0
  6. # first step is to select a random first centroid
  7. current = random.choice(range(len(self.data[0])))
  8. centroids.append(current)
  9. # loop to select the rest of the centroids, one at a time
  10. for i in range(0, self.k - 1):
  11. # for every point in the data find its distance to
  12. # the closest centroid
  13. weights = [self.distanceToClosestCentroid(x, centroids)
  14. for x in range(len(self.data[0]))]
  15. total = sum(weights)
  16. # instead of raw distances, convert so sum of weight = 1
  17. weights = [x / total for x in weights]
  18. #
  19. # now roll virtual die
  20. num = random.random()
  21. total = 0
  22. x = -1
  23. # the roulette wheel simulation
  24. while total < num:
  25. x += 1
  26. total += weights[x]
  27. centroids.append(x)
  28. self.centroids = [[self.data[i][r] for i in range(1, len(self.data))]
  29. for r in centroids]
  30. def distanceToClosestCentroid(self, point, centroidList):
  31. result = self.eDistance(point, centroidList[0])
  32. for centroid in centroidList[1:]:
  33. distance = self.eDistance(point, centroid)
  34. if distance < result:
  35. result = distance
  36. return result
  37. def eDistance(self, i, j):
  38. """ compute distance of point i from centroid j"""
  39. sumSquares = 0
  40. for k in range(1, self.cols):
  41. sumSquares += (self.data[k][i] - self.data[k][j])**2
  42. return math.sqrt(sumSquares)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注