@hainingwyx
2016-11-16T06:49:02.000000Z
字数 50066
阅读 3120
数据挖掘 Python
本书涉及的源程序和数据都可以在以下网站中找到:http://guidetodatamining.com/。
这本书理论比较简单,书中错误较少,动手锻炼较多,如果每个代码都自己写出来,收获不少。总结:适合入门。
欢迎转载,转载请注明出处,如有问题欢迎指正。
相似用户评判标准:曼哈顿距离、欧氏距离、明氏距离。
# Manhattan.pyusers = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5,"Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5,"Vampire Weekend": 2.0},"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0,"Phoenix": 2.0,"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0,"Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5,"Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,"Vampire Weekend": 2.0},"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0,"The Strokes": 4.0, "Vampire Weekend": 1.0},"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0,"Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,"Vampire Weekend": 4.0},"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0,"Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0,"Slightly Stoopid": 2.5, "The Strokes": 3.0}}def manhattan(rating1, rating2):"""Computes the Manhattan distance. Both rating1 and rating2 are dictionariesof the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""distance = 0commonRatings = Falsefor key in rating1:if key in rating2:distance += abs(rating1[key] - rating2[key])commonRatings = Trueif commonRatings:return distanceelse:return -1 #Indicates no ratings in commondef computeNearestNeighbor(username, users):"""creates a sorted list of users based on their distance to username"""distances = []for user in users:if user != username:distance = manhattan(users[user], users[username])distances.append((distance, user))# sort based on distance -- closest firstdistances.sort()return distancesdef recommend(username, users):"""Give list of recommendations"""# first find nearest neighbornearest = computeNearestNeighbor(username, users)[0][1]print nearestrecommendations = []# now find bands neighbor rated that user didn'tneighborRatings = users[nearest]userRatings = users[username]for artist in neighborRatings:if not artist in userRatings:recommendations.append((artist, neighborRatings[artist]))# using the fn sorted for variety - sort is more efficientreturn sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
# -*- coding: utf-8 -*-from math import sqrtusers = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}}#明氏距离def minkowski(rating1,rating2,r):distance=0commonRatings=Falsefor key in rating1:if key in rating2:distance += pow(abs(rating1[key]-rating2[key]),r)commonRatings=Truedistance = pow(distance,1.0/r)if commonRatings:return distanceelse:return -1 #Indicates no ratings in commondef computeNearestNeighbor(username, users):"""creates a sorted list of users based on their distance to username"""distances = []for user in users:if user != username:distance = minkowski(users[user], users[username],3)distances.append((distance, user))# sort based on distance -- closest firstdistances.sort()return distancesdef recommend(username, users):"""Give list of recommendations"""# first find nearest neighbornearest = computeNearestNeighbor(username, users)[0][1]print nearestrecommendations = []# now find bands neighbor rated that user didn'tneighborRatings = users[nearest]userRatings = users[username]for artist in neighborRatings:if not artist in userRatings:recommendations.append((artist, neighborRatings[artist]))# using the fn sorted for variety - sort is more efficientreturn sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
但是可能存在常数差别,但是两者爱好相同的问题。
皮尔逊相关系数:
# Pearson.pyfrom math import sqrtusers = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}}# 这里为了简单使用近似代替def pearson(rating1,rating2):sum_xy=0sum_x=0sum_y=0sum_x2=0sum_y2=0n=0for key in rating1:if key in rating2:n += 1x = rating1[key]y = rating2[key]sum_xy += x*ysum_x += xsum_y += ysum_x2 += x**2sum_y2 += y**2denominnator = sqrt(sum_x2-(sum_x**2)/n)*sqrt(sum_y2-(sum_y**2)/n)if denominnator == 0:return 0else:return (sum_xy-(sum_x*sum_y)/n)/denominnatordef cos_like(rating1,rating2):innerProd=0vector_x=0vectoy_y=0for key in rating1:if key in rating2:x=rating1[key]y=rating2[key]innerProd += x*yvector_x += x**2vectoy_y += y**2denominnator = sqrt(vector_x) * sqrt(vectoy_y)if denominnator == 0:return 0else:return innerProd / denominnator
余弦相似度:
k近邻:利用k个最相似的用户确定推荐结果,K和应有有关。利用皮尔逊系数来确定每个人的影响因子。
# A dictionary of movie critics and their ratings of a small# set of moviescritics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,'The Night Listener': 3.0},'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,'You, Me and Dupree': 3.5},'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,'Superman Returns': 3.5, 'The Night Listener': 4.0},'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,'The Night Listener': 4.5, 'Superman Returns': 4.0,'You, Me and Dupree': 2.5},'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,'You, Me and Dupree': 2.0},'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}from math import sqrt# Returns a distance-based similarity score for person1 and person2def sim_distance(prefs,person1,person2):# Get the list of shared_itemssi={}for item in prefs[person1]:if item in prefs[person2]: si[item]=1# if they have no ratings in common, return 0if len(si)==0: return 0# Add up the squares of all the differencessum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)for item in prefs[person1] if item in prefs[person2]])return 1/(1+sum_of_squares)# Returns the Pearson correlation coefficient for p1 and p2def sim_pearson(prefs,p1,p2):# Get the list of mutually rated itemssi={}for item in prefs[p1]:if item in prefs[p2]: si[item]=1# if they are no ratings in common, return 0if len(si)==0: return 0# Sum calculationsn=len(si)# Sums of all the preferencessum1=sum([prefs[p1][it] for it in si])sum2=sum([prefs[p2][it] for it in si])# Sums of the squaressum1Sq=sum([pow(prefs[p1][it],2) for it in si])sum2Sq=sum([pow(prefs[p2][it],2) for it in si])# Sum of the productspSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])# Calculate r (Pearson score)num=pSum-(sum1*sum2/n)den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))if den==0: return 0r=num/denreturn r# Returns the best matches for person from the prefs dictionary.# Number of results and similarity function are optional params.def topMatches(prefs,person,n=5,similarity=sim_pearson):scores=[(similarity(prefs,person,other),other)for other in prefs if other!=person]scores.sort()scores.reverse()return scores[0:n]# Gets recommendations for a person by using a weighted average# of every other user's rankingsdef getRecommendations(prefs,person,similarity=sim_pearson):totals={}simSums={}for other in prefs:# don't compare me to myselfif other==person: continuesim=similarity(prefs,person,other)# ignore scores of zero or lowerif sim<=0: continuefor item in prefs[other]:# only score movies I haven't seen yetif item not in prefs[person] or prefs[person][item]==0:# Similarity * Scoretotals.setdefault(item,0)totals[item]+=prefs[other][item]*sim# Sum of similaritiessimSums.setdefault(item,0)simSums[item]+=sim# Create the normalized listrankings=[(total/simSums[item],item) for item,total in totals.items()]# Return the sorted listrankings.sort()rankings.reverse()return rankingsdef transformPrefs(prefs):result={}for person in prefs:for item in prefs[person]:result.setdefault(item,{})# Flip item and personresult[item][person]=prefs[person][item]return resultdef calculateSimilarItems(prefs,n=10):# Create a dictionary of items showing which other items they# are most similar to.result={}# Invert the preference matrix to be item-centricitemPrefs=transformPrefs(prefs)c=0for item in itemPrefs:# Status updates for large datasetsc+=1if c%100==0: print "%d / %d" % (c,len(itemPrefs))# Find the most similar items to this onescores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)result[item]=scoresreturn resultdef getRecommendedItems(prefs,itemMatch,user):userRatings=prefs[user]scores={}totalSim={}# Loop over items rated by this userfor (item,rating) in userRatings.items( ):# Loop over items similar to this onefor (similarity,item2) in itemMatch[item]:# Ignore if this user has already rated this itemif item2 in userRatings: continue# Weighted sum of rating times similarityscores.setdefault(item2,0)scores[item2]+=similarity*rating# Sum of all the similaritiestotalSim.setdefault(item2,0)totalSim[item2]+=similarity# Divide each total score by total weighting to get an averagerankings=[(score/totalSim[item],item) for item,score in scores.items( )]# Return the rankings from highest to lowestrankings.sort( )rankings.reverse( )return rankingsdef loadMovieLens(path='C:\Users\WangYixin\Desktop\PCI_Code\PCI_Code Folder\chapter2\data'):# Get movie titlesmovies={}for line in open(path+'/u.item'):(id,title)=line.split('|')[0:2]movies[id]=title# Load dataprefs={}for line in open(path+'/u.data'):(user,movieid,rating,ts)=line.split('\t')prefs.setdefault(user,{})prefs[user][movies[movieid]]=float(rating)return prefs
# -*- coding: utf-8 -*-# 推荐类import codecsfrom math import sqrtusers = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,"Norah Jones": 4.5, "Phoenix": 5.0,"Slightly Stoopid": 1.5,"The Strokes": 2.5, "Vampire Weekend": 2.0},"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,"Deadmau5": 4.0, "Phoenix": 2.0,"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,"Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,"Slightly Stoopid": 1.0},"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,"Deadmau5": 4.5, "Phoenix": 3.0,"Slightly Stoopid": 4.5, "The Strokes": 4.0,"Vampire Weekend": 2.0},"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,"Norah Jones": 4.0, "The Strokes": 4.0,"Vampire Weekend": 1.0},"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,"Norah Jones": 5.0, "Phoenix": 5.0,"Slightly Stoopid": 4.5, "The Strokes": 4.0,"Vampire Weekend": 4.0},"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,"Norah Jones": 3.0, "Phoenix": 5.0,"Slightly Stoopid": 4.0, "The Strokes": 5.0},"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,"Phoenix": 4.0, "Slightly Stoopid": 2.5,"The Strokes": 3.0}}class recommender:def __init__(self, data, k=1, metric='pearson', n=5):""" initialize recommendercurrently, if data is dictionary the recommender is initializedto it.For all other data types of data, no initialization occursk is the k value for k nearest neighbormetric is which distance formula to usen is the maximum number of recommendations to make"""self.k = kself.n = nself.username2id = {}self.userid2name = {}self.productid2name = {}# for some reason I want to save the name of the metricself.metric = metricif self.metric == 'pearson':self.fn = self.pearson## if data is dictionary set recommender data to it#if type(data).__name__ == 'dict':self.data = datadef convertProductID2name(self, id):"""Given product id number return product name"""if id in self.productid2name:return self.productid2name[id]else:return iddef userRatings(self, id, n):"""Return n top ratings for user with id"""print ("Ratings for " + self.userid2name[id])ratings = self.data[id]print(len(ratings))ratings = list(ratings.items())ratings = [(self.convertProductID2name(k), v)for (k, v) in ratings]# finally sort and returnratings.sort(key=lambda artistTuple: artistTuple[1],reverse = True)ratings = ratings[:n]for rating in ratings:print("%s\t%i" % (rating[0], rating[1]))def loadBookDB(self, path=''):"""loads the BX book dataset. Path is where the BX files arelocated"""self.data = {}i = 0## First load book ratings into self.data#f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')f.readline() #read the titlefor line in f:i += 1#separate line into fieldsfields = line.split(';') # still with ""user = fields[0].strip('"') #delete “ in the fieldsbook = fields[1].strip('"')rating = int(fields[2].strip().strip('"'))if user in self.data:currentRatings = self.data[user]else:currentRatings = {}currentRatings[book] = ratingself.data[user] = currentRatings#line = f.readline()f.close()## Now load books into self.productid2name# Books contains isbn, title, and author among other fields#f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')for line in f:i += 1#separate line into fieldsfields = line.split(';')isbn = fields[0].strip('"')title = fields[1].strip('"')author = fields[2].strip().strip('"')title = title + ' by ' + authorself.productid2name[isbn] = titlef.close()## Now load user info into both self.userid2name and# self.username2id#f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')for line in f:i += 1#print(line)#separate line into fieldsfields = line.split(';')userid = fields[0].strip('"')location = fields[1].strip('"')if len(fields) > 3:age = fields[2].strip().strip('"')else:age = 'NULL'if age != 'NULL':value = location + ' (age: ' + age + ')'else:value = locationself.userid2name[userid] = valueself.username2id[location] = useridf.close()print(i)def pearson(self, rating1, rating2):sum_xy = 0sum_x = 0sum_y = 0sum_x2 = 0sum_y2 = 0n = 0for key in rating1:if key in rating2:n += 1x = rating1[key]y = rating2[key]sum_xy += x * ysum_x += xsum_y += ysum_x2 += pow(x, 2)sum_y2 += pow(y, 2)if n == 0:return 0# now compute denominatordenominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)* sqrt(sum_y2 - pow(sum_y, 2) / n))if denominator == 0:return 0else:return (sum_xy - (sum_x * sum_y) / n) / denominatordef computeNearestNeighbor(self, username):"""creates a sorted list of users based on their distance tousername"""distances = []for instance in self.data:if instance != username:distance = self.fn(self.data[username],self.data[instance])distances.append((instance, distance))# sort based on distance -- closest firstdistances.sort(key=lambda artistTuple: artistTuple[1],reverse=True)return distancesdef recommend(self, user):"""Give list of recommendations"""recommendations = {}# first get list of users ordered by nearnessnearest = self.computeNearestNeighbor(user)# now get the ratings for the useruserRatings = self.data[user]# determine the total distancetotalDistance = 0.0for i in range(self.k):totalDistance += nearest[i][1]# now iterate through the k nearest neighbors# accumulating their ratingsfor i in range(self.k):# compute slice of pieweight = nearest[i][1] / totalDistance# get the name of the personname = nearest[i][0]# get the ratings for this personneighborRatings = self.data[name]# get the name of the person# now find bands neighbor rated that user didn'tfor artist in neighborRatings:if not artist in userRatings:if artist not in recommendations:recommendations[artist] = (neighborRatings[artist]* weight)else:recommendations[artist] = (recommendations[artist]+ neighborRatings[artist]* weight)# now make list from dictionaryrecommendations = list(recommendations.items())recommendations = [(self.convertProductID2name(k), v)for (k, v) in recommendations]# finally sort and returnrecommendations.sort(key=lambda artistTuple: artistTuple[1],reverse = True)# Return the first n itemsreturn recommendations[:self.n]############test code#############r = recommender(users)#r.recommend('Jordyn')#r.recommend('Hailey')#r.loadBookDB('BX-CSV-Dump/')#r.recommend('171118')#r.userRatings('171118', 5)
显示评级:显示给出评级结果,如Youtube的点赞、点差按钮
隐式评级:网站点击轨迹。
基于邻居(用户)的推荐系统计算的次数十分巨大,所以有延迟性。还有稀疏性的问题。也称为基于内存的协同过滤,因为需要保存所有的评级结果来进行推荐。
基于物品的过滤:事先找到最相似的物品,并结合物品的评级结果生成推荐。也称为基于模型的协同过滤,因为不需要保存所有的评级结果,取而代之的随时构建一个模型表示物品之间的相似度。
为了抵消分数夸大,调整余弦相似度
users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5,"Lorde": 4, "Fall Out Boy": 1},"Matt": {"Imagine Dragons": 3, "Daft Punk": 4,"Lorde": 4, "Fall Out Boy": 1},"Ben": {"Kacey Musgraves": 4, "Imagine Dragons": 3,"Lorde": 3, "Fall Out Boy": 1},"Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4,"Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1},"Tori": {"Kacey Musgraves": 5, "Imagine Dragons": 4,"Daft Punk": 5, "Fall Out Boy": 3}}def computeSimilarity(band1, band2, userRatings):averages = {}for (key, ratings) in userRatings.items():averages[key] = (float(sum(ratings.values()))/ len(ratings.values()))num = 0 # numeratordem1 = 0 # first half of denominatordem2 = 0for (user, ratings) in userRatings.items():if band1 in ratings and band2 in ratings:avg = averages[user]num += (ratings[band1] - avg) * (ratings[band2] - avg)dem1 += (ratings[band1] - avg)**2dem2 += (ratings[band2] - avg)**2return num / (sqrt(dem1) * sqrt(dem2))
相似矩阵预测:
N表示用户u的所有评级物品中每个和i得分相似的物品。
是i和N之间的相识度
是u给N的评级结果,应该在之间取值,可能需要做线性变换
Slope One算法
计算偏差
物品i到物品j的平均偏差为
def computeDeviations(self):# for each person in the data:# get their ratingsfor ratings in self.data.values(): # data:users2, ratings:{song:value, , }# for each item & rating in that set of ratings:for (item, rating) in ratings.items():self.frequencies.setdefault(item, {}) #key is songself.deviations.setdefault(item, {})# for each item2 & rating2 in that set of ratings:for (item2, rating2) in ratings.items():if item != item2:# add the difference between the ratings to our# computationself.frequencies[item].setdefault(item2, 0)self.deviations[item].setdefault(item2, 0.0)# frequemcies is cardself.frequencies[item][item2] += 1# diviations is the sum of dev of diff users#value of complex dic is devself.deviations[item][item2] += rating - rating2for (item, ratings) in self.deviations.items():for item2 in ratings:ratings[item2] /= self.frequencies[item][item2]# test code for ComputeDeviations(self)#r = recommender(users2)#r.computeDeviations()#r.deviations
加权Slope预测
def slopeOneRecommendations(self, userRatings):recommendations = {}frequencies = {}# for every item and rating in the user's recommendationsfor (userItem, userRating) in userRatings.items(): # userItem :i# for every item in our dataset that the user didn't ratefor (diffItem, diffRatings) in self.deviations.items(): #diffItem : jif diffItem not in userRatings and \userItem in self.deviations[diffItem]:freq = self.frequencies[diffItem][userItem] #freq:c_ji# 如果键不存在于字典中,将会添加键并将值设为默认值。recommendations.setdefault(diffItem, 0.0)frequencies.setdefault(diffItem, 0)# add to the running sum representing the numerator# of the formularecommendations[diffItem] += (diffRatings[userItem] +userRating) * freq# keep a running sum of the frequency of diffitemfrequencies[diffItem] += freq#p(u)j listrecommendations = [(self.convertProductID2name(k),v / frequencies[k])for (k, v) in recommendations.items()]# finally sort and returnrecommendations.sort(key=lambda artistTuple: artistTuple[1],reverse = True)# I am only going to return the first 50 recommendationsreturn recommendations[:50]# test code for SlopeOneRecommendations#r = recommender(users2)#r.computeDeviations()#g = users2['Ben']#r.slopeOneRecommendations(g)
def loadMovieLens(self, path=''):self.data = {}## first load movie ratings#i = 0## First load book ratings into self.data##f = codecs.open(path + "u.data", 'r', 'utf8')f = codecs.open(path + "u.data", 'r', 'ascii')# f = open(path + "u.data")for line in f:i += 1#separate line into fieldsfields = line.split('\t')user = fields[0]movie = fields[1]rating = int(fields[2].strip().strip('"'))if user in self.data:currentRatings = self.data[user]else:currentRatings = {}currentRatings[movie] = ratingself.data[user] = currentRatingsf.close()## Now load movie into self.productid2name# the file u.item contains movie id, title, release date among# other fields##f = codecs.open(path + "u.item", 'r', 'utf8')f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore')#f = open(path + "u.item")for line in f:i += 1#separate line into fieldsfields = line.split('|')mid = fields[0].strip()title = fields[1].strip()self.productid2name[mid] = titlef.close()## Now load user info into both self.userid2name# and self.username2id##f = codecs.open(path + "u.user", 'r', 'utf8')f = open(path + "u.user")for line in f:i += 1fields = line.split('|')userid = fields[0].strip('"')self.userid2name[userid] = lineself.username2id[line] = useridf.close()print(i)# test code#r = recommender(0)#r.loadMovieLens('ml-100k/')#r.computeDeviations()#r.slopeOneRecommendations(r.data['1'])#r.slopeOneRecommendations(r.data['25'])
新物品加入,会因为没有被评分过,永远不会被推荐。Pandora是基于基于一种称为音乐基因的项目。
当所用数据挖掘方法基于特征的值来计算 两个对象的距离,且不同特征的尺度不同,就需要使用归一化。一般使用均值和标准差来进行归一化,但这种方法可能会受到离群点的影响,所以引入改进后的归一化:均值用中位数()代替,标准差用绝对标准差(asd)代替。
# 计算中位数和绝对标准差def getMedian(self, alist):"""return median of alist"""if alist == []:return []blist = sorted(alist)length = len(alist)if length % 2 == 1:# length of list is odd so return middle elementreturn blist[int(((length + 1) / 2) - 1)]else:# length of list is even so compute midpointv1 = blist[int(length / 2)]v2 =blist[(int(length / 2) - 1)]return (v1 + v2) / 2.0def getAbsoluteStandardDeviation(self, alist, median):"""given alist and median return absolute standard deviation"""sum = 0for item in alist:sum += abs(item - median)return sum / len(alist)def unitTest():list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]list3 = [69]list4 = [69, 72]classifier = Classifier('data/athletesTrainingSet.txt')m1 = classifier.getMedian(list1)m2 = classifier.getMedian(list2)m3 = classifier.getMedian(list3)m4 = classifier.getMedian(list4)asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)assert(round(m1, 3) == 65)assert(round(m2, 3) == 66)assert(round(m3, 3) == 69)assert(round(m4, 3) == 70.5)assert(round(asd1, 3) == 8)assert(round(asd2, 3) == 7.5)assert(round(asd3, 3) == 0)assert(round(asd4, 3) == 1.5)print("getMedian and getAbsoluteStandardDeviation work correctly")
assert语句用于软件组件测试的做法是一种常用的技术。产品每一部分分成一段实现代码加上对实现代码的测试代码,这一点十分重要。
# 归一化def normalizeColumn(self, columnNumber):"""given a column number, normalize that column in self.data"""# first extract values to list, v is vector, clounm is 0/1,col is a listcol = [v[1][columnNumber] for v in self.data]median = self.getMedian(col)asd = self.getAbsoluteStandardDeviation(col, median)#print("Median: %f ASD = %f" % (median, asd))self.medianAndDeviation.append((median, asd))for v in self.data:v[1][columnNumber] = (v[1][columnNumber] - median) / asd
10-flod Cross Validation:将数据集分为10份,使用其中9份进行训练,另外1份用作测试,重复该过程10次。
留一法:n-flod Cross Validation。结果是随机的,不是确定值,和数据的划分有关。缺点在于计算机开销很大。分层采样的时候保证样本的均匀性很重要。
混淆矩阵:行表示测试样本的真实类别,列表示预测器所预测出来的类别。可揭示分类器性能。
# divide data into 10 bucketsimport randomdef buckets(filename, bucketName, separator, classColumn):"""the original data is in the file named filenamebucketName is the prefix for all the bucket namesseparator is the character that divides the columns(for ex., a tab or comma and classColumn is the columnthat indicates the class"""# put the data in 10 bucketsnumberOfBuckets = 10data = {}# first read in the data and divide by categorywith open(filename) as f:lines = f.readlines()for line in lines:if separator != '\t':line = line.replace(separator, '\t')# first get the categorycategory = line.split()[classColumn]data.setdefault(category, []) #set the value for dic datadata[category].append(line) #all the information# initialize the buckets [[], [], ...]buckets = []for i in range(numberOfBuckets):buckets.append([])# now for each category put the data into the bucketsfor k in data.keys():#randomize order of instances for each class#data[k] is a list of linerandom.shuffle(data[k])bNum = 0# divide into bucketsfor item in data[k]:buckets[bNum].append(item)bNum = (bNum + 1) % numberOfBuckets# write to filefor bNum in range(numberOfBuckets):f = open("%s-%02i" % ('tmp/'+bucketName, bNum + 1), 'w')for item in buckets[bNum]:f.write(item)f.close()# example of how to use this codebuckets("data/mpgData.txt", 'mpgData',',',0)
分类器评价:Kappa统计量。相对于随机分类器而言的分类器效果。
| Kappa区间 | 性能 |
|---|---|
| <0 | 比随机方法性能差 |
| 0.01-0.2 | 轻微一致 |
| 0.21-0.4 | 一般一致 |
| 0.41-0.6 | 中度一致 |
| 0.61-0.8 | 高度一致 |
| 0.81-1 | 接近完美 |
KNN:当有一个样本是比较特别的时候,使用最近邻可能会导致特别样本的存在而出现误分类。改进的办法就是考察k个邻居。离得越近,影响因子就越大。影响因子可以用距离的倒数来表示。
def knn(self, itemVector):"""returns the predicted class of itemVector using kNearest Neighbors"""# changed from min to heapq.nsmallest to get the# k closest neighborsneighbors = heapq.nsmallest(self.k,[(self.manhattan(itemVector, item[1]), item)for item in self.data])# each neighbor gets a voteresults = {}for neighbor in neighbors:theClass = neighbor[1][0]results.setdefault(theClass, 0)results[theClass] += 1resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True)#get all the classes that have the maximum votesmaxVotes = resultList[0][0]possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes]# randomly select one of the classes that received the max votesanswer = random.choice(possibleAnswers)return( answer)
做工程,数据量大的时候算法的效果越好。做论文还是要研究出一个具有少量性能提高的算法。
特点:分类并给出概率。
先验概率:P(h)
后验概率/条件概率:P(h/d)
# 训练class Classifier:def __init__(self, bucketPrefix, testBucketNumber, dataFormat):""" a classifier will be built from files with the bucketPrefixexcluding the file with textBucketNumber. dataFormat is a string thatdescribes how to interpret each line of the data files. For example,for the iHealth data the format is:"attr attr attr attr class""""total = 0classes = {}counts = {}# reading the data in from the fileself.format = dataFormat.strip().split('\t')self.prior = {}self.conditional = {}# for each of the buckets numbered 1 through 10:for i in range(1, 11):# if it is not the bucket we should ignore, read in the dataif i != testBucketNumber:filename = "%s-%02i" % (bucketPrefix, i)f = open(filename)lines = f.readlines()f.close()for line in lines:fields = line.strip().split('\t')ignore = []vector = []for i in range(len(fields)):if self.format[i] == 'num':vector.append(float(fields[i])) #vector!!elif self.format[i] == 'attr':vector.append(fields[i])elif self.format[i] == 'comment':ignore.append(fields[i])elif self.format[i] == 'class':category = fields[i]# now process this instancetotal += 1classes.setdefault(category, 0) #字典:分类类别计数counts.setdefault(category, {}) #复合字典:每类的每列的具体计数classes[category] += 1# now process each attribute of the instancecol = 0for columnValue in vector:col += 1counts[category].setdefault(col, {})counts[category][col].setdefault(columnValue, 0)counts[category][col][columnValue] += 1# ok done counting. now compute probabilities# first prior probabilities p(h)for (category, count) in classes.items():self.prior[category] = count / total#字典:先验概率# now compute conditional probabilities p(D|h)for (category, columns) in counts.items():self.conditional.setdefault(category, {})for (col, valueCounts) in columns.items():self.conditional[category].setdefault(col, {})for (attrValue, count) in valueCounts.items():self.conditional[category][col][attrValue] = (count / classes[category]) #复合字典:每类的每个属性的条件概率self.tmp = counts #应该暂时没有用
# 分类def classify(self, itemVector):"""Return class we think item Vector is in"""results = []for (category, prior) in self.prior.items():prob = priorcol = 1for attrValue in itemVector:if not attrValue in self.conditional[category][col]:# we did not find any instances of this attribute value# occurring with this category so prob = 0prob = 0else:prob = prob * self.conditional[category][col][attrValue]col += 1results.append((prob, category))# return the category with the highest probabilityreturn(max(results)[1])
# test codec = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")print(c.classify(['health', 'moderate', 'moderate', 'yes']))
问题:当存在某个概率为0时,直接主导整个贝叶斯的计算过程,即使其他的独立事件的条件概率接近于1。此外,基于样本集估计出来概率往往是真实概率的偏低估计。
改进:将
当处理的数据是连续的时候,有两种解决办法。一是离散化,构建类别;一是假设概率分布服从高斯分布,然后计算概率。
样本标准差:
# pdf计算实现def pdf(mean, ssd, x):"""Probability Density Function computing P(x|y)input is the mean, sample standard deviation for all the items in y,and x."""ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))print (ePart)return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart````<div class="md-section-divider"></div>```python# 连续数据的训练class Classifier:def __init__(self, bucketPrefix, testBucketNumber, dataFormat):""" a classifier will be built from files with the bucketPrefixexcluding the file with textBucketNumber. dataFormat is a string thatdescribes how to interpret each line of the data files. For example,for the iHealth data the format is:"attr attr attr attr class""""total = 0classes = {}# counts used for attributes that are not numericcounts = {}# totals used for attributes that are numereric# we will use these to compute the mean and sample standard deviation for# each attribute - class pair.totals = {}numericValues = {}# reading the data in from the fileself.format = dataFormat.strip().split('\t')#self.prior = {}self.conditional = {}# for each of the buckets numbered 1 through 10:for i in range(1, 11):# if it is not the bucket we should ignore, read in the dataif i != testBucketNumber:filename = "%s-%02i" % (bucketPrefix, i)f = open(filename)lines = f.readlines()f.close()for line in lines:fields = line.strip().split('\t')ignore = []vector = []nums = []for i in range(len(fields)):if self.format[i] == 'num':nums.append(float(fields[i]))elif self.format[i] == 'attr':vector.append(fields[i])elif self.format[i] == 'comment':ignore.append(fields[i])elif self.format[i] == 'class':category = fields[i]# now process this instancetotal += 1classes.setdefault(category, 0)counts.setdefault(category, {})totals.setdefault(category, {})numericValues.setdefault(category, {})classes[category] += 1# now process each non-numeric attribute of the instancecol = 0for columnValue in vector:col += 1counts[category].setdefault(col, {})counts[category][col].setdefault(columnValue, 0)counts[category][col][columnValue] += 1# process numeric attributescol = 0for columnValue in nums:col += 1totals[category].setdefault(col, 0)#totals[category][col].setdefault(columnValue, 0)totals[category][col] += columnValuenumericValues[category].setdefault(col, [])numericValues[category][col].append(columnValue)## ok done counting. now compute probabilities## first prior probabilities p(h)#for (category, count) in classes.items():self.prior[category] = count / total## now compute conditional probabilities p(h|D)#for (category, columns) in counts.items():self.conditional.setdefault(category, {})for (col, valueCounts) in columns.items():self.conditional[category].setdefault(col, {})for (attrValue, count) in valueCounts.items():self.conditional[category][col][attrValue] = (count / classes[category])self.tmp = counts## now compute mean and sample standard deviation#self.means = {}self.totals = totalsfor (category, columns) in totals.items():self.means.setdefault(category, {})for (col, cTotal) in columns.items():self.means[category][col] = cTotal / classes[category]# standard deviationself.ssd = {}for (category, columns) in numericValues.items():self.ssd.setdefault(category, {})for (col, values) in columns.items():SumOfSquareDifferences = 0theMean = self.means[category][col]for value in values:SumOfSquareDifferences += (value - theMean)**2columns[col] = 0self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1))
# 连续数据的分类def classify(self, itemVector, numVector):"""Return class we think item Vector is in"""results = []sqrt2pi = math.sqrt(2 * math.pi)for (category, prior) in self.prior.items():prob = priorcol = 1for attrValue in itemVector:if not attrValue in self.conditional[category][col]:# we did not find any instances of this attribute value# occurring with this category so prob = 0prob = 0else:prob = prob * self.conditional[category][col][attrValue]col += 1col = 1for x in numVector:mean = self.means[category][col]ssd = self.ssd[category][col]ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)col += 1results.append((prob, category))# return the category with the highest probability#print(results)return(max(results)[1])
贝叶斯和kNN的比较
许多真实数据挖掘问题中,很多属性不是独立的。有时候可以假设独立。之所以称朴素贝叶斯是因为尽管知道不成立仍然假设属性之间是独立的。
训练阶段:
将标识为同一假设的文档合并成一个文本文件
计算词在该文件中的出现次数n,形成一个词汇表
对于词汇表中的每个词计算器在文本中的出现次数,记为
对词汇表中的每个词(去除停用词),计算
class BayesText:def __init__(self, trainingdir, stopwordlist):"""This class implements a naive Bayes approach to textclassificationtrainingdir is the training data. Each subdirectory oftrainingdir is titled with the name of the classificationcategory -- those subdirectories in turn contain the textfiles for that category.The stopwordlist is a list of words (one per line) will beremoved before any counting takes place."""self.vocabulary = {}self.prob = {}self.totals = {}self.stopwords = {} #停用词字典f = open(stopwordlist)for line in f:self.stopwords[line.strip()] = 1f.close()categories = os.listdir(trainingdir)#filter out files that are not directoriesself.categories = [filename for filename in categoriesif os.path.isdir(trainingdir + filename)]print("Counting ...")for category in self.categories:print(' ' + category)# 计算当前类别的单词和单词数量,单词的总量(self.prob[category],self.totals[category]) = self.train(trainingdir, category)# I am going to eliminate any word in the 所有种类的单词库vocabulary# that doesn't occur at least 3 timestoDelete = []for word in self.vocabulary:if self.vocabulary[word] < 3:# mark word for deletion# can't delete now because you can't delete# from a list you are currently iterating overtoDelete.append(word)# now deletefor word in toDelete:del self.vocabulary[word]# now compute probabilitiesvocabLength = len(self.vocabulary)print("Computing probabilities:")for category in self.categories:print(' ' + category)denominator = self.totals[category] + vocabLengthfor word in self.vocabulary:if word in self.prob[category]:count = self.prob[category][word]else:count = 1# 条件概率计算self.prob[category][word] = (float(count + 1)/ denominator)print ("DONE TRAINING\n\n")# input:trainingdir训练文件的目录, category训练文件的种类# return: (counts, total) (当前文件的单词和单词数量,所有单词的数量)def train(self, trainingdir, category):"""counts word occurrences for a particular category"""currentdir = trainingdir + categoryfiles = os.listdir(currentdir)counts = {}total = 0for file in files:#print(currentdir + '/' + file)f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')for line in f:tokens = line.split()for token in tokens:# get rid of punctuation and lowercase tokentoken = token.strip('\'".,?:-')token = token.lower()if token != '' and not token in self.stopwords:self.vocabulary.setdefault(token, 0)self.vocabulary[token] += 1#所有文档的单词和单词数量counts.setdefault(token, 0)counts[token] += 1#当前文件的单词和单词数量total += 1#所有单词的数量f.close()return(counts, total)# test codebT = BayesText(trainingDir, stoplistfile)bT.prob['rec.motorcycles']["god"]
分类阶段:
停用词:当停用词是噪声时,去掉这些词能减少处理量,提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。
def classify(self, itemVector, numVector):"""Return class we think item Vector is in"""results = []sqrt2pi = math.sqrt(2 * math.pi)for (category, prior) in self.prior.items():prob = priorcol = 1for attrValue in itemVector:if not attrValue in self.conditional[category][col]:# we did not find any instances of this attribute value# occurring with this category so prob = 0prob = 0else:prob = prob * self.conditional[category][col][attrValue]col += 1col = 1for x in numVector:mean = self.means[category][col]ssd = self.ssd[category][col]ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)col += 1results.append((prob, category))# return the category with the highest probability#print(results)return(max(results)[1])# test codebT.classify(testDir+ 'rec.motorcycles/104673')
10-fold cross
from __future__ import print_functionimport os, codecs, mathclass BayesText:# input:训练文件目录,停用词,忽略的文件子集def __init__(self, trainingdir, stopwordlist, ignoreBucket):"""This class implements a naive Bayes approach to textclassificationtrainingdir is the training data. Each subdirectory oftrainingdir is titled with the name of the classificationcategory -- those subdirectories in turn contain the textfiles for that category.The stopwordlist is a list of words (one per line) will beremoved before any counting takes place."""self.vocabulary = {}self.prob = {}self.totals = {}self.stopwords = {}f = open(stopwordlist)for line in f:self.stopwords[line.strip()] = 1f.close()categories = os.listdir(trainingdir)#filter out files that are not directories,in this program, neg and posself.categories = [filename for filename in categoriesif os.path.isdir(trainingdir + filename)]print("Counting ...")for category in self.categories:#print(' ' + category)(self.prob[category],self.totals[category]) = self.train(trainingdir, category,ignoreBucket)# I am going to eliminate any word in the vocabulary# that doesn't occur at least 3 timestoDelete = []for word in self.vocabulary:if self.vocabulary[word] < 3:# mark word for deletion# can't delete now because you can't delete# from a list you are currently iterating overtoDelete.append(word)# now deletefor word in toDelete:del self.vocabulary[word]# now compute probabilitiesvocabLength = len(self.vocabulary)#print("Computing probabilities:")for category in self.categories:#print(' ' + category)denominator = self.totals[category] + vocabLengthfor word in self.vocabulary:if word in self.prob[category]:count = self.prob[category][word]else:count = 1self.prob[category][word] = (float(count + 1)/ denominator)#print ("DONE TRAINING\n\n")def train(self, trainingdir, category, bucketNumberToIgnore):"""counts word occurrences for a particular category"""ignore = "%i" % bucketNumberToIgnorecurrentdir = trainingdir + categorydirectories = os.listdir(currentdir)counts = {}total = 0for directory in directories:if directory != ignore:currentBucket = trainingdir + category + "/" + directoryfiles = os.listdir(currentBucket)#print(" " + currentBucket)for file in files:f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')for line in f:tokens = line.split()for token in tokens:# get rid of punctuation and lowercase tokentoken = token.strip('\'".,?:-')token = token.lower()if token != '' and not token in self.stopwords:self.vocabulary.setdefault(token, 0)self.vocabulary[token] += 1counts.setdefault(token, 0)counts[token] += 1total += 1f.close()return(counts, total)def classify(self, filename):results = {}for category in self.categories:results[category] = 0f = codecs.open(filename, 'r', 'iso8859-1')for line in f:tokens = line.split()for token in tokens:#print(token)token = token.strip('\'".,?:-').lower()if token in self.vocabulary:for category in self.categories:if self.prob[category][token] == 0:print("%s %s" % (category, token))results[category] += math.log(self.prob[category][token])f.close()results = list(results.items())results.sort(key=lambda tuple: tuple[1], reverse = True)# for debugging I can change this to give me the entire listreturn results[0][0]# input: 测试文件的分类目录,当前类别, 忽略子集# return: 当前类别下的分类结果{0:12,1:23}def testCategory(self, direc, category, bucketNumber):results = {}directory = direc + ("%i/" % bucketNumber)#print("Testing " + directory)files = os.listdir(directory)total = 0#correct = 0for file in files:total += 1result = self.classify(directory + file)results.setdefault(result, 0)results[result] += 1#if result == category:# correct += 1return results# input: 测试文件目录, 忽略的子集文件# return: 所有类别的分类结果{1:{0:12,1:23},}def test(self, testdir, bucketNumber):"""Test all files in the test directory--that directory isorganized into subdirectories--each subdir is a classificationcategory"""results = {}categories = os.listdir(testdir)#filter out files that are not directoriescategories = [filename for filename in categories ifos.path.isdir(testdir + filename)]for category in categories:#print(".", end="")results[category] = self.testCategory(testdir + category + '/', category, bucketNumber)return resultsdef tenfold(dataPrefix, stoplist):results = {}for i in range(0,10):bT = BayesText(dataPrefix, stoplist, i)r = bT.test(theDir, i)for (key, value) in r.items():results.setdefault(key, {})for (ckey, cvalue) in value.items():results[key].setdefault(ckey, 0)results[key][ckey] += cvaluecategories = list(results.keys())categories.sort()print( "\n Classified as: ")header = " "subheader = " +"for category in categories:header += "% 2s " % categorysubheader += "-----+"print (header)print (subheader)total = 0.0correct = 0.0for category in categories:row = " %s |" % categoryfor c2 in categories:if c2 in results[category]:count = results[category][c2]else:count = 0row += " %3i |" % counttotal += countif c2 == category:correct += countprint(row)print(subheader)print("\n%5.3f percent correct" %((correct * 100) / total))print("total of %i instances" % total)# change these to match your directory structureprefixPath = "data/review_polarity/"theDir = prefixPath + "/txt_sentoken/"stoplistfile = prefixPath + "stopwords25.txt"tenfold(theDir, stoplistfile)
层次聚类:每次迭代将最相似的两个簇合并,不断重复直至只有一个簇。两个簇距离计算方法不一样,聚类的方法就不一样,合并的过程也不一样。
常规队列:先进先出
优先级队列:去除次序基于队列元素的关联权重,数值越小先去除。
# 从队列中获得数据的最小值from Queue import PriorityQueuesingersQueue = PriorityQueue()singersQueue.put((16, 'Suzaka'))singersQueue.put((14, 'Yui'))singersQueue.put((15, 'Moa'))print singersQueue.get()print singersQueue.get()print singersQueue.get()
class hClusterer:""" this clusterer assumes that the first column of the data is a labelnot used in the clustering. The other columns contain numeric data"""def __init__(self, filename):file = open(filename)self.data = {}self.counter = 0self.queue = PriorityQueue()lines = file.readlines()file.close()header = lines[0].split(',')self.cols = len(header)self.data = [[] for i in range(len(header))]for line in lines[1:]:cells = line.split(',')toggle = 0# self.data [['Border Collie',...], [20.0,...], [45.0,...]]for cell in range(self.cols):if toggle == 0:self.data[cell].append(cells[cell])toggle = 1else:self.data[cell].append(float(cells[cell]))# now normalize number columns (that is, skip the first column)for i in range(1, self.cols):self.data[i] = normalizeColumn(self.data[i])###### I have read in the data and normalized the### columns. Now for each element i in the data, I am going to### 1. compute the Euclidean Distance from element i to all the### other elements. This data will be placed in neighbors,### which is a Python dictionary. Let's say i = 1, and I am### computing the distance to the neighbor j and let's say j### is 2. The neighbors dictionary for i will look like### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... }###### 2. find the closest neighbor###### 3. place the element on a priority queue, called simply queue,### based on the distance to the nearest neighbor (and a counter### used to break ties.# now push distances on queuerows = len(self.data[0])for i in range(rows):minDistance = 99999nearestNeighbor = 0neighbors = {} #存储所有邻居的对和距离for j in range(rows):if i != j:dist = self.distance(i, j)if i < j:pair = (i,j)else:pair = (j,i)neighbors[j] = (pair, dist)if dist < minDistance:minDistance = distnearestNeighbor = j# nearestNum = j# create nearest Pairif i < nearestNeighbor:nearestPair = (i, nearestNeighbor)else:nearestPair = (nearestNeighbor, i)# put instance on priority queueself.queue.put((minDistance, self.counter,[[self.data[0][i]], nearestPair, neighbors]))self.counter += 1 #存储对象的数量,先后顺序def distance(self, i, j):sumSquares = 0for k in range(1, self.cols):sumSquares += (self.data[k][i] - self.data[k][j])**2return math.sqrt(sumSquares)def cluster(self):done = Falsewhile not done:# 依据 minDistance 取出当前最小距离的对象,应该是两对距离。topOne = self.queue.get()nearestPair = topOne[2][1]if not self.queue.empty():nextOne = self.queue.get()nearPair = nextOne[2][1] #当等距离时,不一定一样的tmp = []#### I have just popped two elements off the queue,## topOne and nextOne. I need to check whether nextOne## is topOne's nearest neighbor and vice versa.## If not, I will pop another element off the queue## until I find topOne's nearest neighbor. That is what## this while loop does.### 处理等距离的情况while nearPair != nearestPair:tmp.append((nextOne[0], self.counter, nextOne[2]))self.counter += 1nextOne = self.queue.get()nearPair = nextOne[2][1]#### this for loop pushes the elements I popped off in the## above while loop.## 放回等距离不配对的对象for item in tmp:self.queue.put(item)if len(topOne[2][0]) == 1:item1 = topOne[2][0][0]else:item1 = topOne[2][0]if len(nextOne[2][0]) == 1:item2 = nextOne[2][0][0]else:item2 = nextOne[2][0]## curCluster is, perhaps obviously, the new cluster## which combines cluster item1 with cluster item2.curCluster = (item1, item2)## Now I am doing two things. First, finding the nearest## neighbor to this new cluster. Second, building a new## neighbors list by merging the neighbors lists of item1## and item2. If the distance between item1 and element 23## is 2 and the distance betweeen item2 and element 23 is 4## the distance between element 23 and the new cluster will## be 2 (i.e., the shortest distance).##minDistance = 99999nearestPair = ()#nearestNeighbor = ''merged = {}nNeighbors = nextOne[2][2] #所有的邻居对和距离for (key, value) in topOne[2][2].items():if key in nNeighbors: #只剩下最后一个的时候,不成立if nNeighbors[key][1] < value[1]:dist = nNeighbors[key]else:dist = valueif dist[1] < minDistance:minDistance = dist[1]nearestPair = dist[0] #这里的对比较特别,只去了其中一个点#nearestNeighbor = keymerged[key] = distif merged == {}:return curCluster#返回元祖对,用来建立树else:self.queue.put( (minDistance, self.counter,[curCluster, nearestPair, merged]))self.counter += 1
KMeans
这个算法和爬山法类似,不能保证最后能够找到最有的划分簇。因为算法一开始选择的是随机中心点的集合,所以只能确保找到局部最有划分簇。
误差平方和(sum of squared error,简称SSE):度量某个簇集合的质量。
import mathimport random"""Implementation of the K-means algorithmfor the book A Programmer's Guide to Data Mining"http://www.guidetodatamining.com"""def getMedian(alist):"""get median of list"""tmp = list(alist)tmp.sort()alen = len(tmp)if (alen % 2) == 1:return tmp[alen // 2]else:return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2def normalizeColumn(column):"""normalize the values of a column using Modified Standard Scorethat is (each value - median) / (absolute standard deviation)"""median = getMedian(column)asd = sum([abs(x - median) for x in column]) / len(column)result = [(x - median) / asd for x in column]return resultclass kClusterer:""" Implementation of kMeans ClusteringThis clusterer assumes that the first column of the data is a labelnot used in the clustering. The other columns contain numeric data"""def __init__(self, filename, k):""" k is the number of clusters to makeThis init method:1. reads the data from the file named filename2. stores that data by column in self.data3. normalizes the data using Modified Standard Score4. randomly selects the initial centroids5. assigns points to clusters associated with those centroids"""file = open(filename)self.data = {}self.k = kself.counter = 0self.iterationNumber = 0# used to keep track of % of points that change cluster membership# in an iterationself.pointsChanged = 0# Sum of Squared Errorself.sse = 0## read data from file#lines = file.readlines()file.close()header = lines[0].split(',')self.cols = len(header)self.data = [[] for i in range(len(header))]# we are storing the data by column.# For example, self.data[0] is the data from column 0.# self.data[0][10] is the column 0 value of item 10.for line in lines[1:]:cells = line.split(',')toggle = 0for cell in range(self.cols):if toggle == 0:self.data[cell].append(cells[cell])toggle = 1else:self.data[cell].append(float(cells[cell]))self.datasize = len(self.data[1])self.memberOf = [-1 for x in range(len(self.data[1]))]## now normalize number columns#for i in range(1, self.cols):self.data[i] = normalizeColumn(self.data[i])# select random centroids from existing pointsrandom.seed()#sample(seq, n) 从序列seq中选择n个随机且独立的元素self.centroids = [[self.data[i][r] for i in range(1, len(self.data))]for r in random.sample(range(len(self.data[0])),self.k)]self.assignPointsToCluster()def updateCentroids(self):"""Using the points in the clusters, determine the centroid(mean point) of each cluster"""members = [self.memberOf.count(i) for i in range(len(self.centroids))]#计数列表#对centroid类的数据的第k列数据 第i个中心点求和,计算平均self.centroids = [[sum([self.data[k][i]for i in range(len(self.data[0]))if self.memberOf[i] == centroid])/members[centroid]for k in range(1, len(self.data))]for centroid in range(len(self.centroids))]def assignPointToCluster(self, i):""" assign point to cluster based on distance from centroids"""min = 999999clusterNum = -1for centroid in range(self.k):dist = self.euclideanDistance(i, centroid)if dist < min:min = distclusterNum = centroid# here is where I will keep track of changing pointsif clusterNum != self.memberOf[i]: #第一次认为全部变动self.pointsChanged += 1# add square of distance to running sum of squared errorself.sse += min**2return clusterNumdef assignPointsToCluster(self):""" assign each data point to a cluster"""self.pointsChanged = 0self.sse = 0self.memberOf = [self.assignPointToCluster(i)for i in range(len(self.data[1]))]def euclideanDistance(self, i, j):""" compute distance of point i from centroid j"""sumSquares = 0for k in range(1, self.cols):sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2return math.sqrt(sumSquares)def kCluster(self):"""the method that actually performs the clusteringAs you can see this method repeatedlyupdates the centroids by computing the mean point of each clusterre-assign the points to clusters based on these new centroidsuntil the number of points that change cluster membership is less than 1%."""done = Falsewhile not done:self.iterationNumber += 1self.updateCentroids()self.assignPointsToCluster()## we are done if fewer than 1% of the points change clusters#if float(self.pointsChanged) / len(self.memberOf) < 0.01:done = Trueprint("Final SSE: %f" % self.sse)def showMembers(self):"""Display the results"""for centroid in range(len(self.centroids)):print ("\n\nClass %i\n========" % centroid)for name in [self.data[0][i] for i in range(len(self.data[0]))if self.memberOf[i] == centroid]:print (name)#### RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3#### change the path in the following to match where dogs.csv is on your machinekm = kClusterer('data/dogs.csv', 3)km.kCluster()km.showMembers()
K-means++通过修改初始中心点的选择方法改进了由于初始的随机性导致的结果非优的缺点。虽然第一个点选择是随机的,但其他的点则优先选择那些彼此距离很远的点。
选择初始中心点的步骤:
def selectInitialCentroids(self):"""implement the k-means++ method of selectingthe set of initial centroids"""centroids = []total = 0# first step is to select a random first centroidcurrent = random.choice(range(len(self.data[0])))centroids.append(current)# loop to select the rest of the centroids, one at a timefor i in range(0, self.k - 1):# for every point in the data find its distance to# the closest centroidweights = [self.distanceToClosestCentroid(x, centroids)for x in range(len(self.data[0]))]total = sum(weights)# instead of raw distances, convert so sum of weight = 1weights = [x / total for x in weights]## now roll virtual dienum = random.random()total = 0x = -1# the roulette wheel simulationwhile total < num:x += 1total += weights[x]centroids.append(x)self.centroids = [[self.data[i][r] for i in range(1, len(self.data))]for r in centroids]def distanceToClosestCentroid(self, point, centroidList):result = self.eDistance(point, centroidList[0])for centroid in centroidList[1:]:distance = self.eDistance(point, centroid)if distance < result:result = distancereturn resultdef eDistance(self, i, j):""" compute distance of point i from centroid j"""sumSquares = 0for k in range(1, self.cols):sumSquares += (self.data[k][i] - self.data[k][j])**2return math.sqrt(sumSquares)