KNN算法的Python实现

发布时间:2019-09-20 07:35:27编辑:auto阅读(1507)

    # KNN算法思路:

    #-----------------------------------------------------#

    #step1:读入数据,存储为链表

    #step2:数据预处理,包括缺失值处理、归一化等

    #step3:设置K值

    #step4:计算待测样本与所有样本的距离(二值、序数、连续)

    #step5:投票决定待测样本的类别

    #step6:利用测试集测试正确率

    #-----------------------------------------------------#

    注:因为是python的初学者,可能很多高级的用法还不会,所以把python代码写的像C还请大家不要吐槽。同时希望大家指出其中的错误和有待提高的地方,大家一起进步才是最棒的。

    说明:数据集采自著名UCI数据集库 http://archive.ics.uci.edu/ml/datasets/Adult

    # Author :CWX
    # Date :2015/9/1
    # Function: A classifier which using KNN algorithm 
    
    import math
    
    attributes = {"age":0,"workclass":1,"fnlwg":2,"education":3,"education-num":4,
    			 "marital-status":5,"occupation":6,"relationship":7,"race":8,
    			 "sex":9,"capital-gain":10,"capital-loss":11,"hours-per-week":12,
    			 "native-country":13,"salary":14
    			}
    
    def read_txt(filename):
    #read data and convert it into list 
    	items = []
    	fp = open(filename,'r')
    	lines = fp.readlines()
    	for line in lines:
    		line = line.strip('\n')
    		items.append(line)
    	fp.close()
    
    	i = 0
    	b = []
    	for i in range(len(items)):
    		b.append(items[i].split(','))
    	return b
    
    def computeNa(items):
    # detect missing value in list and handle it
    # items - an whole list 
    	for item in items[:]:
    		if item.count(' ?') > 0:
    			items.remove(item)
    		# if item.count(' ?') >= -1:
    		# 	items.remove(item)
    	return items
    
    def disCal(lst1,lst2,type):
    # calculting distance between lst1 and lst2
    	distance = 0;
    	if type == "Manhattan" or type == "manhattan":
    		for i in range(len(lst2) - 1):
    			distance += abs(lst1[i] - lst2[i])
    	elif type == "Elucildean" or type == "elucildean":
    		for i in range(len(lst2) - 1):
    			distance += math.sqrt((lst1[i] - lst2[i])**2)
    	else:
    		print "Error in type name"
    		distance = -1
    	return distance
    
    def computeContinous(datalist,attribute):
    # compute continous attributes in list
    	min_val = int(datalist[0][attribute])
    	max_val = int(datalist[0][attribute])
    	for items in datalist:
    		if int(items[attribute]) < min_val:
    			min_val = int(items[attribute])
    		elif int(items[attribute]) > max_val:
    			max_val = int(items[attribute])
    	for items in datalist[:]:
    		items[attribute] = (int(items[attribute]) - min_val) / float(max_val - min_val)
    	return datalist
    
    def computeOrdinal(datalist,attribute,level):
    # compute ordinal attribute in datalist
    	level_dict = {}
    	for i in range(len(level)):
    		level_dict[level[i]] = float(i) / (len(level) - 1)
    #		level_dict[level[i]] = i
    	for items in datalist[:]:
    		items[attribute] = level_dict[items[attribute]]
    	return datalist
    
    def KnnAlgorithm(dataTrain,sample,attribute,k):
    	mergeData = dataTrain
    	mergeData.append(sample)
    	data = preProcessing(mergeData)
    	distance = []
    	for i in range(len(data)-2):
    		distance.append(disCal(data[i],data[len(data)-1],"Elucildean"))
    	copy_dis = distance[:] # notice : not copy_dis = distance ,if it will be wrong
    	distance.sort()
    
    	class_dict = {"Yes":0,"No":0}
    	for i in range(k):
    		index = copy_dis.index(distance[i])
    		if data[index][attribute] == " >50K":
    			class_dict["Yes"] += 1
    		else:
    			class_dict["No"] += 1
    	if  class_dict["Yes"] > class_dict["No"]:
    		print "sample's salary >50K"
    	else:
    		print "sample's salary <=50K"
    
    def preProcessing(datalist):
    	b = computeNa(datalist)
    
    	b = computeContinous(b,attributes["age"])
    
    	workclass_level = [" Private"," Self-emp-not-inc"," Self-emp-inc"," Federal-gov"," Local-gov"," State-gov"," Without-pay"," Never-worked"]
    	b = computeOrdinal(b,attributes["workclass"],workclass_level)
    
    	b = computeContinous(b,attributes["fnlwg"])
    
    	education_level =[" Bachelors"," Some-college"," 11th"," HS-grad"," Prof-school",
    				  " Assoc-acdm"," Assoc-voc"," 9th"," 7th-8th"," 12th"," Masters"," 1st-4th"," 10th"," Doctorate"," 5th-6th"," Preschool"] 
    	b = computeOrdinal(b,attributes["education"],education_level)
    
    	b = computeContinous(b,attributes["education-num"])
    
    	marital_status_level = [" Married-civ-spouse"," Divorced"," Never-married"," Separated"," Widowed"," Married-spouse-absent"," Married-AF-spouse"] 
    	b = computeOrdinal(b,attributes["marital-status"],marital_status_level) 
    
    	occupation_level  = [" Tech-support"," Craft-repair"," Other-service"," Sales"," Exec-managerial"," Prof-specialty"," Handlers-cleaners",
    					 " Machine-op-inspct"," Adm-clerical"," Farming-fishing"," Transport-moving"," Priv-house-serv"," Protective-serv"," Armed-Forces"]
    	b = computeOrdinal(b,attributes["occupation"],occupation_level)
    
    	relationship_level = [" Wife"," Own-child"," Husband"," Not-in-family"," Other-relative"," Unmarried"]
    	b = computeOrdinal(b,attributes["relationship"],relationship_level)
    
    	race_level = [" White"," Asian-Pac-Islander"," Amer-Indian-Eskimo"," Other"," Black"]
    	b = computeOrdinal(b,attributes["race"],race_level)
    
    	sex_level = [" Female", " Male"]
    	b = computeOrdinal(b,attributes["sex"],sex_level)
    
    	b = computeContinous(b,attributes["capital-gain"])
    
    	b = computeContinous(b,attributes["capital-loss"])
    
    	b = computeContinous(b,attributes["hours-per-week"])
    
    	native_country_level = [" United-States"," Cambodia"," England"," Puerto-Rico"," Canada"," Germany"," Outlying-US(Guam-USVI-etc)"," India",
    						" Japan"," Greece"," South"," China"," Cuba"," Iran"," Honduras"," Philippines"," Italy"," Poland"," Jamaica"," Vietnam"," Mexico"," Portugal",
    						" Ireland"," France"," Dominican-Republic"," Laos"," Ecuador"," Taiwan"," Haiti"," Columbia"," Hungary"," Guatemala"," Nicaragua"," Scotland",
    						" Thailand"," Yugoslavia"," El-Salvador"," Trinadad&Tobago"," Peru"," Hong"," Holand-Netherlands"]
    	b = computeOrdinal(b,attributes["native-country"],native_country_level)
    	return b
    
    def assessment(dataTrain,dataTest,atrribute,k):
    	mergeData = computeNa(dataTrain)
    	len_train = len(mergeData)
    	mergeData.extend(computeNa(dataTest))
    	data = preProcessing(mergeData)	
    	len_test = len(data) - len_train
    	res_dict = {"correct":0,"wrong":0}
    	for i in range(len_test):
    		distance = []
    		class_dict = {"Yes":0,"No":0}
    		for j in range(len_train):
    			distance.append(disCal(data[j],data[i+len_train],"Elucildean"))
    		copy_dis = distance[:]
    		distance.sort()	
    		for m in range(k):
    			index = copy_dis.index(distance[m])
    			if data[index][atrribute] == " >50K":
    				class_dict["Yes"] += 1
    			else:
    				class_dict["No"] += 1	  
    		if class_dict["Yes"] > class_dict["No"] and mergeData[i+len_train][atrribute] == " >50K.": #Attention : in train data in the end of lines there is a "."
    			res_dict["correct"]  += 1
    		elif mergeData[i+len_train][atrribute] == " <=50K." and class_dict["Yes"] < class_dict["No"]:
    			res_dict["correct"]  += 1
    		else:
    			res_dict["wrong"] += 1
    	correct_ratio = float(res_dict["correct"]) / (res_dict["correct"] + res_dict["wrong"])
    	print "correct_ratio = ",correct_ratio 
    	
    filename = "H:\BaiduYunDownload\AdultDatasets\Adult_data.txt"
    #sample = [" 80"," Private"," 226802"," 11th"," 7"," Never-married"," Machine-op-inspct"," Own-child"," Black"," Male"," 0"," 0"," 40"," United-States"," <=50K"]
    sample = [" 65"," Private"," 184454"," HS-grad"," 9"," Married-civ-spouse"," Machine-op-inspct"," Husband"," White"," Male"," 6418"," 0"," 40"," United-States"," >50K"]
    # this samples salary <=50K#
    # filename = "D:\MyDesktop-HnH\data.txt"
    a = read_txt(filename)
    print len(a)
    
    k = 3
    #KnnAlgorithm(a,sample,attributes["salary"],k)
    
    trainName = "H:\BaiduYunDownload\AdultDatasets\Adult_test.txt" 
    trainData = read_txt(trainName)
    #preProcessing(trainData)
    assessment(a,trainData,attributes["salary"],k)

    结果:正确率 0.812416998672

        运行时间 1小时20分钟

关键字