# 朴素贝叶斯–Python实现西瓜数据判别

8,610次阅读

Contents

## 朴素贝叶斯概念

1、设为一个待分类项，而每个a为x的一个特征属性。

2、有类别集合

3、计算

4、如果$P(y_k|x)=max\{P(y_1|x),P(y_2|x),...,P(y_n|x)\}$，则

1、找到一个已知分类的待分类项集合，这个集合叫做训练样本集。

2、统计得到在各类别下各个特征属性的条件概率估计。即

3、如果各个特征属性是条件独立的，则根据贝叶斯定理有如下推导：

## 拉普拉斯校准

这一节讨论P(a|y)的估计。

## 代码实践

# -*- coding: utf-8 -*-
import csv
import numpy as np
from math import sqrt

attr_num=[3,3,3,3,3,2]
lines = csv.reader(open(filename, "rb"))
dataset = list(lines)
for i in range(1,len(dataset)):
dataset[i] = [float(x) for x in dataset[i] ]
result=np.array(dataset[1:])
return result[:,1:]

def pre_problity(datasets):

pos_prob=1.0*(np.sum(datasets[:,-1]==1.0)+1)/(np.shape(datasets)[0]+2)
neg_prob=1.0*(np.sum(datasets[:,-1]==0.0)+1)/(np.shape(datasets)[0]+2)
return [pos_prob,neg_prob]
def cond_attr_problity(datasets,testdata):
cond_result=np.zeros([np.shape(datasets)[1]-1,2])
pos_data=datasets[datasets[:,-1]==1.0,:]
neg_data=datasets[datasets[:,-1]==0.0,:]
for i in range(len(attr_num)):
cond_result[i,0]=1.0*(np.sum(pos_data[:,i]==testdata[0,i])+1)/(np.sum(datasets[:,-1]==1.0)+attr_num[i])
cond_result[i,1]=1.0*(np.sum(neg_data[:,i]==testdata[0,i])+1)/(np.sum(datasets[:,-1]==0.0)+attr_num[i])

for  j in range(6,8):
#         mean,std computation
pos_mean=np.mean(datasets[(datasets[:,-1]==1.0),j])
pos_std=np.std(datasets[(datasets[:,-1]==1.0),j])
neg_mean=np.mean(datasets[(datasets[:,-1]==0.0),j])
neg_std=np.std(datasets[(datasets[:,-1]==0.0),j])
cond_result[j,0]=1.0/(sqrt(2*np.pi)*pos_std)*np.exp(-1*(testdata[0,j]-pos_mean)**2/(2*pos_std**2))
cond_result[j,1]=1.0/(sqrt(2*np.pi)*neg_std)*np.exp(-1*(testdata[0,j]-neg_mean)**2/(2*neg_std**2))
return cond_result

def classify_data(cond_result,pre_result):
pos_result=pre_result[0]
neg_result=pre_result[1]
for i in range(np.shape(cond_result)[0]):
pos_result*=cond_result[i,0]
neg_result*=cond_result[i,1]
if pos_result>neg_result:
print 'pos'
print pos_result
else:
print 'neg'
print neg_result

def main():
filename = 'watermelon3_0_En.csv'
testname = 'test.csv'
main()