# 随机森林python

# -*- coding: utf-8 -*-
#-------------------
#@Author: cuijian
#-------------------
import pandas as pd
import numpy as np
from random import randrange
from math import floor,sqrt
from sklearn.preprocessing import LabelEncoder

from sklearn.cross_validation import train_test_split
class randomforest(object):
'''
随机森林模型
'''
'''
加载当前测试数据，来自开源 sonar 数据
filename 文件的路径+文件的名字
'''
target=data.iloc[:,-1]
self.target=LabelEncoder().fit_transform(list(target))
self.data=data.iloc[:,0:data.shape[1]-2]
def subsample(self,ratio):
'''
ratio 是划分训练样本和测试样本的比例
example:0.7->70%正样本 30%负样本
'''
Traindata,Testdata,Traintarget,Testtarget=train_test_split(self.data,self.target,test_size=ratio, random_state=0)
return Testdata,Testtarget,Traindata,Traintarget
def buildtree(self,traindata,traintarget,max_depth,min_size, n_features):
'''
构建决策树
max_depth 树的最大深度
min_size 树的大小
n_features 随机采样特征的数量
'''
root=self.get_root(traindata,traintarget,n_features)
self.construct_tree(root,traindata,traintarget,max_depth,min_size,n_features,1)
return root
def node_end(self,group,target):
'''
树节点终止处理，输出预测分类
'''
return  max(set(target[group]),key=lambda x:list(target[group]).count(x))

def construct_tree(self,node,data,target,max_depth,min_size, n_features,depth):
leftgroup,rightgroup=node['groups']
del(node['groups'])
if not leftgroup or not right:
node['left'] = node['right'] = self.node_end(leftgroup + rightgroup,target)
return
if depth >= max_depth:
node['left'], node['right'] = self.node_end(leftgroup,target), self.node_end(rightgroup,target)
return
if len(leftgroup) <= min_size:
node['left'] = self.node_end(leftgroup,target)
else:
node['left'] = self.get_root(data.iloc[leftgroup,:],target[leftgroup],n_features)
self.construct_tree(node['left'],data.iloc[leftgroup,:],target[leftgroup], max_depth, min_size, n_features, depth+1)
if len(rightgroup) <= min_size:
node['right'] = self.node_end(rightgroup,target)
else:
node['right'] = self.get_root(data.iloc[rightgroup,:],target[rightgroup],n_features)
self.construct_tree(node['right'],data.iloc[rightgroup,:],target[rightgroup], max_depth, min_size, n_features, depth+1)

def get_root(self,traindata,traintarget,n_features):
'''
获取每棵决策树的节点
traindata  训练数据
'''
b_index, b_value, b_score, b_groups = 999, 999, 999, None
features = list()
while len(features) < n_features:
index =randrange(traindata.shape[1])
if index not in features:
features.append(index)
for index in features:
for rows in range(traindata.shape[0]):
groups =self.groups_split(index, traindata.iloc[rows,index], traindata)
gini = self.giniscore(groups, traintarget)
if gini < b_score:
b_index, b_value, b_score, b_groups = index,traindata.iloc[rows,index], gini, groups
return {'index':b_index, 'value':b_value, 'groups':b_groups}
def groups_split(self,index,nodeValue,data):
'''
根据当前所选属性，通过比较其他数据，将数据集划分为两个子集
index  特征属性索引值
nodeValue  选取样本节点对应属性特征的数据值
data 原始训练数据
返回值：
leftgroup,rightgroup 记录根据当期属性分类之后的样本的数字索引值,leftgroup 为左节点，rightgroup 右节点
'''
leftgroup,rightgroup=list(),list()
datalength=data.shape[0]
for row in range(datalength):
if data.iloc[row,index]<nodeValue:
leftgroup.append(row)
else:
rightgroup.append(row)
return leftgroup,rightgroup

def train(self,n_trees,max_depth,min_size, n_features,ratio):
'''
训练模型
n_trees  森林树的数量
'''
trees = list()
for i in range(n_trees):
Testdata,Testtarget,Traindata,Traintarget = self.subsample(ratio)
tree = self.buildtree(Traindata,Traintarget,max_depth, min_size, n_features)
trees.append(tree)
Testdata,Testtarget,Traindata,Traintarget = self.subsample(ratio)
predictions = self.bagging_predict(Testdata,Testtarget,trees)
return  predictions
def bagging_predict(self,data,target,trees):
'''
投票表决
data 测试数据
target 测试数据标签
trees 训练得到的多棵树
'''
result=[]
for x in range(data.shape[0]):
tmp=[]
for tree in trees:
tmp.append(self.predict(data.iloc[x,:],target[x],tree))
result.append(max(set(tmp),key=lambda x : tmp.count(x)))

return sum([ 1 if target[x]==result[x] else 0 for x in range(data.shape[0])])* 1.0/len(result)
def predict(self,data,target,tree):
'''
功能：单棵树的决策
data 测试数据
target 测试数据标签
tree 单棵树
'''
if data[tree['index']]<tree['value']:
if isinstance(tree['left'],dict):
return self.predict(data,target,tree['left'])
else:
return  tree['left']
else:
if isinstance(tree['right'],dict):
return self.predict(data,target,tree['right'])
else:
return  tree['right']

def giniscore(self,groups, traintarget):
'''
计算基尼系数,基尼系数是计算划分后的数值，基尼系数越小反映数据的纯度越高，也就是划分效果越好
'''
gini=0.0
target=set(traintarget)
for subgroup in groups:
groupgini=0.0
if not len(subgroup):
continue
for value in target:
subdata=traintarget[subgroup]
prob=sum(subdata==value)*1.0/len(subdata)
groupgini+=prob*(1-prob)
gini+=groupgini/len(subgroup)
return gini

def main():
Test=randomforest()
ntrees=8
max_depth=10
min_size=1
ratio=0.8
n_features=floor(sqrt(Test.data.shape[1]-1))
print Test.train(ntrees,max_depth,min_size,n_features,ratio)

if __name__ == '__main__':
main()


