注册 登录
    • 为了保证你在浏览本网站时有着更好的体验,建议使用类似Chrome、Firefox之类的浏览器~~
    • 如果你喜欢本站的内容何不Ctrl+D收藏一下呢,与大家一起分享各种编程知识~
    • 本网站研究机器学习、计算机视觉、模式识别~当然不局限于此,生命在于折腾,何不年轻时多折腾一下

随机森林python

Python admin 77次浏览 0个评论 扫描二维码

博主尝试对一个数据集使用构建一个二分类随机森林,所有特征都是连续属性,也尝试画一下随机森林的流程图
随机森林python
代码

# -*- coding: utf-8 -*-
#-------------------
#@Author: cuijian 
#-------------------
import pandas as pd
import numpy as np
from random import randrange
from math import floor,sqrt
from sklearn.preprocessing import LabelEncoder

from sklearn.cross_validation import train_test_split
class randomforest(object):
    '''
    随机森林模型
    '''
    def load_data(self,filename):
        '''
        加载当前测试数据,来自开源sonar数据
        filename 文件的路径+文件的名字
        '''
        data=pd.read_csv(filename,header=None)
        target=data.iloc[:,-1]
        self.target=LabelEncoder().fit_transform(list(target))
        self.data=data.iloc[:,0:data.shape[1]-2]
    def subsample(self,ratio):
        '''
        ratio是划分训练样本和测试样本的比例
        example:0.7->70%正样本 30%负样本
        '''
        Traindata,Testdata,Traintarget,Testtarget=train_test_split(self.data,self.target,test_size=ratio, random_state=0)
        return Testdata,Testtarget,Traindata,Traintarget
    def buildtree(self,traindata,traintarget,max_depth,min_size, n_features):
        '''
        构建决策树
        max_depth 树的最大深度
        min_size 树的大小
        n_features 随机采样特征的数量
        '''
        root=self.get_root(traindata,traintarget,n_features)
        self.construct_tree(root,traindata,traintarget,max_depth,min_size,n_features,1)
        return root
    def node_end(self,group,target):
        '''
        树节点终止处理,输出预测分类
        '''
        return  max(set(target[group]),key=lambda x:list(target[group]).count(x))


    def construct_tree(self,node,data,target,max_depth,min_size, n_features,depth):
        leftgroup,rightgroup=node['groups']
        del(node['groups'])
        if not leftgroup or not right:
            node['left'] = node['right'] = self.node_end(leftgroup + rightgroup,target)
            return
        if depth >= max_depth:
            node['left'], node['right'] = self.node_end(leftgroup,target), self.node_end(rightgroup,target)
            return
        if len(leftgroup) <= min_size:
            node['left'] = self.node_end(leftgroup,target)
        else:
            node['left'] = self.get_root(data.iloc[leftgroup,:],target[leftgroup],n_features)
            self.construct_tree(node['left'],data.iloc[leftgroup,:],target[leftgroup], max_depth, min_size, n_features, depth+1)
        if len(rightgroup) <= min_size:
            node['right'] = self.node_end(rightgroup,target)
        else:
            node['right'] = self.get_root(data.iloc[rightgroup,:],target[rightgroup],n_features)
            self.construct_tree(node['right'],data.iloc[rightgroup,:],target[rightgroup], max_depth, min_size, n_features, depth+1)

    def get_root(self,traindata,traintarget,n_features):
        '''
        获取每棵决策树的节点
        traindata  训练数据
        '''
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        features = list()
        while len(features) < n_features:
            index =randrange(traindata.shape[1])
            if index not in features:
                features.append(index)
        for index in features:
            for rows in range(traindata.shape[0]):
                groups =self.groups_split(index, traindata.iloc[rows,index], traindata)
                gini = self.giniscore(groups, traintarget)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index,traindata.iloc[rows,index], gini, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    def groups_split(self,index,nodeValue,data):
        '''
        根据当前所选属性,通过比较其他数据,将数据集划分为两个子集
        index  特征属性索引值
        nodeValue  选取样本节点对应属性特征的数据值
        data 原始训练数据
        返回值:
        leftgroup,rightgroup 记录根据当期属性分类之后的样本的数字索引值,leftgroup为左节点,rightgroup右节点
        '''
        leftgroup,rightgroup=list(),list()
        datalength=data.shape[0]
        for row in range(datalength):
            if data.iloc[row,index]<nodeValue:
                leftgroup.append(row)
            else:
                rightgroup.append(row)
        return leftgroup,rightgroup
            

    def train(self,n_trees,max_depth,min_size, n_features,ratio):
        '''
        训练模型
        n_trees  森林树的数量
        '''
        trees = list()
        for i in range(n_trees):
            Testdata,Testtarget,Traindata,Traintarget = self.subsample(ratio)
            tree = self.buildtree(Traindata,Traintarget,max_depth, min_size, n_features)
            trees.append(tree)
        Testdata,Testtarget,Traindata,Traintarget = self.subsample(ratio)
        predictions = self.bagging_predict(Testdata,Testtarget,trees)
        return  predictions
    def bagging_predict(self,data,target,trees):
        '''
        投票表决
        data 测试数据
        target 测试数据标签
        trees 训练得到的多棵树
        '''
        result=[]
        for x in range(data.shape[0]):
            tmp=[]
            for tree in trees:
                tmp.append(self.predict(data.iloc[x,:],target[x],tree))
            result.append(max(set(tmp),key=lambda x : tmp.count(x)))
        
        return sum([ 1 if target[x]==result[x] else 0 for x in range(data.shape[0])])* 1.0/len(result)
    def predict(self,data,target,tree):
        '''
        功能:单棵树的决策
        data 测试数据
        target 测试数据标签
        tree 单棵树
        '''
        if data[tree['index']]<tree['value']:
            if isinstance(tree['left'],dict):
                return self.predict(data,target,tree['left'])
            else:
                return  tree['left']
        else:
            if isinstance(tree['right'],dict):
                return self.predict(data,target,tree['right'])
            else:
                return  tree['right']

    def giniscore(self,groups, traintarget):
        '''
        计算基尼系数,基尼系数是计算划分后的数值,基尼系数越小反映数据的纯度越高,也就是划分效果越好
        '''
        gini=0.0
        target=set(traintarget)
        for subgroup in groups:
            groupgini=0.0
            if not len(subgroup):
                    continue
            for value in target:                
                subdata=traintarget[subgroup]
                prob=sum(subdata==value)*1.0/len(subdata)
                groupgini+=prob*(1-prob)
            gini+=groupgini/len(subgroup)
        return gini

def main():
    Test=randomforest()
    Test.load_data('sonar.csv')
    ntrees=8
    max_depth=10
    min_size=1
    ratio=0.8
    n_features=floor(sqrt(Test.data.shape[1]-1))
    print Test.train(ntrees,max_depth,min_size,n_features,ratio)

if __name__ == '__main__':
    main()

Deeplearn, 版权所有丨如未注明 , 均为原创丨本网站采用BY-NC-SA协议进行授权 , 转载请注明随机森林python
喜欢 (0)
[xiaocui]
分享 (0)

您必须 登录 才能发表评论!