• R/O
  • SSH

标签
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. 37139e651931737bdbc0cd56fdd463686b017ce7
大小 1,946 字节
时间 2014-02-11 05:34:40
作者 Lorenzo Isella
Log Message

I modified the code to run the random forest for the Kaggle loan competition.

Content

#!/usr/bin/env python
import scipy as s
import numpy as n
import string

import pandas as pd

import pickle


from sklearn.externals import joblib

from sklearn.ensemble import RandomForestRegressor

def reader_fun(filename, datatype):
    
    f = open(filename)
    # raw_data = [map(int, string.split(line)) for line in f.readlines()]

    raw_data = [map(datatype, string.split(line)) for line in f.readlines()]

    f.close()

    raw_data = s.array(raw_data# , dtype="float64"
                       )
    return (raw_data)



#I use pandas to read the data simply because it understands immediately which columns are integers
#and which are float

# train = n.genfromtxt('train_data_fixed.dat',dtype=float)

# loss = n.genfromtxt('loss_data.dat',dtype=float)


# train = pd.read_csv('train_data_fixed.csv')


train = n.loadtxt('train_data_fixed_no_header.dat',dtype="float")

#but I still use numpy for this one as it will return the right 1D array

loss = n.loadtxt('loss_data.dat', dtype="float")





# I am now already cleaning and scaling the data in an R code, so I do not need
# to redo this now

# train_mean = s.mean(train, axis=0)

# train_std= s.std(train, axis=0)

# train=train-train_mean
# train=train/train_std

# n.savetxt("train_mean.dat", train_mean)
# n.savetxt("train_std.dat", train_std)


# print "s.shape(train_balance) is, "

# print s.shape(train_balance)

# train = train/train_balance

# without balancing, some numbers are sooo large that I get an
# error.


print "End of data reading"


clf = RandomForestRegressor(n_estimators=300,\
                            # compute_importances = True, \
                            n_jobs=10, verbose=3)

clf.fit(train, loss)


# s = pickle.dumps(clf)

#save the random forest model

f = open('rf_model.txt','wb')
pickle.dump(clf,f)


# joblib.dump(clf.fit, 'random-forest.pkl')

# predictions = clf.predict(test)

                            

print "So far so good"