Rev. | 37139e651931737bdbc0cd56fdd463686b017ce7 |
---|---|
大小 | 1,946 字节 |
时间 | 2014-02-11 05:34:40 |
作者 | Lorenzo Isella |
Log Message | I modified the code to run the random forest for the Kaggle loan competition. |
#!/usr/bin/env python
import scipy as s
import numpy as n
import string
import pandas as pd
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
def reader_fun(filename, datatype):
f = open(filename)
# raw_data = [map(int, string.split(line)) for line in f.readlines()]
raw_data = [map(datatype, string.split(line)) for line in f.readlines()]
f.close()
raw_data = s.array(raw_data# , dtype="float64"
)
return (raw_data)
#I use pandas to read the data simply because it understands immediately which columns are integers
#and which are float
# train = n.genfromtxt('train_data_fixed.dat',dtype=float)
# loss = n.genfromtxt('loss_data.dat',dtype=float)
# train = pd.read_csv('train_data_fixed.csv')
train = n.loadtxt('train_data_fixed_no_header.dat',dtype="float")
#but I still use numpy for this one as it will return the right 1D array
loss = n.loadtxt('loss_data.dat', dtype="float")
# I am now already cleaning and scaling the data in an R code, so I do not need
# to redo this now
# train_mean = s.mean(train, axis=0)
# train_std= s.std(train, axis=0)
# train=train-train_mean
# train=train/train_std
# n.savetxt("train_mean.dat", train_mean)
# n.savetxt("train_std.dat", train_std)
# print "s.shape(train_balance) is, "
# print s.shape(train_balance)
# train = train/train_balance
# without balancing, some numbers are sooo large that I get an
# error.
print "End of data reading"
clf = RandomForestRegressor(n_estimators=300,\
# compute_importances = True, \
n_jobs=10, verbose=3)
clf.fit(train, loss)
# s = pickle.dumps(clf)
#save the random forest model
f = open('rf_model.txt','wb')
pickle.dump(clf,f)
# joblib.dump(clf.fit, 'random-forest.pkl')
# predictions = clf.predict(test)
print "So far so good"