機器學習與R語言實戰

分類1--樹、延遲和概率

使用遞歸分類樹

library(C50)
data(churn)
str(churnTrain)
#删除不需要的数据
churnTrain=churnTrain[,!names(churnTrain)%in%c("state","area_code","account_length")]
#将70%数据划分到训练数据集,30%划分到测试数据集
set.seed(2)
ind=sample(2,nrow(churnTrain),replace=TRUE,prob=c(0.7,0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]
#浏览训练和测试数据集的维度
dim(trainset)
dim(testset)
#使用递归分割树建立分类模型
library(rpart)
##建立分类树模型
churn.rp=rpart(churn~.,data=trainset)
churn.rp
##检查复杂性参数
printcp(churn.rp)
plotcp(churn.rp)
summary(churn.rp)
#递归分类树可视化
plot(churn.rp,margin = 0.1)
text(churn.rp,all=TRUE,use.n = TRUE)
#评测递归分割树的预测能力
predictions=predict(churn.rp,testset,type="class")
table(testset$churn,predictions)
library(caret)
confusionMatrix(table((predictions),testset$churn))

递归分割树剪枝

##找到分类树模型的最小交叉检验误差
min(churn.rp$cptable[,"xerror"])
which.min(churn.rp$cptable[,"xerror"])
churn.cp=churn.rp$cptable[7,"CP"]
churn.cp
##设置参数的cp的值与交叉检验误差最小记录的CP值相同进行剪枝
prune.tree=prune(churn.rp,cp=churn.cp)
plot(prune.tree,margin=0.1)
text(prune.tree,all=TRUE,use.n = TRUE)
#生成分类表
predictions=predict(prune.tree,testset,type="class")
table(testset$churn,predictions)
confusionMatrix(table(predictions,testset$churn))

使用条件推理树

ctree.model=ctree(churn~.,data=trainset)
ctree.model
plot(ctree.model)
#减少输入特征参数后再重新绘制分类树
daycharge.model=ctree(churn~total_day_charge)
#评价预测能力
ctree.predict=predict(ctree.model,testset)
table(ctree.predict,testset$churn)
confusionMatrix(table(ctree.predict,testset$churn))
#输出预测样本为某一项的概率
tr=treeresponse(ctree.model,newdata=testset[1:5,])

k近邻分类算法

levels(trainset$international_plan)=list("0"="no","1"="yes")
levels(trainset$voice_mail_plan)=list("0"="no","1"="yes")
levels(testset$international_plan)=list("0"="no","1"="yes")
levels(testset$voice_mail_plan)=list("0"="no","1"="yes")
#训练和锻炼数据集
churn.knn=knn(trainset[,!names(trainset)%in%c("churn")],testset[,!names(testset)%in%c("churn")],trainset$churn,k=3)
summary(churn.knn)
table(testset$churn,churn.knn)
library(caret)
confusionMatrix(table(testset$churn,churn.knn))

逻辑回归分类法

summary(fit)
###再剔除无关紧要的变量重新回归
fit=glm(churn~international_plan+voice_mail_plan+total_intl_calls+number_customer_service_calls,data=trainset,family=binomial)
summary(fit)
pred=predict(fit,testset,type="response")
Class=pred>0.5
summary(Class)
tb=table(testset$churn,Class)
#生成混淆矩阵
churn.mod=ifelse(testset$churn=="yes",1,0)
pred_class=churn.mod
pred_class[pred<=0.5]=1-pred_class[pred<=0.5]
ctb=table(churn.mod,pred_class)

支持向量机

library(e1071)
model=svm(churn~.,data=trainset,kernel="radial",cost=1,gamma=1/ncol(trainset))
summary(model)

选择支持向量机的惩罚因子

##准备数据
iris.subset=subset(iris,select=c("Sepal.Length","Sepal.Width","Species"),Species%in%c("setosa","virginica"))
plot(x=iris.subset$Sepal.Length,y=iris.subset$Sepal.Width,col=iris.subset$Species,pch=19)
#将惩罚因子设置为1
svm.model=svm(Species~.,data=iris.subset,kernel='linear',cost=1,scale=FALSE)
points(iris.subset[svm.model$index,c(1,2)],col="blue",cex=2)
w=t(svm.model$coefs)%*%svm.model$SV
b=-svm.model$rho
abline(a=-b/w[1,2],b=-w[1,1]/w[1,2],col="red",lty=5)

.png)

SVM模型的可视化

data(iris)
model.iris=svm(Species~.,iris)
plot(model.iris,iris,Petal.Width~Petal.Length,slice=list(Sepal.Width=3,Sepal.Length=4))
plot(model,trainset,total_day_minutes~total_intl_charge)

.png)

基于支持向量机训练模型实现类预测

svm.pred=predict(model,testset[,!names(testset)%in%c("churn")])
svm.table=table(svm.pred,testset$churn)
svm.table
#计算分类一致性系数
classAgreement(svm.table)
#评价预测性能
library(caret)
confusionMatrix(svm.table)

预测连续数据

library(car)
data(Quartet)
model.reg=svm(Quartet$y1~Quartet$x,type="eps-regression")
predict.y=predict(model.reg,Quartet$x)
predict.y
plot(Quartet$x,Quartet$y1,pch=19)
points(Quartet$x,predict.y,pch=15,col="red")

调整支持向量机

tuned=tune.svm(churn~.,data=trainset,gamma=10^(-6:-1),cost=10^(1:2))
summary(tuned)
#得到最佳参数设置支持向量机
model.tuned=svm(churn~.,data=trainset,gamma=tuned$best.parameters$gamma,cost=tuned$best.parameters$cost)
summary(model.tuned)
#预测
svm.tuned.pred=predict(model.tuned,testset[,!names(testset)%in%c("churn")])
svm.tuned.table=table(svm.tuned.pred,testset$churn)
svm.tuned.table
#算法和模型评测
classAgreement(svm.tuned.table)
confusionMatrix(svm.tuned.table)

用neuralnet分析数据(相应变量为二元变量)

library(neuralnet)
library(datasets)
data(infert)
nn=neuralnet(case~age+parity+induced+spontaneous,data=infert,hidden = 2,err.fct = "ce",linear.output = FALSE)
nn
nn$result.matrix
plot(nn)
par(mfrow=c(2,2))
gwplot(nn,selected.covariate = "age")
gwplot(nn,selected.covariate = "parity")
gwplot(nn,selected.covariate = "induced")
gwplot(nn,selected.covariate = "spontaneous")
new.output=compute(nn,covariate = matrix(c(22,1,0,0,
                                           22,1,1,0,
                                           22,1,0,1,
                                           22,1,1,1),
                                         byrow=TRUE,ncol=4))
new.output$net.result

神经网络模型

data(iris)
ind=sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))
trainset=iris[ind==1,]
testset=iris[ind==2,]
library(neuralnet)
#根据数据集在species上的不同,为训练集新增versicolor,setosa和virginica
trainset$setosa=trainset$Species=="setosa"
trainset$virginica=trainset$Species=="virginica"
trainset$versicolor=trainset$Species=="versicolor"
network=neuralnet(versicolor+virginica+setosa~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,trainset,hidden=3)
network$result.matrix
#返回network模型的权重的第一项
head(network$generalized.weights[[1]])

感知器训练代码

a<-0.2 #学习率
w<-rep(0,3) #权值向量
iris1<-t(as.matrix(iris[,3:4]))
d<-c(rep(0,50),rep(1,100))#期望输出
e<-rep(0,150)
p<-rbind(rep(1,150),iris1)
max<-100000
eps<-rep(0,100000)
i<-0
repeat{
  v<-w%*%p;
  y<-ifelse(sign(v)>=0,1,0);
  e<-d-y;
  eps[i+1]<-sum(abs(e))/length(e)
  if(eps[i+1]<0.01){
    print("finish:");
    print(w);
    break;
  }
  w<-w+a*(d-y)%*%t(p);
  i<-i+1;
  if(i>max){
    print("max time loop");
    print(eps[i])
    print(y);
    break;
  }
}

#绘图
plot(Petal.Length~Petal.Width,xlim=c(0,3),ylim=c(0,8),
     data=iris[iris$Species=="virginica",])
data1<-iris[iris$Species=="versicolor",]
points(data1$Petal.Width,data1$Petal.Length,col=2)
data2<-iris[iris$Species=="setosa",]
points(data2$Petal.Width,data2$Petal.Length,col=3)
x<-seq(0,3,0.01)
y<-x*(-w[2]/w[3])-w[1]/w[3]
lines(x,y,col=4)
#绘制每次迭代的平均绝对误差
plot(1:i,eps[1:i],type="o")

#线性神经网络
p<-rbind(rep(1,150),iris1)
d<-c(rep(0,50),rep(1,100))
w<-rep(0,3)
a<-1/max(eigen(t(p)%*%p)$values)
max<-1000
e<-rep(0,150)
eps<-rep(0,1000)
i<-0
for(i in 1:max){
  v<-w%*%p;
  y<-v;
  e<-d-y;
  eps[i+1]<-sum(e^2)/length(e)
  w<-w+a*(d-y)%*%t(p);
  if(i==max)
    print(w)
}

nnet包

library(nnet); #安装nnet软件包
library(mlbench); #安装mlbench软件包
data(Vehicle); #调入数据
n=length(Vehicle[,1]); #样本量
set.seed(1); #设随机数种子
samp=sample(1:n,n/2); #随机选择半数观测作为训练集
b=class.ind(Vehicle$Class); #生成类别的示性函数
test.cl=function(true,pred){true<-max.col(true);cres=max.col(pred);table(true,cres)};
a=nnet(Vehicle[samp,-19],b[samp,],size=3,rang=0.1,decay=5e-4,maxit=200); #利用训练集中前18个变量作为输入变量,隐藏层有3个节点,初始随机权值在[-0.1,0.1],权值是逐渐衰减的。
test.cl(b[samp,],predict(a,Vehicle[samp,-19]))#给出训练集分类结果
test.cl(b[-samp,],predict(a,Vehicle[-samp,-19]));#给出测试集分类结果
#构建隐藏层包含15个节点的网络。接着上面的语句输入如下程序:
a=nnet(Vehicle[samp,-19],b[samp,],size=15,rang=0.1,decay=5e-4,maxit=10000);
test.cl(b[samp,],predict(a,Vehicle[samp,-19]));
test.cl(b[-samp,],predict(a,Vehicle[-samp,-19]));
summary(a)
#基于nnet包得到的模型实现类标号预测
v.predict=predict(a,Vehicle[samp,-19])

模型评估

library(C50)
library(e1071)
#调用cut函数将索引分成10折
ind=cut(1:nrow(churnTrain),breaks=10,labels=F)
#使用for循环执行10折交叉验证,重复10次
accuracies=c()
for(i in 1:10){
  fit=svm(churn~.,churnTrain[ind!=i,])
  predictions=predict(fit,churnTrain[ind == i, !names(churnTrain) %in% c("churn")])
  correct_count=sum(predictions==churnTrain[ind==i,c("churn")])
  accuracies=append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
}
accuracies

利用e1071包完成交叉检验

library(C50)
library(e1071)
data(churn)
set.seed(2)
ind=sample(2,nrow(churnTrain),replace=TRUE,prob=c(0.7,0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]
tuned=tune.svm(churn~.,data=trainset,gamma = 10^-2,cost=10^2,tunecontrol=tune.control(cross=10))
summary(tuned)
tuned$performances
#使用优化后的模型产生分类表
svmfit=tuned$best.model
table(trainset[,c("churn")],predict(svmfit))

利用caret包完成交叉检验

library(caret)
#设置训练控制参数,进行重复3次的10折交叉检验
control=trainControl(method="repeatedcv",number=10,repeats=3)
#调用rpart处理telecom churn数据集产生分类模型
library(rpart)
model=train(churn~.,data=trainset,method="rpart",preProcess="scale",trControl=control)
model

利用caret包对变量重要程度排序(接上面)

importance=varImp(model,scale=FALSE)
importance
plot(importance)
#rpart包中也有自带函数可以看变量的重要性
library(rpart)
model.rp=rpart(churn~.,data=trainset)
model.rp$variable.importance

利用rminer包对变量重要程度排序

library(rminer)
model=fit(churn~.,trainset,model="svm")
VariableImportance=Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph = "IMP",leg=names(trainset),col="gray",Grid=10)

利用caret包找到高度关联的特征

#去掉非数值类型的属性
new_train=trainset[,!names(churnTrain) %in% c("churn","international_plan","voice_mail_plan","state","area_code")]
#计算每个属性之间的关联度
cor_mat=cor(new_train)
#找到关联度超过0.75的属性
highlyCorrelated=findCorrelation(cor_mat,cutoff=0.75)
#输出这些高度关联属性的名称
names(new_train)[highlyCorrelated]

caret包分析的一整套流程

##准备数据,用的是一个医学数据
library(caret)
data(mdrr)
##变量太多,需要降维,去掉方差极小的自变量
zerovar=nearZeroVar(mdrrDescr)
newdata1=mdrrDescr[,-zerovar]
##再删除与其他变量相关性很强的变量
descrCorr=cor(newdata1)
highCorr=findCorrelation(descrCorr,0.9)
##剔除多重共线性的变量
newdata2 = newdata1[, -highCorr]
comboInfo = findLinearCombos(newdata2)
newdata2=newdata2[, -comboInfo$remove]
##将数据标准化并补足缺失值
Process=preProcess(newdata2)
newdata3=predict(Process,newdata2)
#用createDataPartition将数据进行划分,分成75%的训练样本和25%检验样本
inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)
trainx = newdata3[inTrain,]
testx = newdata3[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]
###其他功能演示:把数据分成几部分
createDataPartition(mdrrClass,times=4,p=0.5)
###生成多重交叉检验样本
createFolds(mdrrClass,10)
#对数据进行图形观察
featurePlot(trainx[,1:2],trainy,plot='box')
#特征选择,cv是交叉检验
subsets=c(20,30,40,50,60,70,80)
ctrl=rfeControl(functions=rfFuncs,method="cv",verbose=FALSE,returnResamp = "final")
Profile=rfe(newdata3,mdrrClass,sizes=subsets,rfeControl = ctrl)
print(Profile)
plot(Profile)
#返回最终保留的自变量
Profile$optVariables

用caret包生成哑变量

library(caret)
customers<-data.frame(id=c(10,20,30,40,50),gender=c("male","female","female","male","female"),
                      mood=c("happy","sad","happy","sad","happy"),outcome=c(1,1,0,0,0))
customers
dmy<-dummyVars(~.,data=customers)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf
#查看customers的数据类型
#可见,outcome的默认类型是numeric,现在这不是我们想要的。接下来将变量outcome转换成factor类型。
#把outcome转换成factor数据类型
customers$outcome<-as.factor(customers$outcome)
str(customers)
trsf<-data.frame(predict(dmy,newdata=customers))
trsf
#针对数据中某一变量进行哑变量处理
dmy<-dummyVars(~gender,data=customers)
trfs<-data.frame(predict(dmy,newdata=customers))
trfs
#对于两分类的因子变量,我们在进行虚拟变量处理后可能不需要出现代表相同意思的两列(例如:gender.female和gender.male)
dmy<-dummyVars(~.,data=customers,fullRank=T)
trfs<-data.frame(predict(dmy,newdata=customers))
trfs

results matching ""

    No results matching ""