SlideShare a Scribd company logo
R:


     sesejun@is.ocha.ac.jp
     2009/10/29(        )
•                                                contacts_train.csv


     •
     •                               (setwd
             >       >                        )
"Pred","Young","Myope","Astimatic","Tear"
"P","Y","Y","Y","N"
"P","Y","Y","N","N"
"P","N","Y","Y","N"
"P","N","Y","Y","N"
"N","Y","Y","Y","Y"
"N","Y","Y","N","Y"
"N","N","N","N","Y"
"N","N","N","N","N"
"N","N","N","N","Y"
"N","N","N","N","N"
                                                    contacts.csv
> contacts.train<-read.table("contacts_train.csv", header=T,
sep=",")
> contacts.train
   Pred Young Myope Astimatic Tear
1     P     Y     Y         Y    N
2     P     Y     Y         N    N
3     P     N     Y         Y    N
4     P     N     Y         Y    N
5     N     Y     Y         Y    Y
6     N     Y     Y         N    Y
7     N     N     N         N    Y
8     N     N     N         N    N
9     N     N     N         N    Y
10    N     N     N         N    N
> contacts.train[1,]
    Pred Young Myope Astimatic Tear
 1     P     Y     Y         Y    N
 > contacts.train[,2]
   [1] Y Y N N Y Y N N N N
 Levels: N Y
 > contacts.train[,"Pred"]
   [1] P P P P N N N N N N
 Levels: N P
 > contacts.train$Pred
   [1] P P P P N N N N N N
 Levels: N P



> contacts.train[c(-1,-3,-5,-7,-9),]
   Pred Young Myope Astimatic Tear
2     P     Y     Y         N    N
4     P     N     Y         Y    N
6     N     Y     Y         N    Y
8     N     N     N         N    N
10    N     N     N         N    N
> class(contacts.train)
[1] "data.frame"




> forecast <- data.frame(date=c("10/1","10/2","10/3"),
weather=c("sunny","sunny","rain"))
> forecast
  date weather
1 10/1   sunny
2 10/2   sunny
3 10/3     rain
> forecast$weather
[1] sunny sunny rain
Levels: rain sunny
> forecast$date
[1] 10/1 10/2 10/3
> nrow(contacts.train)
[1] 10
> ncol(contacts.train)
[1] 5
> rownames(contacts.train)
 [1] "1" "2" "3" "4" "5" "6" "7"         "8"   "9"   "10"
> colnames(contacts.train)
[1] "Pred"      "Young"    "Myope"       "Astimatic" "Tear"

> colnames(contacts.train)[2]
[1] "Young"

> colnames(contacts.train)[2] <- "Old"
> colnames(contacts.train)
[1] "Pred"      "Old"       "Myope"      "Astimatic" "Tear"

> colnames(contacts.train)[2] <- "Young"
> contacts.train$Young
  [1] Y Y N N Y Y N N N N
Levels: N Y
> order(contacts.train$Young)
  [1] 3 4 7 8 9 10 1 2 5 6
> contacts.train[order(contacts.train$Young),]
    Pred Young Myope Astimatic Tear
3      P     N     Y         Y    N
4      P     N     Y         Y    N
7      N     N     N         N    Y
8      N     N     N         N    N
9      N     N     N         N    Y
10     N     N     N         N    N
1      P     Y     Y         Y    N
2      P     Y     Y         N    N
5      N     Y     Y         Y    Y
6      N     Y     Y         N    Y
> library("mvpart")
> rpart(Young~., data=contacts.train, method="class")
n= 10

node), split, n, loss, yval, (yprob)
      * denotes terminal node
1) root 10 4 N (0.6000000 0.4000000)
  2) Myope=N 4 0 N (1.0000000 0.0000000) *
  3) Myope=Y 6 2 Y (0.3333333 0.6666667) *


> rpart(Young~., data=contacts.train, method="class",
control=rpart.control(cp=-1))
n= 10

node), split, n, loss, yval, (yprob)
      * denotes terminal node
1) root 10 4 N (0.6000000 0.4000000)
  2) Myope=N 4 0 N (1.0000000 0.0000000) *
  3) Myope=Y 6 2 Y (0.3333333 0.6666667)
    6) Pred=P 4 2 N (0.5000000 0.5000000) *
    7) Pred=N 2 0 Y (0.0000000 1.0000000) *
IRIS
 •   http://archive.ics.uci.edu/ml/machine-learning-databases/iris/     iris.data


     •               iris.name
     •                                                                (setosa, versicolor, virginia)


 •                          http://togodb.sel.is.ocha.ac.jp/


> iris.train <- read.table("iris_train.csv", sep=",", header=T)
> length(rownames(iris.train))
[1] 120
> length(colnames(iris.train))
[1] 5




> hist(iris.train$Sepal.length)
> hist(iris.train$Petal.length)
> library(“mvpart”)
> rpart(Class~., data=iris.train, method="class",
control=rpart.control(cp=.1))
n= 120

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 120 77 Iris-setosa (0.35833333 0.34166667 0.30000000)
  2) Petal.length< 2.45 43 0 Iris-setosa (1.00000000 0.00000000
0.00000000) *
  3) Petal.length>=2.45 77 36 Iris-versicolor (0.00000000 0.53246753
0.46753247)
    6) Petal.length< 4.75 37 1 Iris-versicolor (0.00000000 0.97297297
0.02702703) *
    7) Petal.length>=4.75 40 5 Iris-virginica (0.00000000 0.12500000
0.87500000) *
> iris.dtree<-rpart(Class~., data=iris.train, method="class",
control=rpart.control(cp=.1))
> plot.new()
> plot(iris.dtree,uniform=T,margin=0.5)
> text(iris.dtree,use.n=T,all.leaves=F)
> plot(iris.train$Petal.length, iris.train$Petal.width, pch =
c(1,2,3)[unclass(iris.train$Class)])
> iris.test <- read.table("iris_test.csv", sep=",", header=T)


> iris.predict <- predict(iris.dtree, iris.test[1:4], type="class")
> iris.predict
               2              4              18              34
    Iris-setosa     Iris-setosa     Iris-setosa     Iris-setosa
...

> iris.predict ==   iris.test$Class
 [1] TRUE TRUE      TRUE TRUE TRUE    TRUE   TRUE   TRUE FALSE   TRUE
[11] TRUE TRUE      TRUE TRUE TRUE    TRUE   TRUE   TRUE TRUE    TRUE
[21] TRUE TRUE      TRUE TRUE TRUE    TRUE   TRUE   TRUE TRUE    TRUE

> sum(iris.predict == iris.test$Class) / length(iris.test$Class)
[1] 0.9666667
> sum(iris.predict != iris.test$Class) / length(iris.test$Class)
[1] 0.03333333
•
    •
        •
        •
        •   rpart       control=rpart.control(cp=.1)   .1


    •                                                       10


    •               3                2                      3

More Related Content

PDF
Datamining r 2nd
sesejun
 
PDF
Calculo2lista2
Cleide Soares
 
PDF
El mundo de la incertidumbre 472 - diciembre de 2015
Camilo Herrera
 
PPTX
Rss y lectores
74244691
 
PDF
Trois conferences Dalcroze
david bonnin
 
PPTX
What are scal­able best prac­tices to spread smart health?
SharpBrains
 
PDF
Vera peralta
Tere Vera
 
PPTX
Lector rss
camiloj67
 
Datamining r 2nd
sesejun
 
Calculo2lista2
Cleide Soares
 
El mundo de la incertidumbre 472 - diciembre de 2015
Camilo Herrera
 
Rss y lectores
74244691
 
Trois conferences Dalcroze
david bonnin
 
What are scal­able best prac­tices to spread smart health?
SharpBrains
 
Vera peralta
Tere Vera
 
Lector rss
camiloj67
 

Similar to Datamining R 2nd (20)

PDF
ゲーム理論 BASIC 演習83 -アナウンスは効果あるか-
ssusere0a682
 
PDF
Oceans 2019 tutorial-geophysical-nav_7-updated
Francisco Curado-Teixeira
 
PDF
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
Ken'ichi Matsui
 
PDF
機械学習モデルの判断根拠の説明
Satoshi Hara
 
PPTX
Data Science for Folks Without (or With!) a Ph.D.
Douglas Starnes
 
DOCX
Simpatía
eliacosta2010
 
PPT
การสุ่มตัวอย่างในงานวิจัยสาธารณสุข
Ultraman Taro
 
KEY
Introduction to Perl Best Practices
José Castro
 
PDF
IIT-JEE Mains 2016 Online Previous Question Paper Day 1
Eneutron
 
PDF
Data Manipulation Using R (& dplyr)
Ram Narasimhan
 
PDF
A note on estimation of population mean in sample survey using auxiliary info...
Alexander Decker
 
PDF
Datamining R 4th
sesejun
 
PDF
Chap02-Solutions-Ex-2-2-Calculus (1).pdf
qasimrazam89
 
PDF
Chap02-Solutions-Ex-2-2-Calculus.pdfjhsdaoihsdaiousadjh
qasimrazam89
 
PDF
Intoduction to numpy
Faraz Ahmed
 
PDF
Jamieson_Jain2018
Masa Kato
 
PDF
Regression and Classification with R
Yanchang Zhao
 
PDF
Slides ensae-2016-11
Arthur Charpentier
 
PDF
Datamining r 1st
sesejun
 
PDF
Ai2418281871
IJMER
 
ゲーム理論 BASIC 演習83 -アナウンスは効果あるか-
ssusere0a682
 
Oceans 2019 tutorial-geophysical-nav_7-updated
Francisco Curado-Teixeira
 
第13回数学カフェ「素数!!」二次会 LT資料「乱数!!」
Ken'ichi Matsui
 
機械学習モデルの判断根拠の説明
Satoshi Hara
 
Data Science for Folks Without (or With!) a Ph.D.
Douglas Starnes
 
Simpatía
eliacosta2010
 
การสุ่มตัวอย่างในงานวิจัยสาธารณสุข
Ultraman Taro
 
Introduction to Perl Best Practices
José Castro
 
IIT-JEE Mains 2016 Online Previous Question Paper Day 1
Eneutron
 
Data Manipulation Using R (& dplyr)
Ram Narasimhan
 
A note on estimation of population mean in sample survey using auxiliary info...
Alexander Decker
 
Datamining R 4th
sesejun
 
Chap02-Solutions-Ex-2-2-Calculus (1).pdf
qasimrazam89
 
Chap02-Solutions-Ex-2-2-Calculus.pdfjhsdaoihsdaiousadjh
qasimrazam89
 
Intoduction to numpy
Faraz Ahmed
 
Jamieson_Jain2018
Masa Kato
 
Regression and Classification with R
Yanchang Zhao
 
Slides ensae-2016-11
Arthur Charpentier
 
Datamining r 1st
sesejun
 
Ai2418281871
IJMER
 
Ad

More from sesejun (20)

PDF
RNAseqによる変動遺伝子抽出の統計: A Review
sesejun
 
PDF
バイオインフォマティクスによる遺伝子発現解析
sesejun
 
PDF
次世代シーケンサが求める機械学習
sesejun
 
PDF
20110602labseminar pub
sesejun
 
PDF
20110524zurichngs 2nd pub
sesejun
 
PDF
20110524zurichngs 1st pub
sesejun
 
PDF
20110214nips2010 read
sesejun
 
PDF
Datamining 9th association_rule.key
sesejun
 
PDF
Datamining 8th hclustering
sesejun
 
PDF
Datamining r 4th
sesejun
 
PDF
Datamining r 3rd
sesejun
 
PDF
Datamining 6th svm
sesejun
 
PDF
Datamining 5th knn
sesejun
 
PDF
Datamining 4th adaboost
sesejun
 
PDF
Datamining 3rd naivebayes
sesejun
 
PDF
Datamining 2nd decisiontree
sesejun
 
PDF
Datamining 7th kmeans
sesejun
 
PDF
100401 Bioinfoinfra
sesejun
 
PDF
Datamining 8th Hclustering
sesejun
 
PDF
Datamining 9th Association Rule
sesejun
 
RNAseqによる変動遺伝子抽出の統計: A Review
sesejun
 
バイオインフォマティクスによる遺伝子発現解析
sesejun
 
次世代シーケンサが求める機械学習
sesejun
 
20110602labseminar pub
sesejun
 
20110524zurichngs 2nd pub
sesejun
 
20110524zurichngs 1st pub
sesejun
 
20110214nips2010 read
sesejun
 
Datamining 9th association_rule.key
sesejun
 
Datamining 8th hclustering
sesejun
 
Datamining r 4th
sesejun
 
Datamining r 3rd
sesejun
 
Datamining 6th svm
sesejun
 
Datamining 5th knn
sesejun
 
Datamining 4th adaboost
sesejun
 
Datamining 3rd naivebayes
sesejun
 
Datamining 2nd decisiontree
sesejun
 
Datamining 7th kmeans
sesejun
 
100401 Bioinfoinfra
sesejun
 
Datamining 8th Hclustering
sesejun
 
Datamining 9th Association Rule
sesejun
 
Ad

Datamining R 2nd

  • 1. R: [email protected] 2009/10/29( )
  • 2. contacts_train.csv • • (setwd > > ) "Pred","Young","Myope","Astimatic","Tear" "P","Y","Y","Y","N" "P","Y","Y","N","N" "P","N","Y","Y","N" "P","N","Y","Y","N" "N","Y","Y","Y","Y" "N","Y","Y","N","Y" "N","N","N","N","Y" "N","N","N","N","N" "N","N","N","N","Y" "N","N","N","N","N" contacts.csv
  • 3. > contacts.train<-read.table("contacts_train.csv", header=T, sep=",") > contacts.train Pred Young Myope Astimatic Tear 1 P Y Y Y N 2 P Y Y N N 3 P N Y Y N 4 P N Y Y N 5 N Y Y Y Y 6 N Y Y N Y 7 N N N N Y 8 N N N N N 9 N N N N Y 10 N N N N N
  • 4. > contacts.train[1,] Pred Young Myope Astimatic Tear 1 P Y Y Y N > contacts.train[,2] [1] Y Y N N Y Y N N N N Levels: N Y > contacts.train[,"Pred"] [1] P P P P N N N N N N Levels: N P > contacts.train$Pred [1] P P P P N N N N N N Levels: N P > contacts.train[c(-1,-3,-5,-7,-9),] Pred Young Myope Astimatic Tear 2 P Y Y N N 4 P N Y Y N 6 N Y Y N Y 8 N N N N N 10 N N N N N
  • 5. > class(contacts.train) [1] "data.frame" > forecast <- data.frame(date=c("10/1","10/2","10/3"), weather=c("sunny","sunny","rain")) > forecast date weather 1 10/1 sunny 2 10/2 sunny 3 10/3 rain > forecast$weather [1] sunny sunny rain Levels: rain sunny > forecast$date [1] 10/1 10/2 10/3
  • 6. > nrow(contacts.train) [1] 10 > ncol(contacts.train) [1] 5 > rownames(contacts.train) [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" > colnames(contacts.train) [1] "Pred" "Young" "Myope" "Astimatic" "Tear" > colnames(contacts.train)[2] [1] "Young" > colnames(contacts.train)[2] <- "Old" > colnames(contacts.train) [1] "Pred" "Old" "Myope" "Astimatic" "Tear" > colnames(contacts.train)[2] <- "Young"
  • 7. > contacts.train$Young [1] Y Y N N Y Y N N N N Levels: N Y > order(contacts.train$Young) [1] 3 4 7 8 9 10 1 2 5 6 > contacts.train[order(contacts.train$Young),] Pred Young Myope Astimatic Tear 3 P N Y Y N 4 P N Y Y N 7 N N N N Y 8 N N N N N 9 N N N N Y 10 N N N N N 1 P Y Y Y N 2 P Y Y N N 5 N Y Y Y Y 6 N Y Y N Y
  • 8. > library("mvpart") > rpart(Young~., data=contacts.train, method="class") n= 10 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 10 4 N (0.6000000 0.4000000) 2) Myope=N 4 0 N (1.0000000 0.0000000) * 3) Myope=Y 6 2 Y (0.3333333 0.6666667) * > rpart(Young~., data=contacts.train, method="class", control=rpart.control(cp=-1)) n= 10 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 10 4 N (0.6000000 0.4000000) 2) Myope=N 4 0 N (1.0000000 0.0000000) * 3) Myope=Y 6 2 Y (0.3333333 0.6666667) 6) Pred=P 4 2 N (0.5000000 0.5000000) * 7) Pred=N 2 0 Y (0.0000000 1.0000000) *
  • 9. IRIS • http://archive.ics.uci.edu/ml/machine-learning-databases/iris/ iris.data • iris.name • (setosa, versicolor, virginia) • http://togodb.sel.is.ocha.ac.jp/ > iris.train <- read.table("iris_train.csv", sep=",", header=T) > length(rownames(iris.train)) [1] 120 > length(colnames(iris.train)) [1] 5 > hist(iris.train$Sepal.length) > hist(iris.train$Petal.length)
  • 10. > library(“mvpart”) > rpart(Class~., data=iris.train, method="class", control=rpart.control(cp=.1)) n= 120 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 120 77 Iris-setosa (0.35833333 0.34166667 0.30000000) 2) Petal.length< 2.45 43 0 Iris-setosa (1.00000000 0.00000000 0.00000000) * 3) Petal.length>=2.45 77 36 Iris-versicolor (0.00000000 0.53246753 0.46753247) 6) Petal.length< 4.75 37 1 Iris-versicolor (0.00000000 0.97297297 0.02702703) * 7) Petal.length>=4.75 40 5 Iris-virginica (0.00000000 0.12500000 0.87500000) *
  • 11. > iris.dtree<-rpart(Class~., data=iris.train, method="class", control=rpart.control(cp=.1)) > plot.new() > plot(iris.dtree,uniform=T,margin=0.5) > text(iris.dtree,use.n=T,all.leaves=F)
  • 12. > plot(iris.train$Petal.length, iris.train$Petal.width, pch = c(1,2,3)[unclass(iris.train$Class)])
  • 13. > iris.test <- read.table("iris_test.csv", sep=",", header=T) > iris.predict <- predict(iris.dtree, iris.test[1:4], type="class") > iris.predict 2 4 18 34 Iris-setosa Iris-setosa Iris-setosa Iris-setosa ... > iris.predict == iris.test$Class [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE [11] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [21] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE > sum(iris.predict == iris.test$Class) / length(iris.test$Class) [1] 0.9666667 > sum(iris.predict != iris.test$Class) / length(iris.test$Class) [1] 0.03333333
  • 14. • • • • rpart control=rpart.control(cp=.1) .1 • 10 • 3 2 3