Data Mining with R (code + notes) Chapter 1 --- R中关于DM的数据结构，以及一些简单的命令...-CSDN博客

Data Mining with R (code + notes) Chapter 1 --- R中关于DM的数据结构，以及一些简单的命令

http://www.liaad.up.pt/~ltorgo/DataMiningWithR/

###################################################
### How to Read this Book?
###################################################

R.version

###################################################
### Starting with R
###################################################
install.packages('DMwR')

installed.packages()

library()

old.packages()

update.packages()

RSiteSearch('neural networks')

###################################################
### R Objects
### 赋值命令：<-
### 查看objects list:ls() ; 删除objects: rm()
###################################################
x <- 945

y <- 39
y
y <- 43
y

z <- 5
w <- z^2
w
i <- (z*2 + 45)/2
i

(34 + 90)/12.5

ls()
rm(y)
rm(z,w,i)

###################################################
### Vectors
### 离散型：c(.....)来实现的
### 查看类型：mode() ; 查看长度: length()
### vector中可以含有NA(missing value)
### vector中的元素是同一个类型的，不过这个类型是可以改变的
### R中的vector有很强的可变性，长度，类型都可以变，也可以一开始先构建一个vector()，后面再添加元素
###################################################
v <- c(4,7,23.5,76.2,80)
v
length(v)
mode(v)

v <- c(4,7,23.5,76.2,80,"rrt")
v

u <- c(4,6,NA,2)
u
k <- c(T,F,NA,TRUE)
k

v[2]

v[1] <- 'hello'
v

x <- vector()

x[3] <- 45
x

length(x)
x[10]
x[5] <- 4
x

v <- c(45,243,78,343,445,44,56,77)
v
v <- c(v[5],v[7])
v

###################################################
### Vectorization
### 向量的运算：非常解释性
###################################################
v <- c(4,7,23.5,76.2,80)
x <- sqrt(v)
x

v1 <- c(4,6,87)
v2 <- c(34,32.4,12)
v1+v2

v1 <- c(4,6,8,24)
v2 <- c(10,2)
v1+v2

v1 <- c(4,6,8,24)
2*v1

###################################################
### Factors
### 向量可以因子化：即可以规定向量中的有效values列表，如果向量中某一元素被赋值了不在列表中的值，将会被作为<NA>
### table:查看向量各个取值的分布
###
###################################################
g <- c('f','m','m','m','f','m','f','m','f','f')
g

g <- factor(g)
g

other.g <- factor(c('m','m','m','m','m'),levels=c('f','m'))
other.g

table(g)
table(other.g)

a <- factor(c('adult','adult','juvenile','juvenile','adult','adult',
'adult','juvenile','adult','juvenile'))

t <- table(a,g)
margin.table(t,1)
margin.table(t,2)

prop.table(t,1)
prop.table(t,2)
prop.table(t)

###################################################
### Generating sequences
### 有时候向量中的各个元素不是毫无规律的，而是一个序列
### begin：inter：end or seq(begin, end, length) or rep(value,重复次数)
### 也可以生成符合某一分布的随机向量：rnorm, rt...
###################################################
x <- 1:1000

10:15-1
10:(15-1)

5:0

seq(-4,1,0.5)

seq(from=1,to=5,length=4)
seq(from=1,to=5,length=2)
seq(length=10,from=-2,by=.2)

rep(5,10)
rep('hi',3)
rep(1:2,3)
rep(1:2,each=3)

gl(3,5)
gl(2,5,labels=c('female','male'))

rnorm(10)

rnorm(4,mean=10,sd=3)

rt(5,df=10)

###################################################
### Indexing
### 向量中的元素是通过index访问的，index可以是任意序列，也可以是判断式
### 我们可以给index命名，然后通过名字来指定某个index
###################################################
x <- c(0,-3,4,-1,45,90,-5)
x > 0

x[x>0]

x[x <= -2 | x > 5]
x[x > 40 & x < 100]

x[c(4,6)]
x[1:3]
y <- c(1,4)
x[y]

x[-1]
x[-c(4,6)]
x[-(1:3)]

pH <- c(4.5,7,7.3,8.2,6.3)
names(pH) <- c('area1','area2','mud','dam','middle')
pH

pH <- c(area1=4.5,area2=7,mud=7.3,dam=8.2,middle=6.3)

pH['mud']
pH[c('area1','dam')]

###################################################
### Matrices and Arrays
### 矩阵的构建：
### row num + col num, 可以一开始就赋值，也可以后面循环赋值
### 矩阵元素的访问：M(row,col),访问一列: M(,col),访问一行: M(row,)
### 矩阵的链接：rbind:按行连接, cbind按列连接
###################################################
m <- c(45,23,66,77,33,44,56,12,78,23)
m
dim(m) <- c(2,5)
m

m <- matrix(c(45,23,66,77,33,44,56,12,78,23),2,5)

m <- matrix(c(45,23,66,77,33,44,56,12,78,23),2,5,byrow=T)
m

m[2,3]

m[-2,1]
m[1,-c(3,5)]

m[1,]
m[,4]

m[1,,drop=F]
m[,4,drop=F]

m1 <- matrix(c(45,23,66,77,33,44,56,12,78,23),2,5)
m1
cbind(c(4,76),m1[,4])
m2 <- matrix(rep(10,20),4,5)
m2
m3 <- rbind(m1[1,],m2[3,])
m3

results <- matrix(c(10,30,40,50,43,56,21,30),2,4,byrow=T)
colnames(results) <- c('1qrt','2qrt','3qrt','4qrt')
rownames(results) <- c('store1','store2')
results
results['store1',]
results['store2',c('1qrt','4qrt')]

a <- array(1:24,dim=c(4,3,2))
a

a[1,3,2]
a[1,,2]
a[4,3,]
a[c(2,3),,-2]

m <- matrix(c(45,23,66,77,33,44,56,12,78,23),2,5)
m
m*3
m1 <- matrix(c(45,23,66,77,33,44),2,3)
m1
m2 <- matrix(c(12,65,32,7,4,78),2,3)
m2
m1+m2

###################################################
### Lists
### Lists就是一个数据结构：其中的elements可以是不同类型的objects
### my.lst[index]返回的是一个sublist
### 而要得到list中某个objects：my.lst[[index]], 而且index只能是一个integer
### 当然最常用的还是$name的方式：my.lst$stud.id
###################################################
my.lst <- list(stud.id=34453,
stud.name="John",
stud.marks=c(14.3,12,15,19))

my.lst

my.lst[[1]]
my.lst[[3]]

my.lst[1]

mode(my.lst[1])
mode(my.lst[[1]])

my.lst$stud.id

names(my.lst)
names(my.lst) <- c('id','name','marks')
my.lst

my.lst$parents.names <- c("Ana","Mike")
my.lst

length(my.lst)

my.lst <- my.lst[-5]

other <- list(age=19,sex='male')
lst <- c(my.lst,other)
lst

unlist(my.lst)

###################################################
### Data Frames
### 是什么：与matrix相似，但columns是有名字的.
###################################################
my.dataset <- data.frame(site=c('A','B','A','A','B'),
season=c('Winter','Summer','Summer','Spring','Fall'),
pH = c(7.4,6.3,8.6,7.2,8.9))

my.dataset[3,2]

my.dataset$pH

my.dataset[my.dataset$pH > 7,]
my.dataset[my.dataset$site == 'A','pH']
my.dataset[my.dataset$season == 'Summer',c('site','pH')]

attach(my.dataset)
my.dataset[pH > 8,]
season

subset(my.dataset,pH > 8)
subset(my.dataset,season == 'Summer',season:pH)

my.dataset[my.dataset$season == 'Summer','pH'] <-
my.dataset[my.dataset$season == 'Summer','pH'] + 1

my.dataset$NO3 <- c(234.5,256.6,654.1,356.7,776.4)
my.dataset

nrow(my.dataset)
ncol(my.dataset)

names(my.dataset)
names(my.dataset) <- c("area","season","pH","NO3" )
my.dataset

names(my.dataset)[4] <- "PO4"
my.dataset

###################################################
### Creating New Functions
### function(<list of parameters>) { <list of R instructions> }
###################################################

se <- function(x) {
v <- var(x)
n <- length(x)
return(sqrt(v/n))
}

se(c(45,2,3,5,76,2,4))

basic.stats <- function(x,more=F) {
stats <- list()

clean.x <- x[!is.na(x)]

stats$n <- length(x)
stats$nNAs <- stats$n-length(clean.x)

stats$mean <- mean(clean.x)
stats$std <- sd(clean.x)
stats$med <- median(clean.x)
if (more) {
stats$skew <- sum(((clean.x-stats$mean)/stats$std)^3)/length(clean.x)
stats$kurt <- sum(((clean.x-stats$mean)/stats$std)^4)/length(clean.x) - 3
}
unlist(stats)
}

basic.stats(c(45,2,4,46,43,65,NA,6,-213,-3,-45))
basic.stats(c(45,2,4,46,43,65,NA,6,-213,-3,-45),more=T)

f <- function(x) {
for(i in 1:10) {
res <- x*i
cat(x,'*',i,'=',res,'\n')
}
}

附上 Some useful functions

Reading and writing data：

read.table(file) ：Reads a table from a ﬁle and creates a data
frame from the contents of this ﬁle, where each
row corresponds to a line of the ﬁle and each
column corresponds to a ﬁeld in the ﬁle.

write.table(obj,file)： Converts obj into a data frame, and writes the
result to file.

Some basic statistics:

sum(x) Sum of the elements of x.
max(x) Largest value of the elements in x.
min(x) Smallest value of the elements in x.
which.max(x) The index of the largest value in x.
which.min(x) The index of the smallest value in x.

range(x) The range of values in x (has the same result as c(min(x),max(x))).
length(x) The number of elements of x.

mean(x) The mean value of the elements of x.
median(x) The median value of the elements of x.

sd(x) The standard deviation of the elements of x.
var(x) The variance of the elements of x.

quantile(x) The quantiles of x.
scale(x) Standardizes the elements of x, i.e. subtracts the mean and divides by the standard deviation. Results in a vector with zero mean and
unit standard deviation. Also works with dataframes (column-wise and only with numeric data!).

Some vector and mathematical functions
sort(x) Sort the elements of x.
rev(x) Reverse the order of the elements of x.
rank(x) Ranks of the elements of x.
log(x,base) The logarithms of all elements of x in base
exp(x) The exponentials of the elements of x.
sqrt(x) The square roots of the elements of x.
abs(x) The absolute value of the elements of x.
round(x,n) Rounds all elements of x to n decimal places.
cumsum(x) Returns a vector where the ith element is the sum from x[1] to x[i].
cumprod(x) The same for the product.
match(x,s) Returns a vector with the same length as x, with the elements of x that are contained in s. The ones that do not belong to s have the value NA.
union(x,y) Returns a vector with the union of vectors x and y.
intersect(x,y) Returns a vector with the intersection of vectors x and y.
setdiff(x,y) Returns a vector resulting from removing the elements of y from x.
is.element(x,y) Return TRUE if x is contained in vector y.
choose(n,k) Calculates the number of combinations of k to n.

Matrix algebra
diag(x,nrow,ncol) Builds a diagonal matrix with nrow rows and ncol columns, with the number x. Can also
be used to extract or replace the diagonal elements of a matrix (see Help).
t(x) The transpose of x.
nrow(x) Number of rows of x.
ncol(x) The number of columns of x.
A %*% B Matrix multiplication of A by B.
solve(A,b) Solve the system of linear equations Ax = b. With a single matrix argument (e.g.
solve(A)) it calculates the inverse of matrix A.
svd(x) Singular value decomposition of matrix x.
qr(x) QR decomposition of matrix x.
eigen(x) Eigen values and vectors of the square matrix x.
det(x) The determinant of matrix x.

Meta-functions
apply(x,margin,fun) Applies the function fun to all rows or columns of matrix x. If the parameter margin is 1 then the function is applied to each row, if it is 2 it is applied to each column. sapply(x,fun) Applies the function fun to all elements of vector x.
lapply(x,fun) The same as previous but the function is applied to all elements of a list x.
aggregate(x,by,fun) Applies a summarization function fun to all subsets of rows of the data frame x. The subsets are formed by the using all combinations of the factors in the list by.
by(x,by,fun) Applies a function fun to all subsets of rows of the data frame x. The subsets are formed
by the using all combinations of the factors in the list by.

最后：

执行写好的code文件：先存储code到 filename.R，然后source('filename.R')

保存数据：save(objectname1, objectname2,... ,'filename.R')

导入数据：load('filename.R')