【R】【课程笔记】01 R软件基础知识_r分位点是上分位点-CSDN博客

本文链接：https://blog.csdn.net/weixin_46285914/article/details/104473852

本文是课程《数据科学与金融计算》第1章的学习笔记，主要介绍R语言的数据类型、数据结构、运算、绘图等，用于知识点总结和代码练习，Q&A为问题及解决方案，参考书籍为《R软件及其在金融定量分析中的应用》。

往期回顾：

博文	内容
【R】【课程笔记】01 R软件基础知识	数据类型、数据结构、运算、绘图等
【R】【课程笔记】02+03 基于R软件的计算	聚类分析、因子分析、神经网络、支持向量机等
【R】【课程笔记】04+05 数据预处理+收益率计算	金融数据处理、收益率、R与C++等
【R】【课程笔记】06 金融波动模型	GARCH、SV、高频波动模型等
【R】【课程笔记】07 分位数回归与VaR（ES）计算	VaR、ES、极值模型等
【R】【课程笔记】08 金融投资组合决策分析	均值-方差模型、均值-VaR模型、均值-CVaR模型等

一、框架

图：R软件基础知识思维导图

二、代码

0. Initializing 初始化

初始化包括查看/变更工作环境，显示/清除对象，安装/导入包，存储文件/保存图片，检查数据，加载数据集等。
Q：无法使用命令存储命令save()，错误类型为：‘Permission denied’
A：问题在于存储位置没有写入权限（常见于C盘），步骤一：“属性”→“安全”→“高级”，发现无法修改权限，步骤二：“属性”→“共享”。

（1）解线性方程组
函数solve(a,b)
eg：a=matrix(rnorm(16),4,4) b=c(1:4) solve(a,b)
（2）矩阵的特征值与特征向量
函数eigen( )
eg：a=diag(4)+1 a.e=eigen(a,symmetric=T) #求出values,vectors

# (1) Working directory
getwd()                          #查看工作目录
#setwd('')                       #改变工作目录

ls()                             #显示对象
x <- 8                           #赋值
rm(x)                            #清除对象
rm(list = ls())                  #删除所有变量

# (2) Save and load
install.packages("neuralnet")    #安装包，可修改镜像Tools-Global Options-Packages
library(neuralnet)               #加载包
search()                         #显示当前工作空间已经存在的包
detach("package:neuralnet")      #卸载包

save.image()
save(x,y,file="C:\\文件夹地址\\S.RData")   #注意地址复制后修改为反斜杠，并加上命名
load(file="C:\\文件夹地址\\S.RData")       #加载之前保存的数据

# (3) picture                             #画图工具
jpeg(file="Saveplot1.jpeg")               #保存在工作目录下
plot(iris)
dev.off()                                 #关闭图形设备，否则占内存

postscript(file="Saveplot2.eps")          #保存为eps格式
plot(iris)
dev.off()

# (4) Display commands                    #展示数据，检查导入数据有无问题
data(iris)                                
head(iris)                                #前几行
tail(iris)                                #后几行

E1 <-3.1415926535
E2 <-3.1415926535
(E1.r <- round(E1,digits=4))              #round():四舍五入，括号（）为打印
options(digits=4);E2                      #设置全局的小数点位数

# (5) Link commands                        #加载数据集
df <- data.frame(name=c("ZhangSan","XiaoHong","LiSi","XiaoLan"),
                 sex=c("M","F","M","F"),age =c(20,21,19,20),weight=c(110,90,128,102))
attach(df)                                #数据集放入空间内，可以直接操作变量
mean(age)
detach(df)                                #不用及时释放

1. Basic types 数据类型

数据类型主要分为数值型（numeric）、逻辑型（logical）、字符型（character）和复数型（complex）。操作命令：查看数据类型（class)、长度（length）、打印字符串（cat）。

#character                      #字符型
z <- "Display \"a\" string "    #“双”引号+反斜杠
w <- 'Display "b"  string '     #单引号+双引号
cat(z)                          #打印字符串
cat(w)
class(z);class(w)

2. Basic structures 数据结构

数据结构主要有：向量（vector），矩阵（matrix），数组（array），列表（list），数据框（data.frame)，因子（factor），表达式（expression）等。

# (1) Vector                     #向量（一维数组）
x <- c(1,3,5,7)
y <- 1:10
seq(10,1,-2)                     #输出（10,8,6,4,2)

rep(1,3)                         #输出(1,1,1)
rep(c(2,4),3)                    #输出(2,4,2,4,2,4)
rep(c(2,4),each=3)               #输出(2,2,2,4,4,4)
rep(2:4,rep(2,3))                #输出(2,2,3,3,4,4)

# (2) Matrix                     #矩阵（二维数组）
matrix(seq(1:20),nrow=5,ncol=4)  #5x4矩阵，默认按列存放
#按行存放，赋列名行名
M <-matrix(seq(1:20),nrow=5,ncol=4,byrow=TRUE,
           dimnames=list(c("r1","r2","r3","r4","r5"),c("c1","c2","c3","c4"))) 
M[2,3]                           #第2行第3列的元素
M[1:3,3]                         #默认取出来的是向量      
M[1:3,3,drop=FALSE]              #返回结果是矩阵
M[-1,]                           #删除第1行所有元素 
M[1:3,-c(1,3)]                   #第1至3行，删除第1列和第3列所有元素

diag(M)                          #取出M中对角元素     
diag(1:3)                        #生成(1,2,3)为对角元的对角矩阵
diag(3)                          #生成(1,1,1)为对角元的对角矩阵

Tri <- matrix(1:9,3,3)           #生成3x3矩阵
Tri[upper.tri(Tri)] <- 0         #矩阵上三角为0

# (3) Array                      #数组
Z <- array(1:32,dim=c(4,4,2))    #先放第3维（2个（4，4）），然后按列放

Z[3,2,1]                         #先取第3维下标为1（第1组），再取第3行第2列（3，2）
Z[1:3,2,1]                       #先取第3维下标为1（第1组），再取第2列的第1至3行
Z[,,2]                           #取第3维下标为2（第二组）
dim(Z)                           #查看维数：dimension
dim(Z) <- c(4,2,4)               #重新排列

# (4) List                       #列表
#生成列表
Stu.Lt <- list(name="ZhangSan",stu.no="20140224",age=21,grade=c(90,85,96)) 

#访问列表
Stu.Lt[[2]]                      #取出内容
class(Stu.Lt[[2]])               #类型为数值型（numerical）
Stu.Lt[2]                        #取出列表                       
class(Stu.Lt[2])                 #类型为列表（list）

Stu.Lt[[4]][1:2]                 #取出第4个属性第1至2项

Stu.Lt["name"]                   #也可以通过属性名称取

#引用列表中元素
Stu.Lt$stu.no                    #列表名$元素名            
names(Stu.Lt)                    #列表的标题

#修改列表
#新增列名为"Project"，内容为"programming","sport"的列表
Stu.Lt$Project <- c("programming","sport")          
Stu.Lt$stu.no <- NULL            #删除列表，可用于删除敏感信息
Stu.Lt$name <- "LiSi"            #修改列表

unlist(Stu.Lt)                   #变成非列表

# (5) Data.frame                 #数据框
#生成数据框
df <- data.frame(name=c("ZhangSan","XiaoHong","LiSi","XiaoLan"),
                sex=c("M","F","M","F"),age =c(20,21,19,20),weight=c(110,90,128,102))

#修改数据框的行名
rownames(df) <- c("one","two","three","four")

#取出数据框中元素  
df[["age"]]                      #取出内容，类型为数值型（numerical），等价于：df$age
df[c("age","weight")]            #取出数据框（data.frame）
df[1:2,3:4]                      #取连续列，类型为数据框（data.frame）

# (6) Factor                     #因子
factor(1:4)                    
factor(1:4,levels=1:2)           #levels代表因子水平向量，指定各离散值，不含值因子取值NA
#labels用来指定各因子的标签
factor(c((1:4),(4:1)),labels=c("A","B","C","D"))
factor(1:4,exclude=2)            #指定该值转化为缺失值NA

sex <- c("M","F","M","F")      
sexf <- factor(sex)              #转换成因子类型(factor)

levels(sexf)                     #提取因子水平，显示："F" "M"
table(sexf)                      #统计因子中各类水平数据的频数

# (7) Expression                 #表达式
e1 <- 12; e2 <- 3.5            
f <- expression(sin(e1)+e2^2)    #不计算，列出函数表达式
eval(f)                          #激活函数表达式，结果为数值
D(f, "e1")                       #函数f对"e1"求导

set.seed(12345)                  #生成随机数               
p1 <- rnorm(100)                 #生成100个正态分布随机数，默认为标准正态分布
p2 <- runif(100)                 #生成100个均匀分布随机数，默认为[0，1]
model.lm <- lm(p2~p1+I(p1^2))    #最小二乘法回归
summary(model.lm)                #模型回归结果

3. Object operation 对象运算

对象操作包括类型转换（Type conversion）、运算符（The operator）、矩阵运算（Operation function）、函数运算（Family of apply functions）。

# (1) Type conversion            #类型转换
M <- matrix(1:12,nrow=3,ncol=4)
is.matrix(M)                     #判断是否为矩阵类型
M.vec <- as.vector(M)            #矩阵转化为向量
M.frame <- as.data.frame(M)      #矩阵转化为数据框

ff1 <- factor(c(9,5,6))          #Levels：5 6 9
as.numeric(ff1)                  #因子变量直接转换成数值，结果为：3 1 2
as.numeric(as.character(ff1))    #因子变量先转化为字符再转化为数值，结果为：9 5 6

ff2 <- factor(c("M","F"))        #Levels：F M（按字母顺序）
as.numeric(ff2)                  #字符型因子变量转化为数值，结果为：2 1

# (2) The operator               #运算符
# Mathematical operations      
x <- c(13,14)%%9                 #%%：mod,取余数

y <- matrix(c(1:9),ncol=3)
z <- matrix(c(2:10),ncol=3)           
y*z                              #对应元素相乘
y%*%z                            #矩阵相乘

# Comparison operations
M <- c("A","B")
m <- c("a","b")
M==m                             #判断对应数是否相等
identical(M,m)                   #两个对象的整体比较，必须全部都正确，否则FALSE
all.equal(M,m)                   #提示几个错误，本题返回值为："2 string mismatches"
all.equal(y,z)                   #矩阵比较，返回值为："Mean relative difference: 0.2"

# (3) Operation function
# Vector operations function
z <- c(1,5,4,7,9)
sum(z)
mean(z)
sd(z)
median(z)
sort(z)                          #排序
append(z,2:3,after=2)            #在第二个数的后面加上2和3

# Matrix operations function
A <- matrix(1:9,ncol=3)
t(A)                             #转置，广义转置函数：aperm(A)
dim(A)
nrow(A)                          #行
ncol(A)                          #列

B <- matrix(1:8,ncol=4);B        #2x4矩阵
C <- matrix(8:1,ncol=4);C        #2x4矩阵
rbind(B,C)                       #按行合并，生成4x4矩阵
cbind(B,C)                       #按列合并，生成2x8矩阵

# (4) Family of apply functions  #函数运算
# apply function                 #apply(X, MARGIN, FUN, ...)
x <- runif(10,-1,1)              #生成10个均匀分布随机数，范围是[-1，1]
y <- rnorm(10,0.5,1)             #生成10个正态分布随机数，均值为0.5，标准差（sd）为1
xy <- cbind(x,y)

#margin：1对行调用函数，2对列调用函数
apply(xy,1,sum)                  #按行求和
apply(xy,2,mean)                 #按列求平均值

# tapply function                #分组统计
#生成一组因子A B C D A B C D A B C D A B，Levels: A B C D E
t1 <- factor(rep(1:4,length=14),levels=1:5,labels=c("A","B","C","D","E"))
t2 <- c(1:14)         
tapply(t2,t1,sum)                #统计t2中ABCDE各类的个数
tapply(t2,t1,sum,simplify=FALSE) #TRUE返回数组，FALSE返回列表

# lapply function                #针对列表的函数：lapply(list, function)
L1 <- list(a=1:20,b=runif(30,-2,5),d=matrix(c(1:10)))
lapply(L1,quantile)              #返回分位数

# sapply function                                    #lapply函数的特殊形式
sapply(L1,quantile,simplify=FALSE,use.names=FALSE)   #返回值为列表(list)
sapply(L1,quantile,simplify=TRUE,use.names=FALSE)    #返回值为矩阵(matrix)

4. Plotting 绘图

plot（）函数可以根据数据绘制出散点图、折线图、柱状图、箱线图等。
pairs（）函数只能表示两者关系，coplot（）能够说明三或四变量关系。

# (1) plot function demo
set.seed(12345)                  
x <- sample(c(1:100),100)       #1-100，乱序
y <- sample(c(1:100),100) 
xt <- ts(x)                     #变成时间序列数据Time Series:Start=1,End=100,Frequency=s1 
xy <- cbind(x, y) 
f <- as.factor(c(rep('A',20),rep('B',30),rep('C',50)))  

#画2x3图(mfrow),下/左/上/右边框(mar)
par(mfrow=c(2,3),mar=c(5,4,3,2))                      
plot(x)                         # indx-x 散点图                          
plot(xt)                        # 时间序列图
plot(xy)                        # x-y 散点图
plot(x,y)                       # x-y 散点图
plot(f)                         # A、B、C柱状图
plot(f,y)                       # A、B、C箱线图（A：20，B：30，C：50）

图4.1 plot（）函数绘图

# (2) pairs and coplot function demo    
data(morley)                    # “光速” 数据集(Expt-Run-Speed)
pairs(morley)                   #三个变量两两画，6张图
# "CO2"数据集(Plant-Type-Treatment：是否制冷-conc：浓度-uptake：吸收CO2)
# 给定条件下，不同植物类型（12种），判断相应变量（浓度与吸收）如何变化。
# 画出12张图，“type=b”：点和线同时画
coplot(uptake ~ conc | Plant, data = CO2,show.given = FALSE, type = "b")

图4.2.1 pairs（）函数绘图
图4.2.2 coplot（）函数绘图

5. Function 自定义函数

自定义绘图函数，进行描述性统计和绘图。

# (1) define a function
stat <- function(x, plot.it=TRUE){
  if (!is.vector(x)) stop('argument x is not a vector, please check it.')      #判断是不是向量
  if (!is.numeric(x)) stop('argument x is not a numeric, please check it.')    #判断是不是数值
  mean <-  mean(x)
  median <- median(x)
  maximum <- max(x)
  minimum <- min(x)
  sd <- sd(x)
  skew <- moments::skewness(x)                               # 以上三步需要安装包'moments'  
  kurt <- moments::kurtosis(x)                                   
  jbtst <- moments::jarque.test(x)                           # J-B正态性检验
  stats <- c(mean=mean, median=median, maximum=maximum, minimum=minimum, skew=skew,
             kurt=kurt)                                      # 输出结果             
  test <- c(JB=jbtst$statistic, p.value=jbtst$p.value)       # 检验统计量
  
  if (plot.it){
    par(mfrow=c(1,2))
    hist(x)
    qqnorm(x)
    qqline(x)
  }
  
  results<- list(stats=round(stats, digits=3), test=round(test, digits=3))   # set digits
  return(results)
}

# (2) read data
library(moments)
library(quantmod)
library(RODBC)  # loading package
getSymbols("^SSEC") #reading SSEC data
head(SSEC,3)
tail(SSEC,3)

setSymbolLookup(SSEC_shjc=list(name="600009.SS",src="yahoo"))
getSymbols("SSEC_shjc",from = "2019-01-01")
tail(SSEC_SHJC,3)

setSymbolLookup(SSEC_scgf=list(name="600008.SS",src="yahoo"))
getSymbols("SSEC_scgf",from = "2019-01-01")

setSymbolLookup(SSEC_zggm=list(name="600007.SS",src="yahoo"))
getSymbols("SSEC_zggm",from = "2019-01-01")
detach(package:RODBC)

# 3. compute descriptive statistics and check the normality of the data
SSEC_shjc_stat <- stat(x=as.vector(SSEC_SHJC[,2]), plot.it=TRUE)
SSEC_scgf_stat <- stat(x=as.vector(SSEC_SCGF[,2]), plot.it=TRUE)
SSEC_zggm_stat <- stat(x=as.vector(SSEC_ZGGM[,2]), plot.it=TRUE)

stats <- rbind(SSEC_shjc_stat$stats, SSEC_scgf_stat$stats, SSEC_zggm_stat$stats)
rownames(stats) <- c('shjc', 'scgf', 'zggm')
print(stats)
tests <- rbind(SSEC_shjc_stat$test, SSEC_scgf_stat$test, SSEC_zggm_stat$test)
rownames(tests) <- c('shjc', 'scgf', 'zggm')
print(tests)

6. 案例金融数据的绘图

# (1) reading SSEC index
library(quantmod)                
library(RODBC)                                                 

getSymbols("^SSEC")                                          # 默认“Yahoo.Finance”下载
head(SSEC,3)
tail(SSEC,3)

setSymbolLookup(SSEC_shjc=list(name="600009.SS",src="yahoo"))
getSymbols("SSEC_shjc",from = "2019-01-01")
tail(SSEC_SHJC,3)

setSymbolLookup(SSEC_scgf=list(name="600008.SS",src="yahoo"))
getSymbols("SSEC_scgf",from = "2019-01-01")

setSymbolLookup(SSEC_zggm=list(name="600007.SS",src="yahoo"))
getSymbols("SSEC_zggm",from = "2019-01-01")
detach(package:RODBC)

# (2) compute return series
Close.ptd.SSEC <- na.omit(SSEC$SSEC.Close)                   # 删除缺失值              
Close.rtd.SSEC <- diff(log(Close.ptd.SSEC))*100              # 计算对数收益率

# (3) Close price of Sample stocks in SSEC  
Close.ptd.shjc <- SSEC_SHJC$`600009.SS.Close`                       
Close.ptd.scgf <- SSEC_SCGF$`600008.SS.Close`  
Close.ptd.zggm <- SSEC_ZGGM$`600007.SS.Close`

# (4) Drawing SSEC Close Price Series, Scatter, ACF and PACF Figure
par(mfrow=c(2,2))  

Close.ptd.SSEC.ts<-ts(Close.ptd.SSEC,start=c(2006),freq=241) # 时间序列数据  
plot(Close.ptd.SSEC.ts, type="l",main="(a) SSEC Close Price Series",xlab="Date",ylab="Price",cex.main=0.95,las=1)     

plot(Close.ptd.shjc[1:20], type="p",pch=17,main="(b) Close Price of Sample stocks",
     xlab="Time",ylab="Price",cex.main=0.95,ylim=c(0,60))    #画3支股票20个交易日的散点图
points(Close.ptd.scgf[1:20],pch=15)                           
points(Close.ptd.zggm[1:20],pch=14)                           
legend("center", legend=c("SHJC_600009","SCGF_600008","ZGGM_600007"),
       pch=c(17,15,14),cex=0.7)     

par(mfrow=c(1,1))
Close.rtd.SSEC <- na.omit(Close.rtd.SSEC)                    # 去掉第一项NA值
acf(Close.rtd.SSEC,main='',xlab='Lag',ylab='ACF',las=1)      # 自相关函数，定阶
title(main='(c) SSEC ACF Test',cex.main=0.95)
pacf(Close.rtd.SSEC,main='',xlab='Lag',ylab='PACF',las=1)    # 偏相关系数
title(main='(d) SSEC PACF Test',cex.main=0.95)

图5.4.1 ACF test
图5.4.2 PACF Test

# (5) Drawing  Q-Q, ecdf, density and hist figure            # 判断是否符合正态

par(mfrow=c(2,2),mar=c(5,4,3,2)) 

qqnorm(Close.rtd.SSEC,main="(a) normal QQ of SSEC ",
       cex.main=0.95,xlab='Theoretical quantile',ylab='Sample quantile')                             
qqline(Close.rtd.SSEC) 

ECD.SSEC <- ecdf(Close.rtd.SSEC[1:10])                       # 经验分布函数（阶梯函数）
plot(ECD.SSEC,lwd = 2,main="(b) ecdf of SSEC[10]",cex.main=0.95,las=1)  
xx <- unique(sort(c(seq(-3, 2, length=24), knots(ECD.SSEC))))            
lines(xx, ECD.SSEC (xx))                                                
abline(v=knots(ECD.SSEC),lty=2,col='gray70')                 # 虚线
x1 <- c((-4):3)                                              # 假设正态下经验分布拟合情况
lines(x1,pnorm(x1,mean(Close.rtd.SSEC[1:10]),sd(Close.rtd.SSEC[1:10])))   

D <-density(Close.rtd.SSEC)                                  # 核密度函数
plot(D, main="(c) Distribution of SSEC ",xlab="earning", ylab='density',
     xlim = c(-7,7), ylim=c(0,0.5),cex.main=0.95)                       
polygon(D, col="gray", border="black")                                  
curve(dnorm,lty = 2, add = TRUE)                             # 正态密度函数         
abline(v=0,lty = 3)                                                     
legend("topright", legend=c("kernal density","normal density"),lty=c(1,2,3),cex=0.7)

hist(Close.rtd.SSEC[1:100],xaxt='n',main='(d) Histogram of SSEC[100]',
     xlab='earning/100',ylab='density', freq=F,cex.main=0.95,las=1)       #直方图    
x2 <- c(-6:4)
lines(x2,dnorm(x2,mean(Close.rtd.SSEC[1:100]),sd(Close.rtd.SSEC[1:100]))) #正态分布曲线
axis(1,at=axTicks(1),labels=as.integer(axTicks(1))/100)