R语言笔记
#设定R软件当前工作目录
setwd("E:/R work")
#显示R软件当前工作目录
getwd()
#R语言数据预处理常用包安装
#plyr,reshape2,lubridate, stringr
install.packages(c("plyr","reshape2","lubridate", "stringr","foreign"))
library(MASS)
library(foreign)
library(stringr)
library(plyr)
library(reshape2)
library(ggplot2)
#####1.R语言数据读取#####
#R包自带数据
data(diamonds)
diamonds
#查看前六行数据
head(diamonds)
#查看后六行数据
tail(diamonds)
#R添加包,可以从一些开放源直接下载金融数据,包括雅虎财经、谷歌财经、等
install.packages("quantmod")
library(quantmod) #加载包
#从雅虎财经下载苹果股票交易数据(从2015年1月1日至今)
getSymbols("AAPL",from="2015-01-01")
#查看数组维数及元素个数
dim(AAPL)
head(AAPL)
tail(AAPL)
#作图,K线图
chartSeries(AAPL,theme=chartTheme('black'))
#从oanda获取外汇数据
install.packages("jsonlite")
library(jsonlite)
getFX("USD/CNY",from="2017-05-01")
head(USDCNY)
tail(USDCNY)
chartSeries(USDCNY,theme = chartTheme('black'))
#read.table函数读取本地/网络数据(read.table, read.csv, read.csv2, read.delim, read.delim2, read.fwf)
help("read.table")
##read.table函数
getwd()
#原始数据有列名,第一列为记录序号,可以省略参数header(但此时应当为TRUE)
rt = read.table("houses.data");rt
rt1 = read.table("houses.data",header = TRUE);rt1
#原始数据有列名,无记录序号列,不可以省略参数header
rt2 = read.table("houses2.data",header = TRUE);rt2
rt2 = read.table("houses2.data");rt2 # 省略参数header(此时为FALSE),变量名会被认为是一行数据
#原始数据无列名,无记录序号列,可以省略参数header(此时为FALSE)
rt3 = read.table("houses3.data");rt3
rt3 = read.table("houses3.data",
col.names = c("Price","Floor","Area","Rooms","Age","Cent.heat"));rt3
#read.csv函数
dat = read.csv('PM.csv') #编码错误,读入乱码,行数也会错乱
dat1 = read.csv('PM.csv',fileEncoding = "utf-8") #指定正确编码
#以下操作不读取表头,并重新制定列名
colname=c('id','city','index','y','x')
dat2 = read.csv('PM.csv',header=FALSE,col.name=colname,fileEncoding = "utf-8")
#当数据量较大时,全部将数据读取会比较耗时,这里可以通过nrows设定
dat3 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=-1) #nrows默认为-1
dat4 = read.csv('PM.csv',fileEncoding = "utf-8",nrows=5) #nrows设置为5
#因子转换
dat5 = read.csv('PM.csv',stringsAsFactors=FALSE,fileEncoding = "utf-8") #读取为string格式
str(dat5)
dat6 = read.csv('PM.csv',fileEncoding = "utf-8") #读取为factor格式
str(dat6)
#文件编码
dat7 = read.csv('PM.csv',fileEncoding = "utf-8") #默认编码不是utf-8,需要设置
dat8 = read.csv('PM-gbk.csv') #这里默认编码是gbk,不需要设置
#最后一行没有回车符会有警告“最后一行不完整”
x=read.table("data1.txt",sep=",");x
person=read.csv("data1.txt", header=FALSE,col.names=c("age","height"))
person
##scan函数读取结构化数据
#15名学生的体重
w = scan("weight.data");w #默认读为数值向量
w = scan("weight.data",what = 0);w
w = scan("weight.data",what = c(""));w #读为字符型向量
w = scan("weight.data",what = list(""));w #读为list
#例100名学生的身高和体重被存在文件h_w.data中,其中1,3,5,7,9列为身高,2,4,6,8,10列为体重,
#试用scan函数读入,并转化为数据框
dat = scan("h_w.data",what = list(height=0,weight=0))
df = as.data.frame(dat)
#scan函数读入屏幕数据
names = scan(what = "")
zhangsan lisi wangwu maliu
names
##其他格式数据读入
install.packages("foreign")
library(foreign)
#读取SPSS文件,不加参数to.data.frame = T返回list
educ = read.spss("educ_scores.sav",to.data.frame = T)
educ = read.xport("educ_scores.xpt") #读取SAS文件
educ = read.S("educ_scores") #读取SPLUS文件
educ = read.dta("educ_scores.dta") #读取stata文件
#读取excel表格数据
educ = read.delim("EDUC_SCORES.txt") #转化为txt文件
educ = read.csv("educ_scores.csv") #转化为csv文件
#利用xlsx包中的函数读取
install.packages("xlsx")
library(xlsx)
#解决无法载入‘rJava’问题方法
install.packages("rJava")
Sys.setenv(JAVA_HOME='C:/Program Files/Java/jre1.8.0_77') #自己的JAVA64路径
library(rJava)
library(xlsx)
#这里默认header=T,sheetIndex = 1表示读取第一个工作簿的数据,或通过指定工作簿名称来读取
educ = read.xlsx("educ_scores.xls",sheetIndex = 1)
educ = read.xlsx("educ_scores.xls",sheetName = "educ_scores")
##文本数据读取
news = readLines('news.txt',encoding = "UTF-8")
news = readLines('news.txt',n=2,encoding = "UTF-8");news
#scan函数读取为列表
line = scan('news.txt',what=list(''),encoding = "UTF-8")
line = scan('news.txt',what=list(''),n=1,encoding = "UTF-8");line
#scan函数读取为向量
line = scan('news.txt',what=c(''),encoding = "UTF-8")
line = scan('news.txt',what=c(''),n=1,encoding = "UTF-8");line
##结构化数据写入
write.table(educ,file = "educ_w.txt",append = T)
write.csv(educ,file = "educ_w.csv")
##文本数据写入
writeLines(line,"news_w.txt")
sink("news_w1.txt")
cat(line)
sink()
y=read.table("http://www.jaredlander.com/data/Tomato%20First.csv",header=TRUE,sep=",")
#使用head(),str(),summary()函数来查看数据集
head(y)
str(y)
summary(y)
getwd()
#查看数据
data = read.table("salary.txt",header = T);data
mode(data)
class(data)
names(data)
colnames(data)
dim(data)
#####2.数据管理与变换######
##数据合并
a=c("Hongkong",1910,75.0,41.8)
data = read.table('salary.txt', header = T,stringsAsFactors = F)
data1=rbind(data,a)
data1[14:16,]
weight=c(150,135,210,140) #数值型向量
height=c(65,61,70,65)
gender=c("F","F",