MEANBOOT
n <- 10^4 #boot的次数
y <- c(4.313, 4.513, 5.489, 4.265,
3.641, 5.106, 8.006, 5.087) #原始样本
mvec <- rep(0,n) #初始化一个向量,用以存放boot均值
for(i in 1:n) #boot均值n次
{
ystar <- sample(y,replace = TRUE)
mvec[i] <- mean(ystar)
}
mean(mvec) #计算boot总均值
quantile(mvec,
prob=c(0.025,0.975), name=TRUE) #给出boot 95%区间估计
hist(mvec, nclass=40, col="grey",
freq=FALSE, main="Bootstrap均值分布") #画出直方图
REGPRIN
#数据导入
df <-read.csv('data.csv')
mt <- as.matrix(df)
#数据处理
y <- mt[,1]
X <- mt[,2:3]
X <- cbind(1,X)
#求回归运算
XTX <- t(X) %*% X
#XTX <- crossprod(X)
i_XTX <- solve(XTX)
beta <- i_XTX %*% t(X) %*% y
#crossprod(X, y)
#实际上,R中的回归很简单,调用lm()即可
fit <- lm(y ~ x1 + x2, data=df)
summary(fit)
COR
x <- -1000:1000 #对x进行赋值
y <- 2*x^2 #根据公式求得y
mean(x) #顺便求个均值
var(x) #算个方差
plot(x, y) #绘制x和y的散点图
cor(x, y) #求x和y的相关系数
GET_HTML
#读取网页
url <- 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/qgqp/index.phtml?qq-pf-to=pcqq.c2c&p=1'
#url <- './web.html'
web <- readLines(url)
#初步筛选模式
pt1 <- '<table.*>'
pt2 <- '</table>'
#查找需要的信息
web <- web[grep(pt1, web):grep(pt2, web)]
#信息筛选模式
pt <- '<[^>]*?>'
#进一步清洗处理
cont <- gsub(pt, '', web)
cont <- gsub('\\s+', '', cont)
cont <- cont[cont!='']
#数据展现
mtr <- matrix(cont, ncol = 12, byrow = TRUE)
#写入文件
write.csv(mtr, './res.csv')
LOGIT
df <- read.csv('./data/data.csv')
summary(df)
set.seed(1)
train <- sample(1:nrow(df), 1e5)
fit <- glm(违约 ~ 标的总额 + 年利率 + 还款期限, data=df[train,], family=binomial())
summary(fit)
# probs <- predict(fit, type='response')
# out <- ifelse(probs>=0.5, 1, 0)
# a <- table(out, df$违约[train])
# sum(diag(a))/sum(a) *100
probs <- predict(fit, newdata = df[-train,], type='response')
out <- ifelse(probs>=0.5, 1, 0)
a <- table(out, df$违约[-train])
sum(diag(a))/sum(a) *100
宋词
splitwords <- function(x)
{
substring(x, 1:(nchar(x)-1), 2:nchar(x))
#substring("abcdef",1:5, 2:6)
}
#调用
txt <- read.csv("SongPoem.csv",colClasses="character")
# 句子用标点符号分割
sentences <- strsplit(txt$Sentence,",|。|!|?|、")
sentences <- unlist(sentences)
sentences <- sentences[sentences!=""]
s.len <- nchar(sentences)
# 单句太长了说明有可能是错误的字符,去除掉
sentences <- sentences[s.len<=15]
s.len <- nchar(sentences)
#词频统计
words <- mapply(splitwords,sentences,SIMPLIFY=TRUE,USE.NAMES=FALSE)
words <- unlist(words)
words.freq <- table(words)
words.freq <- sort(words.freq,decreasing=TRUE)
words.freq[1:100]
#画词云
require(wordcloud2)
wf <- words.freq[2:500]
d <- data.frame(word = names(wf), freq = as.numeric(wf))
wordcloud2(d, size = 0.5)
ANSCOMBER