Rstudio

最新推荐文章于 2024-12-30 04:50:38 发布

LIQIANX

最新推荐文章于 2024-12-30 04:50:38 发布

阅读量586

点赞数

文章标签： java 服务器数据库

本文链接：https://blog.csdn.net/L_kuaing/article/details/125446330

版权

本文通过Rstudio展示了各种数据分析操作，包括Bootstrapping统计、回归分析、线性模型、数据可视化和异常检测。同时，还涉及到文本处理、数据库交互、网页抓取、词云生成以及统计图表的绘制。通过对不同数据集的应用，探讨了从数据预处理到模型建立再到结果解释的全过程。

摘要由CSDN通过智能技术生成

MEANBOOT

n <- 10^4                          #boot的次数
y <- c(4.313, 4.513, 5.489, 4.265,
3.641, 5.106, 8.006, 5.087)           #原始样本

mvec <- rep(0,n)                              #初始化一个向量，用以存放boot均值
for(i in 1:n)                             #boot均值n次
{
ystar <- sample(y,replace = TRUE)
mvec[i] <- mean(ystar)
}

mean(mvec) #计算boot总均值

quantile(mvec,
prob=c(0.025,0.975), name=TRUE) #给出boot 95%区间估计

hist(mvec, nclass=40, col="grey",
freq=FALSE, main="Bootstrap均值分布") #画出直方图

REGPRIN

#数据导入
df <-read.csv('data.csv')
mt <- as.matrix(df)

#数据处理
y <- mt[,1]
X <- mt[,2:3]
X <- cbind(1,X)

#求回归运算
XTX <- t(X) %*% X
#XTX <- crossprod(X)
i_XTX <- solve(XTX)
beta <- i_XTX %*% t(X) %*% y
#crossprod(X, y)

#实际上，R中的回归很简单，调用lm()即可
fit <- lm(y ~ x1 + x2, data=df)
summary(fit)
COR

x <- -1000:1000           #对x进行赋值
y <- 2*x^2     #根据公式求得y
mean(x) #顺便求个均值
var(x) #算个方差
plot(x, y)               #绘制x和y的散点图
cor(x, y)             #求x和y的相关系数
GET_HTML

#读取网页
url <- 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/qgqp/index.phtml?qq-pf-to=pcqq.c2c&p=1'
#url <- './web.html'
web <- readLines(url)

#初步筛选模式
pt1 <- '<table.*>'
pt2 <- '</table>'

#查找需要的信息
web <- web[grep(pt1, web):grep(pt2, web)]

#信息筛选模式
pt <- '<[^>]*?>'

#进一步清洗处理
cont <- gsub(pt, '', web)
cont <- gsub('\\s+', '', cont)
cont <- cont[cont!='']

#数据展现
mtr <- matrix(cont, ncol = 12, byrow = TRUE)

#写入文件
write.csv(mtr, './res.csv')

LOGIT

df <- read.csv('./data/data.csv')
summary(df)

set.seed(1)
train <- sample(1:nrow(df), 1e5)

fit <- glm(违约 ~ 标的总额 + 年利率 + 还款期限, data=df[train,], family=binomial())
summary(fit)

# probs <- predict(fit, type='response')
# out <- ifelse(probs>=0.5, 1, 0)
# a <- table(out, df$违约[train])
# sum(diag(a))/sum(a) *100

probs <- predict(fit, newdata = df[-train,], type='response')
out <- ifelse(probs>=0.5, 1, 0)
a <- table(out, df$违约[-train])
sum(diag(a))/sum(a) *100

宋词

splitwords <- function(x)
{
substring(x, 1:(nchar(x)-1), 2:nchar(x))
#substring("abcdef",1:5, 2:6)
}

#调用
txt <- read.csv("SongPoem.csv",colClasses="character")
# 句子用标点符号分割
sentences <- strsplit(txt$Sentence,"，|。|！|？|、")
sentences <- unlist(sentences)
sentences <- sentences[sentences!=""]
s.len <- nchar(sentences)

# 单句太长了说明有可能是错误的字符，去除掉
sentences <- sentences[s.len<=15]
s.len <- nchar(sentences)

#词频统计
words <- mapply(splitwords,sentences,SIMPLIFY=TRUE,USE.NAMES=FALSE)
words <- unlist(words)
words.freq <- table(words)
words.freq <- sort(words.freq,decreasing=TRUE)
words.freq[1:100]

#画词云
require(wordcloud2)
wf <- words.freq[2:500]
d <- data.frame(word = names(wf), freq = as.numeric(wf))
wordcloud2(d, size = 0.5)

ANSCOMBER