一.词云图
数据主要是使用几篇公众号文章形成的txt
# 设置工作目录到包含数据文件的文件夹
setwd("D:\\0A_Gotoyourdream\\00_2024Paper\\2024data")
# 安装必要的包(如果未安装)
install.packages("tm")
install.packages("wordcloud2")
install.packages("RColorBrewer")
# 加载包
library(tm)
library(wordcloud2)
library(jiebaRD)
library(jiebaR)
# 检查文件编码
# 检查文件是否存在
file.exists("D:\\0A_STUDYING\\2024_ProfessorYH\\DATA\\6001138_T.txt")
file_path <- "D:\\0A_STUDYING\\2024_ProfessorYH\\DATA\\黄金ETF_词云图.txt"
library(readr)
guess_enc <- guess_encoding(file_path, n_max = 1000)
print(guess_enc)
# 读取文件
# 读取文件并转换为 UTF-8 编码
text <- readLines(file_path, encoding = "GB18030")
text <- readLines("D:/0A_STUDYING/2024_ProfessorYH/DATA/黄金ETF_词云图.txt", encoding = "UTF-8",n = -1)
text_t <- tolower(text)
#将 R 中的字符向量(或字符串)通过指定的分隔符进行合并的操作
text_t <- paste(text_t, collapse = " ")
#这个引擎是后续中文分词操作的基础,通过这个引擎,你可以进行中文文本的分词。
engine <- worker()
#获取中文文本中每个词语的分词结果,用于后续的文本分析。
seg <- segment(text_t, engine)
# 过滤掉包含数字的词语
seg <- seg[!grepl("\\d", seg)]
#为了创建一个包含中文文本分词结果的频率表
seg_table <- table(seg)
#为了将特定的关键词从文本数据中移除,以便进行文本分析或其他处理时,这些关键词不会对结果产生干扰。
keywords_to_remove <- c("一下","更好","不能","的", "是", "在", "和", "了", "也", "对", "与", "中", "年",
"月", "日", "黄金", "etf", "基金","股票", "价格", "市场", "公司", "涨", "跌",
"交易", "股", "市场", "投资","可能","交易","为","ETF","将","其","而","从",
"但","截至","就","这","会","前","至","后","于","都","或","较","很","该",
"不","克", "类","一个", "约","自", "最","还", "均","时", "我","下", "相对","据",
"仍","说", "再","只", "吨","尽管", "近期","具体", "看","来", "T","D")
seg_table <- seg_table[!(names(seg_table) %in% keywords_to_remove)]
#打印频率表
print(seg_table)
#从经过过滤的频率表 seg_table 中选择出现频率最高的前 300 个词语。
top_words <- head(sort(seg_table, decreasing = TRUE), 300)
#用于创建一个数据框 (data.frame),其中包含两列:word 列存储词语,freq 列存储每个词语的频率。
word_freq_df <- data.frame(word = names(top_words), freq = as.numeric(top_words))
print(word_freq_df)
再生成词云图
#使用 wordcloud2 包在 R 中绘制词云图的操作
wordcloud2(word_freq_df,size = 2, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1,shape = 'circle' )
二.日历热力图
具体参考的是这篇公众号推文:R金融数据可视化(3)收益率的日历图
# 设置工作目录到包含数据文件的文件夹
setwd("D:\\0A_Gotoyourdream\\00_2024Paper\\2024data")
#茅台
library(tidyverse)
library(tidyquant)
library(pedquant)
mt<-md_stock("600519.sH",from='2021-09-04',to='2024-09-04',source="163")
#只选择相关列,包括股票代码(symbol)、
#日期(date)、开盘价(open)、最高价(high)、
#最低价(low)、收盘价(close)和交易量(volume)。将数据转换为 tibble 数据框
mt_tidy<-mt[[1]]%>%
as_tibble()%>%
select(symbol,date,open,high,low,close,volume)
#选择收盘价列来计算收益率(periodReturn 函数计算每天收益率)
mt_return<-mt_tidy%>%
tq_transmute(select=close,
mutate_fun=periodReturn,period='daily',
col_rename='daily.return')
mt_return<-mt_return%>%
mutate(date=ymd(date),
weekday=lubridate::wday(date,label=T),#获取日期对应的星期几
monthday=lubridate::mday(date),#提取日期中的日部分
month=lubridate::month(date,label=T),#提取日期中的月部分
week_y1=lubridate::week(date),#提取日期在一年中的周数(1-53周)。
week_y2=as.integer(strftime(date,"%W")),#从日期中提取周数(根据ISO标准,从0开始)。
year=lubridate::year(date))#提取年份部分。
head(mt_return)
###计算x坐标轴的标签,用月份表示
monthlabel<-mt_return%>%
dplyr::filter(date>"2023-12-31")%>% #只保留 2024 年的数据
group_by(month)%>% #根据月份进行分组
summarize(mean=mean(week_y2))#对于每个月,计算 week_y2 的平均值
##图1
mt_return%>%
plotly::filter(date>"2022-12-31")%>%
ggplot(aes(x=week_y2,y=weekday))+
geom_tile(aes(fill=daily.return),color="black")+
labs(fill='daily return')+
geom_text(aes(label=monthday),size=2)+
scale_fill_gradient2( low = "green",
mid = "white",
high = "darkred",
midpoint = 0 )+
scale_x_continuous(breaks=monthlabel$mean,labels=monthlabel$month)+
facet_wrap(~year,nrow=2,strip.position='left')+
ylab("weekday")+
xlab("")+
theme_bw()
但是呢,总觉得这个图哪里怪怪的......
library(ggTimeSeries)
###计算一个月内周个数的第二种方式
mt_return2<-mt_return%>%
mutate(weekday1=lubridate::wday(date),#从 date 列中提取星期几
weekday2=if_else(weekday=="Sun",
weekday1+6,weekday1-1),
#星期几重新映射为从星期一(1)到星期日(7)
a=monthday%/%7,b=monthday%%7)%>%
group_by(year)%>%
group_by(month)%>%
mutate(mweek1=if_else((b==0&(weekday=="Sun")),a,
if_else(b<=weekday2,a+1,a+2)))
###图2
mt_return2%>%
plotly::filter(date>"2021-12-31")%>%
ggplot(aes(date=date,fill=daily.return))+
stat_calendar_heatmap(color="black")+
geom_text(aes(y=8-weekday2,x=1+week_y2,label=monthday),size=2)+
labs(fill='daily return')+
scale_fill_gradient2( low = "green",
mid = "white",
high = "darkred",
midpoint = 0 )+
scale_x_continuous(breaks=monthlabel$mean,labels=monthlabel$month)+
scale_y_continuous(breaks=seq(7,1,-1),labels=c("Mon","Tue","Wed","Thu","Fri","Sat","Sun"))+
facet_wrap(~year,nrow=3,strip.position='right')+
ylab("weekday")+
xlab("")+
theme_bw()
week_y1与week_y2的计数方法不同。
比如都是2021-09-10,week_y1=37,week_y2=36
该图展示了2022年、2023年和2024年的日历热力图,显示的是每日的股票收益率(daily return),通过颜色的渐变来体现每天的收益情况。颜色从绿色(负收益)到红色(正收益),白色代表接近于0的收益。
2022年可以看到绿色区域主要集中在年初,尤其是在1月和2月,这表明该阶段内的负收益较多。之后的几个月波动性较大,6月、7月、8月收益情况逐渐好转,显示出较为一致的正收益,表明这段时间市场可能处子上升趋势。
相比2022年,2023年上半年的负收益天数明显增多,尤其是在4月、5月期间,红色区域较少,表明该阶段市场表现欠佳。6月和7月情况有所好转,市场似乎逐渐回暖。
从每周的角度来看,某些天(例如周二、周四)出现负收益的天数较多这可能反映了特定周期内市场的一些波动规律。例如,某些特定的市场事件或财务公告日可能集中在这些时间点。
相对而言,在每个季度结束时,如3月末、6月末、9月末、12月末,股票收益似乎呈现出更加明显的波动,可能与季度财报发布、经济数据公布或市场宏观因素变化有关。
三.地区热力图
假设已经获取好了关于上市股票信息的数据filtered_stock_info.csv
# 筛选属于生物医疗类的股票数据
biomedical_categories <- c("化学制药", "生物制药", "医疗保健", "医药商业", "中成药")
biomedical_stocks <- filtered_stock_info %>%
plotly::filter(industry %in% biomedical_categories)
# 查看筛选后的数据
head(biomedical_stocks)
# 计算各个省份出现的次数
province_count_bio <- biomedical_stocks %>%
group_by(area) %>% # 按 area(省份)分组
summarize(count = n()) %>% # 计算每个省份的数量
arrange(desc(count)) # 按出现次数降序排列
print(province_count_bio)
# 定义经济金融类的行业类别
financial_categories <- c("保险", "多元金融", "商贸代理", "银行", "证券")
# 筛选属于经济金融类的股票数据
financial_stocks <- filtered_stock_info %>%
plotly::filter(industry %in% financial_categories)
# 查看筛选后的数据
head(financial_stocks)
# 计算各个省份出现的次数
province_count_f <- financial_stocks %>%
group_by(area) %>% # 按 area(省份)分组
summarize(count = n()) %>% # 计算每个省份的数量
arrange(desc(count)) # 按出现次数降序排列
print(province_count_f)
# 定义酒类的行业类别
liquor_categories <- c("白酒", "啤酒", "红黄酒")
# 筛选属于酒类的股票数据
liquor_stocks <- filtered_stock_info %>%
plotly::filter(industry %in% liquor_categories)
# 查看筛选后的数据
head(liquor_stocks)
# 计算各个省份出现的次数
province_count_liquor <- liquor_stocks %>%
group_by(area) %>% # 按 area(省份)分组
summarize(count = n()) %>% # 计算每个省份的数量
arrange(desc(count)) # 按出现次数降序排列
print(province_count_liquor)
画不同板块的热力交互图,
###########画热力图##################
#1.生物医药类
name=province_count_bio$area
value=province_count_bio$count
# 导包
library(hchinamap)
library(IRdisplay)
library(htmlwidgets)
# 生成地图
map_widget_bio <- hchinamap(name = name,
value = value,
width = "100%",
height = "650px",
title = "生物医药类股票上市公司所在地分布热力图",
region = "China",
minColor = "#F8F8F8",
maxColor = "#02858C",
itermName = "指标",
hoverColor = "#f6acf5")
map_widget_bio
#2.金融类
name_f=province_count_f$area
count_f=province_count_f$count
map_widget_f <- hchinamap(name = name_f,
value = count_f,
width = "100%",
height = "650px",
title = "金融类类股票上市公司所在地分布热力图",
region = "China",
minColor = "#F8F8F8",
maxColor = "#EEC900",
itermName = "指标",
hoverColor = "#f6acf5")
map_widget_f
#3.酒类
name_liquor=province_count_liquor$area
count_liquor=province_count_liquor$count
map_widget_liquor <- hchinamap(name = name_liquor,
value = count_liquor,
width = "100%",
height = "650px",
title = "酒类股票上市公司所在地分布热力图",
region = "China",
minColor = "#F8F8F8",
maxColor = "#B22222",
itermName = "指标",
hoverColor = "#f6acf5")
map_widget_liquor