Rmarkdown使用rvest包实现对静态网页数据抓取

一、获取节点信息的两种方法

1. XPath方法:节点路径
(1)绝对路径(复制完整的XPath)
(2)相对路径(复制XPath)
2.CSS方法:节点选择器(Selector)

二、爬取二十大网页报告(简单节点)

因为爬虫代码无法发出,所以这里只对二十大报告的文稿内容进行词云图分析

library(jiebaR)
words = worker(user = "add_dict.txt", stop_word = "stopwords.txt")
seg = segment(content3, words) #分词
freq = freq(seg) #统计词频
freq1 = freq[order(-freq$freq),] #按照词频从高到低进行排序
freq2 = freq1[1:120,] #指选取高频词的数量
library(wordcloud2)
#绘制简单的词云图
wordcloud2(freq1, size=1, shape='circle')

在这里插入图片描述

#自定义图片(使用的是中国地图)
wordcloud2(freq1, size = 1, figPath = "map.png")

在这里插入图片描述

wordcloud2(freq1, size = 1, figPath = "词1.jpg")

在这里插入图片描述

wordcloud2(freq1, size = 1, figPath = "词2.jpg")

在这里插入图片描述

wordcloud2(freq1, size = 1, figPath = "词3.png")

在这里插入图片描述

#自定义文字(20大)
letterCloud(freq1, size = 1, word="20大")

在这里插入图片描述

三、爬取”我爱我家”新房全部楼盘(一页)(Xpath方法提取节点)

1.提取图书名称、基本信息、评价数、豆瓣评分、摘抄

rm(list = ls())

library(rvest)

#"我爱我家"新房全部楼盘网址
url = "https://fang.5i5j.com/bj/loupan/n1/"

# read_html():读取html文档
html = read_html(url,  encoding = "utf-8")  

在这里插入图片描述
在这里插入图片描述

==注释==我们按照上面第一张图片,依次进行步骤1、2后可以定位到第一个新房名称的源代码位置,我们右击鼠标,可以得到第二张图片,复制XPath、复制完整的XPath就可以得到第一个新房名称的节点信息了。

(1)相对路径 第一个新房名称的XPath为//[@id=“,i,”]/div[2]/div[1]/a/span[2]。*

但是,如果我们想要一次性提取整个页面中十个新房的名称、价格等数据,我们需要找个新房名称的所有节点,即//div[2]/div[1]/a/span[2]。

页面中关于新房的其他信息都可以通过寻找节点信息得到。

(2)绝对路径 第一个新房名称完整的XPath为/html/body/div[4]/div[1]/ul[1]/li[1]/div[2]/div[1]/a/span[2]。 但是,如果我们想要一次性提取整个页面中十个新房的名称、价格等数据,我们需要找个新房名称的所有节点,即//div[2]/div[1]/a/span[2]。

页面中关于新房的其他信息都可以通过寻找节点信息得到。

#%>%为管道函数,将左边的值赋给右边的函数
#html_nodes()获取网页内节点信息
#html_text()获取标签内文本信息


##相对路径

  #新房名称
  house_name = html %>% html_nodes(xpath = ('//div[2]/div[1]/a/span[2]')) %>% html_text()
  head(house_name)
## [1] "和悦璞云"         "中骏云景台"       "和锦华宸"         "金茂北京国际社区"
## [5] "浪潮一览云山"     "万科七橡墅"
  ####绝对路径
  #house_name = html %>% html_nodes(xpath = '//div[2]/div[1]/a/span[2]') %>% html_text()

  #新房居室
  house_room = html %>% html_nodes(xpath = '//div[2]/div[2]/a/span[2]') %>% html_text(trim = T)
  head(house_room)
## [1] "1居/2居/3居" "2居/3居"     "3居/4居"     "1居/2居/3居" "1居"        
## [6] "4居"
  #新房面积
  house_area = html %>% html_nodes(xpath = '//div[2]/div[2]/a/span[4]') %>% html_text(trim = T)
  head(house_area)
## [1] "47.26-100.15" "74.19-184.68" "81.51-129.31" "49.79-119.11" "18.77-142.92"
## [6] "575.37"
  #新房地址
  house_address = html %>%html_nodes(xpath = '//div[2]/div[3]/a/span[5]')%>%html_text(trim = T)
  head(house_address)
## [1] "北京市大兴区榆南路"                                          
## [2] "良乡镇官道板块 南六环与良常路交汇处南行2公里(良乡六中对面)"
## [3] "北京市顺义区"                                                
## [4] "北京市顺义区水色时光西路西侧"                                
## [5] "河北省保定市涿州市"                                          
## [6] "燕房线周口店镇站北约800米"
 #新房价格
  house_price = html %>%html_nodes(xpath = '//div[3]/p')%>%html_text()
  house_price1 = house_price[3:12]
  head(house_price1)
## [1] "29000元/㎡" "28000元/㎡" "38000元/㎡" "32000元/㎡" "4500元/㎡" 
## [6] "21000元/㎡"
  #新房是否在售
  house_online = html %>%html_nodes(xpath = '//div[2]/div[1]/a/p/span[1]') %>%html_text()
  house_online1 = house_online[1:10]
  head(house_online1)
## [1] "在售" "在售" "在售" "在售" "在售" "在售"
  #新房类型
  house_type = html %>%html_nodes(xpath = '//div[2]/div[1]/a/p/span[2]') %>%html_text()
  head(house_type)
## [1] "普通住宅" "普通住宅" "普通住宅" "普通住宅" "办公"     "普通住宅"

2.将爬取的数据整合成数据框并存储

house = data.frame(
  "名称" = house_name,
  "居室" = house_room,
  "面积" = house_area,
  "地址" = house_address,
  "价格" = house_price1,
  "是否在售" = house_online1,
  "类型" = house_type)
head(house)
##               名称        居室         面积
## 1         和悦璞云 1居/2居/3居 47.26-100.15
## 2       中骏云景台     2居/3居 74.19-184.68
## 3         和锦华宸     3居/4居 81.51-129.31
## 4 金茂北京国际社区 1居/2居/3居 49.79-119.11
## 5     浪潮一览云山         1居 18.77-142.92
## 6       万科七橡墅         4居       575.37
##                                                           地址       价格
## 1                                           北京市大兴区榆南路 29000元/㎡
## 2 良乡镇官道板块 南六环与良常路交汇处南行2公里(良乡六中对面) 28000元/㎡
## 3                                                 北京市顺义区 38000元/㎡
## 4                                 北京市顺义区水色时光西路西侧 32000元/㎡
## 5                                           河北省保定市涿州市  4500元/㎡
## 6                                    燕房线周口店镇站北约800米 21000元/㎡
##   是否在售     类型
## 1     在售 普通住宅
## 2     在售 普通住宅
## 3     在售 普通住宅
## 4     在售 普通住宅
## 5     在售     办公
## 6     在售 普通住宅
write.csv(house, "我爱我家新房(一页).csv")

四、爬取豆瓣图书TOP250(十页)(CSS方法提取节点-Selector)

1.观察爬取的网页特点

#当爬取豆瓣读书TOP250网页时,每个页面有25条数据,我们一共需要爬取10页。10页的网址如下所示:
#https://book.douban.com/top250?start=0
#https://book.douban.com/top250?start=25
#https://book.douban.com/top250?start=50
#https://book.douban.com/top250?start=75
#https://book.douban.com/top250?start=100
#https://book.douban.com/top250?start=125
#https://book.douban.com/top250?start=150
#https://book.douban.com/top250?start=175
#https://book.douban.com/top250?start=200
#https://book.douban.com/top250?start=225

解读观察到网址的前部分都是”https://book.douban.com/top250?start=“,只有最后数字不一样,所以我们利用paste函数批量获取网址。

2.使用paste函数批量处理网址

rm(list=ls())
a = seq(0,9,1) # 生成0到9,步长为1的十个数
url_1 = "https://book.douban.com/top250?start="
url_2 = ""
for (i in 1:10){
  url_2[i] = paste(url_1, a[i]*25, sep = "")
  }

3.创建变量并赋值25行10列的矩阵

book_name = matrix(1:250, nrow=25, ncol=10)
book_basic = matrix(1:250, nrow=25, ncol=10)
book_count = matrix(1:250, nrow=25, ncol=10)
book_star = matrix(1:250, nrow=25, ncol=10)
book_comment =  matrix(1:250, nrow=25, ncol=10)

4.从对应网页中批量爬取数据

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

for (i in 1:10){
  #图书名称
  book_name[,i] =  read_html(url_2[i],encoding = "gdk") %>% html_nodes('td:nth-child(2) > div.pl2 > a') %>% html_text(trim = T)
  
  ##完整的selector
#content > div > div.article > div > table:nth-child(2) > tbody > tr > td:nth-child(2) > div.pl2 > a
  
  #图书的基本信息
  book_basic[,i] = read_html(url_2[i],encoding = "gdk") %>% html_nodes("td:nth-child(2) > p.pl") %>% html_text()
  
  #图书的豆瓣评分
  book_star[,i] = read_html(url_2[i],encoding = "gdk") %>% html_nodes('div.star.clearfix > span.rating_nums') %>% html_text()
  
  #图书的评价数
  book_count[,i] = read_html(url_2[i],encoding = "gdk") %>% html_nodes('div.star.clearfix > span.pl') %>% html_text(trim = T)
  }

5.将数据转换成一列数据

##将数据变为一列的数据,方便后续合并为数据框
book_name = matrix(book_name, ncol = 1, byrow = F)
book_basic = matrix(book_basic, ncol = 1, byrow = F)
book_count = matrix(book_count, ncol = 1, byrow = F)
book_star = matrix(book_star, ncol = 1, byrow = F)

6.对图书的基本信息进行分割

library("stringr")

#将矩阵转换成字符串
book_basic = as.character(book_basic)

#将book_basic按照“/”进行分割
list0 =  strsplit(book_basic, " / ", fixed= T) 

#使用sapply函数取出后三列数据(分别是出版社、出版时间、价格),并将列表转换成矩阵方便提取 
(list2 = as.matrix(sapply(list0, tail, 3)))
##      [,1]             [,2]         [,3]                 [,4]          
## [1,] "人民文学出版社" "作家出版社" "北京十月文艺出版社" "南海出版公司"
## [2,] "1996-12"        "2012-8-1"   "2010-4-1"           "2011-6"      
## [3,] "59.70元"        "20.00元"    "28.00"              "39.50元"     
##      [,5]         [,6]             [,7]         [,8]            
## [1,] "重庆出版社" "人民文学出版社" "译林出版社" "人民文学出版社"
## [2,] "2012-1-1"   "2008-12-1"      "2000-9"     "1998-05"       
## [3,] "168.00元"   "498.00元"       "40.00元"    "39.50元"       
##      [,9]               [,10]            [,11]             [,12]         
## [1,] "北京联合出版公司" "上海译文出版社" "群众出版社"      "南海出版公司"
## [2,] "2018-2"           "2007-3"         "1981-8"          "2013-1-1"    
## [3,] "45.00元"          "10.00元"        "53.00元/68.00元" "39.50元"     
##      [,13]            [,14]                    [,15]            [,16]         
## [1,] "人民文学出版社" "生活·读书·新知三联书店" "人民文学出版社" "哈尔滨出版社"
## [2,] "2003-8"         "1994-5"                 "1997-08"        "2003-8"      
## [3,] "22.00元"        "96.00元"                "25.00元"        "15.80元"     
##      [,17]        [,18]            [,19]            [,20]           
## [1,] "译林出版社" "人民文学出版社" "中国海关出版社" "中国青年出版社"
## [2,] "2012-9"     "1973-3"         "2009-4"         "1997-10"       
## [3,] "32.00元"    "0.36元"         "358.20元"       "27.00元"       
##      [,21]            [,22]            [,23]           
## [1,] "人民文学出版社" "人民文学出版社" "上海译文出版社"
## [2,] "2017-4"         "2018-7"         "2010-8"        
## [3,] "59.00元"        "62.00元"        "22.00元"       
##      [,24]                    [,25]        [,26]            [,27]           
## [1,] "生活·读书·新知三联书店" "中信出版社" "人民文学出版社" "人民文学出版社"
## [2,] "2001"                   "2014-11"    "2005-1"         "1991-2"        
## [3,] "12.00元"                "68.00元"    "64.00元"        "19.00"         
##      [,28]            [,29]                    [,30]           
## [1,] "人民文学出版社" "生活·读书·新知三联书店" "上海人民出版社"
## [2,] "1973-3"         "1994-5"                 "2006-04-01"    
## [3,] "0.20元"         "76.80元"                "38.00"         
##      [,31]                    [,32]            [,33]            [,34]         
## [1,] "生活·读书·新知三联书店" "天津人民出版社" "人民文学出版社" "南海出版公司"
## [2,] "2013-1"                 "2017-1"         "2012-9"         "2012-9-1"    
## [3,] "88.00元"                "32.00元"        "39.00元"        "39.50元"     
##      [,35]            [,36]            [,37]            [,38]           
## [1,] "人民文学出版社" "人民文学出版社" "上海译文出版社" "上海译文出版社"
## [2,] "2006-7"         "2006-5"         "1991-12-1"      "2012-1"        
## [3,] "29.90元"        "18.00元"        "43.90元"        "36.00元"       
##      [,39]            [,40]            [,41]           
## [1,] "春风文艺出版社" "上海译文出版社" "人民文学出版社"
## [2,] "2002-5"         "2006-8"         "1973-3"        
## [3,] "25.00元"        "15.00元"        "0.37元"        
##      [,42]                    [,43]            [,44]            [,45]       
## [1,] "生活·读书·新知三联书店" "中国青年出版社" "广西美术出版社" "新星出版社"
## [2,] "1997-5"                 "2003-7"         "2008-04"        "2013-3"    
## [3,] "18.00元"                "16.00元"        "280.00"         "28.00元"   
##      [,46]                [,47]            [,48]           
## [1,] "广西师范大学出版社" "上海人民出版社" "上海译文出版社"
## [2,] "2015-4"             "2006-5"         "2006-8"        
## [3,] "36.00元"            "29.00元"        "25.00元"       
##      [,49]                    [,50]        [,51]            [,52]           
## [1,] "生活·读书·新知三联书店" "作家出版社" "上海译文出版社" "上海译文出版社"
## [2,] "1999-04"                "2012-9"     "2011-1"         "2007-3"        
## [3,] "47.00元"                "24.00元"    "33.00元"        "18.00元"       
##      [,53]            [,54]            [,55]        [,56]           
## [1,] "人民文学出版社" "人民文学出版社" "译林出版社" "人民文学出版社"
## [2,] "1973-12-01"     "1973-4"         "2012-4-1"   "1994-11"       
## [3,] "0.31 元"        "0.25元"         "30.00元"    "21.45元"       
##      [,57]          [,58]            [,59]        [,60]            [,61]     
## [1,] "南海出版公司" "人民文学出版社" "花城出版社" "浙江教育出版社" "中华书局"
## [2,] "2008-9"       "2008-3"         "1999-3"     "1991-4"         "1982-11" 
## [3,] "28.00"        "19.00"          "19.00元"    "168.00元"       "125.00"  
##      [,62]            [,63]            [,64]            [,65]         
## [1,] "人民文学出版社" "人民文学出版社" "上海译文出版社" "上海三联书店"
## [2,] "1993-7"         "1992-6"         "2013-8"         "2015-1"      
## [3,] "13.00元"        "66.00元"        "34.00元"        "28.00元"     
##      [,66]                    [,67]            [,68]                   
## [1,] "生活·读书·新知三联书店" "人民文学出版社" "生活·读书·新知三联书店"
## [2,] "1997-5"                 "2004-8"         "1994-5"                
## [3,] "19.00元"                "47.20元"        "76.80元"               
##      [,69]        [,70]                [,71]            [,72]               
## [1,] "北京出版社" "广西师范大学出版社" "辽宁教育出版社" "江苏凤凰文艺出版社"
## [2,] "2006-7"     "2010-10"            "2006-1"         "2014-9"            
## [3,] "19.90元"    "38.00元"            "32.00元"        "32.00元"           
##      [,73]                [,74]          [,75]              [,76]           
## [1,] "广西师范大学出版社" "哈尔滨出版社" "北京联合出版公司" "上海人民出版社"
## [2,] "2013-1-10"          "2003-6"       "2020-6"           "2007-9"        
## [3,] "98.00元"            "15.80元"      "38.00元"          "28.00元"       
##      [,77]            [,78]            [,79]            [,80]       
## [1,] "四川文艺出版社" "上海人民出版社" "上海古籍出版社" "中华书局"  
## [2,] "2015-12"        "2021-8"         "1998-12"        "1984-07-01"
## [3,] "35.00元"        "65.00元"        "9.80元"         "0.65"      
##      [,81]            [,82]                [,83]         
## [1,] "浙江人民出版社" "广西师范大学出版社" "南海出版公司"
## [2,] "2015-7-31"      "2010.10"            "2003-1"      
## [3,] "54.90元"        "46.00元"            "20.00元"     
##      [,84]                    [,85]            [,86]               
## [1,] "生活·读书·新知三联书店" "电子工业出版社" "广西师范大学出版社"
## [2,] "2009-7"                 "2011-8"         "2013-1-1"          
## [3,] "43.00元"                "55.00元"        "39.80元"           
##      [,87]            [,88]            [,89]                [,90]           
## [1,] "百花文艺出版社" "陕西人民出版社" "上海锦绣文章出版社" "上海译文出版社"
## [2,] "2005-01"        "1996-10"        "2008-5"             "2017-6"        
## [3,] "19.00"          "4.50元"         "18.00元"            "45.00元"       
##      [,91]                [,92]        [,93]                   
## [1,] "北京十月文艺出版社" "新星出版社" "生活·读书·新知三联书店"
## [2,] "2008-07"            "2012-6"     "2003-7"                
## [3,] "36.00元"            "29.80元"    "18.80元"               
##      [,94]            [,95]        [,96]            [,97]           
## [1,] "人民文学出版社" "译林出版社" "人民文学出版社" "上海译文出版社"
## [2,] "2017-10"        "1997-8"     "1999-04"        "2006-8"        
## [3,] "62.00元"        "23.30元"    "15.40"          "23.00元"       
##      [,98]                [,99]        [,100]       [,101]          
## [1,] "江苏凤凰文艺出版社" "新星出版社" "译林出版社" "上海译文出版社"
## [2,] "2016-5-1"           "2021-1"     "2012-4"     "2007-3"        
## [3,] "129.80元"           "79.00元"    "25.00元"    "32.00元"       
##      [,102]                                [,103]           [,104]        
## [1,] "生活·读书·新知三联书店 上海三联书店" "上海人民出版社" "哈尔滨出版社"
## [2,] "2012-5"                              "2016-1"         "2004-6"      
## [3,] "38.00元"                             "49.00元"        "16.80元"     
##      [,105]           [,106]                  [,107]              
## [1,] "上海人民出版社" "广州出版社 花城出版社" "北京十月文艺出版社"
## [2,] "2008-10-01"     "2008-3"                "2007-6"            
## [3,] "28.00元"        "108.00元"              "28.00元"           
##      [,108]               [,109]           [,110]       [,111]      
## [1,] "江苏凤凰文艺出版社" "北岳文艺出版社" "作家出版社" "译林出版社"
## [2,] "2016-3-1"           "2002-4"         "2011-10"    "2011-9"    
## [3,] "58.00"              "12.00元"        "35.00元"    "28.00元"   
##      [,112]             [,113]               [,114]        
## [1,] "北京联合出版公司" "湖南科学技术出版社" "南海出版公司"
## [2,] "2020-7"           "2010-4"             "2014-2"      
## [3,] "58.00元"          "45.00元"            "29.50"       
##      [,115]                    [,116]       [,117]       [,118]      
## [1,] "果麦文化/上海文化出版社" "译林出版社" "译林出版社" "漓江出版社"
## [2,] "2021-7-28"               "2010-9"     "2003-6"     "1987-10"   
## [3,] "68.00元"                 "35.00元"    "6.00元"     "3.95元"    
##      [,119]       [,120]         [,121]           [,122]          
## [1,] "人民出版社" "南海出版公司" "少年儿童出版社" "上海译文出版社"
## [2,] "1991-5"     "2019-10"      "1962"           "2015-7"        
## [3,] "40.00元"    "59.00元"      "30.00元"        "23.00"         
##      [,123]               [,124]           [,125]           [,126]          
## [1,] "广西师范大学出版社" "化学工业出版社" "四川人民出版社" "天津人民出版社"
## [2,] "2004-5-1"           "2019-9"         "2021-2"         "2014-9"        
## [3,] "32.00元"            "48.00元"        "45.00元"        "68.00元"       
##      [,127]           [,128]           [,129]           [,130]          
## [1,] "时代文艺出版社" "人民文学出版社" "上海译文出版社" "江苏文艺出版社"
## [2,] "2009-1"         "2002-1"         "2007-7"         "2005-01"       
## [3,] "30.00元"        "9.80元"         "20.00元"        "20.00元"       
##      [,131]         [,132]           [,133]       [,134]          
## [1,] "万卷出版公司" "机械工业出版社" "商务印书馆" "上海译文出版社"
## [2,] "2010-6"       "2003-8"         "2013-8"     "1978年6月"     
## [3,] "25.00元"      "88.00元"        "76.00元"    "0.43"          
##      [,135]           [,136]           [,137]         [,138]          
## [1,] "江西人民出版社" "北京大学出版社" "哈尔滨出版社" "上海译文出版社"
## [2,] "2016-4-1"       "2006-1-10"      "2003-08"      "2018-5"        
## [3,] "39.8元"         "42.00"          "13.80"        "48"            
##      [,139]           [,140]       [,141]       [,142]          
## [1,] "人民文学出版社" "中信出版社" "作家出版社" "上海译文出版社"
## [2,] "2017-1"         "2011-10-24" "2007-10"    "2008-4"        
## [3,] "42.00元"        "68.00元"    "26.00元"    "15.00元"       
##      [,143]                 [,144]           [,145]              
## [1,] "皇冠文化出版有限公司" "上海人民出版社" "北京十月文艺出版社"
## [2,] "2014-6-9"             "2009-3"         "2007-5"            
## [3,] "NT$350"               "25.00元"        "28.00元"           
##      [,146]           [,147]           [,148]           [,149]     [,150]      
## [1,] "上海译文出版社" "人民文学出版社" "人民文学出版社" "中华书局" "花城出版社"
## [2,] "2010-8"         "1999-1"         "2001-6"         "1987-1-1" "2017-11"   
## [3,] "29.00元"        "10.20元"        "10.00元"        "21.00元"  "38.00元"   
##      [,151]               [,152]       [,153]           [,154]          
## [1,] "四川科学技术出版社" "花城出版社" "长江文艺出版社" "云南人民出版社"
## [2,] "2005-6"             "2014-6-15"  "2009-3"         "2010-7"        
## [3,] "22.00元"            "34.80元"    "29.80"          "22.00元"       
##      [,155]             [,156]                   [,157]              
## [1,] "解放军文艺出版社" "生活·读书·新知三联书店" "北京十月文艺出版社"
## [2,] "2005-3"           "2004-08"                "2006-12"           
## [3,] "28.00元"          "25.00元"                "29.80元"           
##      [,158]               [,159]           [,160]          
## [1,] "江苏少年儿童出版社" "上海文艺出版社" "人民邮电出版社"
## [2,] "2009-6"             "2015-6-1"       "2006-1"        
## [3,] "18.00元"            "CNY 20.00"      "68.00元"       
##      [,161]                      [,162]               [,163]          
## [1,] "中信出版社 浙江人民出版社" "广西师范大学出版社" "武汉大学出版社"
## [2,] "2007-1"                    "2015-1-1"           "2021-8"        
## [3,] "35.00"                     "39.00元"            "58.00元"       
##      [,164]                 [,165]                   [,166]          
## [1,] "上海社会科学院出版社" "生活·读书·新知三联书店" "上海文艺出版社"
## [2,] "2003-7"               "2005-7"                 "2007-8"        
## [3,] "25.00元"              "32.00元"                "37.00元"       
##      [,167]                   [,168]           [,169]         [,170]          
## [1,] "生活·读书·新知三联书店" "人民文学出版社" "上海三联书店" "中国城市出版社"
## [2,] "2009-12-1"              "1999-1"         "2010-1"       "2010-1"        
## [3,] "28.00元"                "38"             "25.00元"      "28.80"         
##      [,171]         [,172]           [,173]         [,174]          
## [1,] "新世界出版社" "天津人民出版社" "南海出版公司" "上海文艺出版社"
## [2,] "2005-6"       "2004-7"         "2016-11"      "2019-5-10"     
## [3,] "20.00元"      "18.00元"        "39.50元"      "62.00元"       
##      [,175]         [,176]           [,177]               [,178]          
## [1,] "南海出版公司" "人民文学出版社" "北京十月文艺出版社" "上海译文出版社"
## [2,] "2013-6"       "2006-5"         "2005-12"            "2008-7"        
## [3,] "25.00元"      "22.00元"        "29.00元"            "12.00元"       
##      [,179]               [,180]       [,181]               [,182]      
## [1,] "北京十月文艺出版社" "文汇出版社" "北京十月文艺出版社" "商务印书馆"
## [2,] "2008-4"             "2005-8"     "2006-12"            "1986-8"    
## [3,] "29.80元"            "25.00元"    "28.00元"            "28.00元"   
##      [,183]               [,184]               [,185]                  
## [1,] "北京十月文艺出版社" "四川科学技术出版社" "生活·读书·新知三联书店"
## [2,] "2007-7"             "2005-6"             "1999-04"               
## [3,] "28.00元"            "16.00元"            "47.00"                 
##      [,186]       [,187]           [,188]           [,189]          
## [1,] "重庆出版社" "人民文学出版社" "浙江文艺出版社" "上海译文出版社"
## [2,] "2013-10-1"  "1999-05"        "2016-12"        "2008-7"        
## [3,] "330.00元"   "9.20"           "48.00元"        "13.00"         
##      [,190]           [,191]           [,192]     [,193]          
## [1,] "上海古籍出版社" "上海译文出版社" "中华书局" "人民文学出版社"
## [2,] "2006-7"         "2006-4-1"       "2006-12"  "2003-01"       
## [3,] "20.00元"        "45.00元"        "9.80元"   "22.00"         
##      [,194]             [,195]           [,196]          
## [1,] "北京联合出版公司" "人民文学出版社" "湖南文艺出版社"
## [2,] "2018-2-1"         "1997-1"         "2013-8-5"      
## [3,] "58.00"            "50.60元"        "55"            
##      [,197]                   [,198]         [,199]        
## [1,] "生活·读书·新知三联书店" "南海出版公司" "新世界出版社"
## [2,] "2009-10"                "2014-5"       "2009-12"     
## [3,] "39.00元"                "39.50元"      "39.80元"     
##      [,200]               [,201]       [,202]           [,203]          
## [1,] "江苏凤凰文艺出版社" "译林出版社" "人民文学出版社" "人民文学出版社"
## [2,] "2015-10-1"          "2015-5"     "2008-12"        "1998-12"       
## [3,] "328.00元"           "36.00元"    "26.00"          "20.00"         
##      [,204]         [,205]           [,206]                  
## [1,] "南海出版公司" "上海译文出版社" "生活·读书·新知三联书店"
## [2,] "2016-5"       "2007-7"         "2004-08"               
## [3,] "45.00元"      "23.00元"        "21.80"                 
##      [,207]                   [,208]               [,209]            
## [1,] "生活·读书·新知三联书店" "广西师范大学出版社" "世界图书出版公司"
## [2,] "2006-8"                 "2017-3-1"           "2007-11"         
## [3,] "19.80元"                "65.00元"            "68.00元"         
##      [,210]       [,211]     [,212]       [,213]       [,214]      
## [1,] "中信出版社" "中华书局" "译林出版社" "花城出版社" "科学出版社"
## [2,] "2013-2"     "2007-04"  "2015-10"    "1997-5"     "2002-11"   
## [3,] "42.00元"    "20.00元"  "38.00"      "29.00元"    "29.00元"   
##      [,215]             [,216]       [,217]       [,218]          
## [1,] "北京联合出版公司" "译林出版社" "中信出版社" "人民文学出版社"
## [2,] "2021-9"           "1998-7"     "2016-1"     "2000-3-1"      
## [3,] "52.00元"          "28.00元"    "55.00元"    "12.00"         
##      [,219]         [,220]             [,221]           [,222]              
## [1,] "新世界出版社" "中国友谊出版公司" "上海译文出版社" "湖南科学技术出版社"
## [2,] "2004-1"       "2007-1"           "2009-6"         "2011-5"            
## [3,] "38.00元"      "25.00元"          "25.00元"        "26.00元"           
##      [,223]       [,224]         [,225]                   [,226]        
## [1,] "译林出版社" "哈尔滨出版社" "生活·读书·新知三联书店" "南海出版公司"
## [2,] "2012-11"    "2005-6"       "2003-1"                 "2012-5"      
## [3,] "36.00元"    "13.5元"       "21.80元"                "25.00元"     
##      [,227]           [,228]           [,229]       [,230]                 
## [1,] "上海文艺出版社" "上海译文出版社" "译林出版社" "雅众文化/中信出版集团"
## [2,] "2013-3"         "2004-5"         "2012-4"     "2019-4"               
## [3,] "48.00元"        "17.00元"        "22.00元"    "48.00元"              
##      [,231]           [,232]           [,233]         [,234]          
## [1,] "人民文学出版社" "湖南文艺出版社" "上海三联书店" "安徽文艺出版社"
## [2,] "2001-1"         "2018-4"         "2009-6"       "1992年"        
## [3,] "7.00元"         "49.80"          "25.00"        "45元"          
##      [,235]                   [,236]               [,237]          
## [1,] "生活·读书·新知三联书店" "湖南科学技术出版社" "北京大学出版社"
## [2,] "2006-3"                 "2019-6"             "2012-5"        
## [3,] "21.00元"                "56.00元"            "49.00元"       
##      [,238]             [,239]               [,240]       [,241]          
## [1,] "北京联合出版公司" "北京十月文艺出版社" "商务印书馆" "机械工业出版社"
## [2,] "2017-5"           "2009-3"             "2013-1"     "2015-3-1"      
## [3,] "36.80元"          "18.00元"            "12.00元"    "39.8"          
##      [,242]           [,243]           [,244]       [,245]          
## [1,] "天津人民出版社" "湖南文艺出版社" "译林出版社" "人民文学出版社"
## [2,] "2019-10"        "2019-1"         "2012-4-1"   "1963-4-1"      
## [3,] "45.00元"        "52.00元"        "20.00元"    "14.50元"       
##      [,246]           [,247]                    [,248]       [,249]          
## [1,] "人民邮电出版社" "雅众文化/云南人民出版社" "商务印书馆" "上海文化出版社"
## [2,] "2011-4"         "2015-12"                 "2003-2-1"   "2020-8"        
## [3,] "49.00元"        "29.00元"                 "18.00元"    "39.80元"       
##      [,250]          
## [1,] "浙江文艺出版社"
## [2,] "2020-5"        
## [3,] "39.8"
book_edit = list2[1,] #图书的出版社
book_time = list2[2,]  #图书的出版时间
book_price = list2[3,]  #图书的价格

#因为图书作者有的分布在list的第一列和第二列,所以我们只取第一列即图书主作者
book_mainAuthor = sapply(list0, head, 1)  

7.对图书名称、评价数进行清洗

#使用gsub函数、正则表达式替换其中的换行符
book_name = gsub("\\s", "", book_name) 
head(book_name)
##      [,1]                     
## [1,] "红楼梦"                 
## [2,] "活着"                   
## [3,] "1984"                   
## [4,] "百年孤独"               
## [5,] "三体全集:地球往事三部曲"
## [6,] "哈利·波特"
#使用gsub函数、正则表达式替换其中的换行符
book_count = gsub("\\s", "", book_count) 

#使用gsub函数、正则表达式替换其中的括号
book_count = gsub("[(.*)]", "", book_count)
head(book_count)
##      [,1]          
## [1,] "382899人评价"
## [2,] "722990人评价"
## [3,] "249610人评价"
## [4,] "385367人评价"
## [5,] "145821人评价"
## [6,] "78921人评价"

8.将所有数据合并成数据框

book2 = data.frame(
  "书名" = book_name, 
  "出版社" = book_edit, 
  "出版时间" = book_time, 
  "价格" = book_price, 
  "第一作者" = book_mainAuthor, 
  "豆瓣评分" = book_star,
  "评价数" = book_count
  )

head(book2)
##                      书名             出版社  出版时间     价格
## 1                  红楼梦     人民文学出版社   1996-12  59.70元
## 2                    活着         作家出版社  2012-8-1  20.00元
## 3                    1984 北京十月文艺出版社  2010-4-1    28.00
## 4                百年孤独       南海出版公司    2011-6  39.50元
## 5 三体全集:地球往事三部曲         重庆出版社  2012-1-1 168.00元
## 6               哈利·波特     人民文学出版社 2008-12-1 498.00元
##                     第一作者 豆瓣评分       评价数
## 1             [清] 曹雪芹 著      9.6 382899人评价
## 2                       余华      9.4 722990人评价
## 3           [英] 乔治·奥威尔      9.4 249610人评价
## 4 [哥伦比亚] 加西亚·马尔克斯      9.3 385367人评价
## 5                     刘慈欣      9.5 145821人评价
## 6     J.K.罗琳 (J.K.Rowling)      9.7  78921人评价
write.csv(book3, file="作业//book(TOP250).csv")

9.图书出版年份箱型图

#导入豆瓣图书TOP250
BOOK = read.csv("book(TOP250).csv")

#将图书出版年份数据提取出来
(BOOK$Year = substring(BOOK$出版时间,1,4))
##   [1] "1996" "2012" "2010" "2011" "2012" "2008" "2000" "1998" "2018" "2007"
##  [11] "1981" "2013" "2003" "1994" "1997" "2003" "2012" "1973" "2009" "1997"
##  [21] "2017" "2018" "2010" "2001" "2014" "2005" "1991" "1973" "1994" "2006"
##  [31] "2013" "2017" "2012" "2012" "2006" "2006" "1991" "2012" "2002" "2006"
##  [41] "1973" "1997" "2003" "2008" "2013" "2015" "2006" "2006" "1999" "2012"
##  [51] "2011" "2007" "1973" "1973" "2012" "1994" "2008" "2008" "1999" "1991"
##  [61] "1982" "1993" "1992" "2013" "2015" "1997" "2004" "1994" "2006" "2010"
##  [71] "2006" "2014" "2013" "2003" "2020" "2007" "2015" "2021" "1998" "1984"
##  [81] "2015" "2010" "2003" "2009" "2011" "2013" "2005" "1996" "2008" "2017"
##  [91] "2008" "2012" "2003" "2017" "1997" "1999" "2006" "2016" "2021" "2012"
## [101] "2007" "2012" "2016" "2004" "2008" "2008" "2007" "2016" "2002" "2011"
## [111] "2011" "2020" "2010" "2014" "2021" "2010" "2003" "1987" "1991" "2019"
## [121] "1962" "2015" "2004" "2019" "2021" "2014" "2009" "2002" "2007" "2005"
## [131] "2010" "2003" "2013" "1978" "2016" "2006" "2003" "2018" "2017" "2011"
## [141] "2007" "2008" "2014" "2009" "2007" "2010" "1999" "2001" "1987" "2017"
## [151] "2005" "2014" "2009" "2010" "2005" "2004" "2006" "2009" "2015" "2006"
## [161] "2007" "2015" "2021" "2003" "2005" "2007" "2009" "1999" "2010" "2010"
## [171] "2005" "2004" "2016" "2019" "2013" "2006" "2005" "2008" "2008" "2005"
## [181] "2006" "1986" "2007" "2005" "1999" "2013" "1999" "2016" "2008" "2006"
## [191] "2006" "2006" "2003" "2018" "1997" "2013" "2009" "2014" "2009" "2015"
## [201] "2015" "2008" "1998" "2016" "2007" "2004" "2006" "2017" "2007" "2013"
## [211] "2007" "2015" "1997" "2002" "2021" "1998" "2016" "2000" "2004" "2007"
## [221] "2009" "2011" "2012" "2005" "2003" "2012" "2013" "2004" "2012" "2019"
## [231] "2001" "2018" "2009" "1992" "2006" "2019" "2012" "2017" "2009" "2013"
## [241] "2015" "2019" "2019" "2012" "1963" "2011" "2015" "2003" "2020" "2020"
BOOK$Year1 = as.numeric(BOOK$Year)
library(ggplot2)
#绘制箱型图
ggplot(BOOK, aes(y = Year1)) + 
  geom_boxplot(fill = "orange", alpha = 0.8, outlier.size = 3, outlier.fill = "red", outlier.shape = 21) + ## 标记离群点的大小、填充颜色、形状
  labs(title = "豆瓣图书TOP250出版年份分布箱型图", y = "图书出版年份") +
  theme(plot.title = element_text(hjust = 0.5))  # 箱型图标题居中处理

在这里插入图片描述

summary(BOOK$Year1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1962    2003    2008    2007    2013    2021
(New = BOOK[BOOK$Year1 == max(BOOK$Year1),]) #最近出版的图书
(Old = BOOK[BOOK$Year1 == min(BOOK$Year1),]) #最早出版的图书
##       X                        书名                  出版社  出版时间    价格
## 78   78 置身事内:中国政府与经济发展          上海人民出版社    2021-8 65.00元
## 99   99        刘擎西方现代思想讲义              新星出版社    2021-1 79.00元
## 115 115          也许你该找个人聊聊 果麦文化/上海文化出版社 2021-7-28 68.00元
## 125 125          桶川跟踪狂杀人事件          四川人民出版社    2021-2 45.00元
## 163 163                鹿川有许多粪          武汉大学出版社    2021-8 58.00元
## 215 215          从零开始的女性主义        北京联合出版公司    2021-9 52.00元
##              第一作者 豆瓣评分      评价数 Year Year1
## 78             兰小欢      9.1 42413人评价 2021  2021
## 99               刘擎      9.2 23830人评价 2021  2021
## 115 [美]洛莉·戈特利布      9.0 31801人评价 2021  2021
## 125       [日] 清水洁      9.0 31120人评价 2021  2021
## 163            李沧东      9.0 19698人评价 2021  2021
## 215 [日本] 上野千鹤子      8.7 41568人评价 2021  2021
##       X         书名         出版社 出版时间    价格       第一作者 豆瓣评分
## 121 121 十万个为什么 少年儿童出版社     1962 30.00元 少年儿童出版社      9.1
##          评价数 Year Year1
## 121 16883人评价 1962  1962

解读从箱型图我们可以看出,豆瓣图书TOP250的图书中,出版时间大多集中在2003年-2013年。其中,最近的图书出版于2021年,共有6本,分别是《置身事内:中国政府与经济发展》、《刘擎西方现代思想讲义》、《也许你该找个人聊聊》、《桶川跟踪狂杀人事件》、《鹿川有许多粪》、《从零开始的女性主义》;最早的图书出版于1962年,仅有一本,是《十万个为什么》。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值