#淘宝抓取品牌链接数据
library(xml2)
library(rvest)
library(stringr)
# 获取网页
url <- "https://list.tmall.com/search_product.htm?q=nike&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton"
link <- url
# 翻页网址获取
# i=1设置function参数
download_link<- function(url){
for(i in 1:20)
{ i = i+1
web <- read_html(url)
url0 <- "https://list.tmall.com/search_product.htm"
# 读取下一页链接
# 可尝试的方法2:url1 <- web %>% html_nodes("b.ui-page-num a:nth-of-type(1)") %>% html_attr("href")
url1 <- web %>% html_nodes("a.ui-page-next") %>% html_attr("href")
# 观测实际网址,对抓取后的链接与实际网址进行对比
url <- paste0(url0,url1,"")
# 合并链接
link <- rbind(link,url)
#观测数据爬取到第几个链接
print(i)
Sys.sleep(2)
}
return(link)
}
# 下载数据功能
download_product <- function(url){
web <- read_html(url)
# 商品抬头
title <- web %>% html_nodes("p.productTitle") %>% html_nodes("a") %>% html_text(trim = T)
# 商品价格
price <- web %>% html_nodes("p.productPrice") %>% html_text(trim = T)
# 店铺信息
shop <- web %>% html_nodes("div.productShop a") %>% html_text(trim = T)
# 店铺评价月销售详情
productStatus <- web %>% html_nodes("p.productStatus") %>% html_text(trim = T)
for(i in 1:60){
monsale[i] <- str_split(productStatus,"\n")[[i]][1]; # monsale=0初始值赋值,不然有错误
comments[i] <- str_split(productStatus,"\n")[[i]][2] # comments=0初始值赋值,不然有错误
}
data <- data.frame(title,price,shop,monsale,comments)
return(data)
}
# 建立数据框
LINK <- data.frame(web=NULL)
LINK <- download_link(url)
length(LINK)
# 下载全部数据
for(i in 1:length(LINK))
{if(i==1)
{data <- download_product(LINK[1])
data1 <- data
}
else{
data <- download_product(LINK[i])
data1 <- rbind(data1,data)
}
}
#观测前几行下载数据
head(data1)
#写入csv格式文件中
write.csv(data1,"d://asics.csv")
library(xml2)
library(rvest)
library(stringr)
# 获取网页
url <- "https://list.tmall.com/search_product.htm?q=nike&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton"
link <- url
# 翻页网址获取
# i=1设置function参数
download_link<- function(url){
for(i in 1:20)
{ i = i+1
web <- read_html(url)
url0 <- "https://list.tmall.com/search_product.htm"
# 读取下一页链接
# 可尝试的方法2:url1 <- web %>% html_nodes("b.ui-page-num a:nth-of-type(1)") %>% html_attr("href")
url1 <- web %>% html_nodes("a.ui-page-next") %>% html_attr("href")
# 观测实际网址,对抓取后的链接与实际网址进行对比
url <- paste0(url0,url1,"")
# 合并链接
link <- rbind(link,url)
#观测数据爬取到第几个链接
print(i)
Sys.sleep(2)
}
return(link)
}
# 下载数据功能
download_product <- function(url){
web <- read_html(url)
# 商品抬头
title <- web %>% html_nodes("p.productTitle") %>% html_nodes("a") %>% html_text(trim = T)
# 商品价格
price <- web %>% html_nodes("p.productPrice") %>% html_text(trim = T)
# 店铺信息
shop <- web %>% html_nodes("div.productShop a") %>% html_text(trim = T)
# 店铺评价月销售详情
productStatus <- web %>% html_nodes("p.productStatus") %>% html_text(trim = T)
for(i in 1:60){
monsale[i] <- str_split(productStatus,"\n")[[i]][1]; # monsale=0初始值赋值,不然有错误
comments[i] <- str_split(productStatus,"\n")[[i]][2] # comments=0初始值赋值,不然有错误
}
data <- data.frame(title,price,shop,monsale,comments)
return(data)
}
# 建立数据框
LINK <- data.frame(web=NULL)
LINK <- download_link(url)
length(LINK)
# 下载全部数据
for(i in 1:length(LINK))
{if(i==1)
{data <- download_product(LINK[1])
data1 <- data
}
else{
data <- download_product(LINK[i])
data1 <- rbind(data1,data)
}
}
#观测前几行下载数据
head(data1)
#写入csv格式文件中
write.csv(data1,"d://asics.csv")