数据简单爬取:
包
library(RCurl)
# url 网址
#伪装myHttpheader
myHttpheader <- c(
"User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
tmp <- getURL(url = url, httpheader = myHttpheader,debugfunction=d$update,verbose= TRUE)
#得到源代码
strsplit(tmp, split="")
#分割字符串 (断)
pol <- unlist(pol)
#合成一列
pol1 <- pol[c(-1)]
#去掉第一个
where<-regexpr("title=(.*?)\n<", pol1)
pol3 <- regmatches(pol1, where)
#寻找 "title=(.*?)\n<"片段的字符(正则表达) 在字符中的位置
# 得到该片段
pol4<-gsub("title=","",pol4)
#不同于strsplit 代码除去 “”的内容
#\n需要 \\n \需要\\\
lapply(X=pol4, function(file) {file[1]})
#返回列表第一个字符
进度条:
library(tcltk)
plot.new()
pb <- tkProgressBar("进度","已完成 %", 0, 100)
for (i in1:10){}
info <- sprintf("%s已完成 %d%%",c, round(i*100/12))
setTkProgressBar(pb, i*100/12, sprintf("进度 (%s)", info), info)
全代码:空气质量爬取
library(tcltk)
library(stringr)
library(RCurl)
library(ggplot2)
all<-data.frame()
plot.new()
pb <- tkProgressBar("进度","已完成 %", 0, 100)
myHttpheader <- c(
"User-Agent"="Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
cityweath<-list("广州市","深圳市","珠海市","汕头市","佛山市","韶关市","中山市","江门市","湛江市","肇庆市")
d =debugGatherer()
thetime<-c()
quanty<-c()
for(j in 1:10){
i<-1
c<-cityweath[j]
while (i<=12) {
url<-"http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city="
url<-str_c(url,c,"&startdate=2016-01-01&enddate=2016-11-30&page=",i)
tmp <- getURL(url = url, httpheader = myHttpheader,debugfunction=d$update,verbose= TRUE)
pol<-c(strsplit(tmp, split=" coords="),strsplit(tmp, split=" </map></td>"))
pol <- unlist(pol)
pol1 <- pol[c(-1)]
where<-regexpr("title=(.*?)\n<", pol1)
pol3 <- regmatches(pol1, where)
pol3 <- pol3[c(-1)]
pol4<-strsplit(pol3, split="tittle=")
pol4<-gsub("title=","",pol4)
pol4<-gsub("\\\"", "", pol4)
pol4<-gsub("\\n", "", pol4)
pol4<-gsub("><", "", pol4)
pol4<-strsplit(pol4, split=" ")
pol4<-c(pol4[length(pol4)],pol4[2:length(pol4)-1])
thetime<-c(thetime,unlist(lapply(X=pol4, function(file) {file[1]})))
quanty<-c(quanty,unlist(lapply(X=pol4, function(file) {file[3]})))
info <- sprintf("%s已完成 %d%%",c, round(i*100/12))
setTkProgressBar(pb, i*100/12, sprintf("进度 (%s)", info), info)
i=i+1
}
filee<-str_c("C:/Users/Administrator/Desktop/R/",cityweath[j],".txt")
if(j==1){all<-data.frame(thetime,quanty,stringsAsFactors=FALSE,col.names = c(cityweath[j]))}
else all<-cbind(all,data.frame(thetime,quanty,stringsAsFactors=FALSE,col.names = c(cityweath[j])))
thetime<-c()
quanty<-c()
}
close(pb)#关闭进度条
write.csv(all,"all.csv")