1、RCurl抓取验证码
RCurl这个程序包提供了由R到libcurl库的接口,从而实现HTTP的一些功能。例如,从 服务器下载文件、保持连接、上传文件、采用二进制格式读取、句柄重定向、密码认证等等。 什么是curl&libcurl – curl:利用URL语法在命令行方式下工作的开源文件传输工具 – curl背后的库就是libcurl 功能 – 获得页面 – 有关认证 – 上传下载 – 信息搜索 – ……
library(RCurl)
## Loading required package: bitops
myHttpheader <- c("Accept"="image/webp,*/*;q=0.8","Accept-Encoding"="gzip,deflate,sdch","Accept-Language"="zh-CN,zh;q=0.8","Cache-Control"="no-cache","Connection"="keep-alive","Pragma"="no-cache","Referer"="http://bbs.shangdu.com/s/regform.htm","User-Agent"="Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36")
png <- getBinaryURL("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ",httpheader=myHttpheader)
writeBin(png, con ="E:\\新技术\\爬虫\\验证码/123.png")
#shell.exec("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ")
2、RCurl三大函数
1)getURL函数
#1)判读URL是否存在
url.exists(url = "www.baidu.com")
## [1] TRUE
#收集调试信息
d <- debugGatherer()
#verbose = TRUE 这时候,d$value()值是会叠加的
tmp <- getURL(url="www.baidu.com", debugfunction=d$update, verbose=T)
names(d$value())
## [1] "text" "headerIn" "headerOut" "dataIn" "dataOut"
## [6] "sslDataIn" "sslDataOut"
#服务器地址及端口号
cat(d$value()[1])
## Rebuilt URL to: www.baidu.com/
## Trying 180.149.132.151...
## Connected to www.baidu.com (180.149.132.151) port 80 (#0)
## Connection #0 to host www.baidu.com left intact
#print(d$value()[1])
#服务器返回的头信息
cat(d$value()[2])
## HTTP/1.1 200 OK
## Date: Wed, 20 Apr 2016 08:04:09 GMT
## Content-Type: text/html
## Content-Length: 14613
## Last-Modified: Wed, 03 Sep 2014 02:48:32 GMT
## Connection: Keep-Alive
## Vary: Accept-Encoding
## Set-Cookie: BAIDUID=905A08993D710BA9389B6B3048107981:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: BIDUPSID=905A08993D710BA9389B6B3048107981; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: PSTM=1461139449; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## P3P: CP=" OTI DSP COR IVA OUR IND COM "
## Server: BWS/1.1
## X-UA-Compatible: IE=Edge,chrome=1
## Pragma: no-cache
## Cache-control: no-cache
## Accept-Ranges: bytes
##
#提交给服务器的头信息
cat(d$value()[3])
## GET / HTTP/1.1
## Host: www.baidu.com
## Accept: */*
##
#清除d$value(),清除之后全部为空
d$reset()
d$value()
## text headerIn headerOut dataIn dataOut sslDataIn
## "" "" "" "" "" ""
## sslDataOut
## ""
#2)查看服务器返回的头信息,列表形式
h <- basicHeaderGatherer()
txt <- getURL(url="http://www.baidu.com", headerfunction=h$update)
names(h$value())
## [1] "Date" "Content-Type" "Content-Length"
## [4] "Last-Modified" "Connection" "Vary"
## [7] "Set-Cookie" "Set-Cookie" "Set-Cookie"
## [10] "P3P" "Server" "X-UA-Compatible"
## [13] "Pragma" "Cache-control" "Accept-Ranges"
## [16] "status" "statusMessage"
h$value() # 所有的内容只是一个字符串
## Date
## "Wed, 20 Apr 2016 08:04:10 GMT"
## Content-Type
## "text/html"
## Content-Length
## "14613"
## Last-Modified
## "Wed, 03 Sep 2014 02:48:32 GMT"
## Connection
## "Keep-Alive"
## Vary
## "Accept-Encoding"
## Set-Cookie
## "BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
## Set-Cookie
## "BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
## Set-Cookie
## "PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com"
## P3P
## "CP=\" OTI DSP COR IVA OUR IND COM \""
## Server
## "BWS/1.1"
## X-UA-Compatible
## "IE=Edge,chrome=1"
## Pragma
## "no-cache"
## Cache-control
## "no-cache"
## Accept-Ranges
## "bytes"
## status
## "200"
## statusMessage
## "OK"
cat(h$value()) # 用cat显示的,会比较好看
## Wed, 20 Apr 2016 08:04:10 GMT text/html 14613 Wed, 03 Sep 2014 02:48:32 GMT Keep-Alive Accept-Encoding BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com CP=" OTI DSP COR IVA OUR IND COM " BWS/1.1 IE=Edge,chrome=1 no-cache no-cache bytes 200 OK
#查看url请求的访问信息
curl <- getCurlHandle()
txt <- getURL(url="http://www.baidu.com", curl=curl)
names(getCurlInfo(curl))
## [1] "effective.url" "response.code"
## [3] "total.time" "namelookup.time"
## [5] "connect.time" "pretransfer.time"
## [7] "size.upload" "size.download"
## [9] "speed.download" "speed.upload"
## [11] "header.size" "request.size"
## [13] "ssl.verifyresult" "filetime"
## [15] "content.length.download" "content.length.upload"
## [17] "starttransfer.time" "content.type"
## [19] "redirect.time" "redirect.count"
## [21] "private" "http.connectcode"
## [23] "httpauth.avail" "proxyauth.avail"
## [25] "os.errno" "num.connects"
## [27] "ssl.engines" "cookielist"
## [29] "lastsocket" "ftp.entry.path"
## [31] "redirect.url" "primary.ip"
## [33] "appconnect.time" "certinfo"
## [35] "condition.unmet"
getCurlInfo(curl)$response.code
## [1] 200
getCurlInfo(curl=curl)
## $effective.url
## [1] "http://www.baidu.com/"
##
## $response.code
## [1] 200
##
## $total.time
## [1] 0.063
##
## $namelookup.time
## [1] 0
##
## $connect.time
## [1] 0.016
##
## $pretransfer.time
## [1] 0.016
##
## $size.upload
## [1] 0
##
## $size.download
## [1] 14613
##
## $speed.download
## [1] 231952
##
## $speed.upload
## [1] 0
##
## $header.size
## [1] 750
##
## $request.size
## [1] 52
##
## $ssl.verifyresult
## [1] 0
##
## $filetime
## [1] -1
##
## $content.length.download
## [1] 14613
##
## $content.length.upload
## [1] -1
##
## $starttransfer.time
## [1] 0.032
##
## $content.type
## [1] "text/html"
##
## $redirect.time
## [1] 0
##
## $redirect.count
## [1] 0
##
## $private
## NULL
##
## $http.connectcode
## [1] 0
##
## $httpauth.avail
## [1] 0
##
## $proxyauth.avail
## [1] 0
##
## $os.errno
## [1] 0
##
## $num.connects
## [1] 1
##
## $ssl.engines
## [1] "dynamic"
##
## $cookielist
## character(0)
##
## $lastsocket
## [1] 652
##
## $ftp.entry.path
## NULL
##
## $redirect.url
## NULL
##
## $primary.ip
## [1] "180.149.132.151"
##
## $appconnect.time
## [1] 0
##
## $certinfo
## list()
##
## $condition.unmet
## [1] 0
#3)设置自己的header:设置自己的header,把系统设置成ihpone的系统Mac OS
myheader <- c(
"User-Agent"="Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
d <- debugGatherer()
tmp <- getURL(url = "http://www.baidu.com", httpheader=myheader, debugfunction=d$update, verbose=T)
cat(d$value()[3])
## GET / HTTP/1.1
## Host: www.baidu.com
## User-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7
## Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
## Accept-Language: en-us
## Connection: keep-alive
## Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
##
# 设置其他参数,共172个参数
listCurlOptions()
## [1] "address.scope" "append"
## [3] "autoreferer" "buffersize"
## [5] "cainfo" "capath"
## [7] "certinfo" "closepolicy"
## [9] "connect.only" "connecttimeout"
## [11] "connecttimeout.ms" "conv.from.network.function"
## [13] "conv.from.utf8.function" "conv.to.network.function"
## [15] "cookie" "cookiefile"
## [17] "cookiejar" "cookielist"
## [19] "cookiesession" "copypostfields"
## [21] "crlf" "crlfile"
## [23] "customrequest" "debugdata"
## [25] "debugfunction" "dirlistonly"
## [27] "dns.cache.timeout" "dns.use.global.cache"
## [29] "egdsocket" "encoding"
## [31] "errorbuffer" "failonerror"
## [33] "file" "filetime"
## [35] "followlocation" "forbid.reuse"
## [37] "fresh.connect" "ftp.account"
## [39] "ftp.alternative.to.user" "ftp.create.missing.dirs"
## [41] "ftp.filemethod" "ftp.response.timeout"
## [43] "ftp.skip.pasv.ip" "ftp.ssl"
## [45] "ftp.ssl.ccc" "ftp.use.eprt"
## [47] "ftp.use.epsv" "ftpappend"
## [49] "ftplistonly" "ftpport"
## [51] "ftpsslauth" "header"
## [53] "headerfunction" "http.content.decoding"
## [55] "http.transfer.decoding" "http.version"
## [57] "http200aliases" "httpauth"
## [59] "httpget" "httpheader"
## [61] "httppost" "httpproxytunnel"
## [63] "ignore.content.length" "infile"
## [65] "infilesize" "infilesize.large"
## [67] "interface" "ioctldata"
## [69] "ioctlfunction" "ipresolve"
## [71] "issuercert" "keypasswd"
## [73] "krb4level" "krblevel"
## [75] "localport" "localportrange"
## [77] "low.speed.limit" "low.speed.time"
## [79] "max.recv.speed.large" "max.send.speed.large"
## [81] "maxconnects" "maxfilesize"
## [83] "maxfilesize.large" "maxredirs"
## [85] "netrc" "netrc.file"
## [87] "new.directory.perms" "new.file.perms"
## [89] "nobody" "noprogress"
## [91] "noproxy" "nosignal"
## [93] "opensocketdata" "opensocketfunction"
## [95] "password" "port"
## [97] "post" "post301"
## [99] "postfields" "postfieldsize"
## [101] "postfieldsize.large" "postquote"
## [103] "postredir" "prequote"
## [105] "private" "progressdata"
## [107] "progressfunction" "protocols"
## [109] "proxy" "proxy.transfer.mode"
## [111] "proxyauth" "proxypassword"
## [113] "proxyport" "proxytype"
## [115] "proxyusername" "proxyuserpwd"
## [117] "put" "quote"
## [119] "random.file" "range"
## [121] "readfunction" "redir.protocols"
## [123] "referer" "resume.from"
## [125] "resume.from.large" "seekdata"
## [127] "seekfunction" "share"
## [129] "sockoptdata" "sockoptfunction"
## [131] "socks5.gssapi.nec" "socks5.gssapi.service"
## [133] "ssh.auth.types" "ssh.host.public.key.md5"
## [135] "ssh.private.keyfile" "ssh.public.keyfile"
## [137] "ssl.cipher.list" "ssl.ctx.data"
## [139] "ssl.ctx.function" "ssl.sessionid.cache"
## [141] "ssl.verifyhost" "ssl.verifypeer"
## [143] "sslcert" "sslcertpasswd"
## [145] "sslcerttype" "sslengine"
## [147] "sslengine.default" "sslkey"
## [149] "sslkeypasswd" "sslkeytype"
## [151] "sslversion" "stderr"
## [153] "tcp.nodelay" "telnetoptions"
## [155] "tftp.blksize" "timecondition"
## [157] "timeout" "timeout.ms"
## [159] "timevalue" "transfertext"
## [161] "unrestricted.auth" "upload"
## [163] "url" "use.ssl"
## [165] "useragent" "username"
## [167] "userpwd" "verbose"
## [169] "writedata" "writefunction"
## [171] "writeheader" "writeinfo"
2)getForm()函数
# wd=rcurl 这里就是关键字为rcurl
url <- c("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&ch=&tn=SE_hldp02870_0v135xhf&bar=&wd=rcurl&rsv_spt=1&rsv_pq=a3ed162a0088df8f&rsv_t=43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q&rsv_enter=1&inputT=2119")
# 查看url的结构和值
getFormParams(query=url)
## ie
## "utf-8"
## f
## "8"
## rsv_bp
## "1"
## rsv_idx
## "2"
## ch
## ""
## tn
## "SE_hldp02870_0v135xhf"
## bar
## ""
## wd
## "rcurl"
## rsv_spt
## "1"
## rsv_pq
## "a3ed162a0088df8f"
## rsv_t
## "43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q"
## rsv_enter
## "1"
## inputT
## "2119"
names(getFormParams(query=url))
## [1] "ie" "f" "rsv_bp" "rsv_idx" "ch"
## [6] "tn" "bar" "wd" "rsv_spt" "rsv_pq"
## [11] "rsv_t" "rsv_enter" "inputT"
tmp <- getForm(uri="http://www.baidu.com/s", ie="utf-8", f="8", rsv_bp="1", rsv_idx="2", ch="", tn="SE_hldp02870_0v135xhf", bar="", wd="rcurl", rsv_spt="1", rsv_pq="a3ed162a0088df8f", rsv_t="43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q", rsv_enter="1", inputT="2119")
# 注意:getForm函数不稳定(原因还不知道),有时候运行2到3次,才能真正找到页面
3)getBinaryURL()下载一个文件和批量下载文件
#1)下载一个文件
url <- "http://rfunction.com/code/1201/120103.R"
tmp <- getBinaryURL(url)
note <- file("E:\\新技术\\爬虫\\验证码/120103.R", open = "wb")
writeBin(tmp, note)
close(note)
#2)批量下载文件
url <- "http://rfunction.com/code/1202/"
tmp <- RCurl::getURL(url, httpheader = myheader) # 获取网页
tmp_files <- strsplit(x=tmp, split="<li><a href=\"")[[1]]
tmp_files1 <- strsplit(tmp_files, split="\"")
tmp_files2 <- lapply(X=tmp_files1, function(file) {file[1]})
files <- unlist(tmp_files2)
files <- files[c(-1, -2)]
baseURL <- "http://rfunction.com/code/1202/"
for(i in 1:length(files)){
fullURL <- paste(baseURL, files[i], sep = "")
tmp <- getBinaryURL(fullURL)
note <- file(paste("E:\\新技术\\爬虫\\验证码/1202-", files[i], sep = ""), open = "wb")
writeBin(tmp, note)
close(note)
Sys.sleep(0.002) # 休眠0.002秒
}