RCurl抓取

最新推荐文章于 2021-06-01 06:16:15 发布

**码上人生**

最新推荐文章于 2021-06-01 06:16:15 发布

阅读量2.3k

点赞数

分类专栏：数据挖掘文章标签： RCurl 抓取

本文链接：https://blog.csdn.net/qq_16365849/article/details/51201696

版权

数据挖掘专栏收录该内容

27 篇文章 0 订阅

订阅专栏

1、RCurl抓取验证码

RCurl这个程序包提供了由R到libcurl库的接口,从而实现HTTP的一些功能。例如,从服务器下载文件、保持连接、上传文件、采用二进制格式读取、句柄重定向、密码认证等等。什么是curl&libcurl – curl:利用URL语法在命令行方式下工作的开源文件传输工具 – curl背后的库就是libcurl 功能 – 获得页面 – 有关认证 – 上传下载 – 信息搜索 – ……

library(RCurl)

## Loading required package: bitops

myHttpheader <- c("Accept"="image/webp,*/*;q=0.8","Accept-Encoding"="gzip,deflate,sdch","Accept-Language"="zh-CN,zh;q=0.8","Cache-Control"="no-cache","Connection"="keep-alive","Pragma"="no-cache","Referer"="http://bbs.shangdu.com/s/regform.htm","User-Agent"="Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36")

png <- getBinaryURL("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ",httpheader=myHttpheader)

writeBin(png, con ="E:\\新技术\\爬虫\\验证码/123.png")

#shell.exec("http://push.shangdu.com/regcode.php?token=prrX73LtmN_63uEcKh0a2Z1opo1ZgdcBJ")

2、RCurl三大函数

1)getURL函数

#1)判读URL是否存在
url.exists(url = "www.baidu.com")

## [1] TRUE

#收集调试信息
d <- debugGatherer()

#verbose = TRUE 这时候，d$value()值是会叠加的
tmp <- getURL(url="www.baidu.com", debugfunction=d$update, verbose=T)
names(d$value())

## [1] "text"       "headerIn"   "headerOut"  "dataIn"     "dataOut"   
## [6] "sslDataIn"  "sslDataOut"

#服务器地址及端口号
cat(d$value()[1])

## Rebuilt URL to: www.baidu.com/
##   Trying 180.149.132.151...
## Connected to www.baidu.com (180.149.132.151) port 80 (#0)
## Connection #0 to host www.baidu.com left intact

#print(d$value()[1])

#服务器返回的头信息
cat(d$value()[2])

## HTTP/1.1 200 OK
## Date: Wed, 20 Apr 2016 08:04:09 GMT
## Content-Type: text/html
## Content-Length: 14613
## Last-Modified: Wed, 03 Sep 2014 02:48:32 GMT
## Connection: Keep-Alive
## Vary: Accept-Encoding
## Set-Cookie: BAIDUID=905A08993D710BA9389B6B3048107981:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: BIDUPSID=905A08993D710BA9389B6B3048107981; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## Set-Cookie: PSTM=1461139449; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
## P3P: CP=" OTI DSP COR IVA OUR IND COM "
## Server: BWS/1.1
## X-UA-Compatible: IE=Edge,chrome=1
## Pragma: no-cache
## Cache-control: no-cache
## Accept-Ranges: bytes
##

#提交给服务器的头信息
cat(d$value()[3])

## GET / HTTP/1.1
## Host: www.baidu.com
## Accept: */*
##

#清除d$value()，清除之后全部为空
d$reset()
d$value()

##       text   headerIn  headerOut     dataIn    dataOut  sslDataIn 
##         ""         ""         ""         ""         ""         "" 
## sslDataOut 
##         ""

#2)查看服务器返回的头信息，列表形式
h <- basicHeaderGatherer()
txt <- getURL(url="http://www.baidu.com", headerfunction=h$update)
names(h$value())

##  [1] "Date"            "Content-Type"    "Content-Length" 
##  [4] "Last-Modified"   "Connection"      "Vary"           
##  [7] "Set-Cookie"      "Set-Cookie"      "Set-Cookie"     
## [10] "P3P"             "Server"          "X-UA-Compatible"
## [13] "Pragma"          "Cache-control"   "Accept-Ranges"  
## [16] "status"          "statusMessage"

h$value()      # 所有的内容只是一个字符串

##                                                                                                                                Date 
##                                                                                                     "Wed, 20 Apr 2016 08:04:10 GMT" 
##                                                                                                                        Content-Type 
##                                                                                                                         "text/html" 
##                                                                                                                      Content-Length 
##                                                                                                                             "14613" 
##                                                                                                                       Last-Modified 
##                                                                                                     "Wed, 03 Sep 2014 02:48:32 GMT" 
##                                                                                                                          Connection 
##                                                                                                                        "Keep-Alive" 
##                                                                                                                                Vary 
##                                                                                                                   "Accept-Encoding" 
##                                                                                                                          Set-Cookie 
## "BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" 
##                                                                                                                          Set-Cookie 
##     "BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" 
##                                                                                                                          Set-Cookie 
##                               "PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com" 
##                                                                                                                                 P3P 
##                                                                                              "CP=\" OTI DSP COR IVA OUR IND COM \"" 
##                                                                                                                              Server 
##                                                                                                                           "BWS/1.1" 
##                                                                                                                     X-UA-Compatible 
##                                                                                                                  "IE=Edge,chrome=1" 
##                                                                                                                              Pragma 
##                                                                                                                          "no-cache" 
##                                                                                                                       Cache-control 
##                                                                                                                          "no-cache" 
##                                                                                                                       Accept-Ranges 
##                                                                                                                             "bytes" 
##                                                                                                                              status 
##                                                                                                                               "200" 
##                                                                                                                       statusMessage 
##                                                                                                                                "OK"

cat(h$value())  # 用cat显示的，会比较好看

## Wed, 20 Apr 2016 08:04:10 GMT text/html 14613 Wed, 03 Sep 2014 02:48:32 GMT Keep-Alive Accept-Encoding BAIDUID=48D8EBCEAFED4F0D6F78FAE967562852:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com BIDUPSID=48D8EBCEAFED4F0D6F78FAE967562852; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com PSTM=1461139450; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com CP=" OTI DSP COR IVA OUR IND COM " BWS/1.1 IE=Edge,chrome=1 no-cache no-cache bytes 200 OK

#查看url请求的访问信息
curl <- getCurlHandle()
txt <- getURL(url="http://www.baidu.com", curl=curl)
names(getCurlInfo(curl))

##  [1] "effective.url"           "response.code"          
##  [3] "total.time"              "namelookup.time"        
##  [5] "connect.time"            "pretransfer.time"       
##  [7] "size.upload"             "size.download"          
##  [9] "speed.download"          "speed.upload"           
## [11] "header.size"             "request.size"           
## [13] "ssl.verifyresult"        "filetime"               
## [15] "content.length.download" "content.length.upload"  
## [17] "starttransfer.time"      "content.type"           
## [19] "redirect.time"           "redirect.count"         
## [21] "private"                 "http.connectcode"       
## [23] "httpauth.avail"          "proxyauth.avail"        
## [25] "os.errno"                "num.connects"           
## [27] "ssl.engines"             "cookielist"             
## [29] "lastsocket"              "ftp.entry.path"         
## [31] "redirect.url"            "primary.ip"             
## [33] "appconnect.time"         "certinfo"               
## [35] "condition.unmet"

getCurlInfo(curl)$response.code

## [1] 200

getCurlInfo(curl=curl)

## $effective.url
## [1] "http://www.baidu.com/"
## 
## $response.code
## [1] 200
## 
## $total.time
## [1] 0.063
## 
## $namelookup.time
## [1] 0
## 
## $connect.time
## [1] 0.016
## 
## $pretransfer.time
## [1] 0.016
## 
## $size.upload
## [1] 0
## 
## $size.download
## [1] 14613
## 
## $speed.download
## [1] 231952
## 
## $speed.upload
## [1] 0
## 
## $header.size
## [1] 750
## 
## $request.size
## [1] 52
## 
## $ssl.verifyresult
## [1] 0
## 
## $filetime
## [1] -1
## 
## $content.length.download
## [1] 14613
## 
## $content.length.upload
## [1] -1
## 
## $starttransfer.time
## [1] 0.032
## 
## $content.type
## [1] "text/html"
## 
## $redirect.time
## [1] 0
## 
## $redirect.count
## [1] 0
## 
## $private
## NULL
## 
## $http.connectcode
## [1] 0
## 
## $httpauth.avail
## [1] 0
## 
## $proxyauth.avail
## [1] 0
## 
## $os.errno
## [1] 0
## 
## $num.connects
## [1] 1
## 
## $ssl.engines
## [1] "dynamic"
## 
## $cookielist
## character(0)
## 
## $lastsocket
## [1] 652
## 
## $ftp.entry.path
## NULL
## 
## $redirect.url
## NULL
## 
## $primary.ip
## [1] "180.149.132.151"
## 
## $appconnect.time
## [1] 0
## 
## $certinfo
## list()
## 
## $condition.unmet
## [1] 0

#3)设置自己的header：设置自己的header，把系统设置成ihpone的系统Mac OS
myheader <- c(
  "User-Agent"="Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
d <- debugGatherer()
tmp <- getURL(url = "http://www.baidu.com", httpheader=myheader, debugfunction=d$update, verbose=T)
cat(d$value()[3])

## GET / HTTP/1.1
## Host: www.baidu.com
## User-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_0_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A306 Safari/6531.22.7
## Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
## Accept-Language: en-us
## Connection: keep-alive
## Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7
##

# 设置其他参数，共172个参数
listCurlOptions()

##   [1] "address.scope"              "append"                    
##   [3] "autoreferer"                "buffersize"                
##   [5] "cainfo"                     "capath"                    
##   [7] "certinfo"                   "closepolicy"               
##   [9] "connect.only"               "connecttimeout"            
##  [11] "connecttimeout.ms"          "conv.from.network.function"
##  [13] "conv.from.utf8.function"    "conv.to.network.function"  
##  [15] "cookie"                     "cookiefile"                
##  [17] "cookiejar"                  "cookielist"                
##  [19] "cookiesession"              "copypostfields"            
##  [21] "crlf"                       "crlfile"                   
##  [23] "customrequest"              "debugdata"                 
##  [25] "debugfunction"              "dirlistonly"               
##  [27] "dns.cache.timeout"          "dns.use.global.cache"      
##  [29] "egdsocket"                  "encoding"                  
##  [31] "errorbuffer"                "failonerror"               
##  [33] "file"                       "filetime"                  
##  [35] "followlocation"             "forbid.reuse"              
##  [37] "fresh.connect"              "ftp.account"               
##  [39] "ftp.alternative.to.user"    "ftp.create.missing.dirs"   
##  [41] "ftp.filemethod"             "ftp.response.timeout"      
##  [43] "ftp.skip.pasv.ip"           "ftp.ssl"                   
##  [45] "ftp.ssl.ccc"                "ftp.use.eprt"              
##  [47] "ftp.use.epsv"               "ftpappend"                 
##  [49] "ftplistonly"                "ftpport"                   
##  [51] "ftpsslauth"                 "header"                    
##  [53] "headerfunction"             "http.content.decoding"     
##  [55] "http.transfer.decoding"     "http.version"              
##  [57] "http200aliases"             "httpauth"                  
##  [59] "httpget"                    "httpheader"                
##  [61] "httppost"                   "httpproxytunnel"           
##  [63] "ignore.content.length"      "infile"                    
##  [65] "infilesize"                 "infilesize.large"          
##  [67] "interface"                  "ioctldata"                 
##  [69] "ioctlfunction"              "ipresolve"                 
##  [71] "issuercert"                 "keypasswd"                 
##  [73] "krb4level"                  "krblevel"                  
##  [75] "localport"                  "localportrange"            
##  [77] "low.speed.limit"            "low.speed.time"            
##  [79] "max.recv.speed.large"       "max.send.speed.large"      
##  [81] "maxconnects"                "maxfilesize"               
##  [83] "maxfilesize.large"          "maxredirs"                 
##  [85] "netrc"                      "netrc.file"                
##  [87] "new.directory.perms"        "new.file.perms"            
##  [89] "nobody"                     "noprogress"                
##  [91] "noproxy"                    "nosignal"                  
##  [93] "opensocketdata"             "opensocketfunction"        
##  [95] "password"                   "port"                      
##  [97] "post"                       "post301"                   
##  [99] "postfields"                 "postfieldsize"             
## [101] "postfieldsize.large"        "postquote"                 
## [103] "postredir"                  "prequote"                  
## [105] "private"                    "progressdata"              
## [107] "progressfunction"           "protocols"                 
## [109] "proxy"                      "proxy.transfer.mode"       
## [111] "proxyauth"                  "proxypassword"             
## [113] "proxyport"                  "proxytype"                 
## [115] "proxyusername"              "proxyuserpwd"              
## [117] "put"                        "quote"                     
## [119] "random.file"                "range"                     
## [121] "readfunction"               "redir.protocols"           
## [123] "referer"                    "resume.from"               
## [125] "resume.from.large"          "seekdata"                  
## [127] "seekfunction"               "share"                     
## [129] "sockoptdata"                "sockoptfunction"           
## [131] "socks5.gssapi.nec"          "socks5.gssapi.service"     
## [133] "ssh.auth.types"             "ssh.host.public.key.md5"   
## [135] "ssh.private.keyfile"        "ssh.public.keyfile"        
## [137] "ssl.cipher.list"            "ssl.ctx.data"              
## [139] "ssl.ctx.function"           "ssl.sessionid.cache"       
## [141] "ssl.verifyhost"             "ssl.verifypeer"            
## [143] "sslcert"                    "sslcertpasswd"             
## [145] "sslcerttype"                "sslengine"                 
## [147] "sslengine.default"          "sslkey"                    
## [149] "sslkeypasswd"               "sslkeytype"                
## [151] "sslversion"                 "stderr"                    
## [153] "tcp.nodelay"                "telnetoptions"             
## [155] "tftp.blksize"               "timecondition"             
## [157] "timeout"                    "timeout.ms"                
## [159] "timevalue"                  "transfertext"              
## [161] "unrestricted.auth"          "upload"                    
## [163] "url"                        "use.ssl"                   
## [165] "useragent"                  "username"                  
## [167] "userpwd"                    "verbose"                   
## [169] "writedata"                  "writefunction"             
## [171] "writeheader"                "writeinfo"

2）getForm()函数

# wd=rcurl 这里就是关键字为rcurl
url <- c("http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&ch=&tn=SE_hldp02870_0v135xhf&bar=&wd=rcurl&rsv_spt=1&rsv_pq=a3ed162a0088df8f&rsv_t=43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q&rsv_enter=1&inputT=2119")

# 查看url的结构和值
getFormParams(query=url)

##                                                                                 ie 
##                                                                            "utf-8" 
##                                                                                  f 
##                                                                                "8" 
##                                                                             rsv_bp 
##                                                                                "1" 
##                                                                            rsv_idx 
##                                                                                "2" 
##                                                                                 ch 
##                                                                                 "" 
##                                                                                 tn 
##                                                            "SE_hldp02870_0v135xhf" 
##                                                                                bar 
##                                                                                 "" 
##                                                                                 wd 
##                                                                            "rcurl" 
##                                                                            rsv_spt 
##                                                                                "1" 
##                                                                             rsv_pq 
##                                                                 "a3ed162a0088df8f" 
##                                                                              rsv_t 
## "43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q" 
##                                                                          rsv_enter 
##                                                                                "1" 
##                                                                             inputT 
##                                                                             "2119"

names(getFormParams(query=url))

##  [1] "ie"        "f"         "rsv_bp"    "rsv_idx"   "ch"       
##  [6] "tn"        "bar"       "wd"        "rsv_spt"   "rsv_pq"   
## [11] "rsv_t"     "rsv_enter" "inputT"

tmp <- getForm(uri="http://www.baidu.com/s", ie="utf-8", f="8", rsv_bp="1", rsv_idx="2", ch="", tn="SE_hldp02870_0v135xhf", bar="", wd="rcurl", rsv_spt="1", rsv_pq="a3ed162a0088df8f", rsv_t="43d18gWNyd6HWpqDiKov7Dm548s4HY4cgcJlXc8ujpzRW9Okec2aOb5screzftZo5DJ60Cp7aILvRK2Q", rsv_enter="1", inputT="2119")
# 注意：getForm函数不稳定(原因还不知道)，有时候运行2到3次，才能真正找到页面

3）getBinaryURL()下载一个文件和批量下载文件

#1）下载一个文件
url <- "http://rfunction.com/code/1201/120103.R"
tmp <- getBinaryURL(url)
note <- file("E:\\新技术\\爬虫\\验证码/120103.R", open = "wb")
writeBin(tmp, note)
close(note)

#2）批量下载文件
url <- "http://rfunction.com/code/1202/"
tmp <- RCurl::getURL(url, httpheader = myheader) # 获取网页
tmp_files <- strsplit(x=tmp, split="<li><a href=\"")[[1]]
tmp_files1 <- strsplit(tmp_files, split="\"")
tmp_files2 <- lapply(X=tmp_files1, function(file) {file[1]})
files <- unlist(tmp_files2)
files <- files[c(-1, -2)]
baseURL <- "http://rfunction.com/code/1202/"
for(i in 1:length(files)){
  fullURL <- paste(baseURL, files[i], sep = "")
  tmp <- getBinaryURL(fullURL)
  note <- file(paste("E:\\新技术\\爬虫\\验证码/1202-", files[i], sep = ""), open = "wb")
  writeBin(tmp, note)
  close(note)
Sys.sleep(0.002) # 休眠0.002秒
}