普通网页或图片等web资源文件,通过Source.fromURL即可下载,个别网站需要特定HTTP协议,此时需要用到java.net网络库.
获取到网页文档后,如Html,可能过正则表达式提取需要的数据。
示例代码,网络文档下载:
package service.spider
/**
*/
import java.net.URL
import java.net.URLEncoder
import java.net.URLConnection
import java.net.HttpURLConnection
import java.io.InputStreamReader
import java.io.StringReader
import scala.io.Source
/*
scala web html loader
*/
object HtmlDocLoader {
//UTF-8 Html Doc suport
def getPage(url:String,encode:String):String={
// val urlGet=new URL(url)
// val url_encode= URLEncoder.encode(url,encode)
val url_encode=url
val htmldoc= Source.fromURL(url_encode)(encode).mkString;
//val htmldoc= Source.fromURL(url_encode).mkString;
println("code:"+htmldoc)
htmldoc
}
//suport encode utf-8,gbk
//GBK Html Doc Load
def getPageByURLConnetion(url:String,encode:String):String={
var htmlContent:String=""
val urlGet=new URL(url)
// val url_encode= URLEncoder.encode(url,encode)
val url_encode=url
val conn= urlGet.openConnection
val http=conn.asInstanceOf[HttpURLConnection]
http.setRequestProperty("Content-Type","text/html; charset=GBK")
http.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 5.2; rv:40.0) Gecko/20100101 Firefox/40.0")
http.connect()
if(http.getResponseCode==200){
val ins=http.getInputStream()
//对html文档进行编码
val inr=new InputStreamReader(ins,encode)
//val sr=new StringReader(inr)
var c=inr.read()
import scala.StringBuilder
var sb=new StringBuilder()
while(c != -1) {
sb.append(c.toChar)
c=inr.read()
}
htmlContent=sb.toString()
println()
println("get html doc ,OK")
}else{
println("ERROR:"+http.getResponseCode + http.getResponseMessage)
}
htmlContent
}
def main(args: Array[String]) {
htm.getPageByURLConnetion("http://s.XXXX.com/company/company_search.htm","GBK")
}
}
HTML文档数据提取:
package service.spider
/**
* 通过 scala 正则表达式分析Html文本
* 分析结果通过RestFull Api 传递给客户端APP
*
* Created by Administrator on 15-11-18.
*/
import scala.List
import scala.collection.mutable.ListBuffer
import scala.util.matching.Regex
object AliAdSpiderService {
val urlpath:String=""
def docLisParse(url:String):ListBuffer[String]={
//获取Html文档内容
val htm=HtmlDocLoader
var htmlContent=htm.getPageByURLConnetion(url,"GBK")
println(htmlContent)
//通过正则表达式提取广告
val htmlParse=HtmlDocParser
//标题查找正则表达式
val regex1="(?<=<title>)(.*?)(?=</title>)".r
//匹配一次
val html_title= regex1.findFirstIn( htmlContent)
println(html_title)
//提取其它数据
//将数据添加到ListBuffer
def list:ListBuffer[String]=ListBuffer()
list.append(html_title.toString)
list
}
def adLinkParse(htmlContent:String):ListBuffer[String]={
def list:ListBuffer[String]=ListBuffer()
list
}
def main(args: Array[String]) {
val adparser=AliAdSpiderService
adparser.docLisParse("http://xxx.com/company/company_search.htm")
}
}
更进一步,如需要采集更精细的信息,如邮箱,电话,时间等,建设独立开发一个HTML文档解析,或使用其它第三方工具库。