scala 实现简单爬虫

package som.fun.crawl
 
import scala.concurrent. _
import scala.concurrent.duration. _
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions. _
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.io.PrintStream
import java.io.FileOutputStream
import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
 
object CrawlerTest extends App {
 
new Crawler( "http://www.qq.com" , filter = (url : String) = > url.contains( ".qq.com" )).crawl
 
}
 
/**
  * @param startPage crawler would crawl from that page
  * @param filter crawler just crawl those url which match the filter
  * @param onComplete handler for download complete
  */
class Crawler(startPage : String,
   filter : (String = > Boolean) = (url : String) = > true ,
   onDataLoaded : (String, Int, Array[Byte], Map[String, String]) = > Any = (url : String, status : Int, data : Array[Byte], headers : Map[String, String]) = > { println(s "download $url done" ) }) {
   private val latch = new CountDownLatch( 1 )
   private val linkRegex = "" " (src|href)=" ([^ "]+)" |(src|href) = '([^' ]+)' "" ".trim.r
   private val htmlTypeRegex = " \btext/html\b "
   private val crawledPool = new HashSet[String]
 
   def crawl {
     crawlPageLinks(startPage, new String(get(startPage)._2))
     latch.await()
   }
 
   private def crawlPageLinks(pageUrl: String, pageContent: String) {
     val links = parseCrawlLinks(pageUrl, pageContent)
     links.map {
       link =>
         val future = Future(get(link))
         future.onSuccess {
           case data if isTextPage(data._3) =>
             crawlPageLinks(link, new String(data._2))
         }
         future.onFailure {
           case e: Exception =>
             println(s" visit $link error! ")
             e.printStackTrace
         }
     }
   }
 
   private def getFullUrl(parentUrl: String, link: String) = {
     val baseHost = getHostBase(parentUrl)
     link match {
       case link if link.startsWith(" / ") => baseHost + link
       case link if link.startsWith(" http : ") || link.startsWith(" https : ") => link
       case _ =>
         val index = parentUrl.lastIndexOf(" / ")
         parentUrl.substring(0, index) + " / " + link
     }
   }
 
   private def parseCrawlLinks(parentUrl: String, html: String) = {
     val baseHost = getHostBase(parentUrl)
     val links = fetchLinks(html).map {
       link =>
         link match {
           case link if link.startsWith(" / ") => baseHost + link
           case link if link.startsWith(" http : ") || link.startsWith(" https : ") => link
           case _ =>
             val index = parentUrl.lastIndexOf(" / ")
             parentUrl.substring(0, index) + " / " + link
         }
     }.filter {
       link => !crawledPool.contains(link) && this.filter(link)
     }
     println(" find " + links.size + " links at page " + parentUrl)
     links
   }
   def get(url: String) = {
     val uri = new URL(url);
     val conn = uri.openConnection().asInstanceOf[HttpURLConnection];
     conn.setConnectTimeout(100000)
     conn.setReadTimeout(1000000)
     val stream = conn.getInputStream()
     val buf = Array.fill[Byte](1024)(0)
     var len = stream.read(buf)
     val out = new ByteArrayOutputStream
     while (len > -1) {
       out.write(buf, 0, len)
       len = stream.read(buf)
     }
 
     val data = out.toByteArray()
     val status = conn.getResponseCode()
 
     val headers = conn.getHeaderFields().toMap.map {
       head => (head._1, head._2.mkString(" , "))
     }
     conn.disconnect
     crawledPool.add(url)
     this.onDataLoaded(url, status, data, headers)
     (conn.getResponseCode(), data, headers)
 
   }
   private def fetchLinks(html: String) = {
     val list = for (m <- linkRegex.findAllIn(html).matchData if (m.group(1) != null || m.group(3) != null)) yield {
       if (m.group(1) != null) m.group(2) else m.group(4)
     }
     list.filter {
       link => !link.startsWith(" # ") && !link.startsWith(" javascript : ") && link != " " && !link.startsWith(" mailto : ")
     }.toSet
 
   }
 
   private def getHostBase(url: String) = {
     val uri = new URL(url)
     val portPart = if (uri.getPort() == -1 || uri.getPort() == 80) " " else " : " + uri.getPort()
     uri.getProtocol() + " : //" + uri.getHost() + portPart
   }
 
   private def isTextPage(headers : Map[String, String]) = {
     val contentType = if (headers contains "Content-Type ") headers(" Content-Type") else null
     contentType match {
       case null = > false
       case contentType if contentType isEmpty = > false
       case contentType if Pattern.compile(htmlTypeRegex).matcher(contentType).find = > true
       case _ = > false
     }
 
   }
 
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值