简单的scala实现的网络爬虫



一个简单的scala实现的网络爬虫,不到100行代码,可配置抓取规则,下载后的回调函数等
代码放在 github
https://github.com/rock117/simple-crawler
还有几个小地方没写好,有时间再完善下
标签: <无>
代码片段(1)
[全屏查看所有代码]
1. [代码]scala简单爬虫    
?

    
package som.fun.crawl
 
import scala.concurrent._
import scala.concurrent.duration._
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
import java.net.HttpURLConnection
import scala.collection.JavaConversions._
import java.io.ByteArrayOutputStream
import java.util.concurrent.CountDownLatch
import java.util.HashSet
import java.io.PrintStream
import java.io.FileOutputStream
import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
 
object CrawlerTest extends App {
 
new Crawler("http://www.qq.com", filter = (url: String) => url.contains(".qq.com")).crawl
 
}
 
/**
 * @param startPage crawler would crawl from that page
 * @param filter crawler just crawl those url which match the filter
 * @param onComplete handler for download complete
 */
class Crawler(startPage: String,
  filter: (String => Boolean) = (url: String) => true,
  onDataLoaded: (String, Int, Array[Byte], Map[String, String]) => Any = (url: String, status: Int, data: Array[Byte], headers: Map[String, String]) => { println(s"download $url done") }) {
  private val latch = new CountDownLatch(1)
  private val linkRegex = """ (src|href)="([^"]+)"|(src|href)='([^']+)' """.trim.r
  private val htmlTypeRegex = "\btext/html\b"
  private val crawledPool = new HashSet[String]
 
  def crawl {
    crawlPageLinks(startPage, new String(get(startPage)._2))
    latch.await()
  }
 
  private def crawlPageLinks(pageUrl: String, pageContent: String) {
    val links = parseCrawlLinks(pageUrl, pageContent)
    links.map {
      link =>
        val future = Future(get(link))
        future.onSuccess {
          case data if isTextPage(data._3) =>
            crawlPageLinks(link, new String(data._2))
        }
        future.onFailure {
          case e: Exception =>
            println(s"visit $link error!")
            e.printStackTrace
        }
    }
  }
 
  private def getFullUrl(parentUrl: String, link: String) = {
    val baseHost = getHostBase(parentUrl)
    link match {
      case link if link.startsWith("/") => baseHost + link
      case link if link.startsWith("http:") || link.startsWith("https:") => link
      case _ =>
        val index = parentUrl.lastIndexOf("/")
        parentUrl.substring(0, index) + "/" + link
    }
  }
 
  private def parseCrawlLinks(parentUrl: String, html: String) = {
    val baseHost = getHostBase(parentUrl)
    val links = fetchLinks(html).map {
      link =>
        link match {
          case link if link.startsWith("/") => baseHost + link
          case link if link.startsWith("http:") || link.startsWith("https:") => link
          case _ =>
            val index = parentUrl.lastIndexOf("/")
            parentUrl.substring(0, index) + "/" + link
        }
    }.filter {
      link => !crawledPool.contains(link) && this.filter(link)
    }
    println("find " + links.size + " links at page " + parentUrl)
    links
  }
  def get(url: String) = {
    val uri = new URL(url);
    val conn = uri.openConnection().asInstanceOf[HttpURLConnection];
    conn.setConnectTimeout(100000)
    conn.setReadTimeout(1000000)
    val stream = conn.getInputStream()
    val buf = Array.fill[Byte](1024)(0)
    var len = stream.read(buf)
    val out = new ByteArrayOutputStream
    while (len > -1) {
      out.write(buf, 0, len)
      len = stream.read(buf)
    }
 
    val data = out.toByteArray()
    val status = conn.getResponseCode()
 
    val headers = conn.getHeaderFields().toMap.map {
      head => (head._1, head._2.mkString(","))
    }
    conn.disconnect
    crawledPool.add(url)
    this.onDataLoaded(url, status, data, headers)
    (conn.getResponseCode(), data, headers)
 
  }
  private def fetchLinks(html: String) = {
    val list = for (m <- linkRegex.findAllIn(html).matchData if (m.group(1) != null || m.group(3) != null)) yield {
      if (m.group(1) != null) m.group(2) else m.group(4)
    }
    list.filter {
      link => !link.startsWith("#") && !link.startsWith("javascript:") && link != "" && !link.startsWith("mailto:")
    }.toSet
 
  }
 
  private def getHostBase(url: String) = {
    val uri = new URL(url)
    val portPart = if (uri.getPort() == -1 || uri.getPort() == 80) "" else ":" + uri.getPort()
    uri.getProtocol() + "://" + uri.getHost() + portPart
  }
 
  private def isTextPage(headers: Map[String, String]) = {
    val contentType = if (headers contains "Content-Type") headers("Content-Type") else null
    contentType match {
      case null => false
      case contentType if contentType isEmpty => false
      case contentType if Pattern.compile(htmlTypeRegex).matcher(contentType).find => true
      case _ => false
    }
 
  }
 

}


改进建议:


当任务较多或是文件比较大时,有可能导致内存溢出,原因在于onDataLoaded是在文件下载完成后才调用。可以改进如下,提供3个回调方法来替代他.
1. beforeDataLoaded
    是连接建立,(可以读取响应头,但未开始下载数据)开始调用, 此阶段可做先初始化,比如新建一个文件,打开文件流等
2. onDataReceived
    收到数据时(inputstream读取定长数据到buffer,每一次读取调用一次回调)调用,此阶段可以把收到的数据写到文件
3. afterDataLoaded。
    连接关闭时调用。此阶段可做些清理工作,比如关闭文件句柄

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值