基于Source和正则表达式的Scala网页内容抓取

最新推荐文章于 2022-10-18 12:00:08 发布

w_j_w2010

最新推荐文章于 2022-10-18 12:00:08 发布

阅读量633

点赞数

分类专栏： scala 文本处理

scala 同时被 2 个专栏收录

83 篇文章 0 订阅

订阅专栏

文本处理

26 篇文章 0 订阅

订阅专栏

网页内容的抓取使用了Scala标准库的Source，网页内容的提取使用了正则表达式。中间练习了集合类的一些操作和文件读写操作，对正则表达式也重温了一下，特别是跨行匹配（多行匹配）研究了一些时间。提取后的文本写在了文件中。在Ubuntu 10.04 和 Scala 2.10.0 下运行成功。

代码如下：

 
  import 
   java.io._ 
 
  def getIndex() = { 
 
  val indexSource = scala.io.Source.fromURL( 
  "http://www.yifan100.com/dir/15136/" 
  ).mkString 
 
  val indexRegex =  
  "" 
  "<a target=" 
  _blank 
  " href=" 
  (.+\.html) 
  " title=" 
  .+ 
  " >(.+)</a>" 
  "" 
  .r 
 
  (List[(String, String)]() /: indexRegex.findAllMatchIn(indexSource).toList) { (result, item) => 
 
  ( 
  "http://www.yifan100.com" 
  + (item group  
  1 
  ), item group  
  2 
  ) :: result 
 
  }  
  // return List[(url:String, title:String)] 
 
  } 
 
  def getContent(url:String) = { 
 
  val raw = scala.io.Source.fromURL(url).mkString 
 
  val reg =  
  "" 
  "(?s).*<div class=" 
  artcontent 
  ">(.*)<div id=" 
  zhanwei 
  ">.*" 
  "" 
  .r 
 
  (reg findFirstMatchIn raw).map[String](item => { 
 
  val s = (item group  
  1 
  ).replaceAll( 
  "<br>" 
  ,  
  "\r\n" 
  ). 
 
  replaceAll( 
  "" 
  "(?s)</?.*?>" 
  "" 
  ,  
  "" 
  ). 
 
  replaceAll( 
  "" 
  "^\s+" 
  "" 
  ,  
  "" 
  ). 
 
  replaceAll( 
  " " 
  ,  
  " " 
  ) 
 
  s 
 
  } 
 
  )  
  // return Option[String] 
 
  } 
 
  def writeContent(content:Option[String], title:String) { 
 
  if 
  (content.isEmpty) 
 
  println( 
  "Not write " 
  + title +  
  ".txt" 
  ) 
 
  else 
  { 
 
  val writer =  
  new 
  PrintWriter( 
  new 
  File(title +  
  ".txt" 
  )) 
 
  writer write content.get 
 
  writer.close() 
 
  println( 
  "Write " 
  + title +  
  ".txt" 
  ) 
 
  } 
 
  } 
 
  def getIt() { 
 
  getIndex().foreach(item => writeContent(getContent(item._1), item._2)) 
 
  } 
 
  getIt()