网页内容的抓取使用了Scala标准库的Source,网页内容的提取使用了正则表达式。中间练习了集合类的一些操作和文件读写操作,对正则表达式也重温了一下,特别是跨行匹配(多行匹配)研究了一些时间。提取后的文本写在了文件中。在Ubuntu 10.04 和 Scala 2.10.0 下运行成功。
代码如下:
import
java.io._
def getIndex() = {
val indexRegex =
""
"<a target="
_blank
" href="
(.+\.html)
" title="
.+
" >(.+)</a>"
""
.r
(List[(String, String)]() /: indexRegex.findAllMatchIn(indexSource).toList) { (result, item) =>
}
// return List[(url:String, title:String)]
}
def getContent(url:String) = {
val raw = scala.io.Source.fromURL(url).mkString
val reg =
""
"(?s).*<div class="
artcontent
">(.*)<div id="
zhanwei
">.*"
""
.r
(reg findFirstMatchIn raw).map[String](item => {
val s = (item group
1
).replaceAll(
"<br>"
,
"\r\n"
).
replaceAll(
""
"(?s)</?.*?>"
""
,
""
).
replaceAll(
""
"^\s+"
""
,
""
).
replaceAll(
" "
,
" "
)
s
}
)
// return Option[String]
}
def writeContent(content:Option[String], title:String) {
if
(content.isEmpty)
println(
"Not write "
+ title +
".txt"
)
else
{
val writer =
new
PrintWriter(
new
File(title +
".txt"
))
writer write content.get
writer.close()
println(
"Write "
+ title +
".txt"
)
}
}
def getIt() {
getIndex().foreach(item => writeContent(getContent(item._1), item._2))
}
getIt()