文本分析与提取:scala正则
持久化:anorm
web:play 2
scala 正则关键代码示例:
val regex1="(?<=<title>)(.*?)(?=</title>)".r
val html_node= regex1.findFirstIn( htmlContent)
package service.spider.htmlparser
import scala.collection.mutable.{ListBuffer, ArrayBuffer}
object HtmlDocParser {
//title parse
def parsehtmlTitle(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=<title>)(.*?)(?=</title>)".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
def parsehtmlKeyWord(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=keyword=\")(.*?)(?=\")".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
def parsehtmlDescription(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=description=\")(.*?)(?=\")".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
//body parse
def parsehtmlBody(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=<body>)(.*?)(?=</body>)".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
//img parse
def parseImages(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=img src=\")(.*?)(?=\")".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
//js parse
//import js parse
//import css parse
//css parse
//code block parse
//link parse
def parseLinkList(htmlContent:String): String ={
//标题查找正则表达式
val regex1="(?<=href=\")(.*?)(?=\")".r
//匹配一次
val html_node= regex1.findFirstIn( htmlContent)
println(html_node)
html_node.toString
}
//code block
def getCodeBlock(start:String ,end:String,htmlContent:String):List[String]={
//var list=List[String]()
var list=ListBuffer[String]()
val regex1=("(?s)(?<="+start+")(.*?)(?="+end+")").r
val h2=htmlContent.replaceAll("\n\r","")
var i:Int=0
val html_nodeList= regex1.findAllMatchIn(h2)
for(matchstr <- html_nodeList){
i+=1
println("getCodeBlock "+" c:"+i+ " match g:"+matchstr.groupCount )//+ ""+matchstr.group(0))
// println("##########\n"+s)
list.append(matchstr.toString())
//s::list
}
//转不可变集合
list.toList
}
//ul list parse
//flash parse
//video parse
//data parse
//email parse
//mobile parse
//tel parse
}
网页分析示例:
package service.spider.entryParser.alibaba
/**
* 列表页数据分析
* 通过 scala 正则表达式分析Html文本
* 分析结果通过RestFull Api 传递给客户端APP
*
* Created by Administrator on 15-11-18.
*/
import service.spider.HtmlDocLoader
import service.spider.htmlparser.HtmlDocParser
import scala.collection.mutable
import scala.collection.mutable.{ListBuffer, HashMap}
object AliAdListPageSpiderParser {
val urlpath:String=""
def docLisPageContentLoad(url:String):String={
//获取Html文档内容
val htm=HtmlDocLoader
var htmlContent=htm.getPageByURLConnetion(url,"GBK")
// println(htmlContent)
//提取其它数据
htmlContent
}
//list page info
def listPageInfoParse(htmlContent:String): Map[String,String] ={
//通过正则表达式提取广告
//创建Html文档分析器
val htmlParse=HtmlDocParser
//提取数据测试
val html_title=htmlParse.parsehtmlTitle(htmlContent)
//html keyword
//html description
//body
//cn text
var listpageinfoMap=Map[String,String]()
// listpageinfoMap.put("html_title" , html_title)
// listpageinfoMap.put("html_content",htmlContent.asInstanceOf[String])
// listpageinfoMap += ("html_title" -> html_title)
// listpageinfoMap += ("html_content".toString -> htmlContent.toString)
listpageinfoMap
}
def adLinkParse(htmlContent:String):List[Map[String,String]]={
//List[String]
val adlisgPageCodeBlock= getAdCodeBlockList(htmlContent)
val cmpInfoList= getCompanyInfoByCodeBlockList(adlisgPageCodeBlock)
println("##############parse result###########")
for(mapad <-cmpInfoList){
println(mapad("cmpLink"))
}
cmpInfoList
}
//通过代码块提取公司链接信息列表
def getCompanyInfoByCodeBlockList(cbList:List[String]):List[Map[String,String]]={
val cmp_link_list=mutable.ListBuffer[Map[String,String]]()
for(codeblock <- cbList){
val cmp_info_map= parseCompany(codeblock)
//cmp_info_map::cmp_link_list
if(cmp_info_map.size>0){
cmp_link_list.append(cmp_info_map)
}
}
cmp_link_list.toList
}
//cmp info parse
def parseCompany(codeblock:String): Map[String,String] ={
println("Ali Company Info parse:")
// is cyt
val isadcmp=isadCmpinfo(codeblock)
var entryMap=Map[String,String]()
println ("is cyt cmp:"+isadcmp )
if(isadcmp==true){
//test link
val adlinkcode=parseadlinkCode(codeblock)
println("cmp link code "+adlinkcode.length)
//cmp link
val cmp_link=parseAdLink(adlinkcode)
println("cmp link parse:"+cmp_link.length)
//cmp name
val cmp_name=parseadcmpName(adlinkcode)
println("cmp name psrse:"+cmp_name.length)
// entryMap.
entryMap +=("cmpLink" -> cmp_link)
entryMap +=("cmpName" -> cmp_name)
}else{
println("is`nt ad info")
}
println("result :"+entryMap("cmpLink"))
entryMap
}
def getAdCodeBlockList(htmlContent:String):List[String]={
val htmlParse=HtmlDocParser
val start="<div class=\"wrap\">"
val end="<div class=\"bg\"></div>"
val list=htmlParse.getCodeBlock(start ,end,htmlContent)
list
}
//parse cmp
def isadCmpinfo(codeBlock:String): Boolean ={
//class="icons-identification sw-ui-icon-cxt16x16" is cyt
//icons-identification sw-ui-icon-cxt16x16
//icons-identification sw-ui-icon-cxt16x16
val cyt_preffix="icons-identification"
var bok=false
// println(codeBlock)
var pos= codeBlock.indexOf(cyt_preffix)
println("user is cyt:"+codeBlock.indexOf(cyt_preffix))
if(pos == -1){
}else{
bok=true
}
bok
}
/*
parse ad link
<a class="list-item-title-text" offer-id="offer31" rel="nofollow" offerId="" offer-stat="com" title="衡水滨湖新区圣康医疗器械厂" target="_blank" href="http://hbshengkang.1688.com" gotodetail="2" >衡水滨湖新区圣康<font color=red>医疗器械</font>厂</a>
*/
def parseadlinkCode(codeBlock:String ):String={
// val regex="<a class=\"list-item-title-text\" offer-id=\"offer31\" rel=\"nofollow\" offerId=\"\" offer-stat=\"com\" title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r
val regex="<a class=\"list-item-title-text\" .*?title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r
val s= regex.findFirstIn(codeBlock)
// println(s)
s.toString
}
def parseAdLink(codeBlock:String ):String={
val regex="(?<=href=\")(.*?)(?=\")".r
val s= regex.findFirstIn(codeBlock)
//println(s)
var link_str:String=""
s match{
case Some(x) => {
println(x)
link_str=x
}
case None => println ("no link")
}
link_str
}
def parseadcmpName(codeBlock:String ):String={
val regex="(?<=title=\")(.*?)(?=\")".r
val s= regex.findFirstIn(codeBlock)
println(s)
s.toString
}
//
def main(args: Array[String]) {
val adparser=AliAdListPageSpiderParser
adparser.docLisPageContentLoad("http://www.sina.com.cn)
}
}
分析结果的持久化
基于play anrom 开发分析结果的入库处理,示例代码如:
package models
import anorm._
import play.api.db.DB
import scala.collection.JavaConverters._
import scala.collection.immutable.List
import java.util.Date
import play.api.Play.current
/*
db lib: anorm
*/
object AdSpiderInfoDao{
def save(entry: AdSpiderInfo): Unit ={
}
def show= {
val conn = DB.getConnection()
println("dbName:" + conn.getCatalog)
//val result: Boolean = SQL("Select count(*) from ad_spider_info").execute()
}
def count= {
var countvl:Long =0
DB.withConnection { implicit c =>
val firstRow = SQL("Select count(*) as c from ad_spider_info").apply().head
val countryCount = firstRow[Long]("c")
// println("count:" + countryCount)
countvl=countryCount
}
countvl
}
//add data
//test : ok
def add(e:AdSpiderInfo):Boolean = {
DB.withConnection {implicit c =>
val rs= SQL(
"""insert into
ad_spider_info(urlPath, comForm,addTime,updateTime)
values ({urlPath}, {comForm} ,{addTime},{updateTime})
""")
.on(
"urlPath"->e.urlPath,
"comForm"->e.comForm,
//
"addTime"->e.addTime,
//
//
"updateTime"->None
)
.executeInsert()
rs ==1
}
}
def update(e:AdSpiderInfo):Boolean = {
DB.withConnection {implicit c =>
val rs= SQL(
"""update
ad_spider_info set urlPath={urlPath}, comFrome={comFrome},addTime={addTime},updateTime={updateTime} where infoid={infoid}
""")
.on(
"urlPath"->e.urlPath,
"comFrome"->e.comForm,
"addTime"->e.addTime,
"updateTime"->e.updateTime,
"id"->e.infoid
)
.executeUpdate()
rs == 1
}
}
def delete(infoid:Long):Boolean = {
DB.withConnection {implicit c =>
val rs= SQL(
"""delete from ad_spider_info where infoid={infoid}
""")
.on(
"id"->infoid
)
.executeUpdate()
rs == 0
}
}
/*
*/
//query ad
def getAdById(infoid: Long): AdSpiderInfoView = {
var adSpiderInfo:AdSpiderInfoView=null
DB.withConnection { implicit conn =>
val sql: SqlQuery = SQL("Select * from ad_spider_info where infoid={infoid} order by infoid desc")
sql .on( "infoid"-> infoid)
//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String, var comForm:String, var updateTime:Date
val list:List[AdSpiderInfoView]= sql().map(row =>
AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
,row[Long]("visitSalerId"),
row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
).toList
adSpiderInfo= list.head
}
adSpiderInfo
}
def getAdByUrlPath(urlpath : String): Option[AdSpiderInfoView] = {
var adSpiderInfo:Option[AdSpiderInfoView]=null
DB.withConnection { implicit conn =>
val sql = SQL("Select * from ad_spider_info where urlPath = {urlpath} order by infoid desc")
.on( "urlpath" -> urlpath)
//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String, var comForm:String, var updateTime:Date
val list:List[AdSpiderInfoView]= sql()
.map(row =>
AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
,row[Long]("visitSalerId"),
row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
)
.toList
println(list.lastOption)
adSpiderInfo= list.lastOption
}
adSpiderInfo
}
//ad list page index
def getListByPageIndex(index:Int, pageCount:Int,comForm:String): List[AdSpiderInfoView] ={
val start:Int=(index-1)*pageCount
var adSpiderInfolist: List[AdSpiderInfoView]=null
DB.withConnection { implicit c =>
val sql: SimpleSql[Row] = SQL("Select * from ad_spider_info limit {start}, {pageCount} where comForm={comForm}")
.on(
"start"->start,
"pageCount"->pageCount,
"comForm"->comForm
)
//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String, var comForm:String, var updateTime:Date
val list:List[AdSpiderInfoView]= sql().map(row =>
AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
,row[Long]("visitSalerId"),
row[String]("visitSaler"),row[String]("comForm"), row[Date]("addTime"),row[Date]("updateTime"))
).toList
adSpiderInfolist= list
}
adSpiderInfolist
}
def getListAll(): List[AdSpiderInfoView]= {
var adSpiderInfolist: List[AdSpiderInfoView]=null
DB.withConnection { implicit c =>
val sql: SqlQuery = SQL("Select * from ad_spider_info order by infoid desc")
//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String, var comForm:String, var updateTime:Date
val list: List[AdSpiderInfoView] = sql().map(row =>
AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
, row[Long]("visitSalerId"),
row[String]("visitSaler"), row[String]("comForm"), row[Date]("addTime"), row[Date]("updateTime"))
).toList
adSpiderInfolist = list
}
adSpiderInfolist
}
def main (argvs:Array [String]){
var ifo=new AdSpiderInfo()
ifo.urlPath="http://aabbcc.com/test.html"
ifo.comForm="ali"
this.add(ifo)
println("aabbcc")
}
}