基于Scala与正则表达式的Html文本分析应用



文本分析与提取:scala正则
持久化:anorm

web:play 2

scala 正则关键代码示例:


    val regex1="(?<=<title>)(.*?)(?=</title>)".r
 
     val html_node= regex1.findFirstIn( htmlContent)

    

package service.spider.htmlparser

import scala.collection.mutable.{ListBuffer, ArrayBuffer}


 object HtmlDocParser {

   //title parse
    def parsehtmlTitle(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=<title>)(.*?)(?=</title>)".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }

  def parsehtmlKeyWord(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=keyword=\")(.*?)(?=\")".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

  def parsehtmlDescription(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=description=\")(.*?)(?=\")".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

   //body parse

  def parsehtmlBody(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=<body>)(.*?)(?=</body>)".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

   //img parse
   def parseImages(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=img src=\")(.*?)(?=\")".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }
   //js parse

   //import js parse

   //import css parse

   //css parse

   //code block parse

   //link parse
   def parseLinkList(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=href=\")(.*?)(?=\")".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }

  //code block

  def getCodeBlock(start:String ,end:String,htmlContent:String):List[String]={

    //var list=List[String]()
    var list=ListBuffer[String]()
    val regex1=("(?s)(?<="+start+")(.*?)(?="+end+")").r

     val h2=htmlContent.replaceAll("\n\r","")
      var i:Int=0
    val html_nodeList= regex1.findAllMatchIn(h2)
    for(matchstr <- html_nodeList){
      i+=1
      println("getCodeBlock "+" c:"+i+ " match g:"+matchstr.groupCount )//+ ""+matchstr.group(0))
    // println("##########\n"+s)
      list.append(matchstr.toString())
      //s::list
    }

    //转不可变集合
    list.toList
  }

   //ul list parse

   //flash parse

   //video parse

   //data parse

   //email parse

   //mobile parse
  //tel parse

 }
 
 
 网页分析示例:
 
 package service.spider.entryParser.alibaba

/**
 * 列表页数据分析
 * 通过 scala 正则表达式分析Html文本
 * 分析结果通过RestFull Api 传递给客户端APP
 *
 * Created by Administrator on 15-11-18.
 */

import service.spider.HtmlDocLoader

import service.spider.htmlparser.HtmlDocParser

import scala.collection.mutable
import scala.collection.mutable.{ListBuffer, HashMap}

object AliAdListPageSpiderParser {

    val urlpath:String=""


    def docLisPageContentLoad(url:String):String={
      //获取Html文档内容
      val htm=HtmlDocLoader

      var htmlContent=htm.getPageByURLConnetion(url,"GBK")

     // println(htmlContent)

      //提取其它数据
      htmlContent
    }


  //list page info
  def listPageInfoParse(htmlContent:String): Map[String,String] ={
    //通过正则表达式提取广告
    //创建Html文档分析器
    val htmlParse=HtmlDocParser
    //提取数据测试
    val html_title=htmlParse.parsehtmlTitle(htmlContent)

    //html keyword

    //html description

    //body
    //cn text

     var listpageinfoMap=Map[String,String]()
   // listpageinfoMap.put("html_title" , html_title)
   // listpageinfoMap.put("html_content",htmlContent.asInstanceOf[String])

   // listpageinfoMap += ("html_title" -> html_title)
   // listpageinfoMap += ("html_content".toString -> htmlContent.toString)
    listpageinfoMap
  }


  def adLinkParse(htmlContent:String):List[Map[String,String]]={
    //List[String]
    val adlisgPageCodeBlock= getAdCodeBlockList(htmlContent)

    val cmpInfoList= getCompanyInfoByCodeBlockList(adlisgPageCodeBlock)
    println("##############parse result###########")
        for(mapad <-cmpInfoList){

          println(mapad("cmpLink"))
        }
    cmpInfoList
  }

  //通过代码块提取公司链接信息列表

  def getCompanyInfoByCodeBlockList(cbList:List[String]):List[Map[String,String]]={

      val cmp_link_list=mutable.ListBuffer[Map[String,String]]()
      for(codeblock <- cbList){

       val cmp_info_map= parseCompany(codeblock)

        //cmp_info_map::cmp_link_list

        if(cmp_info_map.size>0){
          cmp_link_list.append(cmp_info_map)
        }


      }

    cmp_link_list.toList
  }


  //cmp info parse

  def parseCompany(codeblock:String): Map[String,String] ={
    println("Ali Company Info parse:")
    // is cyt

   val isadcmp=isadCmpinfo(codeblock)

    var entryMap=Map[String,String]()

    println ("is cyt cmp:"+isadcmp )

      if(isadcmp==true){
        //test link
        val adlinkcode=parseadlinkCode(codeblock)
        println("cmp link  code "+adlinkcode.length)
        //cmp link
        val cmp_link=parseAdLink(adlinkcode)

        println("cmp link parse:"+cmp_link.length)
        //cmp name
        val cmp_name=parseadcmpName(adlinkcode)

        println("cmp name psrse:"+cmp_name.length)
      //  entryMap.
       entryMap +=("cmpLink" -> cmp_link)
        entryMap +=("cmpName" -> cmp_name)
      }else{

        println("is`nt ad info")
      }

    println("result :"+entryMap("cmpLink"))

    entryMap
  }


  def getAdCodeBlockList(htmlContent:String):List[String]={

    val htmlParse=HtmlDocParser
      val start="<div class=\"wrap\">"
    val end="<div class=\"bg\"></div>"

    val list=htmlParse.getCodeBlock(start ,end,htmlContent)

      list
    }


  //parse cmp

  def isadCmpinfo(codeBlock:String): Boolean ={
    //class="icons-identification sw-ui-icon-cxt16x16" is cyt
    //icons-identification sw-ui-icon-cxt16x16
    //icons-identification sw-ui-icon-cxt16x16
      val cyt_preffix="icons-identification"

      var bok=false
     // println(codeBlock)
       var pos=  codeBlock.indexOf(cyt_preffix)
         println("user is cyt:"+codeBlock.indexOf(cyt_preffix))
        if(pos == -1){
        }else{
          bok=true
        }

    bok
  }


  /*
  parse ad link

  <a class="list-item-title-text" offer-id="offer31"  rel="nofollow" offerId="" offer-stat="com" title="衡水滨湖新区圣康医疗器械厂" target="_blank" href="http://hbshengkang.1688.com" gotodetail="2" >衡水滨湖新区圣康<font color=red>医疗器械</font>厂</a>
   */


    def parseadlinkCode(codeBlock:String ):String={


     // val regex="<a class=\"list-item-title-text\" offer-id=\"offer31\"  rel=\"nofollow\" offerId=\"\" offer-stat=\"com\" title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r
     val regex="<a class=\"list-item-title-text\" .*?title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r

       val s= regex.findFirstIn(codeBlock)
   //   println(s)
     s.toString
    }

  def parseAdLink(codeBlock:String ):String={
    val regex="(?<=href=\")(.*?)(?=\")".r

    val s= regex.findFirstIn(codeBlock)
    //println(s)

    var link_str:String=""
    s match{
      case Some(x) => {
        println(x)
        link_str=x
      }
      case None => println ("no link")
    }
    link_str
  }



  def parseadcmpName(codeBlock:String ):String={
    val regex="(?<=title=\")(.*?)(?=\")".r

    val s= regex.findFirstIn(codeBlock)
    println(s)
    s.toString
  }

  //

  def main(args: Array[String]) {

      val adparser=AliAdListPageSpiderParser
      adparser.docLisPageContentLoad("http://www.sina.com.cn)
  }


}

分析结果的持久化

基于play anrom 开发分析结果的入库处理,示例代码如:

package models

import anorm._

import play.api.db.DB

import scala.collection.JavaConverters._
import scala.collection.immutable.List
import java.util.Date


import play.api.Play.current
/*

db lib: anorm


 */


object AdSpiderInfoDao{

    def save(entry: AdSpiderInfo): Unit ={


    }

  def show= {

    val conn = DB.getConnection()

    println("dbName:" + conn.getCatalog)
    //val result: Boolean = SQL("Select count(*) from  ad_spider_info").execute()


  }

  def count= {
    var countvl:Long =0
    DB.withConnection { implicit c  =>


      val firstRow = SQL("Select count(*) as c from ad_spider_info").apply().head
      val countryCount = firstRow[Long]("c")
      // println("count:" + countryCount)
      countvl=countryCount
    }
    countvl
  }

    //add data
  //test : ok
    def add(e:AdSpiderInfo):Boolean = {

      DB.withConnection {implicit c  =>

      
        val rs= SQL(
      """insert into
     ad_spider_info(urlPath, comForm,addTime,updateTime)
     values ({urlPath}, {comForm} ,{addTime},{updateTime})
      """)
     .on(
       "urlPath"->e.urlPath,
       "comForm"->e.comForm,
      //
       "addTime"->e.addTime,
      //
       //
       "updateTime"->None
     )
        .executeInsert()
        rs ==1
    }

    }

  def update(e:AdSpiderInfo):Boolean = {

    DB.withConnection {implicit c  =>


      val rs= SQL(
        """update
     ad_spider_info set urlPath={urlPath}, comFrome={comFrome},addTime={addTime},updateTime={updateTime} where infoid={infoid}
        """)
        .on(
          "urlPath"->e.urlPath,
          "comFrome"->e.comForm,
          "addTime"->e.addTime,
          "updateTime"->e.updateTime,
          "id"->e.infoid
        )
      .executeUpdate()
      rs == 1
    }

  }



  def delete(infoid:Long):Boolean = {

    DB.withConnection {implicit c  =>


      val rs= SQL(
        """delete from  ad_spider_info where infoid={infoid}
        """)
        .on(

          "id"->infoid
        )
       .executeUpdate()
      rs == 0
    }

  }

  /*
*/
  //query ad
  def getAdById(infoid: Long): AdSpiderInfoView = {

    var adSpiderInfo:AdSpiderInfoView=null

      DB.withConnection { implicit conn =>


      val sql: SqlQuery = SQL("Select * from  ad_spider_info where infoid={infoid} order by infoid desc")

          sql .on( "infoid"-> infoid)

//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
     val list:List[AdSpiderInfoView]= sql().map(row =>
  AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          ,row[Long]("visitSalerId"),
            row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
        ).toList

      adSpiderInfo=   list.head
    }

    adSpiderInfo
  }


  def getAdByUrlPath(urlpath : String): Option[AdSpiderInfoView] = {

    var adSpiderInfo:Option[AdSpiderInfoView]=null

    DB.withConnection { implicit conn =>


      val sql = SQL("Select * from  ad_spider_info where urlPath = {urlpath} order by infoid desc")

      .on( "urlpath" ->  urlpath)
      //var infoid :Long, var urlPath:String,
      //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date


      val list:List[AdSpiderInfoView]= sql()

        .map(row =>
        AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          ,row[Long]("visitSalerId"),
          row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
      )

        .toList


          println(list.lastOption)

      adSpiderInfo=   list.lastOption
    }

    adSpiderInfo
  }
  //ad list page index

    def getListByPageIndex(index:Int, pageCount:Int,comForm:String): List[AdSpiderInfoView] ={

      val start:Int=(index-1)*pageCount

      var adSpiderInfolist: List[AdSpiderInfoView]=null

      DB.withConnection { implicit c =>


        val sql: SimpleSql[Row] = SQL("Select  * from  ad_spider_info  limit {start}, {pageCount} where comForm={comForm}")
        .on(
          "start"->start,
          "pageCount"->pageCount,
         "comForm"->comForm

        )
        //var infoid :Long, var urlPath:String,
        //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
        val list:List[AdSpiderInfoView]= sql().map(row =>
          AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
            ,row[Long]("visitSalerId"),
            row[String]("visitSaler"),row[String]("comForm"), row[Date]("addTime"),row[Date]("updateTime"))
        ).toList

        adSpiderInfolist=   list
      }

      adSpiderInfolist
    }


  def getListAll(): List[AdSpiderInfoView]= {

    var adSpiderInfolist: List[AdSpiderInfoView]=null

    DB.withConnection { implicit c =>

      val sql: SqlQuery = SQL("Select  * from  ad_spider_info  order by infoid desc")

      //var infoid :Long, var urlPath:String,
      //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
      val list: List[AdSpiderInfoView] = sql().map(row =>
        AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          , row[Long]("visitSalerId"),
          row[String]("visitSaler"), row[String]("comForm"), row[Date]("addTime"), row[Date]("updateTime"))
      ).toList

      adSpiderInfolist = list
    }
      adSpiderInfolist
  }


  def  main (argvs:Array [String]){
     var  ifo=new AdSpiderInfo()
    ifo.urlPath="http://aabbcc.com/test.html"

    ifo.comForm="ali"

    this.add(ifo)

    println("aabbcc")
  }
}

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值