基于Scala与正则表达式的Html文本分析应用

转载 2015年12月16日 17:26:32


文本分析与提取:scala正则
持久化:anorm

web:play 2

scala 正则关键代码示例:


    val regex1="(?<=<title>)(.*?)(?=</title>)".r
 
     val html_node= regex1.findFirstIn( htmlContent)

    

package service.spider.htmlparser

import scala.collection.mutable.{ListBuffer, ArrayBuffer}


 object HtmlDocParser {

   //title parse
    def parsehtmlTitle(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=<title>)(.*?)(?=</title>)".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }

  def parsehtmlKeyWord(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=keyword=\")(.*?)(?=\")".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

  def parsehtmlDescription(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=description=\")(.*?)(?=\")".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

   //body parse

  def parsehtmlBody(htmlContent:String): String ={
    //标题查找正则表达式
    val regex1="(?<=<body>)(.*?)(?=</body>)".r
    //匹配一次
    val html_node= regex1.findFirstIn( htmlContent)

    println(html_node)
    html_node.toString
  }

   //img parse
   def parseImages(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=img src=\")(.*?)(?=\")".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }
   //js parse

   //import js parse

   //import css parse

   //css parse

   //code block parse

   //link parse
   def parseLinkList(htmlContent:String): String ={
     //标题查找正则表达式
     val regex1="(?<=href=\")(.*?)(?=\")".r
     //匹配一次
     val html_node= regex1.findFirstIn( htmlContent)

     println(html_node)
     html_node.toString
   }

  //code block

  def getCodeBlock(start:String ,end:String,htmlContent:String):List[String]={

    //var list=List[String]()
    var list=ListBuffer[String]()
    val regex1=("(?s)(?<="+start+")(.*?)(?="+end+")").r

     val h2=htmlContent.replaceAll("\n\r","")
      var i:Int=0
    val html_nodeList= regex1.findAllMatchIn(h2)
    for(matchstr <- html_nodeList){
      i+=1
      println("getCodeBlock "+" c:"+i+ " match g:"+matchstr.groupCount )//+ ""+matchstr.group(0))
    // println("##########\n"+s)
      list.append(matchstr.toString())
      //s::list
    }

    //转不可变集合
    list.toList
  }

   //ul list parse

   //flash parse

   //video parse

   //data parse

   //email parse

   //mobile parse
  //tel parse

 }
 
 
 网页分析示例:
 
 package service.spider.entryParser.alibaba

/**
 * 列表页数据分析
 * 通过 scala 正则表达式分析Html文本
 * 分析结果通过RestFull Api 传递给客户端APP
 *
 * Created by Administrator on 15-11-18.
 */

import service.spider.HtmlDocLoader

import service.spider.htmlparser.HtmlDocParser

import scala.collection.mutable
import scala.collection.mutable.{ListBuffer, HashMap}

object AliAdListPageSpiderParser {

    val urlpath:String=""


    def docLisPageContentLoad(url:String):String={
      //获取Html文档内容
      val htm=HtmlDocLoader

      var htmlContent=htm.getPageByURLConnetion(url,"GBK")

     // println(htmlContent)

      //提取其它数据
      htmlContent
    }


  //list page info
  def listPageInfoParse(htmlContent:String): Map[String,String] ={
    //通过正则表达式提取广告
    //创建Html文档分析器
    val htmlParse=HtmlDocParser
    //提取数据测试
    val html_title=htmlParse.parsehtmlTitle(htmlContent)

    //html keyword

    //html description

    //body
    //cn text

     var listpageinfoMap=Map[String,String]()
   // listpageinfoMap.put("html_title" , html_title)
   // listpageinfoMap.put("html_content",htmlContent.asInstanceOf[String])

   // listpageinfoMap += ("html_title" -> html_title)
   // listpageinfoMap += ("html_content".toString -> htmlContent.toString)
    listpageinfoMap
  }


  def adLinkParse(htmlContent:String):List[Map[String,String]]={
    //List[String]
    val adlisgPageCodeBlock= getAdCodeBlockList(htmlContent)

    val cmpInfoList= getCompanyInfoByCodeBlockList(adlisgPageCodeBlock)
    println("##############parse result###########")
        for(mapad <-cmpInfoList){

          println(mapad("cmpLink"))
        }
    cmpInfoList
  }

  //通过代码块提取公司链接信息列表

  def getCompanyInfoByCodeBlockList(cbList:List[String]):List[Map[String,String]]={

      val cmp_link_list=mutable.ListBuffer[Map[String,String]]()
      for(codeblock <- cbList){

       val cmp_info_map= parseCompany(codeblock)

        //cmp_info_map::cmp_link_list

        if(cmp_info_map.size>0){
          cmp_link_list.append(cmp_info_map)
        }


      }

    cmp_link_list.toList
  }


  //cmp info parse

  def parseCompany(codeblock:String): Map[String,String] ={
    println("Ali Company Info parse:")
    // is cyt

   val isadcmp=isadCmpinfo(codeblock)

    var entryMap=Map[String,String]()

    println ("is cyt cmp:"+isadcmp )

      if(isadcmp==true){
        //test link
        val adlinkcode=parseadlinkCode(codeblock)
        println("cmp link  code "+adlinkcode.length)
        //cmp link
        val cmp_link=parseAdLink(adlinkcode)

        println("cmp link parse:"+cmp_link.length)
        //cmp name
        val cmp_name=parseadcmpName(adlinkcode)

        println("cmp name psrse:"+cmp_name.length)
      //  entryMap.
       entryMap +=("cmpLink" -> cmp_link)
        entryMap +=("cmpName" -> cmp_name)
      }else{

        println("is`nt ad info")
      }

    println("result :"+entryMap("cmpLink"))

    entryMap
  }


  def getAdCodeBlockList(htmlContent:String):List[String]={

    val htmlParse=HtmlDocParser
      val start="<div class=\"wrap\">"
    val end="<div class=\"bg\"></div>"

    val list=htmlParse.getCodeBlock(start ,end,htmlContent)

      list
    }


  //parse cmp

  def isadCmpinfo(codeBlock:String): Boolean ={
    //class="icons-identification sw-ui-icon-cxt16x16" is cyt
    //icons-identification sw-ui-icon-cxt16x16
    //icons-identification sw-ui-icon-cxt16x16
      val cyt_preffix="icons-identification"

      var bok=false
     // println(codeBlock)
       var pos=  codeBlock.indexOf(cyt_preffix)
         println("user is cyt:"+codeBlock.indexOf(cyt_preffix))
        if(pos == -1){
        }else{
          bok=true
        }

    bok
  }


  /*
  parse ad link

  <a class="list-item-title-text" offer-id="offer31"  rel="nofollow" offerId="" offer-stat="com" title="衡水滨湖新区圣康医疗器械厂" target="_blank" href="http://hbshengkang.1688.com" gotodetail="2" >衡水滨湖新区圣康<font color=red>医疗器械</font>厂</a>
   */


    def parseadlinkCode(codeBlock:String ):String={


     // val regex="<a class=\"list-item-title-text\" offer-id=\"offer31\"  rel=\"nofollow\" offerId=\"\" offer-stat=\"com\" title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r
     val regex="<a class=\"list-item-title-text\" .*?title=\".*?\" target=\"_blank\" href=(.*?).*gotodetail=\"2\"".r

       val s= regex.findFirstIn(codeBlock)
   //   println(s)
     s.toString
    }

  def parseAdLink(codeBlock:String ):String={
    val regex="(?<=href=\")(.*?)(?=\")".r

    val s= regex.findFirstIn(codeBlock)
    //println(s)

    var link_str:String=""
    s match{
      case Some(x) => {
        println(x)
        link_str=x
      }
      case None => println ("no link")
    }
    link_str
  }



  def parseadcmpName(codeBlock:String ):String={
    val regex="(?<=title=\")(.*?)(?=\")".r

    val s= regex.findFirstIn(codeBlock)
    println(s)
    s.toString
  }

  //

  def main(args: Array[String]) {

      val adparser=AliAdListPageSpiderParser
      adparser.docLisPageContentLoad("http://www.sina.com.cn)
  }


}

分析结果的持久化

基于play anrom 开发分析结果的入库处理,示例代码如:

package models

import anorm._

import play.api.db.DB

import scala.collection.JavaConverters._
import scala.collection.immutable.List
import java.util.Date


import play.api.Play.current
/*

db lib: anorm


 */


object AdSpiderInfoDao{

    def save(entry: AdSpiderInfo): Unit ={


    }

  def show= {

    val conn = DB.getConnection()

    println("dbName:" + conn.getCatalog)
    //val result: Boolean = SQL("Select count(*) from  ad_spider_info").execute()


  }

  def count= {
    var countvl:Long =0
    DB.withConnection { implicit c  =>


      val firstRow = SQL("Select count(*) as c from ad_spider_info").apply().head
      val countryCount = firstRow[Long]("c")
      // println("count:" + countryCount)
      countvl=countryCount
    }
    countvl
  }

    //add data
  //test : ok
    def add(e:AdSpiderInfo):Boolean = {

      DB.withConnection {implicit c  =>

      
        val rs= SQL(
      """insert into
     ad_spider_info(urlPath, comForm,addTime,updateTime)
     values ({urlPath}, {comForm} ,{addTime},{updateTime})
      """)
     .on(
       "urlPath"->e.urlPath,
       "comForm"->e.comForm,
      //
       "addTime"->e.addTime,
      //
       //
       "updateTime"->None
     )
        .executeInsert()
        rs ==1
    }

    }

  def update(e:AdSpiderInfo):Boolean = {

    DB.withConnection {implicit c  =>


      val rs= SQL(
        """update
     ad_spider_info set urlPath={urlPath}, comFrome={comFrome},addTime={addTime},updateTime={updateTime} where infoid={infoid}
        """)
        .on(
          "urlPath"->e.urlPath,
          "comFrome"->e.comForm,
          "addTime"->e.addTime,
          "updateTime"->e.updateTime,
          "id"->e.infoid
        )
      .executeUpdate()
      rs == 1
    }

  }



  def delete(infoid:Long):Boolean = {

    DB.withConnection {implicit c  =>


      val rs= SQL(
        """delete from  ad_spider_info where infoid={infoid}
        """)
        .on(

          "id"->infoid
        )
       .executeUpdate()
      rs == 0
    }

  }

  /*
*/
  //query ad
  def getAdById(infoid: Long): AdSpiderInfoView = {

    var adSpiderInfo:AdSpiderInfoView=null

      DB.withConnection { implicit conn =>


      val sql: SqlQuery = SQL("Select * from  ad_spider_info where infoid={infoid} order by infoid desc")

          sql .on( "infoid"-> infoid)

//var infoid :Long, var urlPath:String,
//var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
     val list:List[AdSpiderInfoView]= sql().map(row =>
  AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          ,row[Long]("visitSalerId"),
            row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
        ).toList

      adSpiderInfo=   list.head
    }

    adSpiderInfo
  }


  def getAdByUrlPath(urlpath : String): Option[AdSpiderInfoView] = {

    var adSpiderInfo:Option[AdSpiderInfoView]=null

    DB.withConnection { implicit conn =>


      val sql = SQL("Select * from  ad_spider_info where urlPath = {urlpath} order by infoid desc")

      .on( "urlpath" ->  urlpath)
      //var infoid :Long, var urlPath:String,
      //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date


      val list:List[AdSpiderInfoView]= sql()

        .map(row =>
        AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          ,row[Long]("visitSalerId"),
          row[String]("visitSaler"),row[String]("comForm"),row[Date]("updateTime"), row[Date]("addTime"))
      )

        .toList


          println(list.lastOption)

      adSpiderInfo=   list.lastOption
    }

    adSpiderInfo
  }
  //ad list page index

    def getListByPageIndex(index:Int, pageCount:Int,comForm:String): List[AdSpiderInfoView] ={

      val start:Int=(index-1)*pageCount

      var adSpiderInfolist: List[AdSpiderInfoView]=null

      DB.withConnection { implicit c =>


        val sql: SimpleSql[Row] = SQL("Select  * from  ad_spider_info  limit {start}, {pageCount} where comForm={comForm}")
        .on(
          "start"->start,
          "pageCount"->pageCount,
         "comForm"->comForm

        )
        //var infoid :Long, var urlPath:String,
        //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
        val list:List[AdSpiderInfoView]= sql().map(row =>
          AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
            ,row[Long]("visitSalerId"),
            row[String]("visitSaler"),row[String]("comForm"), row[Date]("addTime"),row[Date]("updateTime"))
        ).toList

        adSpiderInfolist=   list
      }

      adSpiderInfolist
    }


  def getListAll(): List[AdSpiderInfoView]= {

    var adSpiderInfolist: List[AdSpiderInfoView]=null

    DB.withConnection { implicit c =>

      val sql: SqlQuery = SQL("Select  * from  ad_spider_info  order by infoid desc")

      //var infoid :Long, var urlPath:String,
      //var addTime:Date,var visitSalerId:Long, var visitSaler:String,  var comForm:String, var updateTime:Date
      val list: List[AdSpiderInfoView] = sql().map(row =>
        AdSpiderInfoView(row[Long]("infoid"), row[String]("urlPath")
          , row[Long]("visitSalerId"),
          row[String]("visitSaler"), row[String]("comForm"), row[Date]("addTime"), row[Date]("updateTime"))
      ).toList

      adSpiderInfolist = list
    }
      adSpiderInfolist
  }


  def  main (argvs:Array [String]){
     var  ifo=new AdSpiderInfo()
    ifo.urlPath="http://aabbcc.com/test.html"

    ifo.comForm="ali"

    this.add(ifo)

    println("aabbcc")
  }
}

相关文章推荐

Play FrameWork scala.html页面关于变量的一些运算操作

1.简单四则运算 @(jorder.getPopBill().getGoodsPrice() - jorder.getPopBill().getAbroadTransferPrice()) ...

HTMLParser解析HTMl标签的实例

public class HtmlParserTest { /** * @param args * @throws ParserException */ /** * @param...

利用HtmlParse获取Html内容并提取

一.      网上获取html内容 1.利用url获取html内容: public static String getHtmlContent(String urlstr){ /*思路: 1...

Delphi7高级应用开发随书源码

  • 2003年04月30日 00:00
  • 676KB
  • 下载

正则表达式的小应用(按空白分割文本但是保留"\n")

正则表达式从一段文字中提取想要的东西: 比如说从一篇英语阅读中提取除了空格以外的东西包括单词换行符标点下滑线: Small Schools Rising   This year's list o...

html转换为json字符串,正则表达式的应用

最近在做app,需要做一个查看物流的接口,从第三方去申请物流API,由于申请的是免费的,所以只能返回html(这个有点坑,免费的和收费的区别太大了吧),谁让咱是做技术的,即使是html,也可以把它变成...

基于正则表达式的文本抽取软件

  • 2008年10月23日 14:42
  • 7.96MB
  • 下载

一个HTML超链接正则表达式的分析过程

http://www.mbaobao.com/about-us.html?l=help_1_1" http://www.mbaobao.com/contact-us.html?l=help_1_2"...

融合正则表达式的文本统计工具

  • 2012年12月05日 14:48
  • 163KB
  • 下载

正则表达式处理html文本例子

  • 2012年06月06日 09:46
  • 40KB
  • 下载
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:基于Scala与正则表达式的Html文本分析应用
举报原因:
原因补充:

(最多只允许输入30个字)