Octopus和Humphrey PDF解析


  我最开始看到Octopus和Humphrey PDF报告里的文字可以复制的时候是十分开心的,因为直接提取文字的正确率几乎为100%,而OCR即使做得再好,也难免会有识别错误的时候。

  说干就干,我满怀期待地用github.com/ledongthuc/pdf包读取了一下Octopus和Humphrey PDF报告,结果什么都没有读取到。查询了一下才知道:github.com/ledongthuc/pdf包对结构复杂的PDF难以提取到文本信息。而由java语言写成jar包tika则擅长解析结构复杂的PDF,于是我尝试了一下用tika来实现我们伟大的目标:自动化提取Octopus和Humphrey PDF报告中的信息,从而解放双手。幸运的是,这次尝试是成功的。

1. 需要的工具

  工欲善其事必先利其器,我们需要使用的工具如下:

2. 源码

  配置好以上环境之后,就已经成功一大半了,接下来只需要复制、粘贴,而后运行即可。

  程序执行流程如下:

  1. 开启tika本地服务器
  2. 读取Names和IDs
  3. 解析PDF为Html,解析生成的Html
  4. 保存数据

  源码如下:

  2.1 CrackOctopus.go

package main

import (
    "bufio"
    "io"
    "io/ioutil"
    "os/exec"
    "context"
    "fmt"
    "encoding/csv"
    "github.com/google/go-tika/tika"
    "github.com/PuerkitoBio/goquery"
    "strings"
    "regexp"
    "log"
    "os"
    "time"
)

var (
    reg0 = regexp.MustCompile(`MS.+`)
    reg1 = regexp.MustCompile(`MD.+`)
    reg2 = regexp.MustCompile(`sLV.+`)
)


type Eye struct {
    whichEye,result string
}

func ReadFile(fileName string) (res [] string) {
    file, err := os.OpenFile(fileName, os.O_RDWR, 0666)
    if err != nil {
        fmt.Println("Open file error!", err)
        return
    }
    defer file.Close()

    stat, err := file.Stat()
    if err != nil {
        panic(err)
    }

    var size = stat.Size()
    fmt.Println("file size=", size)

    buf := bufio.NewReader(file)
    for {
        line, err := buf.ReadString('\n')
        line = strings.TrimSpace(line)
        res =append(res,line)
        //fmt.Println(line)
        if err != nil {
            if err == io.EOF {
                fmt.Println("File read ok!")
                break
            } else {
                fmt.Println("Read file error!", err)
                return
            }
        }
    }
    return res
}

func PathExists(path string) (bool, error) {
    _, err := os.Stat(path)
    if err == nil {
        return true, nil
    }
    if os.IsNotExist(err) {
        return false, nil
    }
    return false, err
}

func ScanFiles(fileDir string) []string {
    exist, err := PathExists(fileDir)
    if err != nil {
        fmt.Printf("get dir error![%v]\n", err)
    }
    var fileNameList []string
    if exist{
        files, _ := ioutil.ReadDir(fileDir) //读取目录
        for _, onefile := range files {     //遍历目录下文件
            if !onefile.IsDir() { //是文件
                fileName := onefile.Name()
                fileNameList = append(fileNameList, fileDir + fileName)
            }
        }
    }
    return fileNameList
}

func SaveFile(Name string,ID string, OD Eye, OS Eye){
    //这样可以追加写
    nfs, err := os.OpenFile("./OctopusData.csv", os.O_RDWR|os.O_CREATE, 0666)
    if err != nil {
        log.Fatalf("can not create file, err is %+v", err)
    }
    defer nfs.Close()
    nfs.Seek(0, io.SeekEnd)
 
    w := csv.NewWriter(nfs)
    //设置属性
    w.Comma = ','
    w.UseCRLF = true
    /*
    row := []string{"Name", "ID", "AL_OD", "AL_OS"}
    err = w.Write(row)
    if err != nil {
        log.Fatalf("can not write, err is %+v", err)
    }
    */
    //这里必须刷新,才能将数据写入文件。
    w.Flush()
    //一次写入多行
    var newContent [][]string
    Data :=[]string{Name,ID,OD.whichEye, OD.result,OS.whichEye,OS.result}

    newContent = append(newContent, Data)
    w.WriteAll(newContent)

}
func startServer() {

    // Print Go Version
    cmdOutput, err := exec.Command("cmd", "/c","java -jar tika-server-standard-2.3.0.jar").Output()
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("%s", cmdOutput)
}

func runClient(path string) (OD Eye,OS Eye){
    /*
    cmd := exec.Command(`java -jar tika-server-standard-2.3.0.jar`)
    if runtime.GOOS == "windows" {
         cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true}
    }
    err := cmd.Run()
    if err != nil {
        fmt.Println(err)
        return
    }
    */
    // Optionally pass a port as the second argument.
    f, err := os.Open(path)
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    //fmt.Println(f.Name())
    client := tika.NewClient(nil, "http://localhost:9998")
    body, err := client.Parse(context.Background(), f)
    if err != nil {
        fmt.Println(err)
    }
    //fmt.Println(body)
    result := strings.Split(HtmlParser(body),"\n")
    OD.whichEye,OS.whichEye = "右","左"
    OD.result,OS.result = result[0],result[1]
    return OD,OS
    /*
    file, err := os.Create("body.html")
    if err != nil {
        fmt.Println(err)
        return
    }
    defer file.Close()
    file.WriteString(body)
    */

}

func HtmlParser(html string)(result string){
      doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
      //fmt.Println(doc)
      if err != nil {
        log.Fatal(err)
      }
      // Find the review items
      doc.Find("div").Each(func(i int, selection *goquery.Selection) {
        text := selection.Find("p").Text()
        result =result + reg0.FindString(text) + " " + reg1.FindString(text) +  " " + reg2.FindString(text) +"\n"
      })
      fmt.Printf("%s", result)
      return result
}

func main(){
    go startServer()
    Names := ReadFile("./Names.txt")
    IDs := ReadFile("./IDs.txt")
    title := Eye{"眼别","result"}
    noData :=Eye{"",""}
    SaveFile("Name","ID",title,title)
    //fmt.Println(Names,IDs)
    //WhichEye :=ReadFile("./WhichEye.txt")
    time.Sleep(time.Second)
    for i,Name := range Names{
        if Name !=""{
            Dir:= "./Exams/"+ Name + IDs[i]+"/Octopus/"
            files:= ScanFiles(Dir)
            if l:=len(files); l!=0{
                OD,OS := runClient(files[l-1])
                SaveFile(Name,IDs[i],OD,OS)
            }else{
                SaveFile("","",noData,noData)
            }
        }else{
            SaveFile("","",noData,noData)
        }
    }
    
}

  2.2 CrackHumphrey.go

package main

import (
    "bufio"
    "io"
    "io/ioutil"
    "os/exec"
    "context"
    "fmt"
    "encoding/csv"
    "github.com/google/go-tika/tika"
    "github.com/PuerkitoBio/goquery"
    "strings"
    "regexp"
    "log"
    "os"
    "time"
)

var (
    reg0 = regexp.MustCompile(`GHT:.+`)
    reg1 = regexp.MustCompile(`VFI:.+`)
    reg2 = regexp.MustCompile(`MD30-2:.+`)
    reg3 = regexp.MustCompile(`PSD30-2:.+`)
)

type Eye struct {
    whichEye,result string
}

func ReadFile(fileName string) (res [] string) {
    file, err := os.OpenFile(fileName, os.O_RDWR, 0666)
    if err != nil {
        fmt.Println("Open file error!", err)
        return
    }
    defer file.Close()

    stat, err := file.Stat()
    if err != nil {
        panic(err)
    }

    var size = stat.Size()
    fmt.Println("file size=", size)

    buf := bufio.NewReader(file)
    for {
        line, err := buf.ReadString('\n')
        line = strings.TrimSpace(line)
        res =append(res,line)
        //fmt.Println(line)
        if err != nil {
            if err == io.EOF {
                fmt.Println("File read ok!")
                break
            } else {
                fmt.Println("Read file error!", err)
                return
            }
        }
    }
    return res
}
func PathExists(path string) (bool, error) {
    _, err := os.Stat(path)
    if err == nil {
        return true, nil
    }
    if os.IsNotExist(err) {
        return false, nil
    }
    return false, err
}

func ScanFiles(fileDir string) []string {
    exist, err := PathExists(fileDir)
    if err != nil {
        fmt.Printf("get dir error![%v]\n", err)
    }
    var fileNameList []string
    if exist{
        files, _ := ioutil.ReadDir(fileDir) //读取目录
        for _, onefile := range files {     //遍历目录下文件
            if !onefile.IsDir() { //是文件
                fileName := onefile.Name()
                fileNameList = append(fileNameList, fileDir + fileName)
            }
        }
    }
    return fileNameList
}

func SaveFile(Name string,ID string, OD Eye, OS Eye){
    //这样可以追加写
    nfs, err := os.OpenFile("./HumphreyData.csv", os.O_RDWR|os.O_CREATE, 0666)
    if err != nil {
        log.Fatalf("can not create file, err is %+v", err)
    }
    defer nfs.Close()
    nfs.Seek(0, io.SeekEnd)
 
    w := csv.NewWriter(nfs)
    //设置属性
    w.Comma = ','
    w.UseCRLF = true
    /*
    row := []string{"Name", "ID", "AL_OD", "AL_OS"}
    err = w.Write(row)
    if err != nil {
        log.Fatalf("can not write, err is %+v", err)
    }
    */
    //这里必须刷新,才能将数据写入文件。
    w.Flush()
    //一次写入多行
    var newContent [][]string
    Data :=[]string{Name,ID,OD.whichEye, OD.result,OS.whichEye,OS.result}

    newContent = append(newContent, Data)
    w.WriteAll(newContent)

}
func startServer() {

    // Print Go Version
    cmdOutput, err := exec.Command("cmd", "/c","java -jar tika-server-standard-2.3.0.jar").Output()
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("%s", cmdOutput)
}


func runClient(path string) (OD Eye,OS Eye){
    /*
    cmd := exec.Command(`java -jar tika-server-standard-2.3.0.jar`)
    if runtime.GOOS == "windows" {
         cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true}
    }
    err := cmd.Run()
    if err != nil {
        fmt.Println(err)
        return
    }
    */
    // Optionally pass a port as the second argument.
    f, err := os.Open(path)
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    //fmt.Println(f.Name())
    client := tika.NewClient(nil, "http://localhost:9998")
    body, err := client.Parse(context.Background(), f)
    if err != nil {
        fmt.Println(err)
    }
    //fmt.Println(body)
    result := strings.Split(HtmlParser(body),"\n")
    OD.whichEye,OS.whichEye = "右","左"
    OD.result,OS.result = result[1],result[0]
    return OD,OS
    /*
    file, err := os.Create("body.html")
    if err != nil {
        fmt.Println(err)
        return
    }
    defer file.Close()
    file.WriteString(body)
    */

}

func HtmlParser(html string)(result string){
      doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
      //fmt.Println(doc)
      if err != nil {
        log.Fatal(err)
      }
      // Find the review items
      doc.Find("div").Each(func(i int, selection *goquery.Selection) {
        text := selection.Find("p").Text()
        result =result + reg0.FindString(text) + " " + reg1.FindString(text) +  " " + reg2.FindString(text) +  " " + reg3.FindString(text) +"\n"
      })
      fmt.Printf("%s", result)
      return result
}


func main(){
    go startServer()
    Names := ReadFile("./Names.txt")
    IDs := ReadFile("./IDs.txt")
    title := Eye{"眼别","result"}
    noData :=Eye{"",""}
    SaveFile("Name","ID",title,title)
    //fmt.Println(Names,IDs)
    //WhichEye :=ReadFile("./WhichEye.txt")
    time.Sleep(time.Second)
    for i,Name := range Names{
        if Name !=""{
            Dir:= "./Exams/"+ Name + IDs[i]+"/Humphrey/"
            files:= ScanFiles(Dir)
            if l:=len(files); l !=0{
                OD,OS := runClient(files[l-1])
                SaveFile(Name,IDs[i],OD,OS)
            }else{
                SaveFile("","",noData,noData)
            }
        }else{
            SaveFile("","",noData,noData)
        }
    }
    
}

  用Markdown 5 记笔记、排版公众号以及写Blog真的是太方便了,文末推荐一波Markdown的软件:

  • Joplin 6 记笔记十分方便,支持各种平台并且可以同步数据
  • Onemark 7 Onenote的插件,重新定义了Onenote,让你在Onenote中也能用Markdown语法记笔记
  • Markdown Nice 8 不要被它看似不相关的网页吓唬到,它的产品包括网页版、Chrome插件版和桌面版程序(前两者免费,最后者则要加钱)。网页版注册即可免费使用,多种主题可选,排版完成后可以直接复制到微信公众号及知乎等平台,用Markdown排版微信公众号及知乎等推文十分漂亮。

3. 参考资料


  1. 由java写成的PDF文件解析包:https://tika.apache.org/download.html ↩︎

  2. Go语言用来与tika交互的程序包:https://github.com/google/go-tika ↩︎

  3. 参考CSDN配置Java环境 ↩︎

  4. 参考CSDN配置Go环境:https://golang.google.cn/doc/install?download=go1.12.6.windows-amd64.msi ↩︎

  5. Markdown是一种轻量级的标记语言,非常易学,几分钟就能学会基本操作。 ↩︎

  6. 下载链接:https://joplinapp.org/ ↩︎

  7. 下载链接:https://neuxlab.cn/ ↩︎

  8. 排版链接:https://editor.mdnice.com/ ↩︎

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值