Python Golang 解析web日志正则一例

有部分日志文件解析的需求,现在使用的python做的,想看看golang的下表现怎么样,由于对golang没那么熟悉,所以没有做什么优化,对比下看两种语言正则提取的代码和性能。仅做参考和思考

环境

  • macos 11.13.x 15年版本
  • python2.7 brew直接安装的
  • go 1.8.1
  • 都是内置的 re 库

日志格式

188.24.51.81 - - [01/Feb/2018:14:49:16 CST] "GET http://udn-plus.cedexis-test.com/img/35062/iuni2.html?rnd=-1-1-13960-0-0-35062-3705136164-_CgJqMRAUGEYiBQgBEIhtKKTI3-YNMJvPXDjp8MrTBUDW_tnzDEoQCAMQtAEYhEQgACirjoCgBFAAWgoIABAAGAAgACgAYABqGmJ1dHRvbi13b3JrZXIyLmFtcy5odi5wcm9kggEQCAMQtAEYhEQgACiwjoCgBIgBvLHMiQU HTTP/1.1" 200 0 1008 1412 "http://stardust-rain.tumblr.com/ask" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" "-" "-" "-" "-" LLNW
195.142.179.194 - - [01/Feb/2018:14:54:25 CST] "GET http://udn-plus.cedexis-test.com/img/35935/r20.gif?rnd=0-1-13960-0-0-35935-2572944071-_CgJqMRAUGEYiBQgBEIhtKMeF8MoJMJzPXDie88rTBUDc2MgyShEIBBDWARiokQIgACiwkYCgBFAAWgoIABAAGAAgACgAYABqGmJ1dHRvbi13b3JrZXIxLmFtcy5odi5wcm9kggERCAQQ1gEYqJECIAAosJGAoASIAcOdzpUM HTTP/1.1" 200 0 43 445 "http://bigboy1977.tumblr.com/post/153677022974" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-" "-" "-" "-" LLNW

regex.py

import re

t = ('^(?P<remote_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) .* '
        '\[(?P<time_local>.*?)\] '
        '"(?P<request>.*?)" '
        '(?P<status>[^ ]*) '
        '(?P<request_time>[^ ]*) '
        '(?P<body_bytes_sent>[^ ]*) '
        '(?P<bytes_sent>[^ ]*) '
        '"(?P<http_referer>[^"]*)" '
        '"(?P<http_user_agent>[^"]*)" '
        '"(?P<http_x_forwarded_for>[^"]*)" '
        '(?P<connection>[^ ]*) '
        '"(?P<hit>[^"]*)" '
        '"(?P<server_addr>[^"]*)" '
        '(?P<cdn>.*)')

def parser(filename):
    regex = re.compile(t)
    with open(filename, 'r') as f:
        for line in f.readlines():
            res = re.search(regex, line)

for i in range(10):
    parser('test.log')

regex.go

package main 

import (
    "fmt"
    "regexp"
    "os"
    "bufio"
)

type myRegexp struct {
    *regexp.Regexp
}

func (r *myRegexp) FindStringSubmatchMap(s string) map[string]string {
    captures := make(map[string]string)

    match := r.FindStringSubmatch(s)
    if match == nil {
        return captures
    }

    for i, name := range r.SubexpNames() {
        // 
        if i == 0 {
            continue
        }
        captures[name] = match[i]

    }
    return captures
}

func main() {
    re2str := `^(?P<remote_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) .* ` + 
              `\[(?P<time_local>.*?)\] ` +
              `"(?P<request>.*?)" ` + 
              `(?P<status>[^ ]*) `  + 
              `(?P<request_time>[^ ]*) ` +
              `(?P<body_bytes_sent>[^ ]*) ` +
              `(?P<bytes_sent>[^ ]*) ` +
              `"(?P<http_referer>[^"]*)" ` +
              `"(?P<http_user_agent>[^"]*)" ` +
              `"(?P<http_x_forwarded_for>[^"]*)" ` +
              `(?P<connection>[^ ]*) ` +
              `"(?P<hit>[^"]*)" ` +
              `"(?P<server_addr>[^"]*)" ` +
              `(?P<cdn>.*)` 

    re2 := myRegexp{regexp.MustCompile(re2str)}

    for i := 0; i< 10; i++ {
        inFile, err := os.Open("test.log")

        if err != nil {
            fmt.Println(err.Error())
            os.Exit(1)
        } else {
            defer inFile.Close()
        }

        scanner := bufio.NewScanner(inFile)
        scanner.Split(bufio.ScanLines)       
        for scanner.Scan() {
              line := scanner.Text()
              re2.FindStringSubmatchMap(line)
        }
    }
}

测试

$ time python regex.py
python regex.py  11.55s user 0.53s system 99% cpu 12.197 total

$ time ./regex
./regex  53.85s user 1.16s system 97% cpu 56.379 total

看到这个结果还是有点惊讶的, golang使用的时间是python的4倍多,于是google了下,发现多个issues提到这个问题 例如 https://github.com/golang/go/issues/19629,golang 的 re 库基于 https://github.com/google/re2 的,但是很多优化工作还没做完,看起来也在努力改进。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值