python解析url的关键字

近期刚接触python,主要于分析网站用户访问的日志,其中涉及到解析日志中的关键字。该业务主要需要解决以下几个问题:
        1、访客使用的搜索引擎关键字标志不同,如百度中搜索‘大数据’
           https://www.baidu.com/s?f=8&rsv_bp=1&rsv_idx=1&word=%E5%A4%A7%E6%95%B0%E6%8D%AE&tn=91483420_s_hao_pg
           关键词标志为word
           在谷歌中搜索‘大数据’
            https://www.google.com.hk/?gws_rd=ssl#safe=strict&q=%E5%A4%A7%E6%95%B0%E6%8D%AE
            关键词标志为q
        2、相同搜索引擎的不同搜索方式,产生的关键字标志也未必相同,百度中就可能以wd作为关键字的标志。
            https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=91483420_s_hao_pg&wd=%E5%A4%A7%E6%95%B0%E6%8D%AE&oq=%E5%A4%A7%E6%95%B0%E6%8D%AE&rsv_pq=eaf35f8e00003566&rsv_t=17aa2xBQlaQi4DN3AX5SdzIaXVFjARA4pfZnag9PfymOMaWmUFdUUgIuqyJNp3ItorvpNPFN3%2FU&rqlang=cn&rsv_enter=1&rsv_sug3=2&rsv_sug1=2&rsv_sug7=101&rsv_sug2=0&inputT=1432&rsv_sug4=1432
        3、不同搜索引擎的编码方式不同,如国内的搜搜网页
           http://www.soso.cn/search.asp?want=&search=%B4%F3%CA%FD%BE%DD&engine=
           采用的gbk编码
针对前两个问题,将常用的搜索引擎及关键字构建成一个搜索引擎字典:
engineList = {
    "Baidu":('q','word','kw','utf8'),
    "Google":('q','query','keywords','utf8'),
    "Sogou":('query','keyword','utf8'),
    "Chinaso":('q','utf8'),
    "Yahoo":('p','q','utf8'),
    "Soso":('search','q','gb2312'),
    "Youdao":('q','utf8'),
    "Bing":('q','utf8'),
    "Easou":('q','utf8'),
    "360search":('q','utf8'),
    "sm.cn":('q','utf8')
}

对于第三个问题,使用python的异常机制进行判断URL中关键词编码

def decode_keyword(keyword): 
    keyword = urllib.unquote(keyword)
    try :
        keyword = keyword.decode('utf-8')
        return keyword
    except UnicodeDecodeError :
        try:
            keyword = keyword.decode('gbk')
            return keyword
        except UnicodeDecodeError :
            return keyword

完整代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#设定解析日志时编码格式

import sys
import urllib
from urlparse import urlparse
reload(sys)
sys.setdefaultencoding('utf8')

engineList = {
    "Baidu":('q','word','kw','utf8'),
    "Google":('q','query','keywords','utf8'),
    "Sogou":('query','keyword','utf8'),
    "Chinaso":('q','utf8'),
    "Yahoo":('p','q','utf8'),
    "Soso":('search','q','gb2312'),
    "Youdao":('q','utf8'),
    "Bing":('q','utf8'),
    "Easou":('q','utf8'),
    "360search":('q','utf8'),
    "sm.cn":('q','utf8')
}

dict = {}
def main():
    file = open("keyword.txt")
    while True:
        line = file.readline()
        if line:
            #print(line)
            parseLine(line)

def parseLine(line):
    parseUrl(line,dict)
    parseKey(line,dict)

def parseUrl(line,dict):
    url = urlparse(line)
    searchName = str(url.hostname)
    #print(dict)
    if "baidu.com" in searchName:
        dict['searchname'] = 'Baidu'
        dict['issearch'] = 1
    elif "google.com" in searchName:
        dict['searchname'] = 'Google'
        dict['issearch'] = 1
    elif "sogou.com" in searchName:
        dict['searchname'] = 'Sogou'
        dict['issearch'] = 1
    elif "chinaso.com" in searchName:
        dict['searchname'] = 'Chinaso'
        dict['issearch'] = 1
    elif "yahoo.com" in searchName:
        dict['searchname'] = 'Yahoo'
        dict['issearch'] = 1
    elif "soso.cn" in searchName:
        dict['searchname'] = 'Soso'
        dict['issearch'] = 1
    elif "youdao.com" in searchName:
        dict['searchname'] = 'Youdao'
        dict['issearch'] = 1
    elif "bing.com" in searchName:
        dict['searchname'] = 'Bing'
        dict['issearch'] = 1
    elif "easou.com" in searchName:
        dict['searchname'] = 'Easou'
        dict['issearch'] = 1
    elif "so.com" in searchName:
        dict['searchname'] = '360search'
        dict['issearch'] = 1
    elif "sm.cn" in searchName:
        dict['searchname'] = 'sm.cn'
        dict['issearch'] = 1
    else:
        dict['searchname'] = searchName
        dict['issearch'] = -1

def parseKey(line,dict):
    line = line.replace('/','&')
    line = line.replace('?','&')
    paramList = line.split('&')
    for l in paramList[4:]:
        parseParam(l,dict)

def parseParam(l, dict):
    linelist = l.split('=')
    if dict['issearch'] == 1:
        keys = engineList[dict['searchname']]
        keywords = keys[0:(len(keys)-1)]
        for key in keywords:
            if linelist[0] == key:
                dict['keyword']=decode_keyword(linelist[1])
                print(dict['keyword'])

def decode_keyword(keyword):
    keyword = urllib.unquote(keyword)
    try :
        keyword = keyword.decode('utf-8')
        return keyword
    except UnicodeDecodeError :
        try:
            keyword = keyword.decode('gbk')
            return keyword
        except UnicodeDecodeError :
            return keyword
if __name__ == "__main__":
    main()
#keyword.txt
https://www.baidu.com/s?wd=%E6%AD%A6%E5%BF%A0%E5%81%A5&rsv_spt=1&rsv_iqid=0xbe3298bf0001da8c&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=1
https://www.google.com.hk/?gws_rd=ssl#safe=strict&q=%E5%A4%A7%E6%95%B0%E6%8D%AE
https://www.sogou.com/web?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&_asf=www.sogou.com&_ast=&w=01019900&p=40040100&ie=utf8&from=index-nologin&sut=1487&sst0=1482993265601&lkt=0%2C0%2C0
chinaso.com:http://www.chinaso.com/search/pagesearch.htm?q=%E5%A4%A7%E6%95%B0%E6%8D%AE
https://search.yahoo.com/search;_ylc=X3oDMTFiN25laTRvBF9TAzIwMjM1MzgwNzUEaXRjAzEEc2VjA3NyY2hfcWEEc2xrA3NyY2h3ZWI-?p=%E5%A4%A7%E6%95%B0%E6%8D%AE&fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8
http://www.soso.cn/search.asp?want=&search=%B4%F3%CA%FD%BE%DD&engine=   
http://www.youdao.com/search?keyfrom=navindex.normal.searchbox&T1=1482992913989&q=%E5%A4%A7%E6%95%B0%E6%8D%AE
http://cn.bing.com/search?q=%E5%A4%A7%E6%95%B0%E6%8D%AE&go=%E6%8F%90%E4%BA%A4&qs=n&form=QBLH&sp=-1&pq=%E5%A4%A7%E6%95%B0%E6%8D%AE&sc=8-3&sk=&cvid=02BF61C729E04CB39B16D91602092F44
http://i.easou.com/s.m?idx=1&sty=1&q=%E5%A4%A7%E6%95%B0%E6%8D%AE&prefix=100&cid=paw&fr=9.1005.2.2&esid=HeCvH5j3kDA&wver=dsp
https://www.so.com/s?ie=utf-8&shb=1&src=360sou_newhome&q=%E5%A4%A7%E6%95%B0%E6%8D%AE
http://m.sm.cn/s?q=%E5%A4%A7%E6%95%B0%E6%8D%AE&uc_param_str=dnntnwvepffrgibijbprsvdsme&from=smor&safe=1&snum=0
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值