解析url用的类库:
python2版本:
from urlparse import urlparse import urllib
python3版本:
from urllib.parse import urlparse import urllib.request
研究了不同的url规则发现:只要在搜索关键字是用=嫁接的,查询的关键在解析后的query里
如果不是用=嫁接,查询的关键在解析后的path里。
解析的规则都是一样的,正则如下:(6中不同情况的组合)
另外host为‘s.weibo.com’的url编码与其他不同要另做处理。
代码如下:有些网站的规则还不是很清楚,需要花大量时间找规则,规则越清晰,关键字就越清楚,如下规则已适合绝大部分网站,酌情参考。
-
# -*- coding:utf-8 -*-
-
-
from urlparse
import urlparse
-
import urllib
-
import re
-
-
# url
-
source_txt =
"E:\\python_Anaconda_code\\url.txt"
-
# 规则
-
regular =
r'(\w+(%\w\w)+\w+|(%\w\w)+\w+(%\w\w)+|\w+(%\w\w)+|(%\w\w)+\w+|(%\w\w)+|\w+)'
-
-
# 存放关键字
-
kw_list = list()
-
-
# key为要研究网站的host,value为关键字的嫁接标识符
-
dict = {
-
"www.baidu.com":
"wd=",
-
"news.baidu.com":
"word=",
-
"www.sogou.com":
"query=",
-
"tieba.baidu.com":
"kw=",
-
"wenku.baidu.com":
"word=",
-
"music.sina.com.cn":
"k=",
-
"www.haosou.com":
"q=",
-
"www.lagou.com":
"list_",
-
"www.chunyuyisheng.com":
"query=",
-
"s.weibo.com":
"weibo/"
-
}
-
-
def Main():
-
with open(source_txt,
'r')
as f_source_txt:
-
for url
in f_source_txt:
-
host = url.split(
"//")[
1].split(
"/")[
0]
-
if host
in dict:
-
flag = dict[host]
-
if flag.find(
"=") !=
-1:
-
query = urlparse(url).query.replace(
'+',
'')
-
kw = re.search(flag + regular, query, re.I)
# .group(0)
-
if kw:
-
kw = urllib.unquote(kw.group(
0).split(flag)[
1])
-
print(kw)
-
else:
-
path = urlparse(url).path.replace(
'+',
'')
-
kw = re.search(flag + regular, path.replace(
"%25",
"%"), re.I)
-
if kw:
-
kw = urllib.unquote(kw.group(
0).split(flag)[
1])
-
print(kw)
-
if __name__ ==
'__main__':
-
Main()
url.txt的内容如下:
-
https:
//www.baidu.com/
s?ie=utf-
8&f=
8&rsv_bp=
0&rsv_idx=
1&ch=&tn=baidu&bar=&wd=python&rn=&oq=&rsv_pq=ece0867c0002c793&rsv_t=edeaQq7DDvZnxq%2FZVra5K%2BEUanlTIUXhGIhvuTaqdfOECLuXR25XKDp%2Bi0I&rqlang=cn&rsv_enter=
1&inputT=
218
-
https:
//www.baidu.com/
s?ie=utf-
8&f=
8&rsv_bp=
1&rsv_idx=
1&ch=&tn=baidu&bar=&wd=python%E9%87%8C%E7%9A%84%E5%AD%97%E5%85%B8dict&oq=python&rsv_pq=
96c160e70003f332&rsv_t=0880NkOvMIr3TvOdDP1t8EbloD8qwr4yeP6CfPjQihQNNhdExfuwyOFMrx
0&rqlang=cn&rsv_enter=
0&inputT=
10411
-
https:
//www.baidu.com/
s?ie=utf-
8&f=
8&rsv_bp=
1&rsv_idx=
1&ch=&tn=baidu&bar=&wd=python%E9%87%8C%E7%9A%84urlprese&oq=python%25E9%2587%258C%25E7%259A%2584re%25E9%2587%258C%25E7%259A%2584%257C%25E6%2580%258E%25E4%25B9%2588%25E7%2594%25A8&rsv_pq=d1d4e7b90003d391&rsv_t=
5ff4Vok4EELK1PgJ4oSk8L0VvKAn51%2BL8ns%2FjSubexg7Lb7znKcTvnVtn8M&rqlang=cn&rsv_enter=
1&inputT=
2797
-
https:
//www.baidu.com/
s?ie=utf-
8&f=
8&rsv_bp=
1&rsv_idx=
1&ch=&tn=baidu&bar=&wd=python++wo+%E7%88%B1urlprese&oq=python%25E9%2587%258C%25E7%259A%2584urlprese&rsv_pq=eecf45e900033e87&rsv_t=
1c70xAYhrvw5JOZA7lpVgt4pw%2BW1TO8hqTejTh67JgEQfqAGyDydd25HAmU&rqlang=cn&rsv_enter=
0&inputT=
10884
-
http:
//news.baidu.com/ns?word=%E8%B6%B3%E7%90%83&tn=news&from=news&cl=
2&rn=
20&ct=
1
-
http:
//news.baidu.com/ns?ct=
1&rn=
20&ie=utf-
8&bs=%E8%B6%B3%E7%90%83&rsv_bp=
1&sr=
0&cl=
2&f=
8&prevct=
no&tn=news&word=++++++%E8%B6%B3++%E7%90%83+++++%E4%BD%A0%E5%A5%BD+%E5%98%9B%EF%BC%9F&rsv_sug3=
14&rsv_sug4=
912&rsv_sug1=
4&inputT=
8526
-
http:
//tieba.baidu.com/f?ie=utf-
8&kw=%E7%BA%A2%E6%B5%B7%E8%A1%8C%E5%8A%A8&fr=search&red_tag=q0224393377
-
https:
//www.sogou.com/web?query=ni+zai+%E6%88%91+%E5%BF%83li&_asf=www.sogou.com&_ast=
1520388441&w=
0101990
0&p=
40040100&ie=utf8&from=
index-nologin&s_from=
index&sut=
9493&sst
0=
1520388440692&lkt=
8%2C1520388431200%2C1520388436842&sugsuv=
1498714959961744&sugtime=
1520388440692
-
https:
//www.lagou.com/jobs/list_python%E5%A4%A7%E6%95%B0%E6%8D%AEmr?labelWords=&fromSearch=true&suginput=
-
https:
//www.chunyuyisheng.com/pc/search/?query=%E6%85%A2%E6%80%A7%E4%B9%99%E8%82%9D%
-
http:
//s.weibo.com/weibo/%25E5%2594%2590%25E4%25BA%25BA%25E8%25A1%2597%25E6%258E%25A2%25E6%25A1%25882&Refer=
index
-
http:
//s.weibo.com/weibo/%25E4%25BD%25A0%25E5%25A5%25BD123mm%2520%25E5%2597%25AF%2520mm11&Refer=STopic_box
结果如下:
如果要研究其他host,可以加到字典dict里。
备注:以上代码和思路仅供参考,如有更好的方法敬请留言!