百度指数用图片处理过,太难抓取
爬取代码是 python 3.x
#!/usr/bin/env python
#-*- encoding: utf-8 -*-
# refer to http://blog.csdn.net/wangtaoking1/article/details/18308635
import http.cookiejar
from urllib import request
from urllib.parse import quote
HTTP_PROXY = '10.13.61.118:6666'
def getOpener(head, enable_proxy=False):
# deal with the Cookies
cj = http.cookiejar.CookieJar()
cookie_support = request.HTTPCookieProcessor(cj)
# deal with proxy
debug_hander = request.HTTPHandler(debuglevel=1) # debuglevel=0
proxy_handler = request.ProxyHandler({"http":HTTP_PROXY, "https":HTTP_PROXY})
opener = request.build_opener(cookie_support, proxy_handler, debug_hander) \
if enable_proxy else request.build_opener(cookie_support, debug_hander)
# request.install_opener(opener)
opener.addheaders = list(head.items())
return opener
import os,json
def main(school="江苏经贸职业技术学院"):
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip, deflate, sdcn',
}
opener = getOpener(header)
url = "http://index.so.com/index.php?a=overviewJson&q=%s&area=%s" % (quote(school),quote("全国"))
data = opener.open(url).read().decode('utf-8')
try:
index = json.loads(data).get('data')[0]['data']['month_index']
except:
index = -1
return ('%s=%d' % (school, index if isinstance(index,int) else -1 ))
import time
if __name__ == '__main__':
#main()
fp = open("index.txt",'w',encoding='utf-8')
XX = open("school_list.txt").read().splitlines()
for line in XX:
time.sleep(1)
fp.write(main(line)+'\n')
fp.flush()
fp.close()
查看排行
cat index.txt | sort -t= -k2 -nr | less