【seo工具】关键词挖掘,获取关键词数据比较基础的部分,渠道很多,这次我们来采集凤巢的关键词数据,对关键词的分析,以及使用才是重点!我们先解决基础部分:
每次使用脚本需抓包获取cookies数据登陆百度凤巢后台
找到返回json关键词数据的链接作为请求对象
构造header信息,假装你是个人类
构造data信息,用作post
使用pycurl模块请求,使用json模块把返回的json数据可识别,这边直接用正则提取
写入到csv文件当中,就酱了
#coding:utf-8
import pycurl,StringIO,json,urllib,re,time,random,csv
#找到返回json数据的链接
url = 'https://fengchao.baidu.com/nirvana/request.ajax?path=jupiter/GET/kr/word&reqid=1473662256226_43'
# filename = raw_input('input your filename\t')
#轮换ua
def getUA():#随机取ua
uaList = [
'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
]
newUa = random.choice(uaList)
return newUa
#构造头部信息
headers = [
'Accept:*/*',
'Accept-Encoding:gzip, deflate',
'Accept-Language:zh-CN,zh;q=0.8',
'Connection:keep-alive',
# 'Content-Length:857',
'Content-Type:application/x-www-form-urlencoded',
# 'Cookie:-----自定义cookie--------',
'Cookie:FC-FE-TERMINUS=fc_terminus_user; PSTM=1470278993; BIDUPSID=68D179B9795C9500BE7ECCE65F4DABDE; __cfduid=d76a2eae0d2d244e95526665c082a83c21470281708; BAIDUID=D845C1483B574B75268F3B55DD7C3E99:FG=1; BDUSS=RQQkxEOE5XNVZEdlBjTnpiTVQwdHI1YX5IdDJnQkJ-UnBvMEMtRmpuTjFqUDFYQUFBQUFBJCQAAAAAAAAAAAEAAAABgNQ2Qmlnd2F5c2VvAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHX~1Vd1~9VXUT; H_PS_PSSID=1457_18280_21097_20856_20732; SFSSID=1854195357ed9983fd81f60449bb8f68; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02236744899; uc_login_unique=2fd154d0e97cc43a168b297ce0a3b280; __cas__st__3=0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a98f6689d9cd346297babc34f158f94392; __cas__id__3=21291948; __cas__rn__=223674489; SAMPLING_USER_ID=21291948',
'Host:fengchao.baidu.com',
'Origin:https://fengchao.baidu.com',
'Referer:https://fengchao.baidu.com/nirvana/main.html?userid=21291948',
#'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'User-Agent: %s' %getUA(),
]
#打开fengchao.csv用来保存关键词数据
fengchao = open('fengchao.csv','a')
fengchao.write('关键词,Total,PC,Mobile\n')
# n=0
for query in [word.strip() for word in open('word.txt').readlines()]:
# frist = 'Word:%s\tTotal\tPc\tMobile\n\n' % query #表头
# fengchao.write(frist)
# n+=1
for pagelink in range(1,5):
#构造需要post的data
data = urllib.urlencode({
'params': '{"entry":"kr_station","query":"%s","querytype":1,"pageNo":%d,"pageSize":300}' % (query,pagelink),
'source':'',
'path':'jupiter/GET/kr/word',
'userid':21291948,
'token':'0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a56f6689d9cd346297babc34f158f94392',
#'eventId':'1471855302096_47',
#'reqId':'1471855302096_30',
'Name':'',
})
time.sleep(1)
c = pycurl.Curl()#通过curl方法构造一个对象
c.setopt(pycurl.FOLLOWLOCATION, True)#自动进行跳转抓取
c.setopt(pycurl.MAXREDIRS,5)#设置最多跳转多少次
c.setopt(pycurl.CONNECTTIMEOUT, 60)#设置链接超时
c.setopt(pycurl.TIMEOUT,120)#下载超时
c.setopt(pycurl.ENCODING, 'gzip,deflate')#处理gzip内容,有些傻逼网站,就算你给的请求没有gzip,它还是会返回一个gzip压缩后的网页
# c.setopt(c.PROXY,ip)# 代理
c.fp = StringIO.StringIO()
c.setopt(pycurl.URL, url)#设置要访问的URL
c.setopt(pycurl.HTTPHEADER,headers)#传入请求头
c.setopt(pycurl.POST, 1)
c.setopt(pycurl.POSTFIELDS, data)#传入POST数据
c.setopt(c.WRITEFUNCTION, c.fp.write)#回调写入字符串缓存
c.perform()
code = c.getinfo(c.HTTP_CODE)#返回状态码
html = c.fp.getvalue()#返回源代码
# print n
for word,total,pc,mobile in re.findall(r'rd":"(.*?)","pv":(\d+),"pvPc":(\d+),"pvWise":(\d+),',html):
print word,total,pc,mobile
# fengchao.writelines(word + '\t' + total + '\t' + pc + '\t' + mobile + '\n')
fengchao.writelines('%s,%s,%s,%s\n'%(word,total,pc,mobile))
fengchao.close()
脚本中很多地方是需要修改成自己的,注册一个凤巢的账号(免费的),抓包修改信息即可;
脚本写的有一段时间了,能不能用,不知道,思路清晰就好!