# -*- conding:utf-8 -*-
#https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd=%E5%87%8F%E8%82%A5
from threading import Thread
from queue import Queue
import requests
import json
from pymongo import MongoClient
class Xiala(Thread):
def __init__(self,queue,db_config):
super().__init__()
self.queue = queue
self.config = db_config
self.headers ={
"Agent-User":"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
}
def run(self) -> None:
#主程序
while True:
try:
wd = self.queue.get()
json_res = self.fetch_json(wd)
res_word = self.parse_json(json_res)
print(f'{wd}相关词有{len(res_word)}')
for word in res_word:
self.save_mongo(word)
finally:
self.queue.task_done()
@staticmethod
def parse_json(json_res):
if not isinstance(json_res,dict):
return
keyword = json_res.get('g',[])
return {item['q'] for item in keyword}
def fetch_json(self,wd):
try:
url = f'https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd={wd}'
r = requests.get(url, headers = self.headers, timeout =10)
except requests.RequestException as err:
res = None
print(f'request错误{err}')
else:
r.encoding = "utf-8"
try:
return r.json()
except json.JSONDecodeError :
return r.text
def save_mongo(self,res_word):
result = self.config.find_one({'word':res_word},{'count':1, '_id':0})
if result is None:
data={
"word" : res_word,
'count': 1,
}
self.config.insert_one(data)
else:
self.config.update_one({'word':res_word},{"$set":{"count": result['count'] + 1}})
if __name__ == '__main__':
query = Queue()
with open("keywords.txt",encoding="utf-8") as f:
for x in f.readlines():
query.put(x.strip())
client = MongoClient()
db = client['xiala']
config = db['cc41']
for x in range(1):
xl = Xiala(query,config)
xl.daemon = True
xl.start()
query.join()
print('done')
获取下来搜索有多种接口:
第一种:
源代码搜索 sugHost
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su
获取相关搜索词接口
增加?wd=关键词
即获取 关键词 下拉搜索词
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=seo
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=seo&json=1 返回json格式
第二种:
network - xhl - headers
https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd=