requests 版本
#!/usr/bin/env python
# coding=utf-8
import requests
import pandas as pd
from pandas import DataFrame, Series
import json
from datetime import timedelta
from datetime import datetime
from initengine import engine_localhost
from functools import reduce
from urllib.parse import unquote
import re
import chompjs
TODAY = datetime.now().strftime("%Y-%m-%d")
THREE_MONTHS_AGO = (datetime.now()+timedelta(days=-90) ).strftime("%Y-%m") + '-'+datetime.now().strftime("%d")
def get_token(keywords,country,time='today 3-m'):
"""
获取 token
"""
headers = {
'authority': 'trends.google.com',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCNzyywEI8PLLAQjv98sBCLT4ywEInvnLAQiv+ssBCLH6ywEYuvLLARiQ9csB',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626949297.1627024060.4; __utmc=10102256; __utmt=1; __utmb=10102256.1.10.1627024060; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-23-03; NID=219=o5ojNMPZk-VDvmoEpRCGbhARHKeFUd7SUI8jV5feTLZSHZ1jZYzDbaPOfyZQyfd1H3_HQbdaSwZNNdmrUh6jDYFGLL_KLeaGxXJpDtH-JFH9DYWMDoV8MxaaOTaIcNAyvSA5n92HF-txYfkbmjK7EZJpgLzhhk84QTNCcSSi-7Wzx2bc34X45sKe3MGVLGEREFhLWpaUtKBAvNhLwg7ndhrPjGe90jpNpyTeIy4XkJBdGGB-SZWDWtWrOPtl3WEHl8HoIXI5; SIDCC=AJi4QfGxAl6XeAfEqVwTUQEK5YprfGuMyvM0wz3HaXJntjSPk5owPm0f_LongSeWD5NJzwglXg; __Secure-3PSIDCC=AJi4QfGAtKHhFIhaNqN72X4V93Ra39nFF4CFzZ84xOoITLIUvD6Sn0R3zLq9c5zhYcpZNMv7Bw',
}
req = {"comparisonItem":[ {"keyword": keyword ,"geo":country,"time":"today 3-m"} for keyword in keywords ],
"category":0,
"property":""
}
params = (
('hl', 'zh-CN'),
('tz', ['-480', '-480']),
('req', json.dumps(req)),
)
response = requests.get('https://trends.google.com/trends/api/explore', headers=headers, params=params)
print(unquote(response.url))
token = chompjs.parse_js_object(response.content.decode())['widgets'][0]['token']
return token
def get_keywords_trend(keywords,country,token,start_date=THREE_MONTHS_AGO,end_date=TODAY):
"""
根据关键词和国家返回趋势数据集
"""
headers = {
'authority': 'trends.google.com',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCN3yywEI7/LLAQjv98sBCLT4ywEInvnLAQj7+csBCK/6ywEIsfrLARi68ssBGJD1ywE=',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626429382.1626948944.2; __utmc=10102256; __utmt=1; __utmb=10102256.4.9.1626949249014; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-22-07; NID=219=TfRj2AS_SBj3ZmflUiBaPqeunwsz5qziMRTT_km4WLGO0dZf9I0QgFIHrdgE5J-u6OqY7wYrL6KP3eJHt1WG-9wn3y12jD6iS5ivusWy9hAzSzvkusw7kRlBR2Rm19JWcoVMLUyZHV-qaJgEdtzteQ5dLemhQRSr1GxOogEQmg1DJiVmwGK4EJ3eFV2M1D0KviMEd7ua0XKYCCdPqNQOV-EMMGQUGZsiQdXBvTLy9jJbRpZBBP7vo6zyOIjjDymaMba0pIfx; SIDCC=AJi4QfH0UI6JQnI1iTTDKwWJGZjES9eGS6e8zAKNOniD4dXIF-D_4ZuXRKU5ZekywDUd0WUuLA; __Secure-3PSIDCC=AJi4QfE5XNvUJKrD5t2G6qscZfAJghVqoXG-Yk6bHxOFBZpCcREo9SuwZ7dZC1IVyRhGGazlhg',
}
req = {
"time":f"{start_date} {end_date}",
"resolution":"DAY",
"locale":"zh-CN",
"comparisonItem":[ { "geo":{"country":country},"complexKeywordsRestriction":{"keyword":[{"type":"BROAD", "value":keyword}] } } \
for keyword in keywords],
"requestOptions":{"property":"","backend":"IZG","category":0}
}
params = (
('hl', 'zh-CN'),
('tz', ['-480','-480']),
('req', json.dumps(req)),
('token', token),
)
res = requests.get('https://trends.google.com/trends/api/widgetdata/multiline', headers=headers,
params=params)
response = res.content.decode()
print(unquote(res.url))
data_list = chompjs.parse_js_object(response)['default']['timelineData']
value_series =Series(reduce( list.__add__ ,[ data['value'] for data in data_list ]) )
keywords_series = Series( keywords*len(data_list) )
date_series = Series( reduce( list.__add__ , [ [ re.sub('[年月]','-',data['formattedTime'].strip('日')) ]*len(keywords) for data in data_list ]) )
df = DataFrame(columns=['crawl_time','date','country','keyword','value'])
df['keyword'] = keywords_series
df['value'] = value_series
df['date'] = date_series
df['crawl_time'] = TODAY
df['country'] = country
return df
#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%222021-04-22+2021-07-22%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Jumia%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22jiji%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Kilimall%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Naivas%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Avechi%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token=APP6_UEAAAAAYPqYMLtcPo0zmfbw2j2hGFjDcsvQAu2m&tz=-480', headers=headers)
def main():
token = get_token(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG')
print(token)
df = get_keywords_trend(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG',token=token)
df.to_sql('google_keyword_trend', engine_localhost, index=False, if_exists='append')
print(df)
if __name__=='__main__':
main()
asyncio+aiohttp 异步版本
#!/usr/bin/env python
# coding=utf-8
# import requests
import aiohttp
import pandas as pd
from pandas import DataFrame, Series
import json
from datetime import timedelta
from datetime import datetime
from initengine import engine_localhost
from functools import reduce
import re
from urllib.parse import urlencode
from yarl import URL
import chompjs
TODAY = datetime.now().strftime("%Y-%m-%d")
THREE_MONTHS_AGO = (datetime.now()+timedelta(days=-90) ).strftime("%Y-%m") + '-'+datetime.now().strftime("%d")
import asyncio
from urllib.parse import unquote
async def get_token(keywords,country,time='today 3-m'):
"""
获取 token
"""
headers = {
'authority': 'trends.google.com',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCNzyywEI8PLLAQjv98sBCLT4ywEInvnLAQiv+ssBCLH6ywEYuvLLARiQ9csB',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626949297.1627024060.4; __utmc=10102256; __utmt=1; __utmb=10102256.1.10.1627024060; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-23-03; NID=219=o5ojNMPZk-VDvmoEpRCGbhARHKeFUd7SUI8jV5feTLZSHZ1jZYzDbaPOfyZQyfd1H3_HQbdaSwZNNdmrUh6jDYFGLL_KLeaGxXJpDtH-JFH9DYWMDoV8MxaaOTaIcNAyvSA5n92HF-txYfkbmjK7EZJpgLzhhk84QTNCcSSi-7Wzx2bc34X45sKe3MGVLGEREFhLWpaUtKBAvNhLwg7ndhrPjGe90jpNpyTeIy4XkJBdGGB-SZWDWtWrOPtl3WEHl8HoIXI5; SIDCC=AJi4QfGxAl6XeAfEqVwTUQEK5YprfGuMyvM0wz3HaXJntjSPk5owPm0f_LongSeWD5NJzwglXg; __Secure-3PSIDCC=AJi4QfGAtKHhFIhaNqN72X4V93Ra39nFF4CFzZ84xOoITLIUvD6Sn0R3zLq9c5zhYcpZNMv7Bw',
}
req = {"comparisonItem":[ {"keyword": keyword ,"geo":country,"time":"today 3-m"} for keyword in keywords ],
"category":0,
"property":""
}
params = {
'hl':'zh-CN',
'tz':['-480','-480'],
'req':json.dumps(req),
}
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get('https://trends.google.com/trends/api/explore',params=params) as res:
text = await res.text()
token = chompjs.parse_js_object(text)['widgets'][0]['token']
# response = requests.get('https://trends.google.com/trends/api/explore', headers=headers, params=params)
return token
async def get_keywords_trend(keywords,country,token,start_date=THREE_MONTHS_AGO,end_date=TODAY):
"""
根据关键词和国家返回趋势数据集
"""
headers = {
'authority': 'trends.google.com',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCN3yywEI7/LLAQjv98sBCLT4ywEInvnLAQj7+csBCK/6ywEIsfrLARi68ssBGJD1ywE=',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626429382.1626948944.2; __utmc=10102256; __utmt=1; __utmb=10102256.4.9.1626949249014; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-22-07; NID=219=TfRj2AS_SBj3ZmflUiBaPqeunwsz5qziMRTT_km4WLGO0dZf9I0QgFIHrdgE5J-u6OqY7wYrL6KP3eJHt1WG-9wn3y12jD6iS5ivusWy9hAzSzvkusw7kRlBR2Rm19JWcoVMLUyZHV-qaJgEdtzteQ5dLemhQRSr1GxOogEQmg1DJiVmwGK4EJ3eFV2M1D0KviMEd7ua0XKYCCdPqNQOV-EMMGQUGZsiQdXBvTLy9jJbRpZBBP7vo6zyOIjjDymaMba0pIfx; SIDCC=AJi4QfH0UI6JQnI1iTTDKwWJGZjES9eGS6e8zAKNOniD4dXIF-D_4ZuXRKU5ZekywDUd0WUuLA; __Secure-3PSIDCC=AJi4QfE5XNvUJKrD5t2G6qscZfAJghVqoXG-Yk6bHxOFBZpCcREo9SuwZ7dZC1IVyRhGGazlhg',
}
req = {
"time":f"{start_date} {end_date}",
"resolution":"DAY",
"locale":"zh-CN",
"comparisonItem":[ { "geo":{"country":country},"complexKeywordsRestriction":{"keyword":[{"type":"BROAD", "value":keyword}] } } \
for keyword in keywords],
"requestOptions":{"property":"","backend":"IZG","category":0}
}
params = {
'hl':'zh-CN',
'tz':'-480',
'req': json.dumps(req),
'token': token ,
}
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get('https://trends.google.com/trends/api/widgetdata/multiline'+f'?{urlencode(params)}') as res:
response = await res.text()
# print(response)
data_list = chompjs.parse_js_object(response)['default']['timelineData']
print(data_list)
value_series =Series(reduce( list.__add__ ,[ data['value'] for data in data_list ]) )
keywords_series = Series( keywords*len(data_list) )
date_series = Series( reduce( list.__add__ , [ [ re.sub('[年月]','-',data['formattedTime'].strip('日')) ]*len(keywords) for data in data_list ]) )
df = DataFrame(columns=['crawl_time','date','country','keyword','value'])
df['keyword'] = keywords_series
df['value'] = value_series
df['date'] = date_series
df['crawl_time'] = TODAY
df['country'] = country
return df
#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%222021-04-22+2021-07-22%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Jumia%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22jiji%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Kilimall%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Naivas%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Avechi%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token=APP6_UEAAAAAYPqYMLtcPo0zmfbw2j2hGFjDcsvQAu2m&tz=-480', headers=headers)
async def main():
token = await get_token(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG')
print(token)
df = await get_keywords_trend(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG',token=token)
df.to_sql('google_keyword_trend', engine_localhost, index=False, if_exists='append')
print(df)
if __name__=='__main__':
asyncio.run(main())
可以看到同步与异步在代码上基本一致,
asyncio 和 aiohttp 可以基本实现 无痛转换 同步代码到异步代码 ,当然也有很多小坑
ps:代码运行 需要科学上网