Google 关键词趋势数据下载 同步版本 与异步版本

requests 版本

#!/usr/bin/env python
# coding=utf-8
import requests
import pandas as pd
from pandas import DataFrame, Series
import json
from datetime import timedelta
from datetime import datetime
from initengine import engine_localhost
from functools import reduce
from urllib.parse import unquote
import re
import chompjs
TODAY = datetime.now().strftime("%Y-%m-%d")
THREE_MONTHS_AGO = (datetime.now()+timedelta(days=-90) ).strftime("%Y-%m") + '-'+datetime.now().strftime("%d")

def get_token(keywords,country,time='today 3-m'):
    """
    获取 token
    """

    headers = {
        'authority': 'trends.google.com',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'accept': 'application/json, text/plain, */*',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCNzyywEI8PLLAQjv98sBCLT4ywEInvnLAQiv+ssBCLH6ywEYuvLLARiQ9csB',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626949297.1627024060.4; __utmc=10102256; __utmt=1; __utmb=10102256.1.10.1627024060; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-23-03; NID=219=o5ojNMPZk-VDvmoEpRCGbhARHKeFUd7SUI8jV5feTLZSHZ1jZYzDbaPOfyZQyfd1H3_HQbdaSwZNNdmrUh6jDYFGLL_KLeaGxXJpDtH-JFH9DYWMDoV8MxaaOTaIcNAyvSA5n92HF-txYfkbmjK7EZJpgLzhhk84QTNCcSSi-7Wzx2bc34X45sKe3MGVLGEREFhLWpaUtKBAvNhLwg7ndhrPjGe90jpNpyTeIy4XkJBdGGB-SZWDWtWrOPtl3WEHl8HoIXI5; SIDCC=AJi4QfGxAl6XeAfEqVwTUQEK5YprfGuMyvM0wz3HaXJntjSPk5owPm0f_LongSeWD5NJzwglXg; __Secure-3PSIDCC=AJi4QfGAtKHhFIhaNqN72X4V93Ra39nFF4CFzZ84xOoITLIUvD6Sn0R3zLq9c5zhYcpZNMv7Bw',
    }
    req = {"comparisonItem":[ {"keyword": keyword ,"geo":country,"time":"today 3-m"} for keyword in keywords ],
        "category":0,
        "property":""
            }

    params = (
        ('hl', 'zh-CN'),
        ('tz', ['-480', '-480']),
        ('req', json.dumps(req)),
    )

    response = requests.get('https://trends.google.com/trends/api/explore', headers=headers, params=params)
    print(unquote(response.url))
    token = chompjs.parse_js_object(response.content.decode())['widgets'][0]['token']
    return token



def get_keywords_trend(keywords,country,token,start_date=THREE_MONTHS_AGO,end_date=TODAY):
    """
    根据关键词和国家返回趋势数据集
    """

    headers = {
        'authority': 'trends.google.com',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'accept': 'application/json, text/plain, */*',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCN3yywEI7/LLAQjv98sBCLT4ywEInvnLAQj7+csBCK/6ywEIsfrLARi68ssBGJD1ywE=',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626429382.1626948944.2; __utmc=10102256; __utmt=1; __utmb=10102256.4.9.1626949249014; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-22-07; NID=219=TfRj2AS_SBj3ZmflUiBaPqeunwsz5qziMRTT_km4WLGO0dZf9I0QgFIHrdgE5J-u6OqY7wYrL6KP3eJHt1WG-9wn3y12jD6iS5ivusWy9hAzSzvkusw7kRlBR2Rm19JWcoVMLUyZHV-qaJgEdtzteQ5dLemhQRSr1GxOogEQmg1DJiVmwGK4EJ3eFV2M1D0KviMEd7ua0XKYCCdPqNQOV-EMMGQUGZsiQdXBvTLy9jJbRpZBBP7vo6zyOIjjDymaMba0pIfx; SIDCC=AJi4QfH0UI6JQnI1iTTDKwWJGZjES9eGS6e8zAKNOniD4dXIF-D_4ZuXRKU5ZekywDUd0WUuLA; __Secure-3PSIDCC=AJi4QfE5XNvUJKrD5t2G6qscZfAJghVqoXG-Yk6bHxOFBZpCcREo9SuwZ7dZC1IVyRhGGazlhg',
    }

    req = {
                "time":f"{start_date} {end_date}",
                "resolution":"DAY",
                "locale":"zh-CN",
                "comparisonItem":[   { "geo":{"country":country},"complexKeywordsRestriction":{"keyword":[{"type":"BROAD", "value":keyword}] } }  \
                        for keyword in keywords],
                "requestOptions":{"property":"","backend":"IZG","category":0}
                }
    params = (
        ('hl', 'zh-CN'),
        ('tz',  ['-480','-480']),
        ('req', json.dumps(req)),
        ('token', token),
    )


    res = requests.get('https://trends.google.com/trends/api/widgetdata/multiline', headers=headers,
            params=params)
    response = res.content.decode()
    print(unquote(res.url))
    data_list = chompjs.parse_js_object(response)['default']['timelineData']
    value_series =Series(reduce( list.__add__ ,[ data['value']  for data in data_list  ]) )
    keywords_series = Series( keywords*len(data_list) )
    date_series = Series( reduce( list.__add__ ,  [ [ re.sub('[年月]','-',data['formattedTime'].strip('日')) ]*len(keywords)  for data in data_list ]) )
    df = DataFrame(columns=['crawl_time','date','country','keyword','value'])
    df['keyword'] = keywords_series
    df['value'] = value_series
    df['date'] =  date_series
    df['crawl_time'] = TODAY
    df['country'] = country
    return df

#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%222021-04-22+2021-07-22%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Jumia%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22jiji%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Kilimall%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Naivas%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Avechi%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token=APP6_UEAAAAAYPqYMLtcPo0zmfbw2j2hGFjDcsvQAu2m&tz=-480', headers=headers)
def main():
    token  = get_token(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG')
    print(token)
    df = get_keywords_trend(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG',token=token)
    df.to_sql('google_keyword_trend', engine_localhost, index=False, if_exists='append')
    print(df)

if __name__=='__main__':
    main()

asyncio+aiohttp 异步版本

#!/usr/bin/env python
# coding=utf-8
# import requests
import aiohttp
import pandas as pd
from pandas import DataFrame, Series
import json
from datetime import timedelta
from datetime import datetime
from initengine import engine_localhost
from functools import reduce
import re
from urllib.parse import urlencode
from yarl import URL
import chompjs
TODAY = datetime.now().strftime("%Y-%m-%d")
THREE_MONTHS_AGO = (datetime.now()+timedelta(days=-90) ).strftime("%Y-%m") + '-'+datetime.now().strftime("%d")
import asyncio
from urllib.parse import unquote

async def get_token(keywords,country,time='today 3-m'):
    """
    获取 token
    """

    headers = {
        'authority': 'trends.google.com',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'accept': 'application/json, text/plain, */*',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCNzyywEI8PLLAQjv98sBCLT4ywEInvnLAQiv+ssBCLH6ywEYuvLLARiQ9csB',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626949297.1627024060.4; __utmc=10102256; __utmt=1; __utmb=10102256.1.10.1627024060; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-23-03; NID=219=o5ojNMPZk-VDvmoEpRCGbhARHKeFUd7SUI8jV5feTLZSHZ1jZYzDbaPOfyZQyfd1H3_HQbdaSwZNNdmrUh6jDYFGLL_KLeaGxXJpDtH-JFH9DYWMDoV8MxaaOTaIcNAyvSA5n92HF-txYfkbmjK7EZJpgLzhhk84QTNCcSSi-7Wzx2bc34X45sKe3MGVLGEREFhLWpaUtKBAvNhLwg7ndhrPjGe90jpNpyTeIy4XkJBdGGB-SZWDWtWrOPtl3WEHl8HoIXI5; SIDCC=AJi4QfGxAl6XeAfEqVwTUQEK5YprfGuMyvM0wz3HaXJntjSPk5owPm0f_LongSeWD5NJzwglXg; __Secure-3PSIDCC=AJi4QfGAtKHhFIhaNqN72X4V93Ra39nFF4CFzZ84xOoITLIUvD6Sn0R3zLq9c5zhYcpZNMv7Bw',
    }
    req = {"comparisonItem":[ {"keyword": keyword ,"geo":country,"time":"today 3-m"} for keyword in keywords ],
        "category":0,
        "property":""
            }

    params = {
            'hl':'zh-CN',
            'tz':['-480','-480'],
            'req':json.dumps(req),
        }
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get('https://trends.google.com/trends/api/explore',params=params) as res:
            text = await res.text()
            token = chompjs.parse_js_object(text)['widgets'][0]['token'] 
    # response = requests.get('https://trends.google.com/trends/api/explore', headers=headers, params=params)
    return token



async def get_keywords_trend(keywords,country,token,start_date=THREE_MONTHS_AGO,end_date=TODAY):
    """
    根据关键词和国家返回趋势数据集
    """

    headers = {
        'authority': 'trends.google.com',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'accept': 'application/json, text/plain, */*',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'x-client-data': 'CJC2yQEIo7bJAQjEtskBCKmdygEIi/3KAQiMnssBCKegywEI8PDLAQis8ssBCN3yywEI7/LLAQjv98sBCLT4ywEInvnLAQj7+csBCK/6ywEIsfrLARi68ssBGJD1ywE=',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://trends.google.com/trends/explore?date=today%203-m&geo=NG&q=Jumia,jiji,Kilimall,Naivas,Avechi',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': '__utmz=10102256.1626429382.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=10102256.343699485.1626429382.1626429382.1626948944.2; __utmc=10102256; __utmt=1; __utmb=10102256.4.9.1626949249014; HSID=AzbAhLGjWNBnaeSg5; SSID=AbCtluR_Fke88ijZ_; APISID=wWv3y4ORv3Nfei-i/AC3BTttxU99t5IOQA; SAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; __Secure-3PAPISID=M7WAvaTw62jm9rCx/AJOFjbbTIM353MMbT; SEARCH_SAMESITE=CgQIr5IB; SID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vqqWO-BsPjem9iahGfYugXw.; __Secure-3PSID=_Ac1s4eJ00tERsfuV7j4h9BIM-7q_eSR5ITGiKPDHuYhMe-vaU1E9dNhztOYYpMT0xNdIw.; 1P_JAR=2021-07-22-07; NID=219=TfRj2AS_SBj3ZmflUiBaPqeunwsz5qziMRTT_km4WLGO0dZf9I0QgFIHrdgE5J-u6OqY7wYrL6KP3eJHt1WG-9wn3y12jD6iS5ivusWy9hAzSzvkusw7kRlBR2Rm19JWcoVMLUyZHV-qaJgEdtzteQ5dLemhQRSr1GxOogEQmg1DJiVmwGK4EJ3eFV2M1D0KviMEd7ua0XKYCCdPqNQOV-EMMGQUGZsiQdXBvTLy9jJbRpZBBP7vo6zyOIjjDymaMba0pIfx; SIDCC=AJi4QfH0UI6JQnI1iTTDKwWJGZjES9eGS6e8zAKNOniD4dXIF-D_4ZuXRKU5ZekywDUd0WUuLA; __Secure-3PSIDCC=AJi4QfE5XNvUJKrD5t2G6qscZfAJghVqoXG-Yk6bHxOFBZpCcREo9SuwZ7dZC1IVyRhGGazlhg',
    }

    req = {
                "time":f"{start_date} {end_date}",
                "resolution":"DAY",
                "locale":"zh-CN",
                "comparisonItem":[   { "geo":{"country":country},"complexKeywordsRestriction":{"keyword":[{"type":"BROAD", "value":keyword}] } }  \
                        for keyword in keywords],
                "requestOptions":{"property":"","backend":"IZG","category":0}
                }
    params = {
        'hl':'zh-CN',
        'tz':'-480',
        'req': json.dumps(req),
        'token': token ,
        }

    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get('https://trends.google.com/trends/api/widgetdata/multiline'+f'?{urlencode(params)}') as res:
            response = await res.text()
    # print(response)
    data_list = chompjs.parse_js_object(response)['default']['timelineData']
    print(data_list)
    value_series =Series(reduce( list.__add__ ,[ data['value']  for data in data_list  ]) )
    keywords_series = Series( keywords*len(data_list) )
    date_series = Series( reduce( list.__add__ ,  [ [ re.sub('[年月]','-',data['formattedTime'].strip('日')) ]*len(keywords)  for data in data_list ]) )
    df = DataFrame(columns=['crawl_time','date','country','keyword','value'])
    df['keyword'] = keywords_series
    df['value'] = value_series
    df['date'] =  date_series
    df['crawl_time'] = TODAY
    df['country'] = country
    return df

#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%222021-04-22+2021-07-22%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Jumia%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22jiji%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Kilimall%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Naivas%22%7D%5D%7D%7D,%7B%22geo%22:%7B%22country%22:%22NG%22%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22Avechi%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token=APP6_UEAAAAAYPqYMLtcPo0zmfbw2j2hGFjDcsvQAu2m&tz=-480', headers=headers)
async def main():
    token  = await get_token(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG')
    print(token)
    df = await get_keywords_trend(keywords='Jumia,jiji,Kilimall,Naivas,Avechi'.split(','), country='NG',token=token)
    df.to_sql('google_keyword_trend', engine_localhost, index=False, if_exists='append')
    print(df)

if __name__=='__main__':
    asyncio.run(main())

可以看到同步与异步在代码上基本一致,
asyncio 和 aiohttp 可以基本实现 无痛转换 同步代码到异步代码 ,当然也有很多小坑
ps:代码运行 需要科学上网

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值