Twitter 热门搜索结果文本抓取

"""
twitter 热门频道搜索,数据动态加载,前两页都是20条,往下是新接口,一次刷新2条新数据

推荐个工具,复制curl就能快速生成requests请求样例代码,自动解析header,cookie参数,非常实用
https://spidertools.cn/#/formatDict
"""


import requests
import time
from threading import Thread
from queue import Queue

def get_data():
	"""
	列表页1-2
	"""
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/search?q=%22submit%20office%22&src=typed_query&f=top",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/2/search/adaptive.json"
    params = {
        "include_profile_interstitial_type": "1",
        "include_blocking": "1",
        "include_blocked_by": "1",
        "include_followed_by": "1",
        "include_want_retweets": "1",
        "include_mute_edge": "1",
        "include_can_dm": "1",
        "include_can_media_tag": "1",
        "include_ext_has_nft_avatar": "1",
        "include_ext_is_blue_verified": "1",
        "skip_status": "1",
        "cards_platform": "Web-12",
        "include_cards": "1",
        "include_ext_alt_text": "true",
        "include_ext_limited_action_results": "false",
        "include_quote_count": "true",
        "include_reply_count": "1",
        "tweet_mode": "extended",
        "include_ext_collab_control": "true",
        "include_entities": "true",
        "include_user_entities": "true",
        "include_ext_media_color": "true",
        "include_ext_media_availability": "true",
        "include_ext_sensitive_media_warning": "true",
        "include_ext_trusted_friends_metadata": "true",
        "send_error_codes": "true",
        "simple_quoted_tweet": "true",
        "q": "\"submit office\"",
        "count": "20",
        "query_source": "typed_query",
        "pc": "1",
        "spelling_corrections": "1",
        "include_ext_edit_control": "true",
        "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)

    # print(response.json())
    # print(response)

    return response.json()["timeline"]["instructions"][0]["addEntries"]["entries"][21]["content"]["operation"]["cursor"]["value"],list(response.json()["globalObjects"]["tweets"].keys())



def get_detail(tw_id_lis):
	"""
	详情页
	"""
    # print("tw_id_lis",tw_id_lis)
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/BMo3JQdAcg/status/402671294547648512",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/graphql/BoHLKeBvibdYDiJON1oqTg/TweetDetail"
    for twid in tw_id_lis:
        # print("twid",twid)
        params = {
            "variables": "{\"focalTweetId\":\""+twid+"\","
                         "\"referrer\":\"search\",\"with_rux_injections\":false,\"includePromotedContent\":true,\"withCommunity\":true,\"withQuickPromoteEligibilityTweetFields\":true,\"withBirdwatchNotes\":false,\"withSuperFollowsUserFields\":true,\"withDownvotePerspective\":false,\"withReactionsMetadata\":false,\"withReactionsPerspective\":false,\"withSuperFollowsTweetFields\":true,\"withVoice\":true,\"withV2Timeline\":true}",
            "features": "{\"responsive_web_twitter_blue_verified_badge_is_enabled\":true,\"verified_phone_label_enabled\":false,\"responsive_web_graphql_timeline_navigation_enabled\":true,\"unified_cards_ad_metadata_container_dynamic_card_content_query_enabled\":true,\"tweetypie_unmention_optimization_enabled\":true,\"responsive_web_uc_gql_enabled\":true,\"vibe_api_enabled\":true,\"responsive_web_edit_tweet_api_enabled\":true,\"graphql_is_translatable_rweb_tweet_is_translatable_enabled\":true,\"standardized_nudges_misinfo\":true,\"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled\":false,\"interactive_text_enabled\":true,\"responsive_web_text_conversations_enabled\":false,\"responsive_web_enhance_cards_enabled\":true}"
        }
        # print(params)
        response = requests.get(url, headers=headers, cookies=cookies, params=params)
        # print("response.json()",response.json())
        try:
            print(response.json()["data"]["threaded_conversation_with_injections_v2"]["instructions"][0]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"]["legacy"]["full_text"].strip())
            que.put(response.json()["data"]["threaded_conversation_with_injections_v2"]["instructions"][0]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"]["legacy"]["full_text"].strip())

        except Exception as e:
            print(e,e.__traceback__.tb_lineno)


def get_data2(cursor):
	"""
	列表页2-……
	"""
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/search?q=%22submit%20office%22&src=typed_query&f=top",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/2/search/adaptive.json"
    params = {
        "include_profile_interstitial_type": "1",
        "include_blocking": "1",
        "include_blocked_by": "1",
        "include_followed_by": "1",
        "include_want_retweets": "1",
        "include_mute_edge": "1",
        "include_can_dm": "1",
        "include_can_media_tag": "1",
        "include_ext_has_nft_avatar": "1",
        "include_ext_is_blue_verified": "1",
        "skip_status": "1",
        "cards_platform": "Web-12",
        "include_cards": "1",
        "include_ext_alt_text": "true",
        "include_ext_limited_action_results": "false",
        "include_quote_count": "true",
        "include_reply_count": "1",
        "tweet_mode": "extended",
        "include_ext_collab_control": "true",
        "include_entities": "true",
        "include_user_entities": "true",
        "include_ext_media_color": "true",
        "include_ext_media_availability": "true",
        "include_ext_sensitive_media_warning": "true",
        "include_ext_trusted_friends_metadata": "true",
        "send_error_codes": "true",
        "simple_quoted_tweet": "true",
        "q": "\"submit office\"",
        "count": "20",
        "query_source": "typed_query",
        "cursor":"{}".format(cursor),
        "pc": "1",
        "spelling_corrections": "1",
        "include_ext_edit_control": "true",
        "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)
    # print("response.json()",response.json())
    return \
    response.json()["timeline"]["instructions"][2]["replaceEntry"]["entry"]["content"]["operation"]["cursor"][
        "value"], list(response.json()["globalObjects"]["tweets"].keys())


if __name__ == '__main__':
    que = Queue()

    cursor = ""
    num = 0
    while True:
        num += 1
        if num > 2:
            # print("1" * 101)
            cursor, tw_id_lis = get_data2(cursor)
            if not tw_id_lis:
                break
        else:
            cursor,tw_id_lis = get_data()
        time.sleep(1)
        Thread(target=get_detail,args=(tw_id_lis,)).run()

    # 保存结果
    fp = open("./res.txt", "a", encoding="utf-8")
    while not que.empty():
        fp.write(que.get()+"\n")
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值