【WB】微博爬虫案例_无头浏览器采集_selenium/playwright/requests方式采集


人立晚风月照中
独散步长廊
月浸在池塘
欢欣充满了心上
静听乐悠扬
越觉乐洋洋
夜鸟高枝齐和唱
月照彩云上
熏风轻掠
如入山荫心向往
                     🎵 苏妙玲《彩云追月》


import time

import requests
from playwright._impl._errors import TimeoutError
from playwright.sync_api import sync_playwright
from loguru import logger
from scrapy import Selector
import re
from urllib.parse import urlparse


def extract_html_text(html):
    logger.info(f"Worker-0-run-extract_html_text")
    selector = Selector(text=html)
    # 去掉空格和换行符
    text = selector.xpath('//body//text()').re(r'\S+|\n+')
    text = ''.join(text)
    charlist = re.findall('[\u4e00-\u9fa5]+', text)
    text = ''.join(charlist)
    if text:
        return text.strip()
    else:
        return ''


def extract_html(url):
    # 启动Playwright
    with sync_playwright() as p:
        # 选择浏览器引擎(chromium、firefox、webkit)
        browser = p.chromium.launch(
            headless=True,  # 设置为False以便在界面模式下启动浏览器
            executable_path=r'/Applications/Chromium.app/Contents/MacOS/Chromium'  # 设置浏览器路径
            # executable_path=r'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'  # 设置浏览器路径
        )
        # headers = {
        #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        # }

        context = browser.new_context()
        pageInit = context.new_page()
        page = context.new_page()
        page.set_default_timeout(15000)
        page.goto(url)
        # 等待7s
        time.sleep(5)
        html = page.content()
        # print(html)
        page.close()
        return html


def crawl(url):
    html = ''
    try:
        html = extract_html(url)
    except TimeoutError as e:
        logger.info("页面访问超时,检查网络连接或者网站是否正常")
    except Exception as e:
        logger.error("页面访问异常")
    response = Selector(text=html)
    values = response.xpath("//footer/@aria-label").extract_first()
    share, comment, like = values.split(',')
    print(share, comment, like)


def is_valid_http_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme in ("http", "https")


def request_url(wId):
    url = f'https://weibo.com/ajax/statuses/show?id={wId}&locale=zh-CN'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        "Cookie": "SUB=_2AkMRHTAuf8NxqwFRmfscyW7na4Rzzw3EieKnQcH1JRMxHRl-yT9kqnBetRB6Op0ewXqJGg99xI9PHf9GLxIl4ywMtbjK;"
    }

    resp = requests.get(url, headers=headers)
    data = resp.json()
    reposts_count = data['reposts_count']
    comments_count = data['comments_count']
    attitudes_count = data['attitudes_count']
    print(reposts_count, comments_count, attitudes_count)


if __name__ == '__main__':
    url = 'https://weibo.com/1731986465/Oe4y0u9Pn?refer_flag=1001030103_'
    ret = crawl(url)
    wId = url.split('/')[-1].split('?')[0]
    request_url(wId)
  • 35
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值