playwright实战:某网站舆情爬取

网站链接:

'aHR0cDovL3d3dy5jdXN0b21zLmdvdi5jbi9jdXN0b21zLzMwMjI0OS8zMDIyNzAvMzAyMjcyL2luZGV4Lmh0bWw='

反爬技术:加速乐+数5

我一实习生搞这些,不废话了直接上自动化才艺

 展示:

import hashlib
import logging
import re
import redis
from lxml import etree
from datetime import datetime
from pymongo import MongoClient
from playwright.sync_api import Playwright, sync_playwright


client = MongoClient("mongodb://xx:xx/")
db = client["xx"]
collection = db["xx"]
def run(playwright: Playwright) -> None:
    browser = playwright.chromium.launch(
        headless=False,
        args=['--disable-blink-features=AutomationControlled'],
    )
    page = browser.new_page()
    while True:
        r = redis.Redis(host='localhost', port=6379)
        url = r.rpop('detail_urls')
        if url is not None:
            url = url.decode('utf-8')
        try:
            page.goto(url)
            page.wait_for_selector('xpath=//div[@class=" easysite-news-title"]/h2')
            if not page.is_visible('xpath=//div[@class=" easysite-news-title"]/h2'):
                page.go_back()
            if 'was not found on this' in page.content():
                page.go_back()
            resp = page.content()
            tree = etree.HTML(resp)
            title = None
            if tree.xpath('//div[@class=" easysite-news-title"]/h2/text()'):
                title = tree.xpath('//div[@class=" easysite-news-title"]/h2/text()')[0]
            content = tree.xpath('//div[@class="easysite-news-text"]//text()')
            content_str = ''.join(content).replace('\\u3000', '').replace('\n', '').replace('\t', '')
            content = ''.join(filter(str.isprintable, content_str))
            pubTime = str(tree.xpath('//*[@class="hgfg_list"]/text()'))
            if re.search(r'\d+年\d+月\d+日', pubTime):
                pubTime = re.search(r'\d+年\d+月\d+日', pubTime).group(0).replace('年','-').replace('日','').replace('月','-')
            if re.search(r'\d+-\d+-\d+', pubTime):
                pubTime = re.search(r'\d+-\d+-\d+', pubTime).group(0)
            if re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime):
                pubTime = re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime).group(0)
            dt_now = datetime.now()
            hour = str(dt_now.hour)
            minute = str(dt_now.minute)
            second = str(dt_now.second)
            # 将 datetime 对象格式化为目标字符串
            pubtime = str(pubTime)+' '+str(hour+":"+minute+":"+second)
            fetchtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            md5_obj = hashlib.md5(url.encode('utf-8'))
            id = md5_obj.hexdigest()
            dic = {
                'title': title,
                'content': content,
                'pubtime': pubtime,
                'url': url,
                'id': id,
                'fetchtime': fetchtime
            }
            logging.info(f'insert_one:{dic}')
            collection.insert_one(dic)
        except Exception as e:
            logging.error(f'爬取 {url} 失败:{str(e)}')
            continue

with sync_playwright() as playwright:
    run(playwright)


总结:

playwright效率比selenium高多了,先遍历一遍详情页url存到redis里,使用多进程兼分布式的思想对网站进行爬取,在实战过程中,体验了一把playwright的魅力,但还是存在一些问题,playwright会内存泄漏,程序稳定性不好。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

冰吸生椰拿铁.

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值