playwright实战：某网站舆情爬取

冰吸生椰拿铁.

已于 2023-05-23 14:33:28 修改

阅读量492

点赞数

文章标签：爬虫

于 2023-05-22 10:17:46 首次发布

本文链接：https://blog.csdn.net/weixin_66451233/article/details/130801320

版权

网站链接：

'aHR0cDovL3d3dy5jdXN0b21zLmdvdi5jbi9jdXN0b21zLzMwMjI0OS8zMDIyNzAvMzAyMjcyL2luZGV4Lmh0bWw='

反爬技术：加速乐+数5

我一实习生搞这些，不废话了直接上自动化才艺

展示：

import hashlib
import logging
import re
import redis
from lxml import etree
from datetime import datetime
from pymongo import MongoClient
from playwright.sync_api import Playwright, sync_playwright


client = MongoClient("mongodb://xx:xx/")
db = client["xx"]
collection = db["xx"]
def run(playwright: Playwright) -> None:
    browser = playwright.chromium.launch(
        headless=False,
        args=['--disable-blink-features=AutomationControlled'],
    )
    page = browser.new_page()
    while True:
        r = redis.Redis(host='localhost', port=6379)
        url = r.rpop('detail_urls')
        if url is not None:
            url = url.decode('utf-8')
        try:
            page.goto(url)
            page.wait_for_selector('xpath=//div[@class=" easysite-news-title"]/h2')
            if not page.is_visible('xpath=//div[@class=" easysite-news-title"]/h2'):
                page.go_back()
            if 'was not found on this' in page.content():
                page.go_back()
            resp = page.content()
            tree = etree.HTML(resp)
            title = None
            if tree.xpath('//div[@class=" easysite-news-title"]/h2/text()'):
                title = tree.xpath('//div[@class=" easysite-news-title"]/h2/text()')[0]
            content = tree.xpath('//div[@class="easysite-news-text"]//text()')
            content_str = ''.join(content).replace('\\u3000', '').replace('\n', '').replace('\t', '')
            content = ''.join(filter(str.isprintable, content_str))
            pubTime = str(tree.xpath('//*[@class="hgfg_list"]/text()'))
            if re.search(r'\d+年\d+月\d+日', pubTime):
                pubTime = re.search(r'\d+年\d+月\d+日', pubTime).group(0).replace('年','-').replace('日','').replace('月','-')
            if re.search(r'\d+-\d+-\d+', pubTime):
                pubTime = re.search(r'\d+-\d+-\d+', pubTime).group(0)
            if re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime):
                pubTime = re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime).group(0)
            dt_now = datetime.now()
            hour = str(dt_now.hour)
            minute = str(dt_now.minute)
            second = str(dt_now.second)
            # 将 datetime 对象格式化为目标字符串
            pubtime = str(pubTime)+' '+str(hour+":"+minute+":"+second)
            fetchtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            md5_obj = hashlib.md5(url.encode('utf-8'))
            id = md5_obj.hexdigest()
            dic = {
                'title': title,
                'content': content,
                'pubtime': pubtime,
                'url': url,
                'id': id,
                'fetchtime': fetchtime
            }
            logging.info(f'insert_one:{dic}')
            collection.insert_one(dic)
        except Exception as e:
            logging.error(f'爬取 {url} 失败：{str(e)}')
            continue

with sync_playwright() as playwright:
    run(playwright)

总结：

playwright效率比selenium高多了，先遍历一遍详情页url存到redis里，使用多进程兼分布式的思想对网站进行爬取，在实战过程中，体验了一把playwright的魅力，但还是存在一些问题，playwright会内存泄漏，程序稳定性不好。

冰吸生椰拿铁.

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
playwright实战：某网站舆情爬取

playwright对网页的抓取效率比selenium高多了，先遍历一遍详情页url存到redis里，使用多进程兼分布式的思想对网站进行爬取，在实战过程中，体验了一把playwright的魅力，但还是存在一些问题，playwright会内存泄漏，程序稳定性不好。我一实习生搞这些，不废话了直接上自动化才艺。
复制链接

扫一扫