网站链接:
'aHR0cDovL3d3dy5jdXN0b21zLmdvdi5jbi9jdXN0b21zLzMwMjI0OS8zMDIyNzAvMzAyMjcyL2luZGV4Lmh0bWw='
反爬技术:加速乐+数5
我一实习生搞这些,不废话了直接上自动化才艺
展示:
import hashlib
import logging
import re
import redis
from lxml import etree
from datetime import datetime
from pymongo import MongoClient
from playwright.sync_api import Playwright, sync_playwright
client = MongoClient("mongodb://xx:xx/")
db = client["xx"]
collection = db["xx"]
def run(playwright: Playwright) -> None:
browser = playwright.chromium.launch(
headless=False,
args=['--disable-blink-features=AutomationControlled'],
)
page = browser.new_page()
while True:
r = redis.Redis(host='localhost', port=6379)
url = r.rpop('detail_urls')
if url is not None:
url = url.decode('utf-8')
try:
page.goto(url)
page.wait_for_selector('xpath=//div[@class=" easysite-news-title"]/h2')
if not page.is_visible('xpath=//div[@class=" easysite-news-title"]/h2'):
page.go_back()
if 'was not found on this' in page.content():
page.go_back()
resp = page.content()
tree = etree.HTML(resp)
title = None
if tree.xpath('//div[@class=" easysite-news-title"]/h2/text()'):
title = tree.xpath('//div[@class=" easysite-news-title"]/h2/text()')[0]
content = tree.xpath('//div[@class="easysite-news-text"]//text()')
content_str = ''.join(content).replace('\\u3000', '').replace('\n', '').replace('\t', '')
content = ''.join(filter(str.isprintable, content_str))
pubTime = str(tree.xpath('//*[@class="hgfg_list"]/text()'))
if re.search(r'\d+年\d+月\d+日', pubTime):
pubTime = re.search(r'\d+年\d+月\d+日', pubTime).group(0).replace('年','-').replace('日','').replace('月','-')
if re.search(r'\d+-\d+-\d+', pubTime):
pubTime = re.search(r'\d+-\d+-\d+', pubTime).group(0)
if re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime):
pubTime = re.search(r'\d+-\d+-\d+ \d+:\d+', pubTime).group(0)
dt_now = datetime.now()
hour = str(dt_now.hour)
minute = str(dt_now.minute)
second = str(dt_now.second)
# 将 datetime 对象格式化为目标字符串
pubtime = str(pubTime)+' '+str(hour+":"+minute+":"+second)
fetchtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
md5_obj = hashlib.md5(url.encode('utf-8'))
id = md5_obj.hexdigest()
dic = {
'title': title,
'content': content,
'pubtime': pubtime,
'url': url,
'id': id,
'fetchtime': fetchtime
}
logging.info(f'insert_one:{dic}')
collection.insert_one(dic)
except Exception as e:
logging.error(f'爬取 {url} 失败:{str(e)}')
continue
with sync_playwright() as playwright:
run(playwright)
总结:
playwright效率比selenium高多了,先遍历一遍详情页url存到redis里,使用多进程兼分布式的思想对网站进行爬取,在实战过程中,体验了一把playwright的魅力,但还是存在一些问题,playwright会内存泄漏,程序稳定性不好。