import logging
import asyncio
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
from motor.motor_asyncio import AsyncIOMotorClient
#数据库操作
motor_connect_string = 'mongodb://localhost:27017'
momgodb_name = 'movie'
mongo_collection_name = 'data'
client = AsyncIOMotorClient(motor_connect_string )
db = client[momgodb_name]
collection =db[mongo_collection_name]
logging.basicConfig(level= logging.INFO,format='%(levelname)s:%(message)s')
index_url = 'https://spa2.scrape.center/page/{page}'
total_page = 10
timeout = 10
window_width,window_height = 1366,768
headless = False
browser,tap = None,None
async def init():
global browser,tap
browser = await launch(headless =headless,
args = [f'--window-size={window_width},{window_height}',
'--disable-infobars'])
tap = await browser.newPage()
await tap.setViewport({'width':window_width,'height':window_height})
async def scrape_index(url,selector):
logging.info('爬取 %s',url)
try:
await tap.goto(url)
await tap.waitForSelector(selector,options={'timeout':timeout*1000})
except TimeoutError:
logging.error('错误 %s',url)
async def scrape_page(page):
"""爬取详情页的url"""
url = index_url.format(page=page)
await scrape_index(url,selector = '.item .name')
async def parse_index():
"""解析列表页"""
return await tap.querySelectorAllEval('.item .name','nodes => nodes.map(node => node.href)')
async def scrape_detail(url):
"""爬取详情页"""
await scrape_index(url,selector = '.item .name')
async def parse_detail():
"""解析详情页"""
url = tap.url
name = await tap.querySelectorEval('.item .name h2','node=>node.innerText')
catalogs = await tap.querySelectorAllEval('.categories button span','nodes => nodes.map(node => node.innerText)')
cover = await tap.querySelectorEval('.cover','node=>node.src')
score = await tap.querySelectorEval('.score','node=>node.innerText')
drama = await tap.querySelectorEval('.drama p','node=>node.innerText')
return {
'url':url,
'name':name,
'catagories':catalogs,
'cover':cover,
'score':score,
'drama':drama
}
async def main():
await init()
try:
for page in range(1,total_page+1):
await scrape_page(page)
detail_urls = await parse_index()
for detail_url in list(detail_urls):
await scrape_detail(detail_url)
data = await parse_detail()
await collection.update_one(
{'name':data.get('name')},{'$set':data},upsert = True)
finally:
await browser.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())
pyppeteer实战
最新推荐文章于 2024-05-31 14:21:08 发布