【异步爬虫】异步爬取rmsj
异步爬虫
前言
例如:随着人工智能的不断发展,机器学习这门技术也越来越重要,很多人都开启了学习机器学习,本文就介绍了机器学习的基础内容。
我们都知道爬虫市IO密集型任务,当发出一个请求后必须等待请求响应,才能接着运行。所以我们可以使用异步爬虫优化一下
二、使用步骤
1.引入库
代码如下(示例):
from lxml import etree from fake_useragent import UserAgent import asyncio import aiohttp import logging from motor.motor_asyncio import AsyncIOMotorClient
2.配置
代码如下(示例):
client=AsyncIOMotorClient('mongodb://localhost:27017') //Mongobd链接 db=client.test collection=db.tjf url_index='https://www.renminshangjin.com/criminal_screen.html?page={}' url_delay='https://www.renminshangjin.com' semaphore=asyncio.Semaphore(10) //最高并发数 session=None logging.basicConfig(level=logging.INFO) page_number=10
网页代码
from lxml import etree from fake_useragent import UserAgent import asyncio import aiohttp import logging from motor.motor_asyncio import AsyncIOMotorClient client=AsyncIOMotorClient('mongodb://localhost:27017') db=client.test collection=db.tjf url_index='https://www.renminshangjin.com/criminal_screen.html?page={}' url_delay='https://www.renminshangjin.com' semaphore=asyncio.Semaphore(10) session=None logging.basicConfig(level=logging.INFO) page_number=10 async def scrapy_api(url): headers = { 'User-Agent': UserAgent().random } async with semaphore: try: logging.info('scraping %s', url) async with await session.get(url,headers=headers) as response: return await response.text() except aiohttp.ClientError: logging.info('scrape_api error url: %s', url, exc_info=True) async def scrape_index(page): url=url_index.format(page) return await scrapy_api(url) async def scrape_detail(id): url=url_delay+id return await scrapy_api(url) async def save_data(data): logging.info('saving data %s',data) if data: await collection.insert_one(data) async def xpath_1(result_1): for io in result_1: xpath_1 = etree.HTML(io) name = xpath_1.xpath('/html/body/section/div/div[1]/div[1]/div[2]/div[1]/div[1]/span/text()')[0] dizhi = xpath_1.xpath('/html/body/section/div/div[1]/div[企1]/div[2]/div[last()-1]/span/text()')[0] di={'name':name,'address':dizhi} await save_data(di) async def main(): global session session=aiohttp.ClientSession() takes=[asyncio.ensure_future(scrape_index(i)) for i in range(1,130)] result=await asyncio.gather(*takes) # 用这个方法获取数据 比asyncio.wait好用 for resu in result: scr_1=etree.HTML(resu) as1=scr_1.xpath('//div[@class="screen_criminal_list"]/a') ids=[] for i in as1: ids.append(i.xpath('@href')[0]) logging.info('URL:%s',ids) scrape_detail_takes=[asyncio.ensure_future(scrape_detail(idd)) for idd in ids] result_1=await asyncio.gather(*scrape_detail_takes) await xpath_1(result_1) await session.close() if __name__ == '__main__': asyncio.get_event_loop().run_until_complete(main())