aiohttp保存MySQL_基于asyncio、aiohttp、xpath的异步爬虫

本文介绍了如何使用asyncio和aiohttp库编写一个异步爬虫,该爬虫能读取CSV文件中的爬取规则,根据规则抓取多个网站的健康相关信息,并将数据保存到MySQL数据库中。通过xpath解析HTML,爬虫能处理不同网站的结构差异,同时支持动态设置抓取数据的数量。代码中还包含了处理HTML字符串中style标签的方法以及数据库连接配置。
摘要由CSDN通过智能技术生成

今天介绍一下基于asyncio和aiohttp的异步爬虫的编写,解析html用的是xpath。

该爬虫实现了以下功能:

1.读取csv文件中的爬取规则,根据规则爬取数据;代码中添加了对3个网站的不同提取规则,如有需要,还可以继续添加;

2.将爬取到的数据保存到mysql数据库中。

通过输入问题,该爬虫能爬取关于健康方面的数据。

具体代码如下:

# coding:utf-8

"""

async-apiser xpath

"""

from lxml import etree

import csv

import re

import os

import asyncio

import aiohttp

import aiomysql

from datetime import datetime

from config import Config

class HealthSpider(object):

def __init__(self, user_id, keyword, url, hrule, drule, count, trule):

self.user_id = user_id

self.keyword = keyword

self.url = url

self.hrule = hrule

self.drule = drule

self.count = count

self.trule = trule

self.headers = ''

self.urls_done = []

self.urls_will = []

self.spider_data = {}

@staticmethod

def handle_flag(str):

"""

去除字符串中的style样式标签

:param html:

:return:

"""

pattern = re.compile(r' style=".*?;"', re.S)

return pattern.sub('', str)

async def get_html(self, url, session):

"""

根据url,返回html

:param url:

:return:

"""

try:

async with session.get(url, headers=self.headers, timeout=5) as resp:

if resp.status in [200, 201]:

data = await resp.text()

return data

except Exception as e:

raise Exception("数据搜索错误")

def get_url(self, resp):

"""

根据html获取每条数据的url

:param resp:

:return:

"""

# 保存爬取的数据

root = etree.HTML(str(resp))

items = root.xpath(self.hrule)

# html结构不同,组织url的方式也不同

if 5 == self.count:

self.urls_will = ['https://dxy.com' + i for i in items[:5]]

elif 3 == self.count:

self.urls_will = [i for i in items[:3]]

elif 2 == self.count:

self.urls_will = [i for i in items[:2]]

async def get_data(self, url, session, pool):

"""

根据url获取具体数据

:return:

"""

# 根据url解析出htm

html = await self.get_html(url, session)

# 保存爬取的数据

root = etree.HTML(str(html))

html_data = ''

try:

title = root.xpath(self.trule)

title = ''.join(title)

except Exception as e:

title = ''

try:

data = root.xpath(self.drule)

if data:

# html结构不同,获取数据的方式也不同

if 3 == self.count:

html_data = ''.join(map(etree.tounicode, data))

# 去除结果中的style标签

html_data = HealthSpider.handle_flag(html_data)

else:

html_data = etree.tounicode(data[0])

html_data = HealthSpider.handle_flag(html_data)

except Exception as e:

html_data = []

self.urls_done.append(url)

# 数据入库,保存:用户id, 关键词, 日期, 主url, 子url, html数据

if html_data:

self.spider_data["data"].append({"title": title, "html_data": html_data})

spide_date = datetime.now()

data = (self.user_id, self.keyword, spide_date, self.url, url, title, html_data)

stmt = "INSERT INTO spider_data (user_id, keyword, spide_date, main_url, sub_url, title, html_data) " \

"VALUES (%s, %s, %s, %s, %s, %s, %s)"

try:

async with pool.acquire() as conn:

async with conn.cursor() as cur:

await cur.execute(stmt, data)

except Exception as e:

pass

async def start_spider(self, pool):

"""

开始爬取数据

:return:

"""

async with aiohttp.ClientSession() as session:

self.spider_data["user_id"] = self.user_id

self.spider_data["keyword"] = self.keyword

self.spider_data["data"] = []

while True:

# 待爬取url队列为空或者已经爬取3条数据,则停止爬取

if (len(self.urls_will) == 0) or len(self.spider_data["data"]) == self.count:

break

# 获取待爬url

url = self.urls_will.pop()

# 开始爬取数据

if url not in self.urls_done:

await self.get_data(url, session, pool)

return self.spider_data

async def main(self, loop):

# 请求头

self.headers = {'Accept': 'text/html, application/xhtml+xml, application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate',

'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '

'(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'

}

# 连接mysql数据库

pool = await aiomysql.create_pool(host=Config.DB_HOST, port=Config.DB_PORT,

user=Config.DB_USER, password=Config.DB_PASSWORD,

db=Config.DB_NAME, loop=loop, charset="utf8", autocommit=True)

async with aiohttp.ClientSession() as session:

# 首次获取html

html = await self.get_html(self.url, session)

# 获取url

self.get_url(html)

data = await self.start_spider(pool)

return data

# asyncio.ensure_future(self.start_spider(pool))

def get_rules(keyword):

"""

获取csv中的xpath规则

:return:

"""

csv_dict = []

path = os.path.join(os.path.dirname(__file__), 'rules.csv')

with open(path, 'rU') as f:

reader = csv.DictReader(f)

for line in reader:

url = line['url'].format(keyword)

hrule = line['hrule']

drule = line['drule']

count = int(line['count'])

title = line['trule']

csv_dict.append({"url": url, "hrule": hrule, "drule": drule, "count": count, "trule": title})

return csv_dict

def start_spider(keyword):

"""

爬取数据

:param user_id:

:param keyword:

:return:

"""

try:

data_list = get_rules(keyword)

except Exception as e:

raise Exception("搜索规则获取失败")

spider_data = []

tasks = []

loop = asyncio.get_event_loop()

for i in data_list:

spider = HealthSpider(1, keyword, i['url'], i['hrule'], i['drule'], i['count'], i['trule'])

# 任务列表

tasks.append(asyncio.ensure_future(spider.main(loop)))

# 添加到loop

loop.run_until_complete(asyncio.wait(tasks))

try:

for task in tasks:

for i in range(len(task.result()["data"])):

spider_data.append(task.result()["data"][i])

except Exception as e:

pass

# 延时以等待底层打开的连接关闭

loop.run_until_complete(asyncio.sleep(0.250))

loop.close()

return spider_data

if __name__ == '__main__':

# 爬取感冒了怎么办相关内容

start_spider("感冒了怎么办")

下面讲一下代码中某些方法的作用:

1.handle_flag()方法用于去掉html字符串中的style样式标签,保留html中的其他标签,便于前端的展示;

2.get_data()方法用于爬取具体数据,并使用aiomysql将爬取道德数据保存到数据库;

数据库的配置文件config.py:

# coding=utf-8

class Config(object):

DB_ENGINE = 'mysql'

DB_HOST = '127.0.0.1'

DB_PORT = 3306

DB_USER = 'root'

DB_PASSWORD = 'wyzane'

DB_NAME = 'db_tornado'

DB_OPTIONS = {

'init_command': "SET sql_mode='STRICT_TRANS_TABLES'",

'charset': 'utf8mb4',

}

3.get_rules()方法用于从rules.csv文件中读取爬取的规则。因为这里同时爬取了3个不同的网站,由于每个网站解析html的xpath规则不同,并且每个网站提取的数据条数不同,所以把这些规则写到了rules.csv文件(就是一个excel文件)中。先读取规则,再爬取数据。

以上就是基于asyncio的异步爬虫的代码,如有错误,欢迎交流指正!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值