同步
import requests
from lxml import etree
from urllib.request import urlretrieve
import os
class DoutulaSpider(object):
def __init__(self):
self.base_url = "https://www.pkdoutu.com/article/list/?page={page}"
self.hearders = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
def parser_page(self,url):
resp = requests.get(url,headers=self.hearders)
html = resp.text
parser = etree.HTML(html)
emoji_url_list = parser.xpath("")
for emoji_url in emoji_url_list:
emoji_name = emoji_url.split("/")[-1]
emoji_path = os.path.join("emoji",emoji_name)
urlretrieve(emoji_url,emoji_path)
print("%s表情下载成功!"%emoji_name)
def run(self):
for x in range(1,11):
page_url = self.base_url.format(page=x)
self.parser_page(page_url)
if __name__ == '__main__':
spider = DoutulaSpider()
spider.run()
异步
from lxml import etree
import os
import aiohttp
import aiofiles
import asyncio
class DoutulaSpider(object):
def __init__(self):
self.base_url = "https://www.pkdoutu.com/article/list/?page={page}"
self.hearders = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
async def parser_page(self,url):
async with self.session.get(url) as resp:
html = await resp.text()
parser = etree.HTML(html)
emoji_url_list = parser.xpath("")
for emoji_url in emoji_url_list:
async with self.session.get(emoji_url) as emoji_resp:
emoji_name = emoji_url.split("/")[-1]
emoji_path = os.path.join("emoji", emoji_name)
async with aiofiles.open(emoji_path,'wb') as fp:
data = await emoji_resp.read()
await fp.write(data)
print("%s表情下载成功!"%emoji_name)
async def run(self):
self.session = aiohttp.ClientSession(headers=self.hearders)
for x in range(1,11):
await self.parser_page(self.base_url.format(page=x))
await self.session.close()
if __name__ == '__main__':
spider = DoutulaSpider()
asyncio.run(spider.run())