文章目录
一、回顾多线程
1.实现多线程的两个方法
①创建线程 t = Thread(target=func)并启动t.start()
②创建一个继承Thread的类
2.代码举例
from threading import Thread
class My(Thread):
def run(self):
for a in range(100):
print(a)
if __name__ == '__main__':
t1 = My()
t1.start()
for a in range(100):
print('main', a)
二、回顾爬虫
数据解析主要方法
①re解析:
主要通过正则表达式来解析提取数据
举例:
obj1 = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
②bs4解析
主要步骤:
1.把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text, 'html.parser') # 指定html解析器``
2.从bs对象中查找数据:find(标签,属性=值), find_all(标签,属性=值)
page.find("table", class_="hq_table")
# class是python的关键字,加上_则不为关键字
# table = page.find("table", attrs={"class": "hq_table"}) # 与上一行一致
③path解析
主要内容:
tree = etree.parse("b.html") # 直接加载html文件
result = tree.xpath("/html/body/ul/li[1]/a/text()") # /li[1] 表示拿ul下的第一个li标签内容
# xpath索引顺序是从1开始的
result1 = tree.xpath("/html/body/ol/li/a[@href='dapao']/text()")
# a[@href='dapao'] 表示从li标签下找到属性为'dapao'的对应标签 [@XXX=XXX] 是属性的筛选
注:三种解析方式都需要导入对应的包
三、线程池
解释:一次性开辟一些线程,我们用户直接给线程池子提交任务,线程任务的调度交给线程池来完成
使用线程池需要导包:
from concurrent.futures import ThreadPoolExecutor
简单理解:
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
def fun(name):
for a in range(500):
print(name,a)
if __name__ == '__main__':
#创建线程池
with ThreadPoolExecutor(10) as t: # 分十个线程池
for i in range(100):
t.submit(fun,name=f"线程{i}")
# 等待线程池中任务全部执行完毕,才继续执行(守护)
print("over")
使用线程池提取并储存一网址菜品信息
步骤:
1.提取单个页面的数据
2.上线程池,多个页面同时抓取
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
f= open("data.csv",mode= "w",encoding="utf-8")
cswriter = csv.writer(f)
def download_one_page(url):
#拿到页面源代码
resp = requests.get(url)
html = etree.HTML(resp.text)
table = html.xpath("/html/body/div[2]/div[4]/div[1]/table")[0]
# trs = table.xpath("./tr")[1:] # 从索引为1开始取 或:
trs = table.xpath("./tr[position()>1]")
# 拿到每个tr
for tr in trs:
txt = tr.xpath("./td/text()")
# 对数据做简单的处理:\\ / 去掉
txt = (item.replace("\\","").replace("/","")for item in txt)
# 把数据存放在文件中
cswriter.writerow(txt)
print(url,"提取完毕")
if __name__ == '__main__':
# for i in range(1,16721): # 效率地下
# download_one_page(f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
with ThreadPoolExecutor(50) as t:
for i in range(200):
t.submit(download_one_page,url = f"http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml")
print("全部下载完毕")
四、协程
import asyncio
async def fun():
print("4556")
# async关键字使该函数返回值为协程对象
if __name__ == '__main__':
g=fun()
#此时的函数是异步协程函数,此函数执行得到的是一个协程对象
asyncio.run(g) # 运行g协程对象
# 协程程序运行需要asyncio模块的支持
1.time.sleep()的协程转变
time.sleep()同步的代码–> 异步操作await asyncio.sleep()
import asyncio
import time
async def f1():
print("101")
# time.sleep(3) # 当程序出现同步操作的时候,异步就中断了
await asyncio.sleep(3) # 挂起异步操作,使其他过程能够进入线程使用
print("102")
async def f2():
print("201")
await asyncio.sleep(2)
print("202")
async def f3():
print("301")
await asyncio.sleep(4)
print("302")
if __name__ == '__main__':
a1=f1()
a2=f2()
a3=f3()
# asyncio.run(a1)
# asyncio.run(a2)
# asyncio.run(a3)
# 以上为分开启动,需要调用并且执行完成才能继续下一个
tasks = [a1,a2,a3] # 为使得一次性启动多个任务(协程)
t1 = time.time()
asyncio.run(asyncio.wait(tasks))
t2 = time.time()
print(t2-t1)
import asyncio
async def f1():
print("101")
# time.sleep(3) # 当程序出现同步操作的时候,异步就中断了
await asyncio.sleep(3) # 挂起异步操作,使其他过程能够进入线程使用
print("102")
async def f2():
print("201")
await asyncio.sleep(2)
print("202")
async def f3():
print("301")
await asyncio.sleep(4)
print("302")
# 创建一个main函数,减轻main方法的使用
async def main():
# # 第一种
# a1 = f1()
# await a1 # 一般await挂起操作放在协程对象前面
# 第二种
tasks = [
asyncio.create_task(f1()),
asyncio.create_task(f2()),
asyncio.create_task(f3())
] # py3.8之后需要手动转换,否则asyncio.wait()会警告或报异常
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
爬虫领域的简单协程模板
cio
async def download(url):
print("开始下载")
await asyncio.sleep(4) # 网络请求:requests.get()
print("下载完成")
async def main():
urls = [
"http://www.baidu.com",
"http://www.bilibili.com"
]
tasks=[]
for url in urls:
d = download(url)
tasks.append(asyncio.create_task(d))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
注意:一切异步协程都是在单线程的情况下进行的
2.requests.get()的协程转换
requests.get() 同步的代码–> 异步操作aiohttp
举例:使用异步协程爬取三张图片
import asyncio
import aiohttp
urls=[
"http://kr.shanghai-jiuxin.com/file/2021/0702/662c6fbaa13160d20122c30f97de8bd8.jpg",
"https://kr.wzh3c.com/file/2020/0608/smalldf980505591cc79141141fc361e98e49.jpg",
"http://kr.shanghai-jiuxin.com/file/2021/0702/be7e8685eb8808c1c0809227760e5016.jpg"
]
async def aiodownload(url):
"""1.发送请求 2.得到图片内容 3.保存到文件"""
# s=aiohttp.ClientSession() <==> requests
#requests.get() <==> s.get()
name = url.rsplit("/",1)[1]
async with aiohttp.ClientSession() as s:
async with s.get(url) as resp:
# 请求成功,写入文件
with open(name,mode="wb") as f:
f.write(await resp.content.read()) # 读取内容是异步的,需要await挂起
print(name,"ok")
async def main():
tasks = []
for url in urls:
t = aiodownload(url)
tasks.append(asyncio.create_task(t))
# tasks.append(asyncio.create_task(aiodownload(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
所得图片之一:
爬取小说内容
步骤:
1.同步操作:访问getCatalog 拿到所有章节的cid和名称
2.异步操作:访问getChapterContent 下载所有的文章内容
import json
import requests
import asyncio
import aiohttp
import aiofiles
head={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
async def aiodownload(cid,b_id,title):
data = {
"book_id": b_id,
"cid": f"{b_id}|{cid}",
"need_bookinfo": 1
}
data = json.dumps(data)
url = f"http://dushu.baidu.com/api/pc/getChapterContent?data={data}"
async with aiohttp.ClientSession() as s:
async with s.get(url,headers=head) as resp:
dic = await resp.json()
async with aiofiles.open(title,mode="w",encoding="utf-8") as f:
await f.write(dic['data']['novel']['content'])
async def getCatalog(url):
resp = requests.get(url,headers=head)
dic = resp.json()
tasks = []
for item in dic['data']['novel']['items']: # item就是对应每一章节的名称和cid
title = item['title']
cid = item['cid']
# print(title,cid)
# 准备异步任务
t = aiodownload(cid,b_id,title)
tasks.append(asyncio.create_task(t))
await asyncio.wait(tasks)
resp.close()
if __name__ == '__main__':
b_id = "4306063500"
url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+b_id+'"}'
asyncio.run(getCatalog(url))