异步爬虫

异步爬虫

  • 多线程
  • 多进程
  • 协程

多线程与多进程

进程:运行中的程序,每次我们执行一个程序,操作系统对自动的为这个程序准备一些必要的资源(如:分配内存,创建一个能够执行的线程)

线程:程序内,可以直接被CPU调度的执行过程,是操作系统能够进行运算调度的最小单位,它被包含在进程之中,是进程中的实际运作单位

# 两种写法

# 第一种
from threading import Thread


# 创建任务
def func(name):
    for i in range(10):
        print(name, i)


def main():
    # 创建线程
    t1 = Thread(target=func, args=("周杰伦",))
    t2 = Thread(target=func, args=("马斯克",))
    t3 = Thread(target=func, args=("周星驰",))

    # 启动线程
    t1.start()
    t2.start()
    t3.start()

main()

# 第二种 用方法来写 不会

多线程写法

# 两种写法

# 第一种
from threading import Thread


# 创建任务
def func(name):
    for i in range(10):
        print(name, i)


def main():
    # 创建线程
    t1 = Thread(target=func, args=("周杰伦",))
    t2 = Thread(target=func, args=("马斯克",))
    t3 = Thread(target=func, args=("周星驰",))

    # 启动线程
    t1.start()
    t2.start()
    t3.start()

main()

多线程练习

from threading import Thread
import requests
from bs4 import BeautifulSoup

def func(index):
    url = f"http://2chsck.cc/vodtype/1-{index}.html"
    resp = requests.get(url)
    html = resp.text

    page = BeautifulSoup(html, "html.parser")

    # img
    jpg = page.findAll("a", attrs={"class": "stui-vodlist__thumb lazyload"})

    for item in jpg:
        jpgHref = item.get("data-original")
        photo = requests.get(jpgHref)
        jpgName = item.get("title")
        with open(f"images/{index}- {jpgName}.jpg", mode="wb") as f:
            f.write(photo.content)


def main():
    for i in range(101, 201):
        Thread(target=func, args=(f"{i}",)).start()
main()

线程池

# import requests
# from bs4 import BeautifulSoup
# from concurrent.futures import ThreadPoolExecutor
#
#
# def func(page):
#     url = f"https://pic.netbian.com/4kmeinv/index_{page}.html"
#     resp = requests.get(url)
#     resp.encoding = "gbk"
#     html = resp.text
#     page = BeautifulSoup(html, "html.parser")
#     ul = page.find("div", attrs={"class": "slist"})
#     img = ul.findAll("img")
#
#     for item in img:
#         src = "https://pic.netbian.com" + item.get("src")
#         photo = requests.get(src)
#         # print(src)
#         try:
#             with open(f"{src}.jpg", mode="wb") as f:
#                 f.write(photo.content)
#         except:
#             print("error")
#
#
# def main():
#     # 线程池
#     with ThreadPoolExecutor(3) as t:
#         for i in range(2, 10):
#             t.submit(func, i)
#
#
# if __name__ == "__main__":
#     main()


# ----------------------       ^待查询^        -----------------------------

from concurrent.futures import ThreadPoolExecutor

线程池案例

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
}

f = open("新发地.csv", mode="a", encoding="utf-8")


def download(url):
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)
    a_list = tree.xpath("//li[@class='market-list-item']/a")
    for a in a_list:
        span = a.xpath("./span/text()")[:4]
        s = ",".join(span)
        f.write(s)
        f.write("\n")
    print(f"{url} 已完成")

for i in range(1, 2):
    with ThreadPoolExecutor(10) as t:
        for i in range(1, 100):

            # 新发地 请求过多需要验证 待解决 2023年4月18日13:16:28
            url = f"https://www.cnhnb.com/hangqing/cdlist-2003192-0-0-0-0-{i}/"
            t.submit(download, url)

多进程

from multiprocessing import Process


def func(name):
    for i in range(1000):
        print(name, i)


if __name__ == '__main__':
    p1 = Process(target=func, args=("周杰伦",))
    p2 = Process(target=func, args=("林俊杰",))
    p1.start()
    p2.start()


"""
何时使用多线程 何时使用多进程
1. 多线程:任务相对统一,互相特别的相似
2. 多进程:多个任务相互独立,很少有交集
"""
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值