多线程爬虫案例

1.目标网站:https://www.1point3acres.com/bbs/forum-28-1.html此处开始的若干页

2.首先创建两个队列,一个页面队列和一个用于I/O的队列。顺便创建个锁,防止写的时候出问题

page_queue = Queue()
joke_queue = Queue()
gLock = threading.Lock()

3.用CSV存储数据

fp = open('asd.csv','a+',newline='',encoding='utf-8')
url = 'https://www.1point3acres.com/bbs/forum-28-1.html'
writer = csv.writer(fp)
writer.writerow(('标题','链接'))

4.寻找最大页码

max_page = find_max_page(url)
def find_max_page(url):
    selector = comp(url)
    max_page = selector.xpath('//div[@class="pg"]//span/text()')
    if max_page :
        max_page = max_page[0]
        max_page = int(re.findall('\d+',max_page)[0])
        return max_page
    else:
        return

5.循环如栈,把页压入队列内

    for x in range(1,max_page):
        url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x)
        page_queue.put(url)

    for x in range(4):
        t = BSSpider(page_queue,joke_queue)
        t.start()

    for x in range(4):
        t = BSWriter(joke_queue,writer,gLock)
        t.start()

6.解析线程代码如下:

class BSSpider(threading.Thread):
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    # }
    headers = {
        'User-Agent': get_ua(),
        'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept - encoding':'gzip, deflate, br',
        'accept - language':'zh - CN, zh;q = 0.9',
        'referer': 'https://www.1point3acres.com/bbs/',
        'upgrade - insecure - requests':'1',
        'Connection': 'keep-alive',
    }
    # ip代理池
    proxies = {
        'http': '123.54.44.60:9999',
        'http': '182.101.207.11:8080',
        'http': '121.232.148.231:9000',
        'http': '183.166.163.61:9999',
        'http': '175.44.108.179:9999',
        'http': '175.43.155.36:9999',
        'http': '39.108.59.34:8118',
        'http': '219.159.38.207:56210',
        'http': '113.194.48.14:9999',
        'http': '163.125.220.175:8118',
        'http': '123.149.136.180:9999',
        'http': '121.232.194.37:9000',
        'http': '1.85.5.66:8060',
        'http': '125.108.100.20:9000',
        'http': '114.101.252.37:3000',
    }
    def __init__(self,page_queue,joke_queue,*args,**kwargs):
        super(BSSpider,self).__init__(*args,**kwargs)
        # 基域名
        self.base_domain = 'https://www.1point3acres.com/bbs/'
        self.page_queue = page_queue
        self.joke_queue = joke_queue

    def run(self):
        while True:
            # 如果页面队列为空,则break退出
            if self.page_queue.empty():
                break
            # 从页面队列取出url
            url = self.page_queue.get()
            print(url)
            # 设置重传
            requests.adapters.DEFAULT_RETRIES = 30
            response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text
            html = etree.HTML(response)
            titles = html.xpath('//a[contains(@class,"xst")]/text()')
            urls = html.xpath('//a[contains(@class,"xst")]/@href')
            # print(titles,urls)
            for title,link in zip(titles,urls):
                link = self.base_domain + link
                # 把得到的数据压入I/O队列中
                self.joke_queue.put((title,link))
            print('完成一页')

6.I/O线程代码如下:

class BSWriter(threading.Thread):
    def __init__(self,joke_queue,writer,gLock,*args,**kwargs):
        super(BSWriter,self).__init__(*args,**kwargs)
        self.joke_queue = joke_queue
        self.writer = writer
        # 用于写入的锁
        self.lock = gLock

    def run(self):
        while True:
            try:
                joke_info = self.joke_queue.get(timeout=40)
                title,link = joke_info
                # 写入之前上锁
                self.lock.acquire()
                self.writer.writerow((title,link))
                self.lock.release()
                # print('保存一条')
            except:
                break

7.完整代码如下:

# -*- encoding: utf-8 -*-
#@Time: 15:40
#@Software:PyCharm
import requests
from lxml import etree
import threading
from queue import Queue
import csv
import random
import re
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# def get_ua():
#     user_agents = [
#         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
#         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
#         "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
#         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
#         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
#         "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
#         "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
#         "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
#         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
#     ]
#     user_agent = random.choice(user_agents)  # random.choice(),从列表中随机抽取一个对象
#     return user_agent

class BSSpider(threading.Thread):
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    # }
    headers = {
        'User-Agent': get_ua(),
        'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept - encoding':'gzip, deflate, br',
        'accept - language':'zh - CN, zh;q = 0.9',
        'referer': 'https://www.1point3acres.com/bbs/',
        'upgrade - insecure - requests':'1',
        'Connection': 'keep-alive',
    }
    # ip代理池
    proxies = {
        'http': '123.54.44.60:9999',
        'http': '182.101.207.11:8080',
        'http': '121.232.148.231:9000',
        'http': '183.166.163.61:9999',
        'http': '175.44.108.179:9999',
        'http': '175.43.155.36:9999',
        'http': '39.108.59.34:8118',
        'http': '219.159.38.207:56210',
        'http': '113.194.48.14:9999',
        'http': '163.125.220.175:8118',
        'http': '123.149.136.180:9999',
        'http': '121.232.194.37:9000',
        'http': '1.85.5.66:8060',
        'http': '125.108.100.20:9000',
        'http': '114.101.252.37:3000',
    }
    def __init__(self,page_queue,joke_queue,*args,**kwargs):
        super(BSSpider,self).__init__(*args,**kwargs)
        # 基域名
        self.base_domain = 'https://www.1point3acres.com/bbs/'
        self.page_queue = page_queue
        self.joke_queue = joke_queue

    def run(self):
        while True:
            # 如果页面队列为空,则break退出
            if self.page_queue.empty():
                break
            # 从页面队列取出url
            url = self.page_queue.get()
            print(url)
            # 设置重传
            requests.adapters.DEFAULT_RETRIES = 30
            response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text
            html = etree.HTML(response)
            titles = html.xpath('//a[contains(@class,"xst")]/text()')
            urls = html.xpath('//a[contains(@class,"xst")]/@href')
            # print(titles,urls)
            for title,link in zip(titles,urls):
                link = self.base_domain + link
                # 把得到的数据压入I/O队列中
                self.joke_queue.put((title,link))
            print('完成一页')


class BSWriter(threading.Thread):
    def __init__(self,joke_queue,writer,gLock,*args,**kwargs):
        super(BSWriter,self).__init__(*args,**kwargs)
        self.joke_queue = joke_queue
        self.writer = writer
        # 用于写入的锁
        self.lock = gLock

    def run(self):
        while True:
            try:
                joke_info = self.joke_queue.get(timeout=40)
                title,link = joke_info
                # 写入之前上锁
                self.lock.acquire()
                self.writer.writerow((title,link))
                self.lock.release()
                # print('保存一条')
            except:
                break


def main():
    page_queue = Queue()
    joke_queue = Queue()
    gLock = threading.Lock()
    fp = open('asd.csv','a+',newline='',encoding='utf-8')
    url = 'https://www.1point3acres.com/bbs/forum-28-1.html'
    writer = csv.writer(fp)
    writer.writerow(('标题','链接'))
    max_page = find_max_page(url)
    for x in range(1,max_page):
        url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x)
        page_queue.put(url)

    for x in range(4):
        t = BSSpider(page_queue,joke_queue)
        t.start()

    for x in range(4):
        t = BSWriter(joke_queue,writer,gLock)
        t.start()

def find_max_page(url):
    selector = comp(url)
    max_page = selector.xpath('//div[@class="pg"]//span/text()')
    if max_page :
        max_page = max_page[0]
        max_page = int(re.findall('\d+',max_page)[0])
        return max_page
    else:
        return
# def comp(url):
#     ua = get_ua()
#     headers = {
#         'User-Agent': ua,
#         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
#         'accept - encoding': 'gzip, deflate, br',
#         'accept - language': 'zh - CN, zh;q = 0.9',
#         'referer': 'https://www.1point3acres.com/bbs/',
#         'upgrade - insecure - requests': '1',
#         'Connection': 'keep-alive',
#     }
#     proxies = {
#         'http': '123.54.44.60:9999',
#         'http': '182.101.207.11:8080',
#         'http': '121.232.148.231:9000',
#         'http': '183.166.163.61:9999',
#         'http': '175.44.108.179:9999',
#         'http': '175.43.155.36:9999',
#         'http': '39.108.59.34:8118',
#         'http': '219.159.38.207:56210',
#         'http': '113.194.48.14:9999',
#         'http': '163.125.220.175:8118',
#         'http': '123.149.136.180:9999',
#         'http': '121.232.194.37:9000',
#         'http': '1.85.5.66:8060',
#         'http': '125.108.100.20:9000',
#         'http': '114.101.252.37:3000',
#     }
#     requests.adapters.DEFAULT_RETRIES = 30
#     html_data = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
#     # html_data.encoding = html_data.apparent_encoding
#     html = html_data.text
#     selector = etree.HTML(html)
#     return selector
if __name__ == '__main__':
    main()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值