python自定义多线程爬虫例子,3dmax学习资料,在爬虫中使用try,避免运行不了

这个是不用多线程,可以看看他是卡,卡在了哪里,为什么会卡半天




def dmax():
    import requests, re,threading
    from bs4 import BeautifulSoup
    from threading import Thread
    url='http://www.3dmax8.com/3dmax/peixun/3dmax2020/'
    #url = 'http://www.3dmax8.com/3dmax/peixun/3dmax2019/'

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                     AppleWebKit/537.36 (KHTML, like Gecko)\
                     Chrome/79.0.3945.130 Safari/537.36 Edg/79.0.309.71'}
    response = requests.get(url, headers)
    response.encoding = "gbk"
    soup = BeautifulSoup(response.text, "html.parser")
    first_title = soup.find_all("li")

    kk = []

    for i in first_title:
        zh_char = '[\u4e00-\u9fa5]+'  # 中文字符
        k = re.findall(zh_char, str(i))
        pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # 匹配模式
        f = re.findall(pattern, str(i))



        try:
            z = ''
            for i in k:
                z = z + i
                # print(i)
            z = z + ':' + f[0]
            kk.append(f[0])
            print(z)
            # zz.append(str(f[0]))
            # print(z[1],':',f[0])
        except ValueError:
            print('')
        except Exception:
            print('')
            # 没有预先判断到的错误怎么办?
            # ZeroDivisionError
        finally:
            # 无论是否有异常,都会执行的代码
            print('')

        # print(f)

    # print(int((len(zz)-1)/2))

    def pa(url):
        url = str(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                         AppleWebKit/537.36 (KHTML, like Gecko)\
                         Chrome/79.0.3945.130 Safari/537.36 Edg/79.0.309.71'}
        response = requests.get(url, headers)
        response.encoding = "gbk"
        soup = BeautifulSoup(response.text, "html.parser")
        first_title = soup.find_all("li")
        for i in first_title:
            zh_char = '[\u4e00-\u9fa5]+'  # 中文字符
            k = re.findall(zh_char, str(i))
            pattern = re.compile(
                r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # 匹配模式
            f = re.findall(pattern, str(i))
        #kk = []
        try:
            z = ''
            for i in k:
                z = z + i
                # print(i)
            z = z + ':' + f[0]
            #kk.append(f[0])
            print(z)
            # zz.append(str(f[0]))
            # print(z[1],':',f[0])
        except ValueError:
            print('')
        except Exception:
            print('')
            # 没有预先判断到的错误怎么办?
            # ZeroDivisionError
        finally:
            # 无论是否有异常,都会执行的代码
            print('')
        #return kk

    for i in kk:
        '''t = Thread(target=pa, args=(i,))
        t.start()'''
        pa(i)
dmax()

下面的这个是利用了多线程的代码,看看速度是多么的流畅

def dmax():
    import requests, re,threading
    from bs4 import BeautifulSoup
    from threading import Thread
    url='http://www.3dmax8.com/3dmax/peixun/3dmax2020/'
    #url = 'http://www.3dmax8.com/3dmax/peixun/3dmax2019/'

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                     AppleWebKit/537.36 (KHTML, like Gecko)\
                     Chrome/79.0.3945.130 Safari/537.36 Edg/79.0.309.71'}
    response = requests.get(url, headers)
    response.encoding = "gbk"
    soup = BeautifulSoup(response.text, "html.parser")
    first_title = soup.find_all("li")

    kk = []

    for i in first_title:
        zh_char = '[\u4e00-\u9fa5]+'  # 中文字符
        k = re.findall(zh_char, str(i))
        pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # 匹配模式
        f = re.findall(pattern, str(i))



        try:
            z = ''
            for i in k:
                z = z + i
                # print(i)
            z = z + ':' + f[0]
            kk.append(f[0])
            print(z)
            # zz.append(str(f[0]))
            # print(z[1],':',f[0])
        except ValueError:
            print('')
        except Exception:
            print('')
            # 没有预先判断到的错误怎么办?
            # ZeroDivisionError
        finally:
            # 无论是否有异常,都会执行的代码
            print('')

        # print(f)

    # print(int((len(zz)-1)/2))

    def pa(url):
        url = str(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                         AppleWebKit/537.36 (KHTML, like Gecko)\
                         Chrome/79.0.3945.130 Safari/537.36 Edg/79.0.309.71'}
        response = requests.get(url, headers)
        response.encoding = "gbk"
        soup = BeautifulSoup(response.text, "html.parser")
        first_title = soup.find_all("li")
        for i in first_title:
            zh_char = '[\u4e00-\u9fa5]+'  # 中文字符
            k = re.findall(zh_char, str(i))
            pattern = re.compile(
                r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # 匹配模式
            f = re.findall(pattern, str(i))
        #kk = []
        try:
            z = ''
            for i in k:
                z = z + i
                # print(i)
            z = z + ':' + f[0]
            #kk.append(f[0])
            print(z)
            # zz.append(str(f[0]))
            # print(z[1],':',f[0])
        except ValueError:
            print('')
        except Exception:
            print('')
            # 没有预先判断到的错误怎么办?
            # ZeroDivisionError
        finally:
            # 无论是否有异常,都会执行的代码
            print('')
        #return kk

    for i in kk:
        t = Thread(target=pa, args=(i,))
        t.start()
        #pa(i)
dmax()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值