多线程爬取美图网照片

import requests,re,threading
from bs4 import BeautifulSoup
def getPicUrl1(url):#获得照片网站的链接
    html=requests.get(url)
    html.encoding='GBK'
    html=html.text
    try:
        jpg=re.search(r'http://t1.mmonly.cc/uploads/tu/\d{6}/\d{4}/.+jpg',html)
        print("成功")
        src=jpg.group(0)
        return src
    except:
        print("失败")


def getPic(src):
    for i in range(50):
        if(i==0):
            src=src
            startSrc=src
        elif(i==1):
            continue
        else:
            src=src.replace(".html",'')+'_'+str(i)+'.html'
        print(src)
        html=requests.get(src)
        if (str(html))!='<Response [200]>':
            break
        #
        html.encoding='GBK'
        html=html.text
        jpgUrl=getPicUrl1(src)
        jpg=requests.get(jpgUrl)
        soup=BeautifulSoup(html,'lxml')
        h1=soup.find_all('h1')
        strH1=str(h1)
        name=re.search(r'[\u4e00-\u9fa5]+',strH1)
        name=name.group()
        name=name+str(i)+'.jpg'
        #path=r'E/pics/%s'%name
        print(name)
        with open(name,'wb')as f:
            f.write(jpg.content)
            print("%s写入成功"%name)
            f.close()
        #print("%s写入失败"%name)
        src=startSrc
        
def main():
    srcs=[]
    ths=[]
    mainUrl='http://www.mmonly.cc/tag/cs/'
    html=requests.get(mainUrl)
    html.encoding='GBK'
    html=html.text
    urls=re.findall(r'http:.+/mmtp.+html',html)
    for url in urls:
        getPic(url)
"""
        th=htreading.Thread(target=getPic,args=[url])
        th.strat()
        ths.append(th)
    for i in ths:
        i.join()
"""
main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值