多线程爬取美图网照片

最新推荐文章于 2024-04-19 10:19:46 发布

qq_41871270

最新推荐文章于 2024-04-19 10:19:46 发布

阅读量515

点赞数

本文链接：https://blog.csdn.net/qq_41871270/article/details/80579517

版权

import requests,re,threading
from bs4 import BeautifulSoup
def getPicUrl1(url):#获得照片网站的链接
    html=requests.get(url)
    html.encoding='GBK'
    html=html.text
    try:
        jpg=re.search(r'http://t1.mmonly.cc/uploads/tu/\d{6}/\d{4}/.+jpg',html)
        print("成功")
        src=jpg.group(0)
        return src
    except:
        print("失败")


def getPic(src):
    for i in range(50):
        if(i==0):
            src=src
            startSrc=src
        elif(i==1):
            continue
        else:
            src=src.replace(".html",'')+'_'+str(i)+'.html'
        print(src)
        html=requests.get(src)
        if (str(html))!='<Response [200]>':
            break
        #
        html.encoding='GBK'
        html=html.text
        jpgUrl=getPicUrl1(src)
        jpg=requests.get(jpgUrl)
        soup=BeautifulSoup(html,'lxml')
        h1=soup.find_all('h1')
        strH1=str(h1)
        name=re.search(r'[\u4e00-\u9fa5]+',strH1)
        name=name.group()
        name=name+str(i)+'.jpg'
        #path=r'E/pics/%s'%name
        print(name)
        with open(name,'wb')as f:
            f.write(jpg.content)
            print("%s写入成功"%name)
            f.close()
        #print("%s写入失败"%name)
        src=startSrc
        
def main():
    srcs=[]
    ths=[]
    mainUrl='http://www.mmonly.cc/tag/cs/'
    html=requests.get(mainUrl)
    html.encoding='GBK'
    html=html.text
    urls=re.findall(r'http:.+/mmtp.+html',html)
    for url in urls:
        getPic(url)
"""
        th=htreading.Thread(target=getPic,args=[url])
        th.strat()
        ths.append(th)
    for i in ths:
        i.join()
"""
main()