唯美少女网页爬虫源码


唯美少女网页爬虫源码


# -*- coding: utf-8 -*-
"""
@author: tanderick
"""
import requests
import re
import time
import os
import urllib.parse
filepath = str(r'C:/ml/data/image/vm/')
if not os.path.exists(filepath):
    os.mkdir(filepath)

keyword = '校园'#搜索名词
kw=urllib.parse.quote(keyword)
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
t = requests.get('https://www.vmgirls.com/?s='+kw,headers=headers)
html = t.text

m_urls = re.findall('<a href="(.*?)" title="(.*?)" ',html)

for m_url,name in m_urls:
   
   t = requests.get(m_url,headers=headers)
   html = t.text
   urls = re.findall('http(.*?)jpg',html)
   filepath1 = str(filepath+name+'/')
   if not os.path.exists(filepath1):
      os.mkdir(filepath1)
   time.sleep(5)
   for url in urls:
      url = urllib.parse.unquote(str('http'+url+'jpg'))
      filename = url.split('/')[-1]
      response = requests.get(url,headers=headers)
      with open(filepath1+filename,'wb') as f:
         f.write(response.content)
      print(url+'下载完成')

这个是优化了一下结构的,看着舒服点

# -*- coding: utf-8 -*-
"""
@author: tanderick
"""
import requests
import re
import time
import os
import urllib.parse
#调用包
def search(keyword,headers,filepath):
    kw=urllib.parse.quote(keyword)
    t = requests.get('https://www.vmgirls.com/?s='+kw,headers=headers)
    m_html = t.text
    m_urls = re.findall('<a href="(.*?)" title="(.*?)" ',m_html)
    if not os.path.exists(filepath):
        os.mkdir(filepath)  
    return(m_urls)
#搜索内容并返回二级目录网址    
def download(urls,filepath1):
    for url in urls:
      url = urllib.parse.unquote(str('http'+url+'jpg'))
      filename = url.split('/')[-1]
      response = requests.get(url,headers=headers)
      with open(filepath1+filename,'wb') as f:
        f.write(response.content)
      print(url+'下载完成')
      time.sleep(1)
    return
#命名下载图片
def get_url(m_urls,filepath,headers):
    for m_url,name in m_urls:
      t = requests.get(m_url,headers=headers)
      html = t.text
      urls = re.findall('http(.*?)jpg',html)
      filepath1 = str(filepath+name+'/')
      if not os.path.exists(filepath1):
        os.mkdir(filepath1)
      
      download(urls,filepath1)
    return(urls,filepath1)
#需找图片下载地址
if __name__ == '__main__':
  keyword = '校园'#搜索名词
  filepath = str(r'C:/ml/data/image/vm/')
  headers ={
          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
          }
  m_urls = search(keyword,headers,filepath)
  get_url(m_urls,filepath,headers)
#主程序 
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值